Refactor PR and handle some corner cases

zsxwing · zsxwing · commit 2d85159fdd5c · 2015-06-02T22:43:35.000+08:00
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -653,7 +653,7 @@ methods for creating DStreams from files and Akka actors as input sources.
     </div>
     </div>
 
-	Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory (files written in nested directories not supported). It can also monitor files in subdirectories by setting the optional `depth` parameter to a value greater than 1. Note that
+	Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory. It won't search the nested directories by default. You can set the optional `depth` parameter to a value greater than 1 to monitor files in subdirectories. Note that
      + The files must have the same data format.
      + The files must be created in the `dataDirectory` by atomically *moving* or *renaming* them into
      the data directory.
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
@@ -253,18 +253,16 @@ def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_
         return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,
                        UTF8Deserializer())
 
-    def textFileStream(self, directory):
+    def textFileStream(self, directory, depth=1):
         """
         Create an input stream that monitors a Hadoop-compatible file system
         for new files and reads them as text files. Files must be wrriten to the
         monitored directory by "moving" them from another location within the same
         file system. File names starting with . are ignored.
-        """
-        return textFileStream(self, directory, 1)
 
-    def textFileStream(self, directory, depth):
-        """
-        Create an input stream that monitor files in subdirectories.
+        @param directory:      The directory to monitor
+        @param depth:          The max depth to search in the directory. The default
+                               value 1 means only searching files in the current directory
         """
         return DStream(self._jssc.textFileStream(directory, depth), self, UTF8Deserializer())
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -129,6 +129,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
   @transient private var lastNewFileFindingTime = 0L
 
   @transient private var path_ : Path = null
+  @transient private var directoryDepth_ : Int = -1
   @transient private var fs_ : FileSystem = null
 
   override def start() { }
@@ -186,48 +187,55 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
       )
       logDebug(s"Getting new files for time $currentTime, " +
         s"ignoring files older than $modTimeIgnoreThreshold")
-      val filter = new PathFilter {
+      val newFileFilter = new PathFilter {
         def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold)
       }
-      val directoryDepth = fs.getFileStatus(directoryPath).getPath.depth()
+      val rootDirectoryDepth = directoryDepth
 
-      // Nested directories to find new files.
-      def dfs(status: FileStatus): List[FileStatus] = {
+      // Search nested directories to find new files.
+      def searchFilesRecursively(status: FileStatus, files: mutable.ArrayBuffer[String]): Unit = {
         val path = status.getPath
         if (status.isDir) {
-          val depthFilter = depth + directoryDepth - path.depth()
-          if (depthFilter - 1 >= 0) {
+          // Note: A user may set depth = Int.MaxValue to search all nested directories.
+          if (depth > path.depth() - rootDirectoryDepth) {
             if (lastFoundDirs.contains(path)) {
               if (status.getModificationTime > modTimeIgnoreThreshold) {
-                fs.listStatus(path).toList.flatMap(dfs(_))
-              } else Nil
+                fs.listStatus(path).foreach(searchFilesRecursively(_, files))
+              }
             } else {
               lastFoundDirs += path
-              fs.listStatus(path).toList.flatMap(dfs(_))
+              fs.listStatus(path).foreach(searchFilesRecursively(_, files))
             }
-          } else Nil
+          }
         } else {
-          if (filter.accept(path)) status :: Nil else Nil
+          if (newFileFilter.accept(path)) {
+            files += path.toString
+          }
         }
       }
 
-      val path = if (lastFoundDirs.isEmpty) Seq(fs.getFileStatus(directoryPath))
-      else {
-        lastFoundDirs.filter { path =>
-          // If the mod time of directory is more than ignore time, no new files in this directory.
-          try {
-            val status = fs.getFileStatus(path)
-            status != null && status.getModificationTime > modTimeIgnoreThreshold
-          } catch {
-            // If the directory don't find, remove the directory from `lastFoundDirs`
-            case e: FileNotFoundException =>
-              lastFoundDirs.remove(path)
-              false
+      val validDirs: Iterable[Path] =
+        if (lastFoundDirs.isEmpty) {
+          Seq(directoryPath)
+        }
+        else {
+          lastFoundDirs.filter { path =>
+            // If the mod time of directory is more than ignore time, no new files in this directory
+            try {
+              val status = fs.getFileStatus(path)
+              status != null && status.getModificationTime > modTimeIgnoreThreshold
+            } catch {
+              // If the directory don't find, remove the directory from `lastFoundDirs`
+              case e: FileNotFoundException =>
+                lastFoundDirs.remove(path)
+                false
+            }
           }
         }
-      }.flatMap(fs.listStatus(_)).toSeq
 
-      val newFiles = path.flatMap(dfs(_)).map(_.getPath.toString).toArray
+      val newFiles = mutable.ArrayBuffer[String]()
+      validDirs.flatMap(fs.listStatus(_)). // Get sub dirs and files
+        foreach(searchFilesRecursively(_, newFiles))
       val timeTaken = clock.getTimeMillis() - lastNewFileFindingTime
       logInfo("Finding new files took " + timeTaken + " ms")
       logDebug("# cached file times = " + fileToModTime.size)
@@ -238,7 +246,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
             "files in the monitored directory."
         )
       }
-      newFiles
+      newFiles.toArray
     } catch {
       case e: Exception =>
         logWarning("Error finding new files", e)
@@ -321,17 +329,32 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
   }
 
   private def directoryPath: Path = {
-    if (path_ == null) path_ = new Path(directory)
+    if (fs_ == null) init()
     path_
   }
 
+  private def directoryDepth: Int = {
+    if (fs_ == null) init()
+    directoryDepth_
+  }
+
   private def fs: FileSystem = {
-    if (fs_ == null) fs_ = directoryPath.getFileSystem(ssc.sparkContext.hadoopConfiguration)
+    if (fs_ == null) init()
     fs_
   }
 
-  private def reset()  {
+  private def init(): Unit = {
+    val originPath = new Path(directory)
+    fs_ = originPath.getFileSystem(ssc.sparkContext.hadoopConfiguration)
+    // Get the absolute path
+    path_ = fs_.getFileStatus(originPath).getPath
+    directoryDepth_ = path_.depth()
+  }
+
+  private def reset() {
     fs_ = null
+    path_ = null
+    directoryDepth_ = -1
   }
 
   @throws(classOf[IOException])
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -212,6 +212,18 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     testFileStream(newFilesOnly = false, 3)
   }
 
+  test("file input stream - newFilesOnly = false and depth is too small") {
+    testFileStream(newFilesOnly = false, 3, 2)
+  }
+
+  test("file input stream - newFilesOnly = true and depth = Int.MaxValue") {
+    testFileStream(newFilesOnly = true, 3, Int.MaxValue)
+  }
+
+  test("file input stream - newFilesOnly = false and depth = Int.MaxValue") {
+    testFileStream(newFilesOnly = false, 3, Int.MaxValue)
+  }
+
   test("multi-thread receiver") {
     // set up the test receiver
     val numThreads = 10
@@ -364,12 +376,16 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     assert(receiverInputStreams.map(_.id) === Array(0, 1))
   }
 
-  def testFileStream(newFilesOnly: Boolean, depth: Int = 1) {
-    val testDir: File = null
+  def testFileStream(newFilesOnly: Boolean, depth: Int = 1): Unit = {
+    testFileStream(newFilesOnly, depth, depth)
+  }
+
+  def testFileStream(newFilesOnly: Boolean, createDepth: Int, searchDepth: Int) {
+    val rootDir = Utils.createTempDir()
     try {
       val batchDuration = Seconds(2)
-      var testDir = Utils.createTempDir()
-      for (i <- 2 until depth) {
+      var testDir = rootDir
+      for (i <- 1 until createDepth) {
         testDir = Utils.createTempDir(testDir.toString)
       }
       // Create a file that exists before the StreamingContext is created:
@@ -384,8 +400,8 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
         clock.setTime(existingFile.lastModified + batchDuration.milliseconds)
         val batchCounter = new BatchCounter(ssc)
         val fileStream = ssc.fileStream[LongWritable, Text, TextInputFormat](
-          testDir.toString, (x: Path) => true,
-          newFilesOnly = newFilesOnly, depth).map(_._2.toString)
+          rootDir.toString, (x: Path) => true,
+          newFilesOnly = newFilesOnly, searchDepth).map(_._2.toString)
         val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
         val outputStream = new TestOutputStream(fileStream, outputBuffer)
         outputStream.register()
@@ -412,15 +428,18 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
         }
 
         // Verify that all the files have been read
-        val expectedOutput = if (newFilesOnly) {
-          input.map(_.toString).toSet
-        } else {
-          (Seq(0) ++ input).map(_.toString).toSet
-        }
+        val expectedOutput =
+          if (createDepth > searchDepth) {
+            Set()
+          } else if (newFilesOnly) {
+            input.map(_.toString).toSet
+          } else {
+            (Seq(0) ++ input).map(_.toString).toSet
+          }
         assert(outputBuffer.flatten.toSet === expectedOutput)
       }
     } finally {
-      if (testDir != null) Utils.deleteRecursively(testDir)
+      Utils.deleteRecursively(rootDir)
     }
   }
 }