Skip to content

Commit ce1dd9c

Browse files
committed
[SPARK-17165][SQL] FileStreamSource should not track the list of seen files indefinitely
1 parent ba1737c commit ce1dd9c

File tree

4 files changed

+259
-34
lines changed

4 files changed

+259
-34
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.execution.streaming
19+
20+
import scala.util.Try
21+
22+
import org.apache.spark.internal.Logging
23+
import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
24+
import org.apache.spark.util.Utils
25+
26+
/**
27+
* User specified options for file streams.
28+
*/
29+
class FileStreamOptions(@transient private val parameters: Map[String, String])
30+
extends Logging with Serializable {
31+
32+
val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str =>
33+
Try(str.toInt).toOption.filter(_ > 0).getOrElse {
34+
throw new IllegalArgumentException(
35+
s"Invalid value '$str' for option 'maxFilesPerTrigger', must be a positive integer")
36+
}
37+
}
38+
39+
/** Maximum age of a file that can be found in this directory, before it is deleted. */
40+
val maxFileAgeMs: Long =
41+
Utils.timeStringAsMs(parameters.getOrElse("maxFileAge", "24h"))
42+
43+
/** Options as specified by the user, in a case-insensitive map, without "path" set. */
44+
val optionMapWithoutPath: Map[String, String] =
45+
new CaseInsensitiveMap(parameters).filterKeys(_ != "path")
46+
}
47+
48+
49+
object FileStreamOptions {
50+
51+
def apply(): FileStreamOptions = new FileStreamOptions(Map.empty)
52+
53+
def apply(paramName: String, paramValue: String): FileStreamOptions = {
54+
new FileStreamOptions(Map(paramName -> paramValue))
55+
}
56+
}

sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala

Lines changed: 106 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,18 @@
1717

1818
package org.apache.spark.sql.execution.streaming
1919

20-
import scala.util.Try
20+
import scala.collection.JavaConverters._
2121

2222
import org.apache.hadoop.fs.Path
2323

2424
import org.apache.spark.deploy.SparkHadoopUtil
2525
import org.apache.spark.internal.Logging
2626
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
27-
import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, DataSource, ListingFileCatalog, LogicalRelation}
27+
import org.apache.spark.sql.execution.datasources.{DataSource, ListingFileCatalog, LogicalRelation}
2828
import org.apache.spark.sql.types.StructType
29-
import org.apache.spark.util.collection.OpenHashSet
3029

3130
/**
32-
* A very simple source that reads text files from the given directory as they appear.
33-
*
34-
* TODO Clean up the metadata files periodically
31+
* A very simple source that reads files from the given directory as they appear.
3532
*/
3633
class FileStreamSource(
3734
sparkSession: SparkSession,
@@ -41,36 +38,59 @@ class FileStreamSource(
4138
metadataPath: String,
4239
options: Map[String, String]) extends Source with Logging {
4340

41+
import FileStreamSource._
42+
43+
private val sourceOptions = new FileStreamOptions(options)
44+
4445
private val fs = new Path(path).getFileSystem(sparkSession.sessionState.newHadoopConf())
4546
private val qualifiedBasePath = fs.makeQualified(new Path(path)) // can contains glob patterns
46-
private val metadataLog = new HDFSMetadataLog[Seq[String]](sparkSession, metadataPath)
47+
48+
private val metadataLog = new HDFSMetadataLog[Seq[FileEntry]](sparkSession, metadataPath)
49+
4750
private var maxBatchId = metadataLog.getLatest().map(_._1).getOrElse(-1L)
4851

4952
/** Maximum number of new files to be considered in each batch */
50-
private val maxFilesPerBatch = getMaxFilesPerBatch()
53+
private val maxFilesPerBatch = sourceOptions.maxFilesPerTrigger
54+
55+
/** A mapping from a file that we have processed to some timestamp it was last modified. */
56+
// Visible for testing.
57+
val seenFiles = new SeenFilesMap(sourceOptions.maxFileAgeMs)
5158

52-
private val seenFiles = new OpenHashSet[String]
53-
metadataLog.get(None, Some(maxBatchId)).foreach { case (batchId, files) =>
54-
files.foreach(seenFiles.add)
59+
metadataLog.get(None, Some(maxBatchId)).foreach { case (batchId, entry) =>
60+
entry.foreach(seenFiles.add)
61+
seenFiles.purge()
5562
}
5663

64+
logInfo(s"maxFilesPerBatch = $maxFilesPerBatch, maxFileAge = ${sourceOptions.maxFileAgeMs}")
65+
5766
/**
5867
* Returns the maximum offset that can be retrieved from the source.
5968
*
6069
* `synchronized` on this method is for solving race conditions in tests. In the normal usage,
6170
* there is no race here, so the cost of `synchronized` should be rare.
6271
*/
6372
private def fetchMaxOffset(): LongOffset = synchronized {
64-
val newFiles = fetchAllFiles().filter(!seenFiles.contains(_))
73+
// All the new files found - ignore aged files and files that we have seen.
74+
val newFiles = fetchAllFiles().filter(seenFiles.isNewFile)
75+
76+
// Obey user's setting to limit the number of files in this batch trigger.
6577
val batchFiles =
6678
if (maxFilesPerBatch.nonEmpty) newFiles.take(maxFilesPerBatch.get) else newFiles
79+
6780
batchFiles.foreach { file =>
6881
seenFiles.add(file)
6982
logDebug(s"New file: $file")
7083
}
71-
logTrace(s"Number of new files = ${newFiles.size})")
72-
logTrace(s"Number of files selected for batch = ${batchFiles.size}")
73-
logTrace(s"Number of seen files = ${seenFiles.size}")
84+
val numPurged = seenFiles.purge()
85+
86+
logTrace(
87+
s"""
88+
|Number of new files = ${newFiles.size}
89+
|Number of files selected for batch = ${batchFiles.size}
90+
|Number of seen files = ${seenFiles.size}
91+
|Number of files purged from tracking map = $numPurged
92+
""".stripMargin)
93+
7494
if (batchFiles.nonEmpty) {
7595
maxBatchId += 1
7696
metadataLog.add(maxBatchId, batchFiles)
@@ -104,22 +124,26 @@ class FileStreamSource(
104124
val files = metadataLog.get(Some(startId + 1), Some(endId)).flatMap(_._2)
105125
logInfo(s"Processing ${files.length} files from ${startId + 1}:$endId")
106126
logTrace(s"Files are:\n\t" + files.mkString("\n\t"))
107-
val newOptions = new CaseInsensitiveMap(options).filterKeys(_ != "path")
108127
val newDataSource =
109128
DataSource(
110129
sparkSession,
111-
paths = files,
130+
paths = files.map(_.path),
112131
userSpecifiedSchema = Some(schema),
113132
className = fileFormatClassName,
114-
options = newOptions)
133+
options = sourceOptions.optionMapWithoutPath)
115134
Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation()))
116135
}
117136

118-
private def fetchAllFiles(): Seq[String] = {
137+
/**
138+
* Returns a list of files found, sorted by their timestamp.
139+
*/
140+
private def fetchAllFiles(): Seq[FileEntry] = {
119141
val startTime = System.nanoTime
120142
val globbedPaths = SparkHadoopUtil.get.globPathIfNecessary(qualifiedBasePath)
121143
val catalog = new ListingFileCatalog(sparkSession, globbedPaths, options, Some(new StructType))
122-
val files = catalog.allFiles().sortBy(_.getModificationTime).map(_.getPath.toUri.toString)
144+
val files = catalog.allFiles().sortBy(_.getModificationTime).map { status =>
145+
FileEntry(status.getPath.toUri.toString, status.getModificationTime)
146+
}
123147
val endTime = System.nanoTime
124148
val listingTimeMs = (endTime.toDouble - startTime) / 1000000
125149
if (listingTimeMs > 2000) {
@@ -132,20 +156,71 @@ class FileStreamSource(
132156
files
133157
}
134158

135-
private def getMaxFilesPerBatch(): Option[Int] = {
136-
new CaseInsensitiveMap(options)
137-
.get("maxFilesPerTrigger")
138-
.map { str =>
139-
Try(str.toInt).toOption.filter(_ > 0).getOrElse {
140-
throw new IllegalArgumentException(
141-
s"Invalid value '$str' for option 'maxFilesPerTrigger', must be a positive integer")
142-
}
143-
}
144-
}
145-
146159
override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.offset == -1)
147160

148161
override def toString: String = s"FileStreamSource[$qualifiedBasePath]"
149162

150163
override def stop() {}
151164
}
165+
166+
167+
object FileStreamSource {
168+
169+
/** Timestamp for file modification time, in ms since January 1, 1970 UTC. */
170+
type Timestamp = Long
171+
172+
case class FileEntry(path: String, timestamp: Timestamp) extends Serializable
173+
174+
/**
175+
* A custom hash map used to track the list of files seen. This map is not thread-safe.
176+
*
177+
* To prevent the hash map from growing indefinitely, a purge function is available to
178+
* remove files "maxAgeMs" older than the latest file.
179+
*/
180+
class SeenFilesMap(maxAgeMs: Long) {
181+
require(maxAgeMs >= 0)
182+
183+
/** Mapping from file to its timestamp. */
184+
private val map = new java.util.HashMap[String, Timestamp]
185+
186+
private var lastTimestamp: Timestamp = 0L
187+
188+
private def ageThreshold: Timestamp = lastTimestamp - maxAgeMs
189+
190+
/** Add a new file to the map. */
191+
def add(file: FileEntry): Unit = {
192+
map.put(file.path, file.timestamp)
193+
if (file.timestamp > lastTimestamp) {
194+
lastTimestamp = file.timestamp
195+
}
196+
}
197+
198+
/**
199+
* Returns true if we should consider this file a new file. The file is only considered "new"
200+
* if it is new enough that we are still tracking, and we have not seen it before.
201+
*/
202+
def isNewFile(file: FileEntry): Boolean = {
203+
file.timestamp > ageThreshold && !map.containsKey(file.path)
204+
}
205+
206+
/** Removes aged entries and returns the number of files removed. */
207+
def purge(): Int = {
208+
val iter = map.entrySet().iterator()
209+
var count = 0
210+
while (iter.hasNext) {
211+
val entry = iter.next()
212+
if (entry.getValue < lastTimestamp - maxAgeMs) {
213+
count += 1
214+
iter.remove()
215+
}
216+
}
217+
count
218+
}
219+
220+
def size: Int = map.size()
221+
222+
def allEntries: Seq[FileEntry] = {
223+
map.entrySet().asScala.map(entry => FileEntry(entry.getKey, entry.getValue)).toSeq
224+
}
225+
}
226+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.execution.streaming
19+
20+
import org.apache.spark.SparkFunSuite
21+
22+
class FileStreamSourceSuite extends SparkFunSuite {
23+
24+
import FileStreamSource._
25+
26+
test("SeenFilesMap") {
27+
val map = new SeenFilesMap(maxAgeMs = 10)
28+
29+
map.add(FileEntry("a", 5))
30+
assert(map.size == 1)
31+
map.purge()
32+
assert(map.size == 1)
33+
34+
// Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
35+
map.add(FileEntry("b", 15))
36+
assert(map.size == 2)
37+
map.purge()
38+
assert(map.size == 2)
39+
40+
// Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
41+
map.add(FileEntry("c", 16))
42+
assert(map.size == 3)
43+
map.purge()
44+
assert(map.size == 2)
45+
46+
// Override existing entry shouldn't change the size
47+
map.add(FileEntry("c", 25))
48+
assert(map.size == 2)
49+
50+
// Not a new file because we have seen c before
51+
assert(!map.isNewFile(FileEntry("c", 20)))
52+
53+
// Not a new file because timestamp is too old
54+
assert(!map.isNewFile(FileEntry("d", 5)))
55+
56+
// Finally a new file: never seen and not too old
57+
assert(map.isNewFile(FileEntry("e", 20)))
58+
}
59+
60+
}

sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,13 @@ class FileStreamSourceTest extends StreamTest with SharedSQLContext {
104104
def createFileStream(
105105
format: String,
106106
path: String,
107-
schema: Option[StructType] = None): DataFrame = {
107+
schema: Option[StructType] = None,
108+
options: Map[String, String] = Map.empty): DataFrame = {
108109
val reader =
109110
if (schema.isDefined) {
110-
spark.readStream.format(format).schema(schema.get)
111+
spark.readStream.format(format).schema(schema.get).options(options)
111112
} else {
112-
spark.readStream.format(format)
113+
spark.readStream.format(format).options(options)
113114
}
114115
reader.load(path)
115116
}
@@ -331,6 +332,39 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
331332
}
332333
}
333334

335+
test("SPARK-17165 should not track the list of seen files indefinitely") {
336+
// This test works by:
337+
// 1. Create a file
338+
// 2. Get it processed
339+
// 3. Sleeps for a very short amount of time (larger than maxFileAge
340+
// 4. Add another file (at this point the original file should have been purged
341+
// 5. Test the size of the seenFiles internal data structure
342+
343+
// Note that if we change maxFileAge to a very large number, the last step should fail.
344+
withTempDirs { case (src, tmp) =>
345+
val textStream: DataFrame =
346+
createFileStream("text", src.getCanonicalPath, options = Map("maxFileAge" -> "5ms"))
347+
348+
testStream(textStream)(
349+
AddTextFileData("a\nb", src, tmp),
350+
CheckAnswer("a", "b"),
351+
352+
// SLeeps longer than 5ms (maxFileAge)
353+
AssertOnQuery { _ => Thread.sleep(10); true },
354+
355+
AddTextFileData("c\nd", src, tmp),
356+
CheckAnswer("a", "b", "c", "d"),
357+
358+
AssertOnQuery("seen files should contain only one entry") { streamExecution =>
359+
val source = streamExecution.logicalPlan.collect { case e: StreamingExecutionRelation =>
360+
e.source.asInstanceOf[FileStreamSource]
361+
}.head
362+
source.seenFiles.size == 1
363+
}
364+
)
365+
}
366+
}
367+
334368
// =============== JSON file stream tests ================
335369

336370
test("read from json files") {

0 commit comments

Comments
 (0)