-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-18726][SQL]resolveRelation for FileFormat DataSource don't need to listFiles twice #17081
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0082b76
6b5454a
f1da0a4
f79f12c
a8c1dea
60fa037
9a73947
850094c
c39eb26
f3332cb
9cadd41
28c8158
92618b3
f6ec4fe
3e495a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,10 +106,13 @@ case class DataSource( | |
| * be any further inference in any triggers. | ||
| * | ||
| * @param format the file format object for this DataSource | ||
| * @param fileStatusCache the shared cache for file statuses to speed up listing | ||
| * @return A pair of the data schema (excluding partition columns) and the schema of the partition | ||
| * columns. | ||
| */ | ||
| private def getOrInferFileFormatSchema(format: FileFormat): (StructType, StructType) = { | ||
| private def getOrInferFileFormatSchema( | ||
| format: FileFormat, | ||
| fileStatusCache: FileStatusCache = NoopCache): (StructType, StructType) = { | ||
| // the operations below are expensive therefore try not to do them if we don't need to, e.g., | ||
| // in streaming mode, we have already inferred and registered partition columns, we will | ||
| // never have to materialize the lazy val below | ||
|
|
@@ -122,7 +125,7 @@ case class DataSource( | |
| val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) | ||
| SparkHadoopUtil.get.globPathIfNecessary(qualified) | ||
| }.toArray | ||
| new InMemoryFileIndex(sparkSession, globbedPaths, options, None) | ||
| new InMemoryFileIndex(sparkSession, globbedPaths, options, None, fileStatusCache) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This also impacts the streaming code path. If it is fine to streaming, the code changes look good to me.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have make it local only in the no streaming FileFormat match case~ |
||
| } | ||
| val partitionSchema = if (partitionColumns.isEmpty) { | ||
| // Try to infer partitioning, because no DataSource in the read path provides the partitioning | ||
|
|
@@ -354,7 +357,8 @@ case class DataSource( | |
| globPath | ||
| }.toArray | ||
|
|
||
| val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format) | ||
| val fileStatusCache = FileStatusCache.getOrCreate(sparkSession) | ||
| val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format, fileStatusCache) | ||
|
|
||
| val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions && | ||
| catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) { | ||
|
|
@@ -364,7 +368,8 @@ case class DataSource( | |
| catalogTable.get, | ||
| catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize)) | ||
| } else { | ||
| new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(partitionSchema)) | ||
| new InMemoryFileIndex( | ||
| sparkSession, globbedPaths, options, Some(partitionSchema), fileStatusCache) | ||
| } | ||
|
|
||
| HadoopFsRelation( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please update the function description with a new
@parmThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, thanks~