@@ -31,7 +31,7 @@ import org.apache.spark.SerializableWritable
3131import org .apache .spark .sql .{Row , _ }
3232import org .apache .spark .sql .catalyst .expressions ._
3333import org .apache .spark .sql .catalyst .expressions .codegen .GenerateMutableProjection
34- import org .apache .spark .sql .types .{ StructField , StructType }
34+ import org .apache .spark .sql .types .StructType
3535
3636/**
3737 * ::DeveloperApi::
@@ -378,24 +378,30 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
378378 var leafDirToChildrenFiles = mutable.Map .empty[Path , Array [FileStatus ]]
379379
380380 def refresh (): Unit = {
381+ // We don't filter files/directories whose name start with "_" or "." here, as specific data
382+ // sources may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
383+ // But "_temporary" directories are explicitly ignored since failed tasks/jobs may leave
384+ // partial/corrupted data files there.
381385 def listLeafFilesAndDirs (fs : FileSystem , status : FileStatus ): Set [FileStatus ] = {
382- val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
383- val leafDirs = if (dirs.isEmpty) Set (status) else Set .empty[FileStatus ]
384- files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
386+ if (status.getPath.getName.toLowerCase == " _temporary" ) {
387+ Set .empty
388+ } else {
389+ val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
390+ val leafDirs = if (dirs.isEmpty) Set (status) else Set .empty[FileStatus ]
391+ files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
392+ }
385393 }
386394
387395 leafFiles.clear()
388396
389- // We don't filter files/directories like _temporary/_SUCCESS here, as specific data sources
390- // may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
391397 val statuses = paths.flatMap { path =>
392398 val hdfsPath = new Path (path)
393399 val fs = hdfsPath.getFileSystem(hadoopConf)
394400 val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
395401 Try (fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
396402 }
397403
398- val (dirs, files) = statuses.partition (_.isDir)
404+ val files = statuses.filterNot (_.isDir)
399405 leafFiles ++= files.map(f => f.getPath -> f).toMap
400406 leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
401407 }
0 commit comments