1717
1818package org .apache .spark .sql .execution .datasources .parquet
1919
20+ import java .io .IOException
2021import java .net .URI
22+ import java .util .concurrent .{Callable , ExecutionException , Executors , ExecutorService , Future }
2123
2224import scala .collection .JavaConverters ._
2325import scala .collection .mutable
@@ -30,6 +32,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileSplit
3032import org .apache .hadoop .mapreduce .task .TaskAttemptContextImpl
3133import org .apache .parquet .filter2 .compat .FilterCompat
3234import org .apache .parquet .filter2 .predicate .FilterApi
35+ import org .apache .parquet .format .converter .ParquetMetadataConverter .SKIP_ROW_GROUPS
3336import org .apache .parquet .hadoop ._
3437import org .apache .parquet .hadoop .codec .CodecConfig
3538import org .apache .parquet .hadoop .util .ContextUtil
@@ -151,7 +154,7 @@ class ParquetFileFormat
151154 }
152155 }
153156
154- def inferSchema (
157+ override def inferSchema (
155158 sparkSession : SparkSession ,
156159 parameters : Map [String , String ],
157160 files : Seq [FileStatus ]): Option [StructType ] = {
@@ -542,6 +545,58 @@ object ParquetFileFormat extends Logging {
542545 StructType (parquetSchema ++ missingFields)
543546 }
544547
548+ /**
549+ * Reads Parquet footers in multi-threaded manner.
550+ * If the config "spark.sql.files.ignoreCorruptFiles" is set to true, we will ignore the corrupted
551+ * files when reading footers.
552+ */
553+ private def readParquetFootersInParallel (
554+ conf : Configuration ,
555+ partFiles : Seq [FileStatus ],
556+ ignoreCorruptFiles : Boolean ): Seq [Footer ] = {
557+ val footers = partFiles.map { currentFile =>
558+ new Callable [Option [Footer ]]() {
559+ override def call (): Option [Footer ] = {
560+ try {
561+ // Skips row group information since we only need the schema.
562+ // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
563+ // when it can't read the footer.
564+ Some (new Footer (currentFile.getPath(),
565+ ParquetFileReader .readFooter(
566+ conf, currentFile, SKIP_ROW_GROUPS )))
567+ } catch { case e : RuntimeException =>
568+ if (ignoreCorruptFiles) {
569+ logWarning(s " Skipped the footer in the corrupted file: $currentFile" , e)
570+ None
571+ } else {
572+ throw new IOException (s " Could not read footer for file: $currentFile" , e)
573+ }
574+ }
575+ }
576+ }
577+ }
578+ val parallelism = conf.getInt(ParquetFileReader .PARQUET_READ_PARALLELISM , 5 )
579+ val threadPool : ExecutorService = Executors .newFixedThreadPool(parallelism)
580+ try {
581+ val futures : mutable.ArrayBuffer [Future [Option [Footer ]]] = mutable.ArrayBuffer .empty
582+ footers.foreach(callable => futures += threadPool.submit(callable))
583+ val result : mutable.ArrayBuffer [Footer ] = mutable.ArrayBuffer .empty
584+ futures.foreach { future =>
585+ try {
586+ val footer = future.get()
587+ footer.foreach(f => result += f)
588+ } catch { case e : InterruptedException =>
589+ throw new RuntimeException (" The thread was interrupted" , e)
590+ }
591+ }
592+ result.toSeq
593+ } catch { case e : ExecutionException =>
594+ throw new IOException (" Could not read footer: " + e.getMessage(), e.getCause())
595+ } finally {
596+ threadPool.shutdownNow()
597+ }
598+ }
599+
545600 /**
546601 * Figures out a merged Parquet schema with a distributed Spark job.
547602 *
@@ -582,6 +637,8 @@ object ParquetFileFormat extends Logging {
582637 val numParallelism = Math .min(Math .max(partialFileStatusInfo.size, 1 ),
583638 sparkSession.sparkContext.defaultParallelism)
584639
640+ val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
641+
585642 // Issues a Spark job to read Parquet schema in parallel.
586643 val partiallyMergedSchemas =
587644 sparkSession
@@ -593,13 +650,10 @@ object ParquetFileFormat extends Logging {
593650 new FileStatus (length, false , 0 , 0 , 0 , 0 , null , null , null , new Path (path))
594651 }.toSeq
595652
596- // Skips row group information since we only need the schema
597- val skipRowGroups = true
598-
599653 // Reads footers in multi-threaded manner within each task
600654 val footers =
601- ParquetFileReader .readAllFootersInParallel (
602- serializedConf.value, fakeFileStatuses.asJava, skipRowGroups).asScala
655+ ParquetFileFormat .readParquetFootersInParallel (
656+ serializedConf.value, fakeFileStatuses, ignoreCorruptFiles)
603657
604658 // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
605659 val converter =
0 commit comments