Make ignoreCorruptFiles work for Parquet.

viirya · viirya · commit 586b347b04b6 · 2017-01-05T04:59:45.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -135,11 +135,21 @@ class FileScanRDD(
           try {
             if (ignoreCorruptFiles) {
               currentIterator = new NextIterator[Object] {
-                private val internalIter = readFunction(currentFile)
+                private val internalIter = {
+                  try {
+                    // The readFunction may read files before consuming the iterator.
+                    // E.g., vectorized Parquet reader.
+                    readFunction(currentFile)
+                  } catch {
+                    case e @(_: RuntimeException | _: IOException) =>
+                      logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
+                      null
+                  }
+                }
 
                 override def getNext(): AnyRef = {
                   try {
-                    if (internalIter.hasNext) {
+                    if (internalIter != null && internalIter.hasNext) {
                       internalIter.next()
                     } else {
                       finished = true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.io.IOException
 import java.net.URI
+import java.util.concurrent.{Callable, ExecutionException, Executors, ExecutorService, Future}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -30,6 +32,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileSplit
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 import org.apache.parquet.filter2.compat.FilterCompat
 import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS
 import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.codec.CodecConfig
 import org.apache.parquet.hadoop.util.ContextUtil
@@ -151,7 +154,7 @@ class ParquetFileFormat
     }
   }
 
-  def inferSchema(
+  override def inferSchema(
       sparkSession: SparkSession,
       parameters: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
@@ -542,6 +545,58 @@ object ParquetFileFormat extends Logging {
     StructType(parquetSchema ++ missingFields)
   }
 
+  /**
+   * Reads Parquet footers in multi-threaded manner.
+   * If the config "spark.sql.files.ignoreCorruptFiles" is set to true, we will ignore the corrupted
+   * files when reading footers.
+   */
+  private def readParquetFootersInParallel(
+      conf: Configuration,
+      partFiles: Seq[FileStatus],
+      ignoreCorruptFiles: Boolean): Seq[Footer] = {
+    val footers = partFiles.map { currentFile =>
+      new Callable[Option[Footer]]() {
+        override def call(): Option[Footer] = {
+          try {
+            // Skips row group information since we only need the schema.
+            // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
+            // when it can't read the footer.
+            Some(new Footer(currentFile.getPath(),
+              ParquetFileReader.readFooter(
+                conf, currentFile, SKIP_ROW_GROUPS)))
+          } catch { case e: RuntimeException =>
+            if (ignoreCorruptFiles) {
+              logWarning(s"Skipped the footer in the corrupted file: $currentFile", e)
+              None
+            } else {
+              throw new IOException(s"Could not read footer for file: $currentFile", e)
+            }
+          }
+        }
+      }
+    }
+    val parallelism = conf.getInt(ParquetFileReader.PARQUET_READ_PARALLELISM, 5)
+    val threadPool: ExecutorService = Executors.newFixedThreadPool(parallelism)
+    try {
+      val futures: mutable.ArrayBuffer[Future[Option[Footer]]] = mutable.ArrayBuffer.empty
+      footers.foreach(callable => futures += threadPool.submit(callable))
+      val result: mutable.ArrayBuffer[Footer] = mutable.ArrayBuffer.empty
+      futures.foreach { future =>
+        try {
+          val footer = future.get()
+          footer.foreach(f => result += f)
+        } catch { case e: InterruptedException =>
+          throw new RuntimeException("The thread was interrupted", e)
+        }
+      }
+      result.toSeq
+    } catch { case e: ExecutionException =>
+      throw new IOException("Could not read footer: " + e.getMessage(), e.getCause())
+    } finally {
+      threadPool.shutdownNow()
+    }
+  }
+
   /**
    * Figures out a merged Parquet schema with a distributed Spark job.
    *
@@ -582,6 +637,8 @@ object ParquetFileFormat extends Logging {
     val numParallelism = Math.min(Math.max(partialFileStatusInfo.size, 1),
       sparkSession.sparkContext.defaultParallelism)
 
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
+
     // Issues a Spark job to read Parquet schema in parallel.
     val partiallyMergedSchemas =
       sparkSession
@@ -593,13 +650,10 @@ object ParquetFileFormat extends Logging {
             new FileStatus(length, false, 0, 0, 0, 0, null, null, null, new Path(path))
           }.toSeq
 
-          // Skips row group information since we only need the schema
-          val skipRowGroups = true
-
           // Reads footers in multi-threaded manner within each task
           val footers =
-            ParquetFileReader.readAllFootersInParallel(
-              serializedConf.value, fakeFileStatuses.asJava, skipRowGroups).asScala
+            ParquetFileFormat.readParquetFootersInParallel(
+              serializedConf.value, fakeFileStatuses, ignoreCorruptFiles)
 
           // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
           val converter =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.parquet.hadoop.ParquetOutputFormat
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
@@ -212,6 +213,35 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
+  test("Enabling/disabling ignoreCorruptFiles") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(
+          df,
+          Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+      testIgnoreCorruptFiles()
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+      val exception = intercept[SparkException] {
+        testIgnoreCorruptFiles()
+      }
+      assert(exception.getMessage().contains("is not a Parquet file"))
+    }
+  }
+
   test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") {
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath