[SPARK-19183][SQL] Add deleteWithJob hook to internal commit protocol API

ericl · cloud-fan · commit c71b25481aa5 · 2017-01-12T17:45:55.000+08:00
## What changes were proposed in this pull request? Currently in SQL we implement overwrites by calling fs.delete() directly on the original data. This is not ideal since we the original files end up deleted even if the job aborts. We should extend the commit protocol to allow file overwrites to be managed as well. ## How was this patch tested? Existing tests. I also fixed a bunch of tests that were depending on the commit protocol implementation being set to the legacy mapreduce one. cc rxin cloud-fan Author: Eric Liang <ekl@databricks.com> Author: Eric Liang <ekhliang@gmail.com> Closes #16554 from ericl/add-delete-protocol.
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
 import org.apache.spark.util.Utils
@@ -112,6 +113,14 @@ abstract class FileCommitProtocol {
    * just crashes (or killed) before it can call abort.
    */
   def abortTask(taskContext: TaskAttemptContext): Unit
+
+  /**
+   * Specifies that a file should be deleted with the commit of this job. The default
+   * implementation deletes the file immediately.
+   */
+  def deleteWithJob(fs: FileSystem, path: Path, recursive: Boolean): Boolean = {
+    fs.delete(path, recursive)
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -88,11 +88,20 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val pathExists = fs.exists(qualifiedOutputPath)
+    // If we are appending data to an existing dir.
+    val isAppend = pathExists && (mode == SaveMode.Append)
+
+    val committer = FileCommitProtocol.instantiate(
+      sparkSession.sessionState.conf.fileCommitProtocolClass,
+      jobId = java.util.UUID.randomUUID().toString,
+      outputPath = outputPath.toString,
+      isAppend = isAppend)
+
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
         throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
       case (SaveMode.Overwrite, true) =>
-        deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations)
+        deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
         true
       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
         true
@@ -101,15 +110,8 @@ case class InsertIntoHadoopFsRelationCommand(
       case (s, exists) =>
         throw new IllegalStateException(s"unsupported save mode $s ($exists)")
     }
-    // If we are appending data to an existing dir.
-    val isAppend = pathExists && (mode == SaveMode.Append)
 
     if (doInsertion) {
-      val committer = FileCommitProtocol.instantiate(
-        sparkSession.sessionState.conf.fileCommitProtocolClass,
-        jobId = java.util.UUID.randomUUID().toString,
-        outputPath = outputPath.toString,
-        isAppend = isAppend)
 
       // Callback for updating metastore partition metadata after the insertion job completes.
       def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
@@ -160,7 +162,8 @@ case class InsertIntoHadoopFsRelationCommand(
   private def deleteMatchingPartitions(
       fs: FileSystem,
       qualifiedOutputPath: Path,
-      customPartitionLocations: Map[TablePartitionSpec, String]): Unit = {
+      customPartitionLocations: Map[TablePartitionSpec, String],
+      committer: FileCommitProtocol): Unit = {
     val staticPartitionPrefix = if (staticPartitions.nonEmpty) {
       "/" + partitionColumns.flatMap { p =>
         staticPartitions.get(p.name) match {
@@ -175,7 +178,7 @@ case class InsertIntoHadoopFsRelationCommand(
     }
     // first clear the path determined by the static partition keys (e.g. /table/foo=1)
     val staticPrefixPath = qualifiedOutputPath.suffix(staticPartitionPrefix)
-    if (fs.exists(staticPrefixPath) && !fs.delete(staticPrefixPath, true /* recursively */)) {
+    if (fs.exists(staticPrefixPath) && !committer.deleteWithJob(fs, staticPrefixPath, true)) {
       throw new IOException(s"Unable to clear output " +
         s"directory $staticPrefixPath prior to writing to it")
     }
@@ -185,7 +188,7 @@ case class InsertIntoHadoopFsRelationCommand(
         (staticPartitions.toSet -- spec).isEmpty,
         "Custom partition location did not match static partitioning keys")
       val path = new Path(customLoc)
-      if (fs.exists(path) && !fs.delete(path, true)) {
+      if (fs.exists(path) && !committer.deleteWithJob(fs, path, true)) {
         throw new IOException(s"Unable to clear partition " +
           s"directory $path prior to writing to it")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -31,7 +31,7 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       // ignore hidden files
       val allFiles = dir.listFiles(new FilenameFilter {
         override def accept(dir: File, name: String): Boolean = {
-          !name.startsWith(".")
+          !name.startsWith(".") && !name.startsWith("_")
         }
       })
       val totalSize = allFiles.map(_.length()).sum
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection}
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -462,16 +463,19 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden") {
-    val extraOptions = Map(
-      SQLConf.OUTPUT_COMMITTER_CLASS.key -> classOf[ParquetOutputCommitter].getCanonicalName,
-      "spark.sql.parquet.output.committer.class" ->
-        classOf[JobCommitFailureParquetOutputCommitter].getCanonicalName
-    )
-    withTempPath { dir =>
-      val message = intercept[SparkException] {
-        spark.range(0, 1).write.options(extraOptions).parquet(dir.getCanonicalPath)
-      }.getCause.getMessage
-      assert(message === "Intentional exception for testing purposes")
+    withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) {
+      val extraOptions = Map(
+        SQLConf.OUTPUT_COMMITTER_CLASS.key -> classOf[ParquetOutputCommitter].getCanonicalName,
+        "spark.sql.parquet.output.committer.class" ->
+          classOf[JobCommitFailureParquetOutputCommitter].getCanonicalName
+      )
+      withTempPath { dir =>
+        val message = intercept[SparkException] {
+          spark.range(0, 1).write.options(extraOptions).parquet(dir.getCanonicalPath)
+        }.getCause.getMessage
+        assert(message === "Intentional exception for testing purposes")
+      }
     }
   }
 
@@ -488,58 +492,64 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-7837 Do not close output writer twice when commitTask() fails") {
-    // Using a output committer that always fail when committing a task, so that both
-    // `commitTask()` and `abortTask()` are invoked.
-    val extraOptions = Map[String, String](
-      "spark.sql.parquet.output.committer.class" ->
-        classOf[TaskCommitFailureParquetOutputCommitter].getCanonicalName
-    )
+    withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) {
+      // Using a output committer that always fail when committing a task, so that both
+      // `commitTask()` and `abortTask()` are invoked.
+      val extraOptions = Map[String, String](
+        "spark.sql.parquet.output.committer.class" ->
+          classOf[TaskCommitFailureParquetOutputCommitter].getCanonicalName
+      )
+
+      // Before fixing SPARK-7837, the following code results in an NPE because both
+      // `commitTask()` and `abortTask()` try to close output writers.
 
-    // Before fixing SPARK-7837, the following code results in an NPE because both
-    // `commitTask()` and `abortTask()` try to close output writers.
-
-    withTempPath { dir =>
-      val m1 = intercept[SparkException] {
-        spark.range(1).coalesce(1).write.options(extraOptions).parquet(dir.getCanonicalPath)
-      }.getCause.getMessage
-      assert(m1.contains("Intentional exception for testing purposes"))
-    }
+      withTempPath { dir =>
+        val m1 = intercept[SparkException] {
+          spark.range(1).coalesce(1).write.options(extraOptions).parquet(dir.getCanonicalPath)
+        }.getCause.getMessage
+        assert(m1.contains("Intentional exception for testing purposes"))
+      }
 
-    withTempPath { dir =>
-      val m2 = intercept[SparkException] {
-        val df = spark.range(1).select('id as 'a, 'id as 'b).coalesce(1)
-        df.write.partitionBy("a").options(extraOptions).parquet(dir.getCanonicalPath)
-      }.getCause.getMessage
-      assert(m2.contains("Intentional exception for testing purposes"))
+      withTempPath { dir =>
+        val m2 = intercept[SparkException] {
+          val df = spark.range(1).select('id as 'a, 'id as 'b).coalesce(1)
+          df.write.partitionBy("a").options(extraOptions).parquet(dir.getCanonicalPath)
+        }.getCause.getMessage
+        assert(m2.contains("Intentional exception for testing purposes"))
+      }
     }
   }
 
   test("SPARK-11044 Parquet writer version fixed as version1 ") {
-    // For dictionary encoding, Parquet changes the encoding types according to its writer
-    // version. So, this test checks one of the encoding types in order to ensure that
-    // the file is written with writer version2.
-    val extraOptions = Map[String, String](
-      // Write a Parquet file with writer version2.
-      ParquetOutputFormat.WRITER_VERSION -> ParquetProperties.WriterVersion.PARQUET_2_0.toString,
-      // By default, dictionary encoding is enabled from Parquet 1.2.0 but
-      // it is enabled just in case.
-      ParquetOutputFormat.ENABLE_DICTIONARY -> "true"
-    )
-
-    val hadoopConf = spark.sessionState.newHadoopConfWithOptions(extraOptions)
-
-    withSQLConf(ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true") {
-      withTempPath { dir =>
-        val path = s"${dir.getCanonicalPath}/part-r-0.parquet"
-        spark.range(1 << 16).selectExpr("(id % 4) AS i")
-          .coalesce(1).write.options(extraOptions).mode("overwrite").parquet(path)
-
-        val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head
-        val columnChunkMetadata = blockMetadata.getColumns.asScala.head
-
-        // If the file is written with version2, this should include
-        // Encoding.RLE_DICTIONARY type. For version1, it is Encoding.PLAIN_DICTIONARY
-        assert(columnChunkMetadata.getEncodings.contains(Encoding.RLE_DICTIONARY))
+    withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) {
+      // For dictionary encoding, Parquet changes the encoding types according to its writer
+      // version. So, this test checks one of the encoding types in order to ensure that
+      // the file is written with writer version2.
+      val extraOptions = Map[String, String](
+        // Write a Parquet file with writer version2.
+        ParquetOutputFormat.WRITER_VERSION -> ParquetProperties.WriterVersion.PARQUET_2_0.toString,
+        // By default, dictionary encoding is enabled from Parquet 1.2.0 but
+        // it is enabled just in case.
+        ParquetOutputFormat.ENABLE_DICTIONARY -> "true"
+      )
+
+      val hadoopConf = spark.sessionState.newHadoopConfWithOptions(extraOptions)
+
+      withSQLConf(ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true") {
+        withTempPath { dir =>
+          val path = s"${dir.getCanonicalPath}/part-r-0.parquet"
+          spark.range(1 << 16).selectExpr("(id % 4) AS i")
+            .coalesce(1).write.options(extraOptions).mode("overwrite").parquet(path)
+
+          val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head
+          val columnChunkMetadata = blockMetadata.getColumns.asScala.head
+
+          // If the file is written with version2, this should include
+          // Encoding.RLE_DICTIONARY type. For version1, it is Encoding.PLAIN_DICTIONARY
+          assert(columnChunkMetadata.getEncodings.contains(Encoding.RLE_DICTIONARY))
+        }
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -455,15 +455,15 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       assert(partDf.schema.map(_.name) === Seq("intField", "stringField"))
 
       path.listFiles().foreach { f =>
-        if (f.getName.toLowerCase().endsWith(".parquet")) {
+        if (!f.getName.startsWith("_") && f.getName.toLowerCase().endsWith(".parquet")) {
           // when the input is a path to a parquet file
           val df = spark.read.parquet(f.getCanonicalPath)
           assert(df.schema.map(_.name) === Seq("intField", "stringField"))
         }
       }
 
       path.listFiles().foreach { f =>
-        if (f.getName.toLowerCase().endsWith(".parquet")) {
+        if (!f.getName.startsWith("_") && f.getName.toLowerCase().endsWith(".parquet")) {
           // when the input is a path to a parquet file but `basePath` is overridden to
           // the base path containing partitioning directories
           val df = spark
@@ -932,7 +932,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
-      withSQLConf(ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true") {
+      withSQLConf(
+          ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true",
+          "spark.sql.sources.commitProtocolClass" ->
+            classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) {
         spark.range(3).write.parquet(s"$path/p0=0/p1=0")
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
 import org.apache.spark.sql.execution.FileSourceScanExec
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
 import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT, SingleElement}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -178,6 +179,8 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
 
     withSQLConf(
+      SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName,
       SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true",
       SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "true",
       ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true"
@@ -186,6 +189,8 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
 
     withSQLConf(
+      SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName,
       SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true",
       SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "false"
     ) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -93,7 +93,9 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
         .orc(path)
 
       // Check if this is compressed as ZLIB.
-      val maybeOrcFile = new File(path).listFiles().find(_.getName.endsWith(".zlib.orc"))
+      val maybeOrcFile = new File(path).listFiles().find { f =>
+        !f.getName.startsWith("_") && f.getName.endsWith(".zlib.orc")
+      }
       assert(maybeOrcFile.isDefined)
       val orcFilePath = maybeOrcFile.get.toPath.toString
       val expectedCompressionKind =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {`
`31`	`31`	`// ignore hidden files`
`32`	`32`	`val allFiles = dir.listFiles(new FilenameFilter {`
`33`	`33`	`override def accept(dir: File, name: String): Boolean = {`
`34`		`- !name.startsWith(".")`
	`34`	`+ !name.startsWith(".") && !name.startsWith("_")`
`35`	`35`	`}`
`36`	`36`	`})`
`37`	`37`	`val totalSize = allFiles.map(_.length()).sum`