Do not write empty files for internal datasources as well.

HyukjinKwon · HyukjinKwon · commit dee6a4ebf11a · 2016-05-03T19:35:53.000+09:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -239,48 +239,50 @@ private[sql] class DefaultWriterContainer(
   extends BaseWriterContainer(relation, job, isAppend) {
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
-    executorSideSetup(taskContext)
-    val configuration = taskAttemptContext.getConfiguration
-    configuration.set("spark.sql.sources.output.path", outputPath)
-    var writer = newOutputWriter(getWorkPath)
-    writer.initConverter(dataSchema)
-
-    // If anything below fails, we should abort the task.
-    try {
-      Utils.tryWithSafeFinallyAndFailureCallbacks {
-        while (iterator.hasNext) {
-          val internalRow = iterator.next()
-          writer.writeInternal(internalRow)
-        }
-        commitTask()
-      }(catchBlock = abortTask())
-    } catch {
-      case t: Throwable =>
-        throw new SparkException("Task failed while writing rows", t)
-    }
+    if (iterator.hasNext) {
+      executorSideSetup(taskContext)
+      val configuration = taskAttemptContext.getConfiguration
+      configuration.set("spark.sql.sources.output.path", outputPath)
+      var writer = newOutputWriter(getWorkPath)
+      writer.initConverter(dataSchema)
 
-    def commitTask(): Unit = {
+      // If anything below fails, we should abort the task.
       try {
-        if (writer != null) {
-          writer.close()
-          writer = null
-        }
-        super.commitTask()
+        Utils.tryWithSafeFinallyAndFailureCallbacks {
+          while (iterator.hasNext) {
+            val internalRow = iterator.next()
+            writer.writeInternal(internalRow)
+          }
+          commitTask()
+        }(catchBlock = abortTask())
       } catch {
-        case cause: Throwable =>
-          // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and
-          // will cause `abortTask()` to be invoked.
-          throw new RuntimeException("Failed to commit task", cause)
+        case t: Throwable =>
+          throw new SparkException("Task failed while writing rows", t)
       }
-    }
 
-    def abortTask(): Unit = {
-      try {
-        if (writer != null) {
-          writer.close()
+      def commitTask(): Unit = {
+        try {
+          if (writer != null) {
+            writer.close()
+            writer = null
+          }
+          super.commitTask()
+        } catch {
+          case cause: Throwable =>
+            // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and
+            // will cause `abortTask()` to be invoked.
+            throw new RuntimeException("Failed to commit task", cause)
+        }
+      }
+
+      def abortTask(): Unit = {
+        try {
+          if (writer != null) {
+            writer.close()
+          }
+        } finally {
+          super.abortTask()
         }
-      } finally {
-        super.abortTask()
       }
     }
   }
@@ -363,84 +365,87 @@ private[sql] class DynamicPartitionWriterContainer(
   }
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
-    executorSideSetup(taskContext)
-
-    // We should first sort by partition columns, then bucket id, and finally sorting columns.
-    val sortingExpressions: Seq[Expression] = partitionColumns ++ bucketIdExpression ++ sortColumns
-    val getSortingKey = UnsafeProjection.create(sortingExpressions, inputSchema)
-
-    val sortingKeySchema = StructType(sortingExpressions.map {
-      case a: Attribute => StructField(a.name, a.dataType, a.nullable)
-      // The sorting expressions are all `Attribute` except bucket id.
-      case _ => StructField("bucketId", IntegerType, nullable = false)
-    })
-
-    // Returns the data columns to be written given an input row
-    val getOutputRow = UnsafeProjection.create(dataColumns, inputSchema)
-
-    // Returns the partition path given a partition key.
-    val getPartitionString =
-      UnsafeProjection.create(Concat(partitionStringExpression) :: Nil, partitionColumns)
-
-    // Sorts the data before write, so that we only need one writer at the same time.
-    // TODO: inject a local sort operator in planning.
-    val sorter = new UnsafeKVExternalSorter(
-      sortingKeySchema,
-      StructType.fromAttributes(dataColumns),
-      SparkEnv.get.blockManager,
-      SparkEnv.get.serializerManager,
-      TaskContext.get().taskMemoryManager().pageSizeBytes)
-
-    while (iterator.hasNext) {
-      val currentRow = iterator.next()
-      sorter.insertKV(getSortingKey(currentRow), getOutputRow(currentRow))
-    }
-    logInfo(s"Sorting complete. Writing out partition files one at a time.")
-
-    val getBucketingKey: InternalRow => InternalRow = if (sortColumns.isEmpty) {
-      identity
-    } else {
-      UnsafeProjection.create(sortingExpressions.dropRight(sortColumns.length).zipWithIndex.map {
-        case (expr, ordinal) => BoundReference(ordinal, expr.dataType, expr.nullable)
+    if (iterator.hasNext) {
+      executorSideSetup(taskContext)
+
+      // We should first sort by partition columns, then bucket id, and finally sorting columns.
+      val sortingExpressions: Seq[Expression] =
+        partitionColumns ++ bucketIdExpression ++ sortColumns
+      val getSortingKey = UnsafeProjection.create(sortingExpressions, inputSchema)
+
+      val sortingKeySchema = StructType(sortingExpressions.map {
+        case a: Attribute => StructField(a.name, a.dataType, a.nullable)
+        // The sorting expressions are all `Attribute` except bucket id.
+        case _ => StructField("bucketId", IntegerType, nullable = false)
       })
-    }
 
-    val sortedIterator = sorter.sortedIterator()
+      // Returns the data columns to be written given an input row
+      val getOutputRow = UnsafeProjection.create(dataColumns, inputSchema)
+
+      // Returns the partition path given a partition key.
+      val getPartitionString =
+        UnsafeProjection.create(Concat(partitionStringExpression) :: Nil, partitionColumns)
+
+      // Sorts the data before write, so that we only need one writer at the same time.
+      // TODO: inject a local sort operator in planning.
+      val sorter = new UnsafeKVExternalSorter(
+        sortingKeySchema,
+        StructType.fromAttributes(dataColumns),
+        SparkEnv.get.blockManager,
+        SparkEnv.get.serializerManager,
+        TaskContext.get().taskMemoryManager().pageSizeBytes)
+
+      while (iterator.hasNext) {
+        val currentRow = iterator.next()
+        sorter.insertKV(getSortingKey(currentRow), getOutputRow(currentRow))
+      }
+      logInfo(s"Sorting complete. Writing out partition files one at a time.")
+
+      val getBucketingKey: InternalRow => InternalRow = if (sortColumns.isEmpty) {
+        identity
+      } else {
+        UnsafeProjection.create(sortingExpressions.dropRight(sortColumns.length).zipWithIndex.map {
+          case (expr, ordinal) => BoundReference(ordinal, expr.dataType, expr.nullable)
+        })
+      }
 
-    // If anything below fails, we should abort the task.
-    var currentWriter: OutputWriter = null
-    try {
-      Utils.tryWithSafeFinallyAndFailureCallbacks {
-        var currentKey: UnsafeRow = null
-        while (sortedIterator.next()) {
-          val nextKey = getBucketingKey(sortedIterator.getKey).asInstanceOf[UnsafeRow]
-          if (currentKey != nextKey) {
-            if (currentWriter != null) {
-              currentWriter.close()
-              currentWriter = null
-            }
-            currentKey = nextKey.copy()
-            logDebug(s"Writing partition: $currentKey")
+      val sortedIterator = sorter.sortedIterator()
 
-            currentWriter = newOutputWriter(currentKey, getPartitionString)
+      // If anything below fails, we should abort the task.
+      var currentWriter: OutputWriter = null
+      try {
+        Utils.tryWithSafeFinallyAndFailureCallbacks {
+          var currentKey: UnsafeRow = null
+          while (sortedIterator.next()) {
+            val nextKey = getBucketingKey(sortedIterator.getKey).asInstanceOf[UnsafeRow]
+            if (currentKey != nextKey) {
+              if (currentWriter != null) {
+                currentWriter.close()
+                currentWriter = null
+              }
+              currentKey = nextKey.copy()
+              logDebug(s"Writing partition: $currentKey")
+
+              currentWriter = newOutputWriter(currentKey, getPartitionString)
+            }
+            currentWriter.writeInternal(sortedIterator.getValue)
+          }
+          if (currentWriter != null) {
+            currentWriter.close()
+            currentWriter = null
           }
-          currentWriter.writeInternal(sortedIterator.getValue)
-        }
-        if (currentWriter != null) {
-          currentWriter.close()
-          currentWriter = null
-        }
 
-        commitTask()
-      }(catchBlock = {
-        if (currentWriter != null) {
-          currentWriter.close()
-        }
-        abortTask()
-      })
-    } catch {
-      case t: Throwable =>
-        throw new SparkException("Task failed while writing rows", t)
+          commitTask()
+        }(catchBlock = {
+          if (currentWriter != null) {
+            currentWriter.close()
+          }
+          abortTask()
+        })
+      } catch {
+        case t: Throwable =>
+          throw new SparkException("Task failed while writing rows", t)
+      }
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -695,6 +695,24 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       }
     }
   }
+
+  test("SPARK-10216: Avoid empty files during overwriting table with group by query") {
+    withTempPath { path =>
+      val df = sqlContext.range(0, 5)
+      val groupedDF = df.groupBy("id").count()
+      groupedDF.write
+        .format(dataSourceName)
+        .mode(SaveMode.Overwrite)
+        .save(path.getCanonicalPath)
+
+      val overwrittenFiles = path.listFiles()
+        .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
+        .sortBy(_.getName)
+      val overwrittenFilesWithoutEmpty = overwrittenFiles.filter(_.length > 0)
+
+      assert(overwrittenFiles === overwrittenFilesWithoutEmpty)
+    }
+  }
 }
 
 // This class is used to test SPARK-8578. We should not use any custom output committer when

Original file line number	Diff line number	Diff line change
`@@ -695,6 +695,24 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes`
`695`	`695`	`}`
`696`	`696`	`}`
`697`	`697`	`}`
	`698`	`+`
	`699`	`+ test("SPARK-10216: Avoid empty files during overwriting table with group by query") {`
	`700`	`+ withTempPath { path =>`
	`701`	`+ val df = sqlContext.range(0, 5)`
	`702`	`+ val groupedDF = df.groupBy("id").count()`
	`703`	`+ groupedDF.write`
	`704`	`+ .format(dataSourceName)`
	`705`	`+ .mode(SaveMode.Overwrite)`
	`706`	`+ .save(path.getCanonicalPath)`
	`707`	`+`
	`708`	`+ val overwrittenFiles = path.listFiles()`
	`709`	`+ .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))`
	`710`	`+ .sortBy(_.getName)`
	`711`	`+ val overwrittenFilesWithoutEmpty = overwrittenFiles.filter(_.length > 0)`
	`712`	`+`
	`713`	`+ assert(overwrittenFiles === overwrittenFilesWithoutEmpty)`
	`714`	`+ }`
	`715`	`+ }`
`698`	`716`	`}`
`699`	`717`
`700`	`718`	`// This class is used to test SPARK-8578. We should not use any custom output committer when`