[SPARK-26164][SQL][FOLLOWUP] WriteTaskStatsTracker should know which file the row is written to

cloud-fan · cloud-fan · commit e83910f1f89d · 2021-05-07T08:28:42.000Z
### What changes were proposed in this pull request? This is a follow-up of #32198 Before #32198, in `WriteTaskStatsTracker.newRow`, we know that the row is written to the current file. After #32198 , we no longer know this connection. This PR adds the file path parameter in `WriteTaskStatsTracker.newRow` to bring back the connection. ### Why are the changes needed? To not break some custom `WriteTaskStatsTracker` implementations. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? N/A Closes #32459 from cloud-fan/minor. Authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -151,7 +151,7 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
     }
   }
 
-  override def newRow(row: InternalRow): Unit = {
+  override def newRow(filePath: String, row: InternalRow): Unit = {
     numRows += 1
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
@@ -157,7 +157,7 @@ class SingleDirectoryDataWriter(
     }
 
     currentWriter.write(record)
-    statsTrackers.foreach(_.newRow(record))
+    statsTrackers.foreach(_.newRow(currentWriter.path, record))
     recordsInFile += 1
   }
 }
@@ -301,7 +301,7 @@ abstract class BaseDynamicPartitionDataWriter(
   protected def writeRecord(record: InternalRow): Unit = {
     val outputRow = getOutputRow(record)
     currentWriter.write(outputRow)
-    statsTrackers.foreach(_.newRow(outputRow))
+    statsTrackers.foreach(_.newRow(currentWriter.path, outputRow))
     recordsInFile += 1
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala
@@ -59,9 +59,10 @@ trait WriteTaskStatsTracker {
    * Process the fact that a new row to update the tracked statistics accordingly.
    * @note Keep in mind that any overhead here is per-row, obviously,
    *       so implementations should be as lightweight as possible.
+   * @param filePath Path of the file which the row is written to.
    * @param row Current data row to be processed.
    */
-  def newRow(row: InternalRow): Unit
+  def newRow(filePath: String, row: InternalRow): Unit
 
   /**
    * Returns the final statistics computed so far.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CustomWriteTaskStatsTrackerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CustomWriteTaskStatsTrackerSuite.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+
+class CustomWriteTaskStatsTrackerSuite extends SparkFunSuite {
+
+  def checkFinalStats(tracker: CustomWriteTaskStatsTracker, result: Map[String, Int]): Unit = {
+    assert(tracker.getFinalStats().asInstanceOf[CustomWriteTaskStats].numRowsPerFile == result)
+  }
+
+  test("sequential file writing") {
+    val tracker = new CustomWriteTaskStatsTracker
+    tracker.newFile("a")
+    tracker.newRow("a", null)
+    tracker.newRow("a", null)
+    tracker.newFile("b")
+    checkFinalStats(tracker, Map("a" -> 2, "b" -> 0))
+  }
+
+  test("random file writing") {
+    val tracker = new CustomWriteTaskStatsTracker
+    tracker.newFile("a")
+    tracker.newRow("a", null)
+    tracker.newFile("b")
+    tracker.newRow("a", null)
+    tracker.newRow("b", null)
+    checkFinalStats(tracker, Map("a" -> 2, "b" -> 1))
+  }
+}
+
+class CustomWriteTaskStatsTracker extends WriteTaskStatsTracker {
+
+  val numRowsPerFile = mutable.Map.empty[String, Int]
+
+  override def newPartition(partitionValues: InternalRow): Unit = {}
+
+  override def newFile(filePath: String): Unit = {
+    numRowsPerFile.put(filePath, 0)
+  }
+
+  override def closeFile(filePath: String): Unit = {}
+
+  override def newRow(filePath: String, row: InternalRow): Unit = {
+    numRowsPerFile(filePath) += 1
+  }
+
+  override def getFinalStats(): WriteTaskStats = {
+    CustomWriteTaskStats(numRowsPerFile.toMap)
+  }
+}
+
+case class CustomWriteTaskStats(numRowsPerFile: Map[String, Int]) extends WriteTaskStats

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)`
`151`	`151`	`}`
`152`	`152`	`}`
`153`	`153`
`154`		`- override def newRow(row: InternalRow): Unit = {`
	`154`	`+ override def newRow(filePath: String, row: InternalRow): Unit = {`
`155`	`155`	`numRows += 1`
`156`	`156`	`}`
`157`	`157`
Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ class SingleDirectoryDataWriter(`
`157`	`157`	`}`
`158`	`158`
`159`	`159`	`currentWriter.write(record)`
`160`		`- statsTrackers.foreach(_.newRow(record))`
	`160`	`+ statsTrackers.foreach(_.newRow(currentWriter.path, record))`
`161`	`161`	`recordsInFile += 1`
`162`	`162`	`}`
`163`	`163`	`}`
`@@ -301,7 +301,7 @@ abstract class BaseDynamicPartitionDataWriter(`
`301`	`301`	`protected def writeRecord(record: InternalRow): Unit = {`
`302`	`302`	`val outputRow = getOutputRow(record)`
`303`	`303`	`currentWriter.write(outputRow)`
`304`		`- statsTrackers.foreach(_.newRow(outputRow))`
	`304`	`+ statsTrackers.foreach(_.newRow(currentWriter.path, outputRow))`
`305`	`305`	`recordsInFile += 1`
`306`	`306`	`}`
`307`	`307`	`}`