WriteTaskStatsTracker should know which file the row is written to

cloud-fan · cloud-fan · commit 88478b70c364 · 2021-05-07T00:28:55.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -151,7 +151,7 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
     }
   }
 
-  override def newRow(row: InternalRow): Unit = {
+  override def newRow(filePath: String, row: InternalRow): Unit = {
     numRows += 1
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
@@ -157,7 +157,7 @@ class SingleDirectoryDataWriter(
     }
 
     currentWriter.write(record)
-    statsTrackers.foreach(_.newRow(record))
+    statsTrackers.foreach(_.newRow(currentWriter.path, record))
     recordsInFile += 1
   }
 }
@@ -301,7 +301,7 @@ abstract class BaseDynamicPartitionDataWriter(
   protected def writeRecord(record: InternalRow): Unit = {
     val outputRow = getOutputRow(record)
     currentWriter.write(outputRow)
-    statsTrackers.foreach(_.newRow(outputRow))
+    statsTrackers.foreach(_.newRow(currentWriter.path, outputRow))
     recordsInFile += 1
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala
@@ -59,9 +59,10 @@ trait WriteTaskStatsTracker {
    * Process the fact that a new row to update the tracked statistics accordingly.
    * @note Keep in mind that any overhead here is per-row, obviously,
    *       so implementations should be as lightweight as possible.
+   * @param filePath Path of the file which the row is written to.
    * @param row Current data row to be processed.
    */
-  def newRow(row: InternalRow): Unit
+  def newRow(filePath: String, row: InternalRow): Unit
 
   /**
    * Returns the final statistics computed so far.

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)`
`151`	`151`	`}`
`152`	`152`	`}`
`153`	`153`
`154`		`- override def newRow(row: InternalRow): Unit = {`
	`154`	`+ override def newRow(filePath: String, row: InternalRow): Unit = {`
`155`	`155`	`numRows += 1`
`156`	`156`	`}`
`157`	`157`
Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ class SingleDirectoryDataWriter(`
`157`	`157`	`}`
`158`	`158`
`159`	`159`	`currentWriter.write(record)`
`160`		`- statsTrackers.foreach(_.newRow(record))`
	`160`	`+ statsTrackers.foreach(_.newRow(currentWriter.path, record))`
`161`	`161`	`recordsInFile += 1`
`162`	`162`	`}`
`163`	`163`	`}`
`@@ -301,7 +301,7 @@ abstract class BaseDynamicPartitionDataWriter(`
`301`	`301`	`protected def writeRecord(record: InternalRow): Unit = {`
`302`	`302`	`val outputRow = getOutputRow(record)`
`303`	`303`	`currentWriter.write(outputRow)`
`304`		`- statsTrackers.foreach(_.newRow(outputRow))`
	`304`	`+ statsTrackers.foreach(_.newRow(currentWriter.path, outputRow))`
`305`	`305`	`recordsInFile += 1`
`306`	`306`	`}`
`307`	`307`	`}`