Add custom metrics.

viirya · viirya · commit eebf9c67fb8d · 2021-01-30T17:13:12.000-08:00
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala
@@ -23,6 +23,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
+import org.apache.spark.sql.connector.read.streaming.{CustomMetric, CustomSumMetric}
 import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer
 
 /** A [[InputPartition]] for reading Kafka data in a batch based streaming query. */
@@ -105,4 +106,12 @@ private case class KafkaBatchPartitionReader(
       range
     }
   }
+
+  override def getCustomMetrics(): Array[CustomMetric] = {
+    Array(
+      CustomSumMetric("offsetOutOfRange", "estimated number of fetched offsets out of range",
+        consumer.getNumOffsetOutOfRange()),
+      CustomSumMetric("dataLoss", "number of data loss error",
+        consumer.getNumDataLoss()))
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory}
-import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset, ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl}
+import org.apache.spark.sql.connector.read.streaming.{CustomMetric, CustomSumMetric, MicroBatchStream, Offset, ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.UninterruptibleThread
@@ -217,4 +217,9 @@ private[kafka010] class KafkaMicroBatchStream(
       logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
     }
   }
+
+  override def supportedCustomMetrics(): Array[CustomMetric] =
+    Array(
+      CustomSumMetric("offsetOutOfRange", "estimated number of fetched offsets out of range", 0L),
+      CustomSumMetric("dataLoss", "number of data loss error", 0L))
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala
@@ -239,6 +239,9 @@ private[kafka010] class KafkaDataConsumer(
     fetchedDataPool: FetchedDataPool) extends Logging {
   import KafkaDataConsumer._
 
+  private var offsetOutOfRange = 0L
+  private var dataLoss = 0L
+
   private val isTokenProviderEnabled =
     HadoopDelegationTokenManager.isServiceEnabled(SparkEnv.get.conf, "kafka")
 
@@ -329,7 +332,14 @@ private[kafka010] class KafkaDataConsumer(
 
           reportDataLoss(topicPartition, groupId, failOnDataLoss,
             s"Cannot fetch offset $toFetchOffset", e)
+
+          val oldToFetchOffsetd = toFetchOffset
           toFetchOffset = getEarliestAvailableOffsetBetween(consumer, toFetchOffset, untilOffset)
+          if (toFetchOffset == UNKNOWN_OFFSET) {
+            offsetOutOfRange += (untilOffset - oldToFetchOffsetd)
+          } else {
+            offsetOutOfRange += (toFetchOffset - oldToFetchOffsetd)
+          }
       }
     }
 
@@ -350,6 +360,9 @@ private[kafka010] class KafkaDataConsumer(
     consumer.getAvailableOffsetRange()
   }
 
+  def getNumOffsetOutOfRange(): Long = offsetOutOfRange
+  def getNumDataLoss(): Long = dataLoss
+
   /**
    * Release borrowed objects in data reader to the pool. Once the instance is created, caller
    * must call method after using the instance to make sure resources are not leaked.
@@ -596,6 +609,7 @@ private[kafka010] class KafkaDataConsumer(
       message: String,
       cause: Throwable = null): Unit = {
     val finalMessage = s"$message ${additionalMessage(topicPartition, groupId, failOnDataLoss)}"
+    dataLoss += 1
     reportDataLoss0(failOnDataLoss, finalMessage, cause)
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 
 import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.connector.read.streaming.CustomMetric;
 
 /**
  * A partition reader returned by {@link PartitionReaderFactory#createReader(InputPartition)} or
@@ -48,4 +49,12 @@ public interface PartitionReader<T> extends Closeable {
    * Return the current record. This method should return same value until `next` is called.
    */
   T get();
+
+  /**
+   * Returns an array of custom metrics. By default it returns empty array.
+   */
+  default CustomMetric[] getCustomMetrics() {
+    CustomMetric[] NO_METRICS = {};
+    return NO_METRICS;
+  }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/CustomMetric.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/CustomMetric.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.streaming;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * A custom metric for {@link SparkDataStream}.
+ *
+ * @since 3.2.0
+ */
+@Evolving
+public interface CustomMetric {
+  /**
+   * Returns the name of custom metric.
+   */
+  String getName();
+
+  /**
+   * Returns the description of custom metric.
+   */
+  String getDescription();
+
+  /**
+   * Returns the value of custom metric.
+   */
+  Long getValue();
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/streaming/MicroBatchStream.java
@@ -56,4 +56,13 @@ public interface MicroBatchStream extends SparkDataStream {
    * Returns a factory to create a {@link PartitionReader} for each {@link InputPartition}.
    */
   PartitionReaderFactory createReaderFactory();
+
+  /**
+   * Returns an array of supported custom metrics with name and description.
+   * By default it returns empty array.
+   */
+  default CustomMetric[] supportedCustomMetrics() {
+    CustomMetric[] NO_METRICS = {};
+    return NO_METRICS;
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/read/streaming/CustomMetrics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/read/streaming/CustomMetrics.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.streaming
+
+case class CustomSumMetric(name: String, desc: String, value: Long) extends CustomMetric {
+  override def getName(): String = name
+  override def getDescription: String = desc
+  override def getValue: java.lang.Long = value
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
 import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.util.CompletionIterator
 
 class DataSourceRDDPartition(val index: Int, val inputPartition: InputPartition)
   extends Partition with Serializable
@@ -36,7 +37,8 @@ class DataSourceRDD(
     sc: SparkContext,
     @transient private val inputPartitions: Seq[InputPartition],
     partitionReaderFactory: PartitionReaderFactory,
-    columnarReads: Boolean)
+    columnarReads: Boolean,
+    onCompletion: PartitionReader[_] => Unit = _ => {})
   extends RDD[InternalRow](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
@@ -55,11 +57,21 @@ class DataSourceRDD(
     val (iter, reader) = if (columnarReads) {
       val batchReader = partitionReaderFactory.createColumnarReader(inputPartition)
       val iter = new MetricsBatchIterator(new PartitionIterator[ColumnarBatch](batchReader))
-      (iter, batchReader)
+      def completionFunction = {
+        onCompletion(batchReader)
+      }
+      val completionIterator = CompletionIterator[ColumnarBatch, Iterator[ColumnarBatch]](
+        iter, completionFunction)
+      (completionIterator, batchReader)
     } else {
       val rowReader = partitionReaderFactory.createReader(inputPartition)
       val iter = new MetricsRowIterator(new PartitionIterator[InternalRow](rowReader))
-      (iter, rowReader)
+      def completionFunction = {
+        onCompletion(rowReader)
+      }
+      val completionIterator = CompletionIterator[InternalRow, Iterator[InternalRow]](
+        iter, completionFunction)
+      (completionIterator, rowReader)
     }
     context.addTaskCompletionListener[Unit](_ => reader.close())
     // TODO: SPARK-25083 remove the type erasure hack in data source scan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.execution.datasources.v2
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan}
+import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory, Scan}
 import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset}
+import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
  * Physical plan node for scanning a micro-batch of data from a data source.
@@ -33,6 +34,14 @@ case class MicroBatchScanExec(
     @transient start: Offset,
     @transient end: Offset) extends DataSourceV2ScanExecBase {
 
+  override lazy val metrics = {
+    val customMetrics = stream.supportedCustomMetrics().map { customMetric =>
+      customMetric.getName -> SQLMetrics.createMetric(sparkContext, customMetric.getDescription)
+    }
+    Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) ++
+      customMetrics
+  }
+
   // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
   override def equals(other: Any): Boolean = other match {
     case other: MicroBatchScanExec => this.stream == other.stream
@@ -45,7 +54,17 @@ case class MicroBatchScanExec(
 
   override lazy val readerFactory: PartitionReaderFactory = stream.createReaderFactory()
 
+  /**
+   * The callback function which is called when the output iterator of input RDD is consumed
+   * completely.
+   */
+  private def onOutputCompletion(reader: PartitionReader[_]) = {
+    reader.getCustomMetrics.foreach { metric =>
+      longMetric(metric.getName) += metric.getValue
+    }
+  }
+
   override lazy val inputRDD: RDD[InternalRow] = {
-    new DataSourceRDD(sparkContext, partitions, readerFactory, supportsColumnar)
+    new DataSourceRDD(sparkContext, partitions, readerFactory, supportsColumnar, onOutputCompletion)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ import org.apache.spark.internal.Logging`
`23`	`23`	`import org.apache.spark.sql.catalyst.InternalRow`
`24`	`24`	`import org.apache.spark.sql.catalyst.expressions.UnsafeRow`
`25`	`25`	`import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}`
	`26`	`+import org.apache.spark.sql.connector.read.streaming.{CustomMetric, CustomSumMetric}`
`26`	`27`	`import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer`
`27`	`28`
`28`	`29`	`/** A [[InputPartition]] for reading Kafka data in a batch based streaming query. */`
`@@ -105,4 +106,12 @@ private case class KafkaBatchPartitionReader(`
`105`	`106`	`range`
`106`	`107`	`}`
`107`	`108`	`}`
	`109`	`+`
	`110`	`+ override def getCustomMetrics(): Array[CustomMetric] = {`
	`111`	`+ Array(`
	`112`	`+ CustomSumMetric("offsetOutOfRange", "estimated number of fetched offsets out of range",`
	`113`	`+ consumer.getNumOffsetOutOfRange()),`
	`114`	`+ CustomSumMetric("dataLoss", "number of data loss error",`
	`115`	`+ consumer.getNumDataLoss()))`
	`116`	`+ }`
`108`	`117`	`}`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging`
`24`	`24`	`import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT`
`25`	`25`	`import org.apache.spark.sql.SparkSession`
`26`	`26`	`import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory}`
`27`		`-import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset, ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl}`
	`27`	`+import org.apache.spark.sql.connector.read.streaming.{CustomMetric, CustomSumMetric, MicroBatchStream, Offset, ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl}`
`28`	`28`	`import org.apache.spark.sql.kafka010.KafkaSourceProvider._`
`29`	`29`	`import org.apache.spark.sql.util.CaseInsensitiveStringMap`
`30`	`30`	`import org.apache.spark.util.UninterruptibleThread`
`@@ -217,4 +217,9 @@ private[kafka010] class KafkaMicroBatchStream(`
`217`	`217`	`logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")`
`218`	`218`	`}`
`219`	`219`	`}`
	`220`	`+`
	`221`	`+ override def supportedCustomMetrics(): Array[CustomMetric] =`
	`222`	`+ Array(`
	`223`	`+ CustomSumMetric("offsetOutOfRange", "estimated number of fetched offsets out of range", 0L),`
	`224`	`+ CustomSumMetric("dataLoss", "number of data loss error", 0L))`
`220`	`225`	`}`