address comments

cloud-fan · cloud-fan · commit f93861467fc1 · 2018-08-21T23:48:33.000+08:00
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReadSupport.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReadSupport.scala
@@ -73,6 +73,8 @@ private[kafka010] class KafkaMicroBatchReadSupport(
 
   private val rangeCalculator = KafkaOffsetRangeCalculator(options)
 
+  private var endPartitionOffsets: KafkaSourceOffset = _
+
   /**
    * Lazily initialize `initialPartitionOffsets` to make sure that `KafkaConsumer.poll` is only
    * called in StreamExecutionThread. Otherwise, interrupting a thread while running
@@ -85,11 +87,12 @@ private[kafka010] class KafkaMicroBatchReadSupport(
   override def latestOffset(start: Offset): Offset = {
     val startPartitionOffsets = start.asInstanceOf[KafkaSourceOffset].partitionToOffsets
     val latestPartitionOffsets = kafkaOffsetReader.fetchLatestOffsets()
-    KafkaSourceOffset(maxOffsetsPerTrigger.map { maxOffsets =>
+    endPartitionOffsets = KafkaSourceOffset(maxOffsetsPerTrigger.map { maxOffsets =>
       rateLimit(maxOffsets, startPartitionOffsets, latestPartitionOffsets)
     }.getOrElse {
       latestPartitionOffsets
     })
+    endPartitionOffsets
   }
 
   override def fullSchema(): StructType = KafkaOffsetReader.kafkaSchema
@@ -153,10 +156,11 @@ private[kafka010] class KafkaMicroBatchReadSupport(
     KafkaMicroBatchReaderFactory
   }
 
-  override def getCustomMetrics(config: ScanConfig): CustomMetrics = {
-    val endPartitionOffsets = config.asInstanceOf[SimpleStreamingScanConfig]
-      .end.get.asInstanceOf[KafkaSourceOffset].partitionToOffsets
-    KafkaCustomMetrics(kafkaOffsetReader.fetchLatestOffsets(), endPartitionOffsets)
+  // TODO: figure out the life cycle of custom metrics, and make this method take `ScanConfig` as
+  // a parameter.
+  override def getCustomMetrics(): CustomMetrics = {
+    KafkaCustomMetrics(
+      kafkaOffsetReader.fetchLatestOffsets(), endPartitionOffsets.partitionToOffsets)
   }
 
   override def deserializeOffset(json: String): Offset = {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BatchReadSupportProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BatchReadSupportProvider.java
@@ -34,7 +34,10 @@ public interface BatchReadSupportProvider extends DataSourceV2 {
 
   /**
    * Creates a {@link BatchReadSupport} instance to load the data from this data source with a user
-   * specified schema.
+   * specified schema, which is called by Spark at the beginning of each batch query.
+   *
+   * Spark will call this method at the beginning of each batch query to create a
+   * {@link BatchReadSupport} instance.
    *
    * By default this method throws {@link UnsupportedOperationException}, implementations should
    * override this method to handle user specified schema.
@@ -48,7 +51,8 @@ default BatchReadSupport createBatchReadSupport(StructType schema, DataSourceOpt
   }
 
   /**
-   * Creates a {@link BatchReadSupport} instance to scan the data from this data source.
+   * Creates a {@link BatchReadSupport} instance to scan the data from this data source, which is
+   * called by Spark at the beginning of each batch query.
    *
    * @param options the options for the returned data source reader, which is an immutable
    *                case-insensitive string-to-string map.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BatchWriteSupportProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BatchWriteSupportProvider.java
@@ -35,7 +35,9 @@
 public interface BatchWriteSupportProvider extends DataSourceV2 {
 
   /**
-   * Creates an optional {@link BatchWriteSupport} instance to save the data to this data source.
+   * Creates an optional {@link BatchWriteSupport} instance to save the data to this data source,
+   * which is called by Spark at the beginning of each batch query.
+   *
    * Data sources can return None if there is no writing needed to be done according to the save
    * mode.
    *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupportProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupportProvider.java
@@ -34,7 +34,8 @@ public interface ContinuousReadSupportProvider extends DataSourceV2 {
 
   /**
    * Creates a {@link ContinuousReadSupport} instance to scan the data from this streaming data
-   * source with a user specified schema.
+   * source with a user specified schema, which is called by Spark at the beginning of each
+   * continuous streaming query.
    *
    * By default this method throws {@link UnsupportedOperationException}, implementations should
    * override this method to handle user specified schema.
@@ -55,7 +56,7 @@ default ContinuousReadSupport createContinuousReadSupport(
 
   /**
    * Creates a {@link ContinuousReadSupport} instance to scan the data from this streaming data
-   * source.
+   * source, which is called by Spark at the beginning of each continuous streaming query.
    *
    * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
    *                           recovery. Readers for the same logical source in the same query
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupportProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupportProvider.java
@@ -34,7 +34,8 @@ public interface MicroBatchReadSupportProvider extends DataSourceV2 {
 
   /**
    * Creates a {@link MicroBatchReadSupport} instance to scan the data from this streaming data
-   * source with a user specified schema.
+   * source with a user specified schema, which is called by Spark at the beginning of each
+   * micro-batch streaming query.
    *
    * By default this method throws {@link UnsupportedOperationException}, implementations should
    * override this method to handle user specified schema.
@@ -55,7 +56,7 @@ default MicroBatchReadSupport createMicroBatchReadSupport(
 
   /**
    * Creates a {@link MicroBatchReadSupport} instance to scan the data from this streaming data
-   * source.
+   * source, which is called by Spark at the beginning of each micro-batch streaming query.
    *
    * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
    *                           recovery. Readers for the same logical source in the same query
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/StreamingWriteSupportProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/StreamingWriteSupportProvider.java
@@ -34,7 +34,8 @@
 public interface StreamingWriteSupportProvider extends DataSourceV2, BaseStreamingSink {
 
   /**
-   * Creates a {@link StreamingWriteSupport} instance to save the data to this data source.
+   * Creates a {@link StreamingWriteSupport} instance to save the data to this data source, which is
+   * called by Spark at the beginning of each streaming query.
    *
    * @param queryId A unique string for the writing query. It's possible that there are many
    *                writing queries running at the same time, and the returned
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/BatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/BatchReadSupport.java
@@ -24,17 +24,20 @@
  *
  * The execution engine will get an instance of this interface from a data source provider
  * (e.g. {@link org.apache.spark.sql.sources.v2.BatchReadSupportProvider}) at the start of a batch
- * query, then call {@link #newScanConfigBuilder()} to create an instance of {@link ScanConfig}. The
- * {@link ScanConfigBuilder} can apply operator pushdown and keep the pushdown result in
+ * query, then call {@link #newScanConfigBuilder()} and create an instance of {@link ScanConfig}.
+ * The {@link ScanConfigBuilder} can apply operator pushdown and keep the pushdown result in
  * {@link ScanConfig}. The {@link ScanConfig} will be used to create input partitions and reader
- * factory to scan data from the data source.
+ * factory to scan data from the data source with a Spark job.
  */
 @InterfaceStability.Evolving
 public interface BatchReadSupport extends ReadSupport {
 
   /**
-   * Returns a builder of {@link ScanConfig}. The builder can take some query specific information
-   * to do operators pushdown, and keep these information in the created {@link ScanConfig}.
+   * Returns a builder of {@link ScanConfig}. Spark will call this method and create a
+   * {@link ScanConfig} for each data scanning job.
+   *
+   * The builder can take some query specific information to do operators pushdown, and keep these
+   * information in the created {@link ScanConfig}.
    *
    * This is the first step of the data scan. All other methods in {@link BatchReadSupport} needs
    * to take {@link ScanConfig} as an input.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ScanConfig.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ScanConfig.java
@@ -21,9 +21,9 @@
 import org.apache.spark.sql.types.StructType;
 
 /**
- * An interface that carries query specific information for the data scan, like operator pushdown
- * information and streaming query offsets. This is defined as an empty interface, and data sources
- * should define their own {@link ScanConfig} classes.
+ * An interface that carries query specific information for the data scanning job, like operator
+ * pushdown information and streaming query offsets. This is defined as an empty interface, and data
+ * sources should define their own {@link ScanConfig} classes.
  *
  * For APIs that take a {@link ScanConfig} as input, like
  * {@link ReadSupport#planInputPartitions(ScanConfig)},
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/ContinuousReadSupport.java
@@ -29,20 +29,22 @@
  *
  * The execution engine will get an instance of this interface from a data source provider
  * (e.g. {@link org.apache.spark.sql.sources.v2.ContinuousReadSupportProvider}) at the start of a
- * streaming query, then call {@link #newScanConfigBuilder(Offset)} to create an instance of
+ * streaming query, then call {@link #newScanConfigBuilder(Offset)} and create an instance of
  * {@link ScanConfig} for the duration of the streaming query or until
  * {@link #needsReconfiguration(ScanConfig)} is true. The {@link ScanConfig} will be used to create
- * input partitions and reader factory to scan data for its duration. At the end {@link #stop()}
- * will be called when the streaming execution is completed. Note that a single query may have
- * multiple executions due to restart or failure recovery.
+ * input partitions and reader factory to scan data with a Spark job for its duration. At the end
+ * {@link #stop()} will be called when the streaming execution is completed. Note that a single
+ * query may have multiple executions due to restart or failure recovery.
  */
 @InterfaceStability.Evolving
 public interface ContinuousReadSupport extends StreamingReadSupport, BaseStreamingSource {
 
   /**
-   * Returns a builder of {@link ScanConfig}. The builder can take some query specific information
-   * to do operators pushdown, streaming offsets, etc., and keep these information in the
-   * created {@link ScanConfig}.
+   * Returns a builder of {@link ScanConfig}. Spark will call this method and create a
+   * {@link ScanConfig} for each data scanning job.
+   *
+   * The builder can take some query specific information to do operators pushdown, store streaming
+   * offsets, etc., and keep these information in the created {@link ScanConfig}.
    *
    * This is the first step of the data scan. All other methods in {@link ContinuousReadSupport}
    * needs to take {@link ScanConfig} as an input.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/MicroBatchReadSupport.java
@@ -27,19 +27,21 @@
  *
  * The execution engine will get an instance of this interface from a data source provider
  * (e.g. {@link org.apache.spark.sql.sources.v2.MicroBatchReadSupportProvider}) at the start of a
- * streaming query, then call {@link #newScanConfigBuilder(Offset, Offset)} to create an instance of
- * {@link ScanConfig} for each micro-batch. The {@link ScanConfig} will be used to create input
- * partitions and reader factory to scan a micro-batch. At the end {@link #stop()} will be called
- * when the streaming execution is completed. Note that a single query may have multiple executions
- * due to restart or failure recovery.
+ * streaming query, then call {@link #newScanConfigBuilder(Offset, Offset)} and create an instance
+ * of {@link ScanConfig} for each micro-batch. The {@link ScanConfig} will be used to create input
+ * partitions and reader factory to scan a micro-batch with a Spark job. At the end {@link #stop()}
+ * will be called when the streaming execution is completed. Note that a single query may have
+ * multiple executions due to restart or failure recovery.
  */
 @InterfaceStability.Evolving
 public interface MicroBatchReadSupport extends StreamingReadSupport, BaseStreamingSource {
 
   /**
-   * Returns a builder of {@link ScanConfig}. The builder can take some query specific information
-   * to do operators pushdown, take streaming offsets, etc., and keep these information in the
-   * created {@link ScanConfig}.
+   * Returns a builder of {@link ScanConfig}. Spark will call this method and create a
+   * {@link ScanConfig} for each data scanning job.
+   *
+   * The builder can take some query specific information to do operators pushdown, store streaming
+   * offsets, etc., and keep these information in the created {@link ScanConfig}.
    *
    * This is the first step of the data scan. All other methods in {@link MicroBatchReadSupport}
    * needs to take {@link ScanConfig} as an input.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/SupportsCustomReaderMetrics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/streaming/SupportsCustomReaderMetrics.java
@@ -32,10 +32,10 @@ public interface SupportsCustomReaderMetrics extends StreamingReadSupport {
   /**
    * Returns custom metrics specific to this data source.
    */
-  CustomMetrics getCustomMetrics(ScanConfig config);
+  CustomMetrics getCustomMetrics();
 
   /**
-   * Invoked if the custom metrics returned by {@link #getCustomMetrics(ScanConfig)} is invalid
+   * Invoked if the custom metrics returned by {@link #getCustomMetrics()} is invalid
    * (e.g. Invalid data that cannot be parsed). Throwing an error here would ensure that
    * your custom metrics work right and correct values are reported always. The default action
    * on invalid metrics is to ignore it.

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,10 @@ public interface BatchReadSupportProvider extends DataSourceV2 {`
`34`	`34`
`35`	`35`	`/**`
`36`	`36`	`* Creates a {@link BatchReadSupport} instance to load the data from this data source with a user`
`37`		`- * specified schema.`
	`37`	`+ * specified schema, which is called by Spark at the beginning of each batch query.`
	`38`	`+ *`
	`39`	`+ * Spark will call this method at the beginning of each batch query to create a`
	`40`	`+ * {@link BatchReadSupport} instance.`
`38`	`41`	`*`
`39`	`42`	`* By default this method throws {@link UnsupportedOperationException}, implementations should`
`40`	`43`	`* override this method to handle user specified schema.`
`@@ -48,7 +51,8 @@ default BatchReadSupport createBatchReadSupport(StructType schema, DataSourceOpt`
`48`	`51`	`}`
`49`	`52`
`50`	`53`	`/**`
`51`		`- * Creates a {@link BatchReadSupport} instance to scan the data from this data source.`
	`54`	`+ * Creates a {@link BatchReadSupport} instance to scan the data from this data source, which is`
	`55`	`+ * called by Spark at the beginning of each batch query.`
`52`	`56`	`*`
`53`	`57`	`* @param options the options for the returned data source reader, which is an immutable`
`54`	`58`	`* case-insensitive string-to-string map.`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,8 @@`
`34`	`34`	`public interface StreamingWriteSupportProvider extends DataSourceV2, BaseStreamingSink {`
`35`	`35`
`36`	`36`	`/**`
`37`		`- * Creates a {@link StreamingWriteSupport} instance to save the data to this data source.`
	`37`	`+ * Creates a {@link StreamingWriteSupport} instance to save the data to this data source, which is`
	`38`	`+ * called by Spark at the beginning of each streaming query.`
`38`	`39`	`*`
`39`	`40`	`* @param queryId A unique string for the writing query. It's possible that there are many`
`40`	`41`	`* writing queries running at the same time, and the returned`