document improvement

cloud-fan · cloud-fan · commit d2c86f4339d5 · 2017-09-15T11:51:13.000+08:00
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.sources.v2;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * The base interface for data source v2. Implementations must have a public, no arguments
  * constructor.
  *
  * Note that this is an empty interface, data source implementations should mix-in at least one of
- * the plug-in interfaces like `ReadSupport`. Otherwise it's just a dummy data source which is
+ * the plug-in interfaces like {@link ReadSupport}. Otherwise it's just a dummy data source which is
  * un-readable/writable.
  */
+@InterfaceStability.Evolving
 public interface DataSourceV2 {}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
@@ -22,10 +22,13 @@
 import java.util.Map;
 import java.util.Optional;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * An immutable string-to-string map in which keys are case-insensitive. This is used to represent
  * data source options.
  */
+@InterfaceStability.Evolving
 public class DataSourceV2Options {
   private final Map<String, String> keyLowerCasedMap;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -17,16 +17,18 @@
 
 package org.apache.spark.sql.sources.v2;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 
 /**
- * A mix-in interface for `DataSourceV2`. Users can implement this interface to provide data reading
- * ability and scan the data from the data source.
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data reading ability and scan the data from the data source.
  */
+@InterfaceStability.Evolving
 public interface ReadSupport {
 
   /**
-   * Creates a `DataSourceV2Reader` to scan the data for this data source.
+   * Creates a {@link DataSourceV2Reader} to scan the data from this data source.
    *
    * @param options the options for this data source reader, which is an immutable case-insensitive
    *                string-to-string map.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
@@ -17,21 +17,23 @@
 
 package org.apache.spark.sql.sources.v2;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 import org.apache.spark.sql.types.StructType;
 
 /**
- * A mix-in interface for `DataSourceV2`. Users can implement this interface to provide data reading
- * ability and scan the data from the data source.
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data reading ability and scan the data from the data source.
  *
- * This is a variant of `ReadSupport` that accepts user-specified schema when reading data. A data
- * source can implement both `ReadSupport` and `ReadSupportWithSchema` if it supports both schema
- * inference and user-specified schema.
+ * This is a variant of {@link ReadSupport} that accepts user-specified schema when reading data.
+ * A data source can implement both {@link ReadSupport} and {@link ReadSupportWithSchema} if it
+ * supports both schema inference and user-specified schema.
  */
+@InterfaceStability.Evolving
 public interface ReadSupportWithSchema {
 
   /**
-   * Create a `DataSourceV2Reader` to scan the data for this data source.
+   * Create a {@link DataSourceV2Reader} to scan the data from this data source.
    *
    * @param schema the full schema of this data source reader. Full schema usually maps to the
    *               physical schema of the underlying storage of this data source reader, e.g.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -19,9 +19,13 @@
 
 import java.io.Closeable;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
- * A data reader returned by a read task and is responsible for outputting data for a RDD partition.
+ * A data reader returned by {@link ReadTask#createReader()} and is responsible for outputting data
+ * for a RDD partition.
  */
+@InterfaceStability.Evolving
 public interface DataReader<T> extends Closeable {
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -19,26 +19,33 @@
 
 import java.util.List;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.types.StructType;
 
 /**
- * A data source reader that can mix in various query optimization interfaces and implement these
- * optimizations. The actual scan logic should be delegated to `ReadTask`s that are returned by
- * this data source reader.
+ * A data source reader that is returned by
+ * {@link ReadSupport#createReader(DataSourceV2Options)} or
+ * {@link ReadSupportWithSchema#createReader(StructType, DataSourceV2Options)}.
+ * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
+ * logic should be delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
  *
  * There are mainly 3 kinds of query optimizations:
  *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
  *      pruning), etc. These push-down interfaces are named like `SupportsPushDownXXX`.
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc. These
  *      reporting interfaces are named like `SupportsReportingXXX`.
- *   3. Special scan. E.g, columnar scan, unsafe row scan, etc. Note that a data source reader can
- *      implement at most one special scan. These scan interfaces are named like `SupportsScanXXX`.
+ *   3. Special scans. E.g, columnar scan, unsafe row scan, etc. These scan interfaces are named
+ *      like `SupportsScanXXX`.
  *
  * Spark first applies all operator push-down optimizations that this data source supports. Then
  * Spark collects information this data source reported for further optimizations. Finally Spark
  * issues the scan request and does the actual data reading.
  */
+@InterfaceStability.Evolving
 public interface DataSourceV2Reader {
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
@@ -19,13 +19,17 @@
 
 import java.io.Serializable;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
- * A read task returned by a data source reader and is responsible to create the data reader.
- * The relationship between `ReadTask` and `DataReader` is similar to `Iterable` and `Iterator`.
+ * A read task returned by {@link DataSourceV2Reader#createReadTasks()} and is responsible for
+ * creating the actual data reader. The relationship between {@link ReadTask} and {@link DataReader}
+ * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
  *
  * Note that, the read task will be serialized and sent to executors, then the data reader will be
  * created on executors and do the actual reading.
  */
+@InterfaceStability.Evolving
 public interface ReadTask<T> extends Serializable {
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java
@@ -19,10 +19,13 @@
 
 import java.util.OptionalLong;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * An interface to represent statistics for a data source, which is returned by
- * `SupportsReportStatistics`.
+ * {@link SupportsReportStatistics#getStatistics()}.
  */
+@InterfaceStability.Evolving
 public interface Statistics {
   OptionalLong sizeInBytes();
   OptionalLong numRows();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -22,13 +22,16 @@
 import org.apache.spark.sql.catalyst.expressions.Expression;
 
 /**
- * A mix-in interface for `DataSourceV2Reader`. Users can implement this interface to push down
- * arbitrary expressions as predicates to the data source. This is an experimental and unstable
- * interface as `Expression` is not public and may get changed in future Spark versions.
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down arbitrary expressions as predicates to the data source.
+ * This is an experimental and unstable interface as {@link Expression} is not public and may get
+ * changed in the future Spark versions.
  *
- * Note that, if users implement both this interface and `SupportsPushDownFilters`, Spark will
- * ignore `SupportsPushDownFilters` and only process this interface.
+ * Note that, if data source readers implement both this interface and
+ * {@link SupportsPushDownFilters}, Spark will ignore {@link SupportsPushDownFilters} and only
+ * process this interface.
  */
+@InterfaceStability.Evolving
 @Experimental
 @InterfaceStability.Unstable
 public interface SupportsPushDownCatalystFilters {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -17,15 +17,18 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.Filter;
 
 /**
- * A mix-in interface for `DataSourceV2Reader`. Users can implement this interface to push down
- * filters to the data source and reduce the size of the data to be read.
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down filters to the data source and reduce the size of the data to be read.
  *
- * Note that, if users implement both this interface and `SupportsPushDownCatalystFilters`, Spark
- * will ignore this interface and only process `SupportsPushDownCatalystFilters`.
+ * Note that, if data source readers implement both this interface and
+ * {@link SupportsPushDownCatalystFilters}, Spark will ignore this interface and only process
+ * {@link SupportsPushDownCatalystFilters}.
  */
+@InterfaceStability.Evolving
 public interface SupportsPushDownFilters {
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.types.StructType;
 
 /**
- * A mix-in interface for `DataSourceV2Reader`. Users can implement this interface to push down
- * required columns and only read these columns during scan.
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down required columns to the data source and only read these columns during
+ * scan to reduce the size of the data to be read.
  */
+@InterfaceStability.Evolving
 public interface SupportsPushDownRequiredColumns {
 
   /**
@@ -32,7 +35,7 @@ public interface SupportsPushDownRequiredColumns {
    * also OK to do the pruning partially, e.g., a data source may not be able to prune nested
    * fields, and only prune top-level columns.
    *
-   * Note that, data source implementations should update `DataSourceReader.readSchema` after
+   * Note that, data source readers should update {@link DataSourceV2Reader#readSchema()} after
    * applying column pruning.
    */
   void pruneColumns(StructType requiredSchema);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
@@ -17,10 +17,17 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
- * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to report
- * statistics to Spark.
+ * A mix in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to report statistics to Spark.
  */
+@InterfaceStability.Evolving
 public interface SupportsReportStatistics {
+
+  /**
+   * Returns the basic statistics of this data source.
+   */
   Statistics getStatistics();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -27,10 +27,12 @@
 import org.apache.spark.sql.sources.v2.reader.ReadTask;
 
 /**
- * A mix-in interface for `DataSourceV2Reader`. Users can implement this interface to output
- * unsafe rows directly and avoid the row copy at Spark side. This is an experimental and unstable
- * interface, as `UnsafeRow` is not public and may get changed in future Spark versions.
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to output {@link UnsafeRow} directly and avoid the row copy at Spark side.
+ * This is an experimental and unstable interface, as {@link UnsafeRow} is not public and may get
+ * changed in the future Spark versions.
  */
+@InterfaceStability.Evolving
 @Experimental
 @InterfaceStability.Unstable
 public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
@@ -41,7 +43,7 @@ default List<ReadTask<Row>> createReadTasks() {
   }
 
   /**
-   * Similar to `DataSourceV2Reader.createReadTasks`, but return data in unsafe row format.
+   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns data in unsafe row format.
    */
   List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
@@ -27,18 +29,13 @@ class DataSourceRDDPartition(val index: Int, val readTask: ReadTask[UnsafeRow])
 
 class DataSourceRDD(
     sc: SparkContext,
-    @transient private val generators: java.util.List[ReadTask[UnsafeRow]])
+    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
   extends RDD[UnsafeRow](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
-    var index = 0
-    val iter = generators.iterator()
-    val res = new Array[Partition](generators.size())
-    while (iter.hasNext) {
-      res(index) = new DataSourceRDDPartition(index, iter.next())
-      index += 1
-    }
-    res
+    readTasks.asScala.zipWithIndex.map {
+      case (readTask, index) => new DataSourceRDDPartition(index, readTask)
+    }.toArray
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {