update ConfigSupport

jiangxb1987 · jiangxb1987 · commit 84df37e60d30 · 2017-12-06T22:40:38.000+08:00
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ConfigSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ConfigSupport.java
@@ -18,9 +18,9 @@
 package org.apache.spark.sql.sources.v2;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 
 import java.util.List;
+import java.util.Map;
 
 /**
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
@@ -32,6 +32,23 @@ public interface ConfigSupport {
     /**
      * Create a list of key-prefixes, all session configs that match at least one of the prefixes
      * will be propagated to the data source options.
+     * If the returned list is empty, no session config will be propagated.
      */
     List<String> getConfigPrefixes();
+
+    /**
+     * Create a mapping from session config names to data source option names. If a propagated
+     * session config's key doesn't exist in this mapping, the "spark.sql.${source}" prefix will
+     * be trimmed. For example, if the data source name is "parquet", perform the following config
+     * key mapping by default:
+     * "spark.sql.parquet.int96AsTimestamp" -> "int96AsTimestamp",
+     * "spark.sql.parquet.compression.codec" -> "compression.codec",
+     * "spark.sql.columnNameOfCorruptRecord" -> "columnNameOfCorruptRecord".
+     *
+     * If the mapping is specified, for example, the returned map contains an entry
+     * ("spark.sql.columnNameOfCorruptRecord" -> "colNameCorrupt"), then the session config
+     * "spark.sql.columnNameOfCorruptRecord" will be converted to "colNameCorrupt" in
+     * [[DataSourceV2Options]].
+     */
+    Map<String, String> getConfigMapping();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -190,7 +190,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       val dataSource = cls.newInstance()
       val options = dataSource match {
         case cs: ConfigSupport =>
-          val confs = withSessionConfig(cs, sparkSession.sessionState.conf)
+          val confs = withSessionConfig(cs, source, sparkSession.sessionState.conf)
           new DataSourceV2Options((confs ++ extraOptions).asJava)
         case _ =>
           new DataSourceV2Options(extraOptions.asJava)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ConfigSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ConfigSupport.scala
@@ -17,30 +17,50 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
+import java.util.regex.Pattern
+
 import scala.collection.JavaConverters._
 import scala.collection.immutable
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.v2.ConfigSupport
 
-private[sql] object DataSourceV2ConfigSupport {
+private[sql] object DataSourceV2ConfigSupport extends Logging {
 
   /**
-   * Helper method to filter session configs with config key that matches at least one of the given
-   * prefixes.
+   * Helper method to propagate session configs with config key that matches at least one of the
+   * given prefixes to the corresponding data source options.
    *
-   * @param cs the config key-prefixes that should be filtered.
+   * @param cs the session config propagate help class
+   * @param source the data source format
    * @param conf the session conf
    * @return an immutable map that contains all the session configs that should be propagated to
    *         the data source.
    */
   def withSessionConfig(
       cs: ConfigSupport,
+      source: String,
       conf: SQLConf): immutable.Map[String, String] = {
     val prefixes = cs.getConfigPrefixes
     require(prefixes != null, "The config key-prefixes cann't be null.")
+    val mapping = cs.getConfigMapping.asScala
+
+    val pattern = Pattern.compile(s"spark\\.sql(\\.$source)?\\.(.*)")
     conf.getAllConfs.filterKeys { confKey =>
       prefixes.asScala.exists(confKey.startsWith(_))
+    }.map{ entry =>
+      val newKey = mapping.get(entry._1).getOrElse {
+        val m = pattern.matcher(entry._1)
+        if (m.matches()) {
+          m.group(2)
+        } else {
+          // Unable to recognize the session config key.
+          logWarning(s"Unrecognizable session config name ${entry._1}.")
+          entry._1
+        }
+      }
+      (newKey, entry._2)
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources.v2
 
+import java.util
 import java.util.{ArrayList, List => JList}
 
 import test.org.apache.spark.sql.sources.v2._
@@ -47,16 +48,16 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
 
   test("simple implementation with config support") {
     withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false",
-        SQLConf.PARQUET_INT96_AS_TIMESTAMP.key -> "true",
+        SQLConf.PARQUET_COMPRESSION.key -> "uncompressed",
         SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD.key -> "32",
         SQLConf.PARALLEL_PARTITION_DISCOVERY_PARALLELISM.key -> "10000") {
       val cs = classOf[DataSourceV2WithConfig].newInstance().asInstanceOf[ConfigSupport]
-      val confs = DataSourceV2ConfigSupport.withSessionConfig(cs, SQLConf.get)
+      val confs = DataSourceV2ConfigSupport.withSessionConfig(cs, "parquet", SQLConf.get)
       assert(confs.size == 3)
-      assert(confs.keySet.filter(_.startsWith("spark.sql.parquet")).size == 2)
-      assert(confs.keySet.filter(
-        _.startsWith("spark.sql.sources.parallelPartitionDiscovery.threshold")).size == 1)
+      assert(confs.keySet.filter(_.startsWith("spark.sql.parquet")).size == 0)
       assert(confs.keySet.filter(_.startsWith("not.exist.prefix")).size == 0)
+      assert(confs.keySet.contains("compressionCodec"))
+      assert(confs.keySet.contains("sources.parallelPartitionDiscovery.threshold"))
     }
   }
 
@@ -203,6 +204,12 @@ class DataSourceV2WithConfig extends SimpleDataSourceV2 with ConfigSupport {
       "spark.sql.parquet",
       "spark.sql.sources.parallelPartitionDiscovery.threshold")
   }
+
+  override def getConfigMapping: util.Map[String, String] = {
+    val configMap = new util.HashMap[String, String]()
+    configMap.put("spark.sql.parquet.compression.codec", "compressionCodec")
+    configMap
+  }
 }
 
 class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {