fix json schema inference

cloud-fan · cloud-fan · commit a1519d4aa692 · 2018-05-20T19:18:27.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -33,6 +33,7 @@ import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
+import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -105,6 +105,10 @@ object SQLExecution {
     }
   }
 
+  /**
+   * Wrap an action with specified SQL configs. These configs will be propagated to the executor
+   * side via job local properties.
+   */
   def withSQLConfPropagated[T](sparkSession: SparkSession)(body: => T): T = {
     val sc = sparkSession.sparkContext
     // Set all the specified SQL configs to local properties, so that they can be available at
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonInferSchema.scala
@@ -45,8 +45,9 @@ private[sql] object JsonInferSchema {
     val parseMode = configOptions.parseMode
     val columnNameOfCorruptRecord = configOptions.columnNameOfCorruptRecord
 
-    // perform schema inference on each row and merge afterwards
-    val rootType = json.mapPartitions { iter =>
+    // In each RDD partition, perform schema inference on each row and merge afterwards.
+    val typeMerger = compatibleRootType(columnNameOfCorruptRecord, parseMode)
+    val mergedTypesFromPartitions = json.mapPartitions { iter =>
       val factory = new JsonFactory()
       configOptions.setJacksonOptions(factory)
       iter.flatMap { row =>
@@ -66,9 +67,13 @@ private[sql] object JsonInferSchema {
                 s"Parse Mode: ${FailFastMode.name}.", e)
           }
         }
-      }
-    }.fold(StructType(Nil))(
-      compatibleRootType(columnNameOfCorruptRecord, parseMode))
+      }.reduceOption(typeMerger).toIterator
+    }
+
+    // Here we get RDD local iterator then fold, instead of calling `RDD.fold` directly, because
+    // `RDD.fold` will run the fold function in DAGScheduler event loop thread, which may not have
+    // active SparkSession and `SQLConf.get` may point to the wrong configs.
+    val rootType = mergedTypesFromPartitions.toLocalIterator.fold(StructType(Nil))(typeMerger)
 
     canonicalizeType(rootType) match {
       case Some(st: StructType) => st

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,10 @@ object SQLExecution {`
`105`	`105`	`}`
`106`	`106`	`}`
`107`	`107`
	`108`	`+ /**`
	`109`	`+ * Wrap an action with specified SQL configs. These configs will be propagated to the executor`
	`110`	`+ * side via job local properties.`
	`111`	`+ */`
`108`	`112`	`def withSQLConfPropagated[T](sparkSession: SparkSession)(body: => T): T = {`
`109`	`113`	`val sc = sparkSession.sparkContext`
`110`	`114`	`// Set all the specified SQL configs to local properties, so that they can be available at`