Adding spark.sql.csv.parser.columnPruning.enabled

MaxGekk · MaxGekk · commit 6ff6d4fda9f7 · 2018-05-18T17:25:50.000+02:00
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -1814,7 +1814,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
   - In version 2.3 and earlier, `to_utc_timestamp` and `from_utc_timestamp` respect the timezone in the input timestamp string, which breaks the assumption that the input timestamp is in a specific timezone. Therefore, these 2 functions can return unexpected results. In version 2.4 and later, this problem has been fixed. `to_utc_timestamp` and `from_utc_timestamp` will return null if the input timestamp string contains timezone. As an example, `from_utc_timestamp('2000-10-10 00:00:00', 'GMT+1')` will return `2000-10-10 01:00:00` in both Spark 2.3 and 2.4. However, `from_utc_timestamp('2000-10-10 00:00:00+00:00', 'GMT+1')`, assuming a local timezone of GMT+8, will return `2000-10-10 09:00:00` in Spark 2.3 but `null` in 2.4. For people who don't care about this problem and want to retain the previous behaivor to keep their query unchanged, you can set `spark.sql.function.rejectTimezoneInString` to false. This option will be removed in Spark 3.0 and should only be used as a temporary workaround.
   - In version 2.3 and earlier, Spark converts Parquet Hive tables by default but ignores table properties like `TBLPROPERTIES (parquet.compression 'NONE')`. This happens for ORC Hive table properties like `TBLPROPERTIES (orc.compress 'NONE')` in case of `spark.sql.hive.convertMetastoreOrc=true`, too. Since Spark 2.4, Spark respects Parquet/ORC specific table properties while converting Parquet/ORC Hive tables. As an example, `CREATE TABLE t(id int) STORED AS PARQUET TBLPROPERTIES (parquet.compression 'NONE')` would generate Snappy parquet files during insertion in Spark 2.3, and in Spark 2.4, the result would be uncompressed parquet files.
   - Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, `CREATE TABLE t(id int) STORED AS ORC` would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark's ORC data source table and ORC vectorization would be applied. To set `false` to `spark.sql.hive.convertMetastoreOrc` restores the previous behavior.
-  - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, all column values must be requested. This example demonstrates how to achieve that with filter in Scala `spark.read.option("header", true).option("mode", "dropmalformed").csv("a.csv").filter(_ => true).select("id")`.
+  - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`.
 
 ## Upgrading From Spark SQL 2.2 to 2.3
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1266,6 +1266,13 @@ object SQLConf {
   object Replaced {
     val MAPREDUCE_JOB_REDUCES = "mapreduce.job.reduces"
   }
+
+  val CSV_PARSER_COLUMN_PRUNING = buildConf("spark.sql.csv.parser.columnPruning.enabled")
+    .internal()
+    .doc("If it is set to true, column names of the requested schema are passed to CSV parser. " +
+      "Other column values can be ignored during parsing even if they are malformed.")
+    .booleanConf
+    .createWithDefault(true)
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -25,6 +25,7 @@ import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.internal.SQLConf
 
 class CSVOptions(
     @transient val parameters: CaseInsensitiveMap[String],
@@ -80,6 +81,8 @@ class CSVOptions(
     }
   }
 
+  private[csv] val columnPruning = SQLConf.get.getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
+
   val delimiter = CSVUtils.toChar(
     parameters.getOrElse("sep", parameters.getOrElse("delimiter", ",")))
   val parseMode: ParseMode =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 class UnivocityParser(
-    schema: StructType,
+    dataSchema: StructType,
     requiredSchema: StructType,
     val options: CSVOptions) extends Logging {
-  require(requiredSchema.toSet.subsetOf(schema.toSet),
+  require(requiredSchema.toSet.subsetOf(dataSchema.toSet),
     "requiredSchema should be the subset of schema.")
 
   def this(schema: StructType, options: CSVOptions) = this(schema, schema, options)
@@ -47,14 +47,15 @@ class UnivocityParser(
 
   private val tokenizer = {
     val parserSetting = options.asParserSettings
-    if (requiredSchema.length < schema.length) {
-      val tokenIndexArr = requiredSchema.map(f => java.lang.Integer.valueOf(schema.indexOf(f)))
+    if (options.columnPruning && requiredSchema.length < dataSchema.length) {
+      val tokenIndexArr = requiredSchema.map(f => java.lang.Integer.valueOf(dataSchema.indexOf(f)))
       parserSetting.selectIndexes(tokenIndexArr: _*)
     }
     new CsvParser(parserSetting)
   }
+  private val schema = if (options.columnPruning) requiredSchema else dataSchema
 
-  private val row = new GenericInternalRow(requiredSchema.length)
+  private val row = new GenericInternalRow(schema.length)
 
   // Retrieve the raw record string.
   private def getCurrentInput: UTF8String = {
@@ -81,7 +82,7 @@ class UnivocityParser(
   //
   //   output row - ["A", 2]
   private val valueConverters: Array[ValueConverter] = {
-    requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
+    schema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
   }
 
   /**
@@ -189,14 +190,14 @@ class UnivocityParser(
   def parse(input: String): InternalRow = convert(tokenizer.parseLine(input))
 
   private def convert(tokens: Array[String]): InternalRow = {
-    if (tokens.length != requiredSchema.length) {
+    if (tokens.length != schema.length) {
       // If the number of tokens doesn't match the schema, we should treat it as a malformed record.
       // However, we still have chance to parse some of the tokens, by adding extra null tokens in
       // the tail if the number is smaller, or by dropping extra tokens if the number is larger.
-      val checkedTokens = if (requiredSchema.length > tokens.length) {
-        tokens ++ new Array[String](requiredSchema.length - tokens.length)
+      val checkedTokens = if (schema.length > tokens.length) {
+        tokens ++ new Array[String](schema.length - tokens.length)
       } else {
-        tokens.take(requiredSchema.length)
+        tokens.take(schema.length)
       }
       def getPartialResult(): Option[InternalRow] = {
         try {
@@ -214,7 +215,7 @@ class UnivocityParser(
     } else {
       try {
         var i = 0
-        while (i < requiredSchema.length) {
+        while (i < schema.length) {
           row(i) = valueConverters(i).apply(tokens(i))
           i += 1
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -260,14 +260,16 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
   }
 
   test("test for DROPMALFORMED parsing mode") {
-    Seq(false, true).foreach { multiLine =>
-      val cars = spark.read
-        .format("csv")
-        .option("multiLine", multiLine)
-        .options(Map("header" -> "true", "mode" -> "dropmalformed"))
-        .load(testFile(carsFile))
+    withSQLConf(SQLConf.CSV_PARSER_COLUMN_PRUNING.key -> "false") {
+      Seq(false, true).foreach { multiLine =>
+        val cars = spark.read
+          .format("csv")
+          .option("multiLine", multiLine)
+          .options(Map("header" -> "true", "mode" -> "dropmalformed"))
+          .load(testFile(carsFile))
 
-      assert(cars.collect().size === 2)
+        assert(cars.select("year").collect().size === 2)
+      }
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -1266,6 +1266,13 @@ object SQLConf {`
`1266`	`1266`	`object Replaced {`
`1267`	`1267`	`val MAPREDUCE_JOB_REDUCES = "mapreduce.job.reduces"`
`1268`	`1268`	`}`
	`1269`	`+`
	`1270`	`+ val CSV_PARSER_COLUMN_PRUNING = buildConf("spark.sql.csv.parser.columnPruning.enabled")`
	`1271`	`+ .internal()`
	`1272`	`+ .doc("If it is set to true, column names of the requested schema are passed to CSV parser. " +`
	`1273`	`+ "Other column values can be ignored during parsing even if they are malformed.")`
	`1274`	`+ .booleanConf`
	`1275`	`+ .createWithDefault(true)`
`1269`	`1276`	`}`
`1270`	`1277`
`1271`	`1278`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ import org.apache.commons.lang3.time.FastDateFormat`
`25`	`25`
`26`	`26`	`import org.apache.spark.internal.Logging`
`27`	`27`	`import org.apache.spark.sql.catalyst.util._`
	`28`	`+import org.apache.spark.sql.internal.SQLConf`
`28`	`29`
`29`	`30`	`class CSVOptions(`
`30`	`31`	`@transient val parameters: CaseInsensitiveMap[String],`
`@@ -80,6 +81,8 @@ class CSVOptions(`
`80`	`81`	`}`
`81`	`82`	`}`
`82`	`83`
	`84`	`+ private[csv] val columnPruning = SQLConf.get.getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)`
	`85`	`+`
`83`	`86`	`val delimiter = CSVUtils.toChar(`
`84`	`87`	`parameters.getOrElse("sep", parameters.getOrElse("delimiter", ",")))`
`85`	`88`	`val parseMode: ParseMode =`