diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index c3d1dd444b50..87e6aebf0e67 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -190,7 +190,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 290867035f91..5ef07b83c7f4 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -191,7 +191,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index 97ad65a4096c..57c874c58fe5 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -211,7 +211,7 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
token-provider-1.0.1.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
validation-api-1.1.0.Final.jar
woodstox-core-5.0.3.jar
xbean-asm5-shaded-4.4.jar
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index ef41837f89d6..f270c70fbfcf 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -38,7 +38,7 @@
com.univocity
univocity-parsers
- 2.5.9
+ 2.6.3
jar
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 2ec0fc605a84..d9f5e9257995 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -164,7 +164,7 @@ class CSVOptions(
writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite)
writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite)
writerSettings.setNullValue(nullValue)
- writerSettings.setEmptyValue(nullValue)
+ writerSettings.setEmptyValue("\"\"")
writerSettings.setSkipEmptyLines(true)
writerSettings.setQuoteAllFields(quoteAll)
writerSettings.setQuoteEscapingEnabled(escapeQuotes)
@@ -185,6 +185,7 @@ class CSVOptions(
settings.setInputBufferSize(inputBufferSize)
settings.setMaxColumns(maxColumns)
settings.setNullValue(nullValue)
+ settings.setEmptyValue("")
settings.setMaxCharsPerColumn(maxCharsPerColumn)
settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
settings
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
new file mode 100644
index 000000000000..d442ba7e59c6
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.csv
+
+import java.io.File
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Column, Row, SparkSession}
+import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+/**
+ * Benchmark to measure CSV read/write performance.
+ * To run this:
+ * spark-submit --class --jars
+ */
+object CSVBenchmarks {
+ val conf = new SparkConf()
+
+ val spark = SparkSession.builder
+ .master("local[1]")
+ .appName("benchmark-csv-datasource")
+ .config(conf)
+ .getOrCreate()
+ import spark.implicits._
+
+ def withTempPath(f: File => Unit): Unit = {
+ val path = Utils.createTempDir()
+ path.delete()
+ try f(path) finally Utils.deleteRecursively(path)
+ }
+
+ def quotedValuesBenchmark(rowsNum: Int, numIters: Int): Unit = {
+ val benchmark = new Benchmark(s"Parsing quoted values", rowsNum)
+
+ withTempPath { path =>
+ val str = (0 until 10000).map(i => s""""$i"""").mkString(",")
+
+ spark.range(rowsNum)
+ .map(_ => str)
+ .write.option("header", true)
+ .csv(path.getAbsolutePath)
+
+ val schema = new StructType().add("value", StringType)
+ val ds = spark.read.option("header", true).schema(schema).csv(path.getAbsolutePath)
+
+ benchmark.addCase(s"One quoted string", numIters) { _ =>
+ ds.filter((_: Row) => true).count()
+ }
+
+ /*
+ Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+
+ Parsing quoted values: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+ --------------------------------------------------------------------------------------------
+ One quoted string 30273 / 30549 0.0 605451.2 1.0X
+ */
+ benchmark.run()
+ }
+ }
+
+ def main(args: Array[String]): Unit = {
+ quotedValuesBenchmark(rowsNum = 50 * 1000, numIters = 3)
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 461abdd96d3f..07e6c74b14d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1322,4 +1322,50 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
val sampled = spark.read.option("inferSchema", true).option("samplingRatio", 1.0).csv(ds)
assert(sampled.count() == ds.count())
}
+
+ test("SPARK-17916: An empty string should not be coerced to null when nullValue is passed.") {
+ val litNull: String = null
+ val df = Seq(
+ (1, "John Doe"),
+ (2, ""),
+ (3, "-"),
+ (4, litNull)
+ ).toDF("id", "name")
+
+ // Checks for new behavior where an empty string is not coerced to null when `nullValue` is
+ // set to anything but an empty string literal.
+ withTempPath { path =>
+ df.write
+ .option("nullValue", "-")
+ .csv(path.getAbsolutePath)
+ val computed = spark.read
+ .option("nullValue", "-")
+ .schema(df.schema)
+ .csv(path.getAbsolutePath)
+ val expected = Seq(
+ (1, "John Doe"),
+ (2, ""),
+ (3, litNull),
+ (4, litNull)
+ ).toDF("id", "name")
+
+ checkAnswer(computed, expected)
+ }
+ // Keeps the old behavior where empty string us coerced to nullValue is not passed.
+ withTempPath { path =>
+ df.write
+ .csv(path.getAbsolutePath)
+ val computed = spark.read
+ .schema(df.schema)
+ .csv(path.getAbsolutePath)
+ val expected = Seq(
+ (1, "John Doe"),
+ (2, litNull),
+ (3, "-"),
+ (4, litNull)
+ ).toDF("id", "name")
+
+ checkAnswer(computed, expected)
+ }
+ }
}