From 457a21aad16c05b84268e416259f7aa332b0fc43 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 8 May 2018 23:23:56 +0200
Subject: [PATCH 1/6] Getting tests from the PR #20068

---
 .../execution/datasources/csv/CSVSuite.scala  | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 461abdd96d3f..07e6c74b14d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1322,4 +1322,50 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
     val sampled = spark.read.option("inferSchema", true).option("samplingRatio", 1.0).csv(ds)
     assert(sampled.count() == ds.count())
   }
+
+  test("SPARK-17916: An empty string should not be coerced to null when nullValue is passed.") {
+    val litNull: String = null
+    val df = Seq(
+      (1, "John Doe"),
+      (2, ""),
+      (3, "-"),
+      (4, litNull)
+    ).toDF("id", "name")
+
+    // Checks for new behavior where an empty string is not coerced to null when `nullValue` is
+    // set to anything but an empty string literal.
+    withTempPath { path =>
+      df.write
+        .option("nullValue", "-")
+        .csv(path.getAbsolutePath)
+      val computed = spark.read
+        .option("nullValue", "-")
+        .schema(df.schema)
+        .csv(path.getAbsolutePath)
+      val expected = Seq(
+        (1, "John Doe"),
+        (2, ""),
+        (3, litNull),
+        (4, litNull)
+      ).toDF("id", "name")
+
+      checkAnswer(computed, expected)
+    }
+    // Keeps the old behavior where empty string us coerced to nullValue is not passed.
+    withTempPath { path =>
+      df.write
+        .csv(path.getAbsolutePath)
+      val computed = spark.read
+        .schema(df.schema)
+        .csv(path.getAbsolutePath)
+      val expected = Seq(
+        (1, "John Doe"),
+        (2, litNull),
+        (3, "-"),
+        (4, litNull)
+      ).toDF("id", "name")
+
+      checkAnswer(computed, expected)
+    }
+  }
 }

From cd78b12595c6fcbb71819416e0dd515a6bc82d91 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 8 May 2018 23:24:37 +0200
Subject: [PATCH 2/6] Bump versions of uniVocity parser to 2.6.3

---
 sql/core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index ef41837f89d6..f270c70fbfcf 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -38,7 +38,7 @@
     <dependency>
       <groupId>com.univocity</groupId>
       <artifactId>univocity-parsers</artifactId>
-      <version>2.5.9</version>
+      <version>2.6.3</version>
       <type>jar</type>
     </dependency>
     <dependency>

From 598ba2da9ee6b014713f3ad41fc382590dcc7b37 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 8 May 2018 23:25:09 +0200
Subject: [PATCH 3/6] Set values of empty strings in read and in write

---
 .../spark/sql/execution/datasources/csv/CSVOptions.scala       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 2ec0fc605a84..d9f5e9257995 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -164,7 +164,7 @@ class CSVOptions(
     writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite)
     writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite)
     writerSettings.setNullValue(nullValue)
-    writerSettings.setEmptyValue(nullValue)
+    writerSettings.setEmptyValue("\"\"")
     writerSettings.setSkipEmptyLines(true)
     writerSettings.setQuoteAllFields(quoteAll)
     writerSettings.setQuoteEscapingEnabled(escapeQuotes)
@@ -185,6 +185,7 @@ class CSVOptions(
     settings.setInputBufferSize(inputBufferSize)
     settings.setMaxColumns(maxColumns)
     settings.setNullValue(nullValue)
+    settings.setEmptyValue("")
     settings.setMaxCharsPerColumn(maxCharsPerColumn)
     settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
     settings

From db1834534e058046f2ddc6a7681d0b42b6b402c5 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 8 May 2018 23:55:21 +0200
Subject: [PATCH 4/6] Bumping versions of uniVocity parser in manifest files

---
 dev/deps/spark-deps-hadoop-2.6 | 2 +-
 dev/deps/spark-deps-hadoop-2.7 | 2 +-
 dev/deps/spark-deps-hadoop-3.1 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index c3d1dd444b50..87e6aebf0e67 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -190,7 +190,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 290867035f91..5ef07b83c7f4 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -191,7 +191,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index 97ad65a4096c..57c874c58fe5 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -211,7 +211,7 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 token-provider-1.0.1.jar
-univocity-parsers-2.5.9.jar
+univocity-parsers-2.6.3.jar
 validation-api-1.1.0.Final.jar
 woodstox-core-5.0.3.jar
 xbean-asm5-shaded-4.4.jar

From 137a525a35a841cba726c489ca589762481472b0 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 12 May 2018 23:05:23 +0200
Subject: [PATCH 5/6] Benchmarks for uniVocity v2.5.9

---
 .../datasources/csv/CSVBenchmarks.scala       | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
new file mode 100644
index 000000000000..4dd5ff7c693f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.csv
+
+import java.io.File
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Column, Row, SparkSession}
+import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+/**
+ * Benchmark to measure CSV read/write performance.
+ * To run this:
+ *  spark-submit --class <this class> --jars <spark sql test jar>
+ */
+object CSVBenchmarks {
+  val conf = new SparkConf()
+
+  val spark = SparkSession.builder
+    .master("local[1]")
+    .appName("benchmark-csv-datasource")
+    .config(conf)
+    .getOrCreate()
+  import spark.implicits._
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def quotedValuesBenchmark(rowsNum: Int, numIters: Int): Unit = {
+    val benchmark = new Benchmark(s"Parsing quoted values", rowsNum)
+
+    withTempPath { path =>
+      val str = (0 until 10000).map(i => s""""$i"""").mkString(",")
+
+      spark.range(rowsNum)
+        .map(_ => str)
+        .write.option("header", true)
+        .csv(path.getAbsolutePath)
+
+      val schema = new StructType().add("value", StringType)
+      val ds = spark.read.option("header", true).schema(schema).csv(path.getAbsolutePath)
+
+      benchmark.addCase(s"One quoted string", numIters) { _ =>
+        ds.filter((_: Row) => true).count()
+      }
+
+      /*
+      Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+
+      Parsing quoted values:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+      --------------------------------------------------------------------------------------------
+      One quoted string                       38884 / 39065          0.0      777683.0       1.0X
+      */
+      benchmark.run()
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    quotedValuesBenchmark(rowsNum = 50 * 1000, numIters = 3)
+  }
+}

From f3a0072b82c23566d4010d977172578f04f51bff Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 12 May 2018 23:06:17 +0200
Subject: [PATCH 6/6] Benchmarks for uniVocity v2.6.3

---
 .../spark/sql/execution/datasources/csv/CSVBenchmarks.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
index 4dd5ff7c693f..d442ba7e59c6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
@@ -68,7 +68,7 @@ object CSVBenchmarks {
 
       Parsing quoted values:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
       --------------------------------------------------------------------------------------------
-      One quoted string                       38884 / 39065          0.0      777683.0       1.0X
+      One quoted string                       30273 / 30549          0.0      605451.2       1.0X
       */
       benchmark.run()
     }