From f317891c07ad8329f7b4a106bb1b55b33f21a076 Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Thu, 20 Sep 2018 16:41:35 -0400 Subject: [PATCH 1/5] Fix multiline crlf --- .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 1 + .../spark/sql/catalyst/csv/UnivocityParser.scala | 1 + sql/core/src/test/resources/test-data/cars-crlf.csv | 7 +++++++ .../sql/execution/datasources/csv/CSVSuite.scala | 12 ++++++++++++ 4 files changed, 21 insertions(+) create mode 100644 sql/core/src/test/resources/test-data/cars-crlf.csv diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 3e25d820e9941..de90967a8acc3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -212,6 +212,7 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) + settings.setLineSeparatorDetectionEnabled(true) settings } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index 46ed58ed92830..01838cd15db00 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -71,6 +71,7 @@ class UnivocityParser( if (parsedSchema.length < dataSchema.length) { parserSetting.selectIndexes(tokenIndexArr: _*) } + new CsvParser(parserSetting) } diff --git a/sql/core/src/test/resources/test-data/cars-crlf.csv b/sql/core/src/test/resources/test-data/cars-crlf.csv new file mode 100644 index 0000000000000..d018d08ebc6fc --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-crlf.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index d59035b716cf0..d43efc8776b00 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -52,6 +52,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsNullFile = "test-data/cars-null.csv" private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" + private val carsCrlf = "test-data/cars-crlf.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" private val disableCommentsFile = "test-data/disable_comments.csv" @@ -220,6 +221,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te } } + test("crlf line separators in multiline mode") { + val cars = spark + .read + .format("csv") + .option("multiLine", "true") + .option("header", "true") + .load(testFile(carsCrlf)) + + verifyCars(cars, withHeader = true) + } + test("test aliases sep and encoding for delimiter and charset") { // scalastyle:off val cars = spark From 05c2fcbede4c8c9980b0b879ba3a1275c7764d21 Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Thu, 20 Sep 2018 16:54:42 -0400 Subject: [PATCH 2/5] remove unnecessary line --- .../org/apache/spark/sql/catalyst/csv/UnivocityParser.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index 01838cd15db00..46ed58ed92830 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -71,7 +71,6 @@ class UnivocityParser( if (parsedSchema.length < dataSchema.length) { parserSetting.selectIndexes(tokenIndexArr: _*) } - new CsvParser(parserSetting) } From aedfbd7774ec5544d02760e85aea6e6fd973e23e Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Mon, 24 Sep 2018 10:51:52 -0400 Subject: [PATCH 3/5] Only turn on line separator detection on multiline mode --- .../org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index de90967a8acc3..a17ea715572dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -212,7 +212,11 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) - settings.setLineSeparatorDetectionEnabled(true) + + if (multiLine) { + settings.setLineSeparatorDetectionEnabled(true) + } + settings } } From 2a2e65e855f2e15176ad341ce66be73f5de01c24 Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Tue, 25 Sep 2018 12:30:40 -0400 Subject: [PATCH 4/5] simplify setting line detection --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index a17ea715572dd..e099a92cd9a3b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -212,10 +212,7 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) - - if (multiLine) { - settings.setLineSeparatorDetectionEnabled(true) - } + settings.setLineSeparatorDetectionEnabled(multiLine) settings } From 040047b696c58496ea3da274fa2c58166d31b100 Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Tue, 2 Oct 2018 14:01:11 -0400 Subject: [PATCH 5/5] address cr --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index e099a92cd9a3b..cdaaa172e8367 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -212,7 +212,7 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) - settings.setLineSeparatorDetectionEnabled(multiLine) + settings.setLineSeparatorDetectionEnabled(multiLine == true) settings }