From 3cd91b1d07d3cb2451045913d0c1e27226a67816 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 14 Feb 2018 15:23:34 -0800 Subject: [PATCH 1/4] added test case --- .../test/resources/json-tests/utf16WithBOM.json | Bin 0 -> 170 bytes .../execution/datasources/json/JsonSuite.scala | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 sql/core/src/test/resources/json-tests/utf16WithBOM.json diff --git a/sql/core/src/test/resources/json-tests/utf16WithBOM.json b/sql/core/src/test/resources/json-tests/utf16WithBOM.json new file mode 100644 index 0000000000000000000000000000000000000000..65e7e2f72948103b198017d2553f501efe850da0 GIT binary patch literal 170 zcmezWubM%LA&nuEp@^ZFp@hMYA(0`MAr&ZQ1;ow_89?z&po|iO4ub-a%mK<{s&fL0 if=oyOs;Fh)W#D4KXQ~TBK0_%(Isr3-fU5G5OauU86dZN{ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 8c8d41ebf115a..c84ebcd3c9af2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2063,4 +2063,17 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } } + + test("json in UTF-16 with BOM") { + val fileName = "json-tests/utf16WithBOM.json" + val testFile = Thread.currentThread().getContextClassLoader.getResource(fileName).toString + val schema = new StructType().add("firstName", StringType).add("lastName", StringType) + val jsonDF = spark.read.schema(schema) + .option("mode", "DROPMALFORMED") + .json(testFile) + + checkAnswer(jsonDF, Seq( + Row("Chris", "Baird"), Row("Doug", "Rood") + )) + } } From d4015d0c3c9b5cae0309bd6b9486b4990c7f4479 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 14 Feb 2018 15:24:38 -0800 Subject: [PATCH 2/4] Revert "[SPARK-23094] Fix invalid character handling in JsonDataSource" This reverts commit e01919e834d301e13adc8919932796ebae900576. --- .../catalyst/json/CreateJacksonParser.scala | 5 ++- .../sources/JsonHadoopFsRelationSuite.scala | 34 ------------------- 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala index b1672e7e2fca2..025a388aacaa5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala @@ -40,11 +40,10 @@ private[sql] object CreateJacksonParser extends Serializable { } def text(jsonFactory: JsonFactory, record: Text): JsonParser = { - val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength) - jsonFactory.createParser(new InputStreamReader(bain, "UTF-8")) + jsonFactory.createParser(record.getBytes, 0, record.getLength) } def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = { - jsonFactory.createParser(new InputStreamReader(record, "UTF-8")) + jsonFactory.createParser(record) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala index 27f398ebf301a..49be30435ad2f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -28,8 +28,6 @@ import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" - private val badJson = "\u0000\u0000\u0000A\u0001AAA" - // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false @@ -107,36 +105,4 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { ) } } - - test("invalid json with leading nulls - from file (multiLine=true)") { - import testImplicits._ - withTempDir { tempDir => - val path = tempDir.getAbsolutePath - Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path) - val expected = s"""$badJson\n{"a":1}\n""" - val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType) - val df = - spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path) - checkAnswer(df, Row(null, expected)) - } - } - - test("invalid json with leading nulls - from file (multiLine=false)") { - import testImplicits._ - withTempDir { tempDir => - val path = tempDir.getAbsolutePath - Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path) - val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType) - val df = - spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path) - checkAnswer(df, Seq(Row(1, null), Row(null, badJson))) - } - } - - test("invalid json with leading nulls - from dataset") { - import testImplicits._ - checkAnswer( - spark.read.json(Seq(badJson).toDS()), - Row(badJson)) - } } From 86c88ae9ce02ff25b6b4c8cdac4fe73cb1b65b8b Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 14 Feb 2018 21:42:46 -0800 Subject: [PATCH 3/4] remove the test case --- .../sql/execution/datasources/json/JsonSuite.scala | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index c84ebcd3c9af2..8c8d41ebf115a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2063,17 +2063,4 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } } - - test("json in UTF-16 with BOM") { - val fileName = "json-tests/utf16WithBOM.json" - val testFile = Thread.currentThread().getContextClassLoader.getResource(fileName).toString - val schema = new StructType().add("firstName", StringType).add("lastName", StringType) - val jsonDF = spark.read.schema(schema) - .option("mode", "DROPMALFORMED") - .json(testFile) - - checkAnswer(jsonDF, Seq( - Row("Chris", "Baird"), Row("Doug", "Rood") - )) - } } From 0bb86c6e45967de2fe92980422b5a3ccd83fb15f Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 14 Feb 2018 23:18:05 -0800 Subject: [PATCH 4/4] remove the file --- .../test/resources/json-tests/utf16WithBOM.json | Bin 170 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 sql/core/src/test/resources/json-tests/utf16WithBOM.json diff --git a/sql/core/src/test/resources/json-tests/utf16WithBOM.json b/sql/core/src/test/resources/json-tests/utf16WithBOM.json deleted file mode 100644 index 65e7e2f72948103b198017d2553f501efe850da0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 170 zcmezWubM%LA&nuEp@^ZFp@hMYA(0`MAr&ZQ1;ow_89?z&po|iO4ub-a%mK<{s&fL0 if=oyOs;Fh)W#D4KXQ~TBK0_%(Isr3-fU5G5OauU86dZN{