From 27776778b01c17cbb94f32a41eab858bb438b3e5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 17 Nov 2015 13:08:32 +0800 Subject: [PATCH 01/10] Make allowNonNumericNumbers option work. --- .../execution/datasources/json/JSONOptions.scala | 2 +- .../execution/datasources/json/JacksonParser.scala | 13 +------------ .../datasources/json/JsonParsingOptionsSuite.scala | 6 ++---- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala index c132ead20e7d..8c88957ca8e1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala @@ -59,6 +59,6 @@ object JSONOptions { allowNumericLeadingZeros = parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false), allowNonNumericNumbers = - parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) + parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(false) ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index bfa140504105..0bfc7635ed79 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -100,18 +100,7 @@ object JacksonParser { parser.getFloatValue case (VALUE_STRING, FloatType) => - // Special case handling for NaN and Infinity. - val value = parser.getText - val lowerCaseValue = value.toLowerCase() - if (lowerCaseValue.equals("nan") || - lowerCaseValue.equals("infinity") || - lowerCaseValue.equals("-infinity") || - lowerCaseValue.equals("inf") || - lowerCaseValue.equals("-inf")) { - value.toFloat - } else { - sys.error(s"Cannot parse $value as FloatType.") - } + parser.getFloatValue case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) => parser.getDoubleValue diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 4cc0a3a9585d..522670395152 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -93,9 +93,7 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.first().getLong(0) == 18) } - // The following two tests are not really working - need to look into Jackson's - // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. - ignore("allowNonNumericNumbers off") { + test("allowNonNumericNumbers off") { val str = """{"age": NaN}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.json(rdd) @@ -103,7 +101,7 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.schema.head.name == "_corrupt_record") } - ignore("allowNonNumericNumbers on") { + test("allowNonNumericNumbers on") { val str = """{"age": NaN}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowNonNumericNumbers", "true").json(rdd) From b2a835dd141f3457e08deb921da3b4fa19a5efd4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 18 Nov 2015 00:43:11 +0800 Subject: [PATCH 02/10] Not to quote non numeric numbers when enabling allowNonNumericNumbers. --- .../datasources/json/JSONOptions.scala | 4 +-- .../datasources/json/JSONRelation.scala | 12 ++++++--- .../datasources/json/JacksonParser.scala | 13 +-------- .../json/JsonParsingOptionsSuite.scala | 27 ++++++++++++------- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala index 8c88957ca8e1..0332b8243cf0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala @@ -31,7 +31,7 @@ case class JSONOptions( allowUnquotedFieldNames: Boolean = false, allowSingleQuotes: Boolean = true, allowNumericLeadingZeros: Boolean = false, - allowNonNumericNumbers: Boolean = false) { + allowNonNumericNumbers: Boolean = true) { /** Sets config options on a Jackson [[JsonFactory]]. */ def setJacksonOptions(factory: JsonFactory): Unit = { @@ -59,6 +59,6 @@ object JSONOptions { allowNumericLeadingZeros = parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false), allowNonNumericNumbers = - parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(false) + parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala index 3e61ba35bea8..b7f9592f7cd5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.json import java.io.CharArrayWriter -import com.fasterxml.jackson.core.JsonFactory +import com.fasterxml.jackson.core.{JsonGenerator, JsonFactory} import com.google.common.base.Objects import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{LongWritable, NullWritable, Text} @@ -161,12 +161,13 @@ private[sql] class JSONRelation( } override def prepareJobForWrite(job: Job): OutputWriterFactory = { + val quoteNonNumeric = !options.allowNonNumericNumbers new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { - new JsonOutputWriter(path, dataSchema, context) + new JsonOutputWriter(path, dataSchema, context, quoteNonNumeric) } } } @@ -175,12 +176,15 @@ private[sql] class JSONRelation( private[json] class JsonOutputWriter( path: String, dataSchema: StructType, - context: TaskAttemptContext) + context: TaskAttemptContext, + quoteNonNumeric: Boolean) extends OutputWriter with SparkHadoopMapRedUtil with Logging { private[this] val writer = new CharArrayWriter() // create the Generator without separator inserted between 2 records - private[this] val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + private[this] val factory = new JsonFactory() + factory.configure(JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS, quoteNonNumeric) + private[this] val gen = factory.createGenerator(writer).setRootValueSeparator(null) private[this] val result = new Text() private val recordWriter: RecordWriter[NullWritable, Text] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index 0bfc7635ed79..eea80a331261 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -106,18 +106,7 @@ object JacksonParser { parser.getDoubleValue case (VALUE_STRING, DoubleType) => - // Special case handling for NaN and Infinity. - val value = parser.getText - val lowerCaseValue = value.toLowerCase() - if (lowerCaseValue.equals("nan") || - lowerCaseValue.equals("infinity") || - lowerCaseValue.equals("-infinity") || - lowerCaseValue.equals("inf") || - lowerCaseValue.equals("-inf")) { - value.toDouble - } else { - sys.error(s"Cannot parse $value as DoubleType.") - } + parser.getDoubleValue case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) => Decimal(parser.getDecimalValue, dt.precision, dt.scale) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 522670395152..094a77d631f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -94,19 +94,28 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { } test("allowNonNumericNumbers off") { - val str = """{"age": NaN}""" - val rdd = sqlContext.sparkContext.parallelize(Seq(str)) - val df = sqlContext.read.json(rdd) + val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", + """{"age": -Infinity}""") - assert(df.schema.head.name == "_corrupt_record") + testCases.foreach { str => + val rdd = sqlContext.sparkContext.parallelize(Seq(str)) + val df = sqlContext.read.option("allowNonNumericNumbers", "false").json(rdd) + + assert(df.schema.head.name == "_corrupt_record") + } } test("allowNonNumericNumbers on") { - val str = """{"age": NaN}""" - val rdd = sqlContext.sparkContext.parallelize(Seq(str)) - val df = sqlContext.read.option("allowNonNumericNumbers", "true").json(rdd) + val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", + """{"age": -Infinity}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity) - assert(df.schema.head.name == "age") - assert(df.first().getDouble(0).isNaN) + testCases.zipWithIndex.foreach { case (str, idx) => + val rdd = sqlContext.sparkContext.parallelize(Seq(str)) + val df = sqlContext.read.json(rdd) + + assert(df.schema.head.name == "age") + assert(tests(idx)(df.first().getDouble(0))) + } } } From 186fa5e2f9bcc8c66bbe51c5d03e3a8f681d051a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 20 Nov 2015 16:19:01 +0800 Subject: [PATCH 03/10] Deal with quoted non-numeric number. --- .../datasources/json/InferSchema.scala | 16 +++++- .../datasources/json/JSONRelation.scala | 12 ++--- .../datasources/json/JacksonParser.scala | 51 +++++++++++++------ .../json/JsonParsingOptionsSuite.scala | 6 ++- .../datasources/json/JsonSuite.scala | 2 +- 5 files changed, 59 insertions(+), 28 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala index 922fd5b21167..f566f853f7f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala @@ -92,7 +92,21 @@ private[json] object InferSchema { // record fields' types have been combined. NullType - case VALUE_STRING => StringType + case VALUE_STRING => + // When JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS is enabled, + // we need to do special handling for quoted non-numeric numbers. + if (configOptions.allowNonNumericNumbers) { + val value = parser.getText + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { + return DoubleType + } + } + StringType case START_OBJECT => val builder = Seq.newBuilder[StructField] while (nextUntil(parser, END_OBJECT)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala index b7f9592f7cd5..3e61ba35bea8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.json import java.io.CharArrayWriter -import com.fasterxml.jackson.core.{JsonGenerator, JsonFactory} +import com.fasterxml.jackson.core.JsonFactory import com.google.common.base.Objects import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{LongWritable, NullWritable, Text} @@ -161,13 +161,12 @@ private[sql] class JSONRelation( } override def prepareJobForWrite(job: Job): OutputWriterFactory = { - val quoteNonNumeric = !options.allowNonNumericNumbers new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { - new JsonOutputWriter(path, dataSchema, context, quoteNonNumeric) + new JsonOutputWriter(path, dataSchema, context) } } } @@ -176,15 +175,12 @@ private[sql] class JSONRelation( private[json] class JsonOutputWriter( path: String, dataSchema: StructType, - context: TaskAttemptContext, - quoteNonNumeric: Boolean) + context: TaskAttemptContext) extends OutputWriter with SparkHadoopMapRedUtil with Logging { private[this] val writer = new CharArrayWriter() // create the Generator without separator inserted between 2 records - private[this] val factory = new JsonFactory() - factory.configure(JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS, quoteNonNumeric) - private[this] val gen = factory.createGenerator(writer).setRootValueSeparator(null) + private[this] val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) private[this] val result = new Text() private val recordWriter: RecordWriter[NullWritable, Text] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index eea80a331261..1813fbcd4e30 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -50,7 +50,8 @@ object JacksonParser { def convertField( factory: JsonFactory, parser: JsonParser, - schema: DataType): Any = { + schema: DataType, + configOptions: JSONOptions): Any = { import com.fasterxml.jackson.core.JsonToken._ (parser.getCurrentToken, schema) match { case (null | VALUE_NULL, _) => @@ -58,7 +59,7 @@ object JacksonParser { case (FIELD_NAME, _) => parser.nextToken() - convertField(factory, parser, schema) + convertField(factory, parser, schema, configOptions) case (VALUE_STRING, StringType) => UTF8String.fromString(parser.getText) @@ -106,7 +107,22 @@ object JacksonParser { parser.getDoubleValue case (VALUE_STRING, DoubleType) => - parser.getDoubleValue + // Special case handling for quoted non-numeric numbers. + if (configOptions.allowNonNumericNumbers) { + val value = parser.getText + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { + value.toDouble + } else { + sys.error(s"Cannot parse $value as DoubleType.") + } + } else { + parser.getDoubleValue + } case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) => Decimal(parser.getDecimalValue, dt.precision, dt.scale) @@ -130,26 +146,26 @@ object JacksonParser { false case (START_OBJECT, st: StructType) => - convertObject(factory, parser, st) + convertObject(factory, parser, st, configOptions) case (START_ARRAY, st: StructType) => // SPARK-3308: support reading top level JSON arrays and take every element // in such an array as a row - convertArray(factory, parser, st) + convertArray(factory, parser, st, configOptions) case (START_ARRAY, ArrayType(st, _)) => - convertArray(factory, parser, st) + convertArray(factory, parser, st, configOptions) case (START_OBJECT, ArrayType(st, _)) => // the business end of SPARK-3308: // when an object is found but an array is requested just wrap it in a list - convertField(factory, parser, st) :: Nil + convertField(factory, parser, st, configOptions) :: Nil case (START_OBJECT, MapType(StringType, kt, _)) => - convertMap(factory, parser, kt) + convertMap(factory, parser, kt, configOptions) case (_, udt: UserDefinedType[_]) => - convertField(factory, parser, udt.sqlType) + convertField(factory, parser, udt.sqlType, configOptions) case (token, dataType) => sys.error(s"Failed to parse a value for data type $dataType (current token: $token).") @@ -164,12 +180,13 @@ object JacksonParser { private def convertObject( factory: JsonFactory, parser: JsonParser, - schema: StructType): InternalRow = { + schema: StructType, + configOptions: JSONOptions): InternalRow = { val row = new GenericMutableRow(schema.length) while (nextUntil(parser, JsonToken.END_OBJECT)) { schema.getFieldIndex(parser.getCurrentName) match { case Some(index) => - row.update(index, convertField(factory, parser, schema(index).dataType)) + row.update(index, convertField(factory, parser, schema(index).dataType, configOptions)) case None => parser.skipChildren() @@ -185,12 +202,13 @@ object JacksonParser { private def convertMap( factory: JsonFactory, parser: JsonParser, - valueType: DataType): MapData = { + valueType: DataType, + configOptions: JSONOptions): MapData = { val keys = ArrayBuffer.empty[UTF8String] val values = ArrayBuffer.empty[Any] while (nextUntil(parser, JsonToken.END_OBJECT)) { keys += UTF8String.fromString(parser.getCurrentName) - values += convertField(factory, parser, valueType) + values += convertField(factory, parser, valueType, configOptions) } ArrayBasedMapData(keys.toArray, values.toArray) } @@ -198,10 +216,11 @@ object JacksonParser { private def convertArray( factory: JsonFactory, parser: JsonParser, - elementType: DataType): ArrayData = { + elementType: DataType, + configOptions: JSONOptions): ArrayData = { val values = ArrayBuffer.empty[Any] while (nextUntil(parser, JsonToken.END_ARRAY)) { - values += convertField(factory, parser, elementType) + values += convertField(factory, parser, elementType, configOptions) } new GenericArrayData(values.toArray) @@ -235,7 +254,7 @@ object JacksonParser { Utils.tryWithResource(factory.createParser(record)) { parser => parser.nextToken() - convertField(factory, parser, schema) match { + convertField(factory, parser, schema, configOptions) match { case null => failedRecord(record) case row: InternalRow => row :: Nil case array: ArrayData => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 094a77d631f8..b25abe5f298b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -107,8 +107,10 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowNonNumericNumbers on") { val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", - """{"age": -Infinity}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity) + """{"age": -Infinity}""", """{"age": "NaN"}""", """{"age": "Infinity"}""", + """{"age": "-Infinity"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity, + _.isNaN, _.isPosInfinity, _.isNegInfinity) testCases.zipWithIndex.foreach { case (str, idx) => val rdd = sqlContext.sparkContext.parallelize(Seq(str)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 6042b1178aff..6282c2a59ab5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -54,7 +54,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { Utils.tryWithResource(factory.createParser(writer.toString)) { parser => parser.nextToken() - JacksonParser.convertField(factory, parser, dataType) + JacksonParser.convertField(factory, parser, dataType, JSONOptions()) } } From 6d90b2464501fa69c4475b3f756f4062c2863d2f Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 27 Nov 2015 17:32:29 +0800 Subject: [PATCH 04/10] For comments. --- .../datasources/json/JacksonParser.scala | 17 ++++++++++++++++- .../json/JsonParsingOptionsSuite.scala | 12 +++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index 1813fbcd4e30..aa2e34b7ae81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -101,7 +101,22 @@ object JacksonParser { parser.getFloatValue case (VALUE_STRING, FloatType) => - parser.getFloatValue + // Special case handling for quoted non-numeric numbers. + if (configOptions.allowNonNumericNumbers) { + val value = parser.getText + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { + value.toFloat + } else { + sys.error(s"Cannot parse $value as FloatType.") + } + } else { + parser.getFloatValue + } case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) => parser.getDoubleValue diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index b25abe5f298b..13557b349be4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -94,7 +94,7 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { } test("allowNonNumericNumbers off") { - val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", + var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", """{"age": -Infinity}""") testCases.foreach { str => @@ -103,6 +103,16 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.schema.head.name == "_corrupt_record") } + + testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", + """{"age": "-Infinity"}""") + + testCases.foreach { str => + val rdd = sqlContext.sparkContext.parallelize(Seq(str)) + val df = sqlContext.read.option("allowNonNumericNumbers", "false").json(rdd) + + assert(df.schema.head.name == "age") + } } test("allowNonNumericNumbers on") { From 6f668c3f18fd5486ec96be19b95a08be1f4cda39 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 16 May 2016 07:52:38 +0000 Subject: [PATCH 05/10] Update doc. --- python/pyspark/sql/readwriter.py | 3 +++ .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 2 ++ 2 files changed, 5 insertions(+) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index c98aef1a0e69..4e1e0c8d4296 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -193,6 +193,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, set, it uses the default value, ``true``. :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is set, it uses the default value, ``false``. + :param allowNonNumericNumbers: allows using non-numeric numbers such as "NaN", "Infinity", + "-Infinity", "INF", "-INF", which are convertd to floating + point numbers, ``true``. :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character using backslash quoting mechanism. If None is set, it uses the default value, ``false``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index e1a64dfc5e7b..8da10c9c7abb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -293,6 +293,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * *
  • `allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers * (e.g. 00012)
  • + *
  • `allowNonNumericNumbers` (default `true`): allows using non-numeric numbers such as "NaN", + * "Infinity", "-Infinity", "INF", "-INF", which are convertd to floating point numbers.
  • *
  • `allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all * character using backslash quoting mechanism
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records From 1cfd1dceba9777e2c3323597d2fc15b0f37db1c9 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 16 May 2016 08:02:21 +0000 Subject: [PATCH 06/10] Fix dependency test. --- dev/deps/spark-deps-hadoop-2.2 | 11 ++++++----- dev/deps/spark-deps-hadoop-2.3 | 11 ++++++----- dev/deps/spark-deps-hadoop-2.4 | 11 ++++++----- dev/deps/spark-deps-hadoop-2.6 | 11 ++++++----- dev/deps/spark-deps-hadoop-2.7 | 11 ++++++----- 5 files changed, 30 insertions(+), 25 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index c3be6b2fee99..2a5c97da1c6d 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -72,12 +72,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar @@ -127,7 +128,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.3.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 61ed4c0889b8..aaf532734401 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -134,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.3.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index fb014921765f..ff3589c1f757 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -134,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.3.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 0baf4e84fff0..fcb14b9ff8f8 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -80,13 +80,14 @@ htrace-core-3.0.4.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -142,7 +143,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.3.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 8be218cd68d9..e6e46d6eb1a8 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -80,13 +80,14 @@ htrace-core-3.1.0-incubating.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -143,7 +144,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.3.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar From 4fca52c68ae15eb792dab08cea0dc725c3437397 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 18 May 2016 13:50:23 +0000 Subject: [PATCH 07/10] Address comment. Revert change of InferSchema. --- .../sql/execution/datasources/json/InferSchema.scala | 11 +---------- .../datasources/json/JsonParsingOptionsSuite.scala | 8 +++++--- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala index 372318890949..579b036417d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala @@ -114,16 +114,7 @@ private[sql] object InferSchema { // record fields' types have been combined. NullType - case VALUE_STRING => - // If there is only one row, the following non-numeric numbers will be incorrectly - // recognized as StringType. - val value = parser.getText - if (value.equals("NaN") || - value.equals("Infinity") || - value.equals("-Infinity")) { - return DoubleType - } - return StringType + case VALUE_STRING => StringType case START_OBJECT => val builder = Array.newBuilder[StructField] while (nextUntil(parser, END_OBJECT)) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index f1a40898026c..76d7d52edc61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} /** * Test cases for various [[JSONOptions]]. @@ -107,10 +108,11 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { // quoted non-numeric numbers should still work even allowNonNumericNumbers is off. testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "-Infinity"}""") val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity) + val schema = StructType(StructField("age", DoubleType, true) :: Nil) testCases.zipWithIndex.foreach { case (str, idx) => val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd) + val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd) assert(df.schema.head.name == "age") assert(tests(idx)(df.first().getDouble(0))) @@ -124,10 +126,10 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, _.isNegInfinity, _.isPosInfinity, _.isNegInfinity) - + val schema = StructType(StructField("age", DoubleType, true) :: Nil) testCases.zipWithIndex.foreach { case (str, idx) => val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd) + val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd) assert(df.schema.head.name == "age") assert(tests(idx)(df.first().getDouble(0))) From 0e473fc1bdfd9b6e8e076d2e69cfe6cf53de8898 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 18 May 2016 15:05:54 +0000 Subject: [PATCH 08/10] Support +INF, -INF. --- .../sql/execution/datasources/json/JacksonParser.scala | 8 ++++++++ .../datasources/json/JsonParsingOptionsSuite.scala | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index 2fad59920789..cff2065debfa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -133,6 +133,10 @@ object JacksonParser extends Logging { value.equals("Infinity") || value.equals("-Infinity")) { value.toFloat + } else if (value.equals("+INF")) { + Float.PositiveInfinity + } else if (value.equals("-INF")) { + Float.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.") } @@ -147,6 +151,10 @@ object JacksonParser extends Logging { value.equals("Infinity") || value.equals("-Infinity")) { value.toDouble + } else if (value.equals("+INF")) { + Double.PositiveInfinity + } else if (value.equals("-INF")) { + Double.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 76d7d52edc61..894175e7e1c4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -106,8 +106,10 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { } // quoted non-numeric numbers should still work even allowNonNumericNumbers is off. - testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "-Infinity"}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity) + testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "-Infinity"}""", + """{"age": "+INF"}""", """{"age": "-INF"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity, + _.isPosInfinity, _.isNegInfinity) val schema = StructType(StructField("age", DoubleType, true) :: Nil) testCases.zipWithIndex.foreach { case (str, idx) => From 025edeabe0128f6ef6e2e83a0f852c34847ff820 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 22 May 2016 09:45:46 +0000 Subject: [PATCH 09/10] Manage paranamer up to 2.8. --- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- pom.xml | 6 ++++++ 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 2a5c97da1c6d..054561ff5169 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -128,7 +128,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index aaf532734401..7e60a313ae8f 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index ff3589c1f757..70d33b4f4812 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index fcb14b9ff8f8..a80f6bc2a406 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -143,7 +143,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index e6e46d6eb1a8..c0b53f73cd49 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -144,7 +144,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.3.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/pom.xml b/pom.xml index 051762764abe..bce553d45de3 100644 --- a/pom.xml +++ b/pom.xml @@ -180,6 +180,7 @@ 4.5.2-1 1.1 2.52.0 + 2.8 ${java.home} @@ -1821,6 +1822,11 @@ antlr4-runtime ${antlr4.version} + + com.thoughtworks.paranamer + paranamer + ${paranamer.version} + From af1e3a12ab140e886b89f11e35e606af7b15e7ee Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 23 May 2016 03:30:05 +0000 Subject: [PATCH 10/10] Support INF and +Infinity when allowNonNumericNumbers is off. --- .../datasources/json/JacksonParser.scala | 6 +++-- .../json/JsonParsingOptionsSuite.scala | 23 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index cff2065debfa..cafca323187d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -131,9 +131,10 @@ object JacksonParser extends Logging { val value = parser.getText if (value.equals("NaN") || value.equals("Infinity") || + value.equals("+Infinity") || value.equals("-Infinity")) { value.toFloat - } else if (value.equals("+INF")) { + } else if (value.equals("+INF") || value.equals("INF")) { Float.PositiveInfinity } else if (value.equals("-INF")) { Float.NegativeInfinity @@ -149,9 +150,10 @@ object JacksonParser extends Logging { val value = parser.getText if (value.equals("NaN") || value.equals("Infinity") || + value.equals("+Infinity") || value.equals("-Infinity")) { value.toDouble - } else if (value.equals("+INF")) { + } else if (value.equals("+INF") || value.equals("INF")) { Double.PositiveInfinity } else if (value.equals("-INF")) { Double.NegativeInfinity diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 894175e7e1c4..2aab955c1ecb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -97,7 +97,8 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowNonNumericNumbers off") { // non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off. var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", - """{"age": -Infinity}""", """{"age": +INF}""", """{"age": -INF}""") + """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""", + """{"age": +INF}""", """{"age": -INF}""") testCases.foreach { str => val rdd = spark.sparkContext.parallelize(Seq(str)) val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd) @@ -106,10 +107,11 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { } // quoted non-numeric numbers should still work even allowNonNumericNumbers is off. - testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "-Infinity"}""", - """{"age": "+INF"}""", """{"age": "-INF"}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity, - _.isPosInfinity, _.isNegInfinity) + testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""", + """{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""", + """{"age": "-INF"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity) val schema = StructType(StructField("age", DoubleType, true) :: Nil) testCases.zipWithIndex.foreach { case (str, idx) => @@ -123,11 +125,12 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { test("allowNonNumericNumbers on") { val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", - """{"age": -Infinity}""", """{"age": +INF}""", """{"age": -INF}""", """{"age": "NaN"}""", - """{"age": "Infinity"}""", """{"age": "-Infinity"}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity, - _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, _.isNegInfinity, - _.isPosInfinity, _.isNegInfinity) + """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""", + """{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""", + """{"age": "-Infinity"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isNegInfinity) val schema = StructType(StructField("age", DoubleType, true) :: Nil) testCases.zipWithIndex.foreach { case (str, idx) => val rdd = spark.sparkContext.parallelize(Seq(str))