diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index c3be6b2fee99..054561ff5169 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -72,12 +72,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar @@ -127,7 +128,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 61ed4c0889b8..7e60a313ae8f 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -134,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index fb014921765f..70d33b4f4812 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -134,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 0baf4e84fff0..a80f6bc2a406 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -80,13 +80,14 @@ htrace-core-3.0.4.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -142,7 +143,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 8be218cd68d9..c0b53f73cd49 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -80,13 +80,14 @@ htrace-core-3.1.0-incubating.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.5.3.jar -jackson-core-2.5.3.jar +jackson-annotations-2.7.3.jar +jackson-core-2.7.3.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.5.3.jar +jackson-databind-2.7.3.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.11-2.5.3.jar +jackson-module-paranamer-2.7.3.jar +jackson-module-scala_2.11-2.7.3.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -143,7 +144,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.6.jar +paranamer-2.8.jar parquet-column-1.7.0.jar parquet-common-1.7.0.jar parquet-encoding-1.7.0.jar diff --git a/pom.xml b/pom.xml index 40d9bf5ccccb..bce553d45de3 100644 --- a/pom.xml +++ b/pom.xml @@ -161,7 +161,7 @@ ${scala.version} org.scala-lang 1.9.13 - 2.5.3 + 2.7.3 1.1.2.4 1.1.2 1.2.0-incubating @@ -180,6 +180,7 @@ 4.5.2-1 1.1 2.52.0 + 2.8 ${java.home} @@ -1821,6 +1822,11 @@ antlr4-runtime ${antlr4.version} + + com.thoughtworks.paranamer + paranamer + ${paranamer.version} + diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index c98aef1a0e69..4e1e0c8d4296 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -193,6 +193,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, set, it uses the default value, ``true``. :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is set, it uses the default value, ``false``. + :param allowNonNumericNumbers: allows using non-numeric numbers such as "NaN", "Infinity", + "-Infinity", "INF", "-INF", which are convertd to floating + point numbers, ``true``. :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character using backslash quoting mechanism. If None is set, it uses the default value, ``false``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index e1a64dfc5e7b..8da10c9c7abb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -293,6 +293,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * *
  • `allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers * (e.g. 00012)
  • + *
  • `allowNonNumericNumbers` (default `true`): allows using non-numeric numbers such as "NaN", + * "Infinity", "-Infinity", "INF", "-INF", which are convertd to floating point numbers.
  • *
  • `allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all * character using backslash quoting mechanism
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index aeee2600a19e..cafca323187d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -129,13 +129,15 @@ object JacksonParser extends Logging { case (VALUE_STRING, FloatType) => // Special case handling for NaN and Infinity. val value = parser.getText - val lowerCaseValue = value.toLowerCase() - if (lowerCaseValue.equals("nan") || - lowerCaseValue.equals("infinity") || - lowerCaseValue.equals("-infinity") || - lowerCaseValue.equals("inf") || - lowerCaseValue.equals("-inf")) { + if (value.equals("NaN") || + value.equals("Infinity") || + value.equals("+Infinity") || + value.equals("-Infinity")) { value.toFloat + } else if (value.equals("+INF") || value.equals("INF")) { + Float.PositiveInfinity + } else if (value.equals("-INF")) { + Float.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.") } @@ -146,13 +148,15 @@ object JacksonParser extends Logging { case (VALUE_STRING, DoubleType) => // Special case handling for NaN and Infinity. val value = parser.getText - val lowerCaseValue = value.toLowerCase() - if (lowerCaseValue.equals("nan") || - lowerCaseValue.equals("infinity") || - lowerCaseValue.equals("-infinity") || - lowerCaseValue.equals("inf") || - lowerCaseValue.equals("-inf")) { + if (value.equals("NaN") || + value.equals("Infinity") || + value.equals("+Infinity") || + value.equals("-Infinity")) { value.toDouble + } else if (value.equals("+INF") || value.equals("INF")) { + Double.PositiveInfinity + } else if (value.equals("-INF")) { + Double.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index c31dffedbdf6..2aab955c1ecb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} /** * Test cases for various [[JSONOptions]]. @@ -93,23 +94,51 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.first().getLong(0) == 18) } - // The following two tests are not really working - need to look into Jackson's - // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. - ignore("allowNonNumericNumbers off") { - val str = """{"age": NaN}""" - val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.json(rdd) - - assert(df.schema.head.name == "_corrupt_record") + test("allowNonNumericNumbers off") { + // non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off. + var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", + """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""", + """{"age": +INF}""", """{"age": -INF}""") + testCases.foreach { str => + val rdd = spark.sparkContext.parallelize(Seq(str)) + val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd) + + assert(df.schema.head.name == "_corrupt_record") + } + + // quoted non-numeric numbers should still work even allowNonNumericNumbers is off. + testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""", + """{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""", + """{"age": "-INF"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity) + val schema = StructType(StructField("age", DoubleType, true) :: Nil) + + testCases.zipWithIndex.foreach { case (str, idx) => + val rdd = spark.sparkContext.parallelize(Seq(str)) + val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd) + + assert(df.schema.head.name == "age") + assert(tests(idx)(df.first().getDouble(0))) + } } - ignore("allowNonNumericNumbers on") { - val str = """{"age": NaN}""" - val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd) - - assert(df.schema.head.name == "age") - assert(df.first().getDouble(0).isNaN) + test("allowNonNumericNumbers on") { + val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", + """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""", + """{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""", + """{"age": "-Infinity"}""") + val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, + _.isNegInfinity, _.isPosInfinity, _.isNegInfinity) + val schema = StructType(StructField("age", DoubleType, true) :: Nil) + testCases.zipWithIndex.foreach { case (str, idx) => + val rdd = spark.sparkContext.parallelize(Seq(str)) + val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd) + + assert(df.schema.head.name == "age") + assert(tests(idx)(df.first().getDouble(0))) + } } test("allowBackslashEscapingAnyCharacter off") {