Handle more cases.

yhuai · yhuai · commit ad714338cfbf · 2015-12-16T14:56:36.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -61,7 +61,10 @@ private[json] object InferSchema {
             StructType(Seq(StructField(columnNameOfCorruptRecords, StringType)))
         }
       }
-    }.treeAggregate[DataType](StructType(Seq()))(compatibleRootType, compatibleRootType)
+    }.treeAggregate[DataType](
+      StructType(Seq()))(
+      compatibleRootType(columnNameOfCorruptRecords),
+      compatibleRootType(columnNameOfCorruptRecords))
 
     canonicalizeType(rootType) match {
       case Some(st: StructType) => st
@@ -170,12 +173,38 @@ private[json] object InferSchema {
     case other => Some(other)
   }
 
+  private def withCorruptField(
+      struct: StructType,
+      columnNameOfCorruptRecords: String): StructType = {
+    if (!struct.fieldNames.contains(columnNameOfCorruptRecords)) {
+      // If this given struct does not have a column used for corrupt records,
+      // add this field.
+      struct.add(columnNameOfCorruptRecords, StringType, nullable = true)
+    } else {
+      // Otherwise, just return this struct.
+      struct
+    }
+  }
+
   /**
    * Remove top-level ArrayType wrappers and merge the remaining schemas
    */
-  private def compatibleRootType: (DataType, DataType) => DataType = {
-    case (ArrayType(ty1, _), ty2) => compatibleRootType(ty1, ty2)
-    case (ty1, ArrayType(ty2, _)) => compatibleRootType(ty1, ty2)
+  private def compatibleRootType(
+      columnNameOfCorruptRecords: String): (DataType, DataType) => DataType = {
+    // Since we support array of json objects at the top level,
+    // we need to check the element type and find the root level data type.
+    case (ArrayType(ty1, _), ty2) => compatibleRootType(columnNameOfCorruptRecords)(ty1, ty2)
+    case (ty1, ArrayType(ty2, _)) => compatibleRootType(columnNameOfCorruptRecords)(ty1, ty2)
+    // If we see any other data type at the root level, we get records that cannot be
+    // parsed. So, we use the struct as the data type and add the corrupt field to the schema.
+    case (struct: StructType, NullType) => struct
+    case (NullType, struct: StructType) => struct
+    case (struct: StructType, o) if !o.isInstanceOf[StructType] =>
+      withCorruptField(struct, columnNameOfCorruptRecords)
+    case (o, struct: StructType) if !o.isInstanceOf[StructType] =>
+      withCorruptField(struct, columnNameOfCorruptRecords)
+    // If we get anything else, we call compatibleType.
+    // Usually, when we reach here, ty1 and ty2 are two StructTypes.
     case (ty1, ty2) => compatibleType(ty1, ty2)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -31,6 +31,8 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 
+private[json] class SparkSQLJsonProcessingException(msg: String) extends Exception(msg)
+
 object JacksonParser {
 
   def parse(
@@ -110,7 +112,7 @@ object JacksonParser {
           lowerCaseValue.equals("-inf")) {
           value.toFloat
         } else {
-          sys.error(s"Cannot parse $value as FloatType.")
+          throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.")
         }
 
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) =>
@@ -127,7 +129,7 @@ object JacksonParser {
           lowerCaseValue.equals("-inf")) {
           value.toDouble
         } else {
-          sys.error(s"Cannot parse $value as DoubleType.")
+          throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.")
         }
 
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) =>
@@ -174,7 +176,11 @@ object JacksonParser {
         convertField(factory, parser, udt.sqlType)
 
       case (token, dataType) =>
-        sys.error(s"Failed to parse a value for data type $dataType (current token: $token).")
+        // We cannot parse this token based on the given data type. So, we throw a
+        // SparkSQLJsonProcessingException and this exception will be caught by
+        // parseJson method.
+        throw new SparkSQLJsonProcessingException(
+          s"Failed to parse a value for data type $dataType (current token: $token).")
     }
   }
 
@@ -266,12 +272,15 @@ object JacksonParser {
                 } else {
                   array.toArray[InternalRow](schema)
                 }
-              case _ => failedRecord(record)
+              case _ =>
+                failedRecord(record)
             }
           }
         } catch {
           case _: JsonProcessingException =>
             failedRecord(record)
+          case _: SparkSQLJsonProcessingException =>
+            failedRecord(record)
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -1435,21 +1435,31 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
         val schema = StructType(
           StructField("_unparsed", StringType, true) ::
             StructField("dummy", StringType, true) :: Nil)
-        val jsonDF = sqlContext.read.schema(schema).json(additionalCorruptRecords)
-        jsonDF.registerTempTable("jsonTable")
-
-        // In HiveContext, backticks should be used to access columns starting with a underscore.
-        checkAnswer(
-          sql(
-            """
-              |SELECT dummy, _unparsed
-              |FROM jsonTable
-            """.stripMargin),
-          Row("test", null) ::
-          Row(null, """42""") ::
-          Row(null, """     ","ian":"test"}""") :: Nil
-        )
 
+        {
+          // We need to make sure we can infer the schema.
+          val jsonDF = sqlContext.read.json(additionalCorruptRecords)
+          assert(jsonDF.schema === schema)
+        }
+
+        {
+          val jsonDF = sqlContext.read.schema(schema).json(additionalCorruptRecords)
+          jsonDF.registerTempTable("jsonTable")
+
+          // In HiveContext, backticks should be used to access columns starting with a underscore.
+          checkAnswer(
+            sql(
+              """
+                |SELECT dummy, _unparsed
+                |FROM jsonTable
+              """.stripMargin),
+            Row("test", null) ::
+              Row(null, """[1,2,3]""") ::
+              Row(null, """":"test", "a":1}""") ::
+              Row(null, """42""") ::
+              Row(null, """     ","ian":"test"}""") :: Nil
+          )
+        }
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
@@ -191,6 +191,8 @@ private[json] trait TestJsonData {
   def additionalCorruptRecords: RDD[String] =
     sqlContext.sparkContext.parallelize(
       """{"dummy":"test"}""" ::
+      """[1,2,3]""" ::
+      """":"test", "a":1}""" ::
       """42""" ::
       """     ","ian":"test"}""" :: Nil)
 
@@ -203,7 +205,6 @@ private[json] trait TestJsonData {
         """{"b": [{"c": {}}]}""" ::
         """]""" :: Nil)
 
-
   lazy val singleRow: RDD[String] = sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil)
 
   def empty: RDD[String] = sqlContext.sparkContext.parallelize(Seq[String]())

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,8 @@ import org.apache.spark.sql.types._`
`31`	`31`	`import org.apache.spark.unsafe.types.UTF8String`
`32`	`32`	`import org.apache.spark.util.Utils`
`33`	`33`
	`34`	`+private[json] class SparkSQLJsonProcessingException(msg: String) extends Exception(msg)`
	`35`	`+`
`34`	`36`	`object JacksonParser {`
`35`	`37`
`36`	`38`	`def parse(`
`@@ -110,7 +112,7 @@ object JacksonParser {`
`110`	`112`	`lowerCaseValue.equals("-inf")) {`
`111`	`113`	`value.toFloat`
`112`	`114`	`} else {`
`113`		`- sys.error(s"Cannot parse $value as FloatType.")`
	`115`	`+ throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.")`
`114`	`116`	`}`
`115`	`117`
`116`	`118`	`case (VALUE_NUMBER_INT \| VALUE_NUMBER_FLOAT, DoubleType) =>`
`@@ -127,7 +129,7 @@ object JacksonParser {`
`127`	`129`	`lowerCaseValue.equals("-inf")) {`
`128`	`130`	`value.toDouble`
`129`	`131`	`} else {`
`130`		`- sys.error(s"Cannot parse $value as DoubleType.")`
	`132`	`+ throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.")`
`131`	`133`	`}`
`132`	`134`
`133`	`135`	`case (VALUE_NUMBER_INT \| VALUE_NUMBER_FLOAT, dt: DecimalType) =>`
`@@ -174,7 +176,11 @@ object JacksonParser {`
`174`	`176`	`convertField(factory, parser, udt.sqlType)`
`175`	`177`
`176`	`178`	`case (token, dataType) =>`
`177`		`- sys.error(s"Failed to parse a value for data type $dataType (current token: $token).")`
	`179`	`+ // We cannot parse this token based on the given data type. So, we throw a`
	`180`	`+ // SparkSQLJsonProcessingException and this exception will be caught by`
	`181`	`+ // parseJson method.`
	`182`	`+ throw new SparkSQLJsonProcessingException(`
	`183`	`+ s"Failed to parse a value for data type $dataType (current token: $token).")`
`178`	`184`	`}`
`179`	`185`	`}`
`180`	`186`
`@@ -266,12 +272,15 @@ object JacksonParser {`
`266`	`272`	`} else {`
`267`	`273`	`array.toArray[InternalRow](schema)`
`268`	`274`	`}`
`269`		`- case _ => failedRecord(record)`
	`275`	`+ case _ =>`
	`276`	`+ failedRecord(record)`
`270`	`277`	`}`
`271`	`278`	`}`
`272`	`279`	`} catch {`
`273`	`280`	`case _: JsonProcessingException =>`
`274`	`281`	`failedRecord(record)`
	`282`	`+ case _: SparkSQLJsonProcessingException =>`
	`283`	`+ failedRecord(record)`
`275`	`284`	`}`
`276`	`285`	`}`
`277`	`286`	`}`