Skip to content

Commit a3fef2c

Browse files
yhuailiancheng
authored andcommitted
[SPARK-6052][SQL]In JSON schema inference, we should always set containsNull of an ArrayType to true
Always set `containsNull = true` when infer the schema of JSON datasets. If we set `containsNull` based on records we scanned, we may miss arrays with null values when we do sampling. Also, because future data can have arrays with null values, if we convert JSON data to parquet, always setting `containsNull = true` is a more robust way to go. JIRA: https://issues.apache.org/jira/browse/SPARK-6052 Author: Yin Huai <[email protected]> Closes #4806 from yhuai/jsonArrayContainsNull and squashes the following commits: 05eab9d [Yin Huai] Change containsNull to true. (cherry picked from commit 3efd8bb) Signed-off-by: Cheng Lian <[email protected]>
1 parent c59871c commit a3fef2c

File tree

2 files changed

+23
-24
lines changed

2 files changed

+23
-24
lines changed

sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,13 +199,12 @@ private[sql] object JsonRDD extends Logging {
199199
* type conflicts.
200200
*/
201201
private def typeOfArray(l: Seq[Any]): ArrayType = {
202-
val containsNull = l.exists(v => v == null)
203202
val elements = l.flatMap(v => Option(v))
204203
if (elements.isEmpty) {
205204
// If this JSON array is empty, we use NullType as a placeholder.
206205
// If this array is not empty in other JSON objects, we can resolve
207206
// the type after we have passed through all JSON objects.
208-
ArrayType(NullType, containsNull)
207+
ArrayType(NullType, containsNull = true)
209208
} else {
210209
val elementType = elements.map {
211210
e => e match {
@@ -217,7 +216,7 @@ private[sql] object JsonRDD extends Logging {
217216
}
218217
}.reduce((type1: DataType, type2: DataType) => compatibleType(type1, type2))
219218

220-
ArrayType(elementType, containsNull)
219+
ArrayType(elementType, containsNull = true)
221220
}
222221
}
223222

@@ -245,15 +244,15 @@ private[sql] object JsonRDD extends Logging {
245244
// The value associated with the key is an array.
246245
// Handle inner structs of an array.
247246
def buildKeyPathForInnerStructs(v: Any, t: DataType): Seq[(String, DataType)] = t match {
248-
case ArrayType(e: StructType, containsNull) => {
247+
case ArrayType(e: StructType, _) => {
249248
// The elements of this arrays are structs.
250249
v.asInstanceOf[Seq[Map[String, Any]]].flatMap(Option(_)).flatMap {
251250
element => allKeysWithValueTypes(element)
252251
}.map {
253252
case (k, t) => (s"$key.$k", t)
254253
}
255254
}
256-
case ArrayType(t1, containsNull) =>
255+
case ArrayType(t1, _) =>
257256
v.asInstanceOf[Seq[Any]].flatMap(Option(_)).flatMap {
258257
element => buildKeyPathForInnerStructs(element, t1)
259258
}

sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -248,26 +248,26 @@ class JsonSuite extends QueryTest {
248248
val jsonDF = jsonRDD(complexFieldAndType1)
249249

250250
val expectedSchema = StructType(
251-
StructField("arrayOfArray1", ArrayType(ArrayType(StringType, false), false), true) ::
252-
StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, false), false), true) ::
253-
StructField("arrayOfBigInteger", ArrayType(DecimalType.Unlimited, false), true) ::
254-
StructField("arrayOfBoolean", ArrayType(BooleanType, false), true) ::
255-
StructField("arrayOfDouble", ArrayType(DoubleType, false), true) ::
256-
StructField("arrayOfInteger", ArrayType(LongType, false), true) ::
257-
StructField("arrayOfLong", ArrayType(LongType, false), true) ::
251+
StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
252+
StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, true), true), true) ::
253+
StructField("arrayOfBigInteger", ArrayType(DecimalType.Unlimited, true), true) ::
254+
StructField("arrayOfBoolean", ArrayType(BooleanType, true), true) ::
255+
StructField("arrayOfDouble", ArrayType(DoubleType, true), true) ::
256+
StructField("arrayOfInteger", ArrayType(LongType, true), true) ::
257+
StructField("arrayOfLong", ArrayType(LongType, true), true) ::
258258
StructField("arrayOfNull", ArrayType(StringType, true), true) ::
259-
StructField("arrayOfString", ArrayType(StringType, false), true) ::
259+
StructField("arrayOfString", ArrayType(StringType, true), true) ::
260260
StructField("arrayOfStruct", ArrayType(
261261
StructType(
262262
StructField("field1", BooleanType, true) ::
263263
StructField("field2", StringType, true) ::
264-
StructField("field3", StringType, true) :: Nil), false), true) ::
264+
StructField("field3", StringType, true) :: Nil), true), true) ::
265265
StructField("struct", StructType(
266266
StructField("field1", BooleanType, true) ::
267267
StructField("field2", DecimalType.Unlimited, true) :: Nil), true) ::
268268
StructField("structWithArrayFields", StructType(
269-
StructField("field1", ArrayType(LongType, false), true) ::
270-
StructField("field2", ArrayType(StringType, false), true) :: Nil), true) :: Nil)
269+
StructField("field1", ArrayType(LongType, true), true) ::
270+
StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil)
271271

272272
assert(expectedSchema === jsonDF.schema)
273273

@@ -487,7 +487,7 @@ class JsonSuite extends QueryTest {
487487
val jsonDF = jsonRDD(complexFieldValueTypeConflict)
488488

489489
val expectedSchema = StructType(
490-
StructField("array", ArrayType(LongType, false), true) ::
490+
StructField("array", ArrayType(LongType, true), true) ::
491491
StructField("num_struct", StringType, true) ::
492492
StructField("str_array", StringType, true) ::
493493
StructField("struct", StructType(
@@ -513,8 +513,8 @@ class JsonSuite extends QueryTest {
513513
val expectedSchema = StructType(
514514
StructField("array1", ArrayType(StringType, true), true) ::
515515
StructField("array2", ArrayType(StructType(
516-
StructField("field", LongType, true) :: Nil), false), true) ::
517-
StructField("array3", ArrayType(StringType, false), true) :: Nil)
516+
StructField("field", LongType, true) :: Nil), true), true) ::
517+
StructField("array3", ArrayType(StringType, true), true) :: Nil)
518518

519519
assert(expectedSchema === jsonDF.schema)
520520

@@ -541,7 +541,7 @@ class JsonSuite extends QueryTest {
541541
val expectedSchema = StructType(
542542
StructField("a", BooleanType, true) ::
543543
StructField("b", LongType, true) ::
544-
StructField("c", ArrayType(LongType, false), true) ::
544+
StructField("c", ArrayType(LongType, true), true) ::
545545
StructField("d", StructType(
546546
StructField("field", BooleanType, true) :: Nil), true) ::
547547
StructField("e", StringType, true) :: Nil)
@@ -835,15 +835,15 @@ class JsonSuite extends QueryTest {
835835

836836
val schema = StructType(
837837
StructField("field1",
838-
ArrayType(ArrayType(ArrayType(ArrayType(StringType, false), false), true), false), true) ::
838+
ArrayType(ArrayType(ArrayType(ArrayType(StringType, true), true), true), true), true) ::
839839
StructField("field2",
840840
ArrayType(ArrayType(
841-
StructType(StructField("Test", LongType, true) :: Nil), false), true), true) ::
841+
StructType(StructField("Test", LongType, true) :: Nil), true), true), true) ::
842842
StructField("field3",
843843
ArrayType(ArrayType(
844-
StructType(StructField("Test", StringType, true) :: Nil), true), false), true) ::
844+
StructType(StructField("Test", StringType, true) :: Nil), true), true), true) ::
845845
StructField("field4",
846-
ArrayType(ArrayType(ArrayType(LongType, false), true), false), true) :: Nil)
846+
ArrayType(ArrayType(ArrayType(LongType, true), true), true), true) :: Nil)
847847

848848
assert(schema === jsonDF.schema)
849849

0 commit comments

Comments
 (0)