More tests and comments

liancheng · liancheng · commit 6f009a24edb3 · 2015-08-30T17:54:57.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -102,6 +102,10 @@ private[parquet] object CatalystReadSupport {
 
   val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
 
+  /**
+   * Tailors `parquetSchema` according to `catalystSchema` by removing column paths don't exist
+   * in `catalystSchema`, and adding those only exist in `catalystSchema`.
+   */
   def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = {
     val clippedGroup = clipParquetType(parquetSchema.asGroupType(), catalystSchema).asGroupType()
     Types.buildMessage().addFields(clippedGroup.getFields.asScala: _*).named("root")
@@ -131,6 +135,8 @@ private[parquet] object CatalystReadSupport {
   }
 
   private def clipParquetListType(parquetList: GroupType, elementType: DataType): Type = {
+    assert(!isPrimitiveCatalystType(elementType))
+
     // Unannotated repeated group, list element type is just the group itself.  Clip it.
     if (parquetList.getOriginalType == null && parquetList.isRepetition(Repetition.REPEATED)) {
       clipParquetType(parquetList, elementType)
@@ -175,11 +181,13 @@ private[parquet] object CatalystReadSupport {
 
   private def clipParquetMapType(
       parquetMap: GroupType, keyType: DataType, valueType: DataType): GroupType = {
+    assert(!isPrimitiveCatalystType(valueType))
+
     val repeatedGroup = parquetMap.getType(0).asGroupType()
     val parquetKeyType = repeatedGroup.getType(0)
     val parquetValueType = repeatedGroup.getType(1)
 
-    val clippedRepeatedGrouop =
+    val clippedRepeatedGroup =
       Types
         .repeatedGroup()
         .as(repeatedGroup.getOriginalType)
@@ -190,7 +198,7 @@ private[parquet] object CatalystReadSupport {
     Types
       .buildGroup(parquetMap.getRepetition)
       .as(parquetMap.getOriginalType)
-      .addField(clippedRepeatedGrouop)
+      .addField(clippedRepeatedGroup)
       .named(parquetMap.getName)
   }
 
@@ -206,6 +214,11 @@ private[parquet] object CatalystReadSupport {
       }
     }
 
+    // Here we can't use builder methods defined in `Types` to construct the `GroupType` and have to
+    // resort to this deprecated constructor.  The reason is that, `tailoredFields` can be empty,
+    // and `Types` builder methods don't allow constructing empty group types.  For example, query
+    // `SELECT COUNT(1) FROM t` requests for zero columns.
+    // TODO Refactor method signature to return a list of fields instead of a `GroupType`
     new GroupType(
       parquetRecord.getRepetition,
       parquetRecord.getName,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -230,7 +230,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
-  test("SPARK-10301") {
+  test("SPARK-10301 Clipping nested structs in requested schema") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
       val df = sqlContext
@@ -240,13 +240,70 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
 
       df.write.mode("append").parquet(path)
 
+      val userDefinedSchema = new StructType()
+        .add("s", new StructType().add("a", LongType, nullable = true), nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0)))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('b', id, 'c', id) AS s")
+        .coalesce(1)
+
+      df1.write.parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add("a", LongType, nullable = true)
+            .add("c", LongType, nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Seq(
+          Row(Row(0, null)),
+          Row(Row(null, 1))))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', ARRAY(NAMED_STRUCT('b', id, 'c', id))) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
       val userDefinedSchema = new StructType()
         .add("s",
           new StructType()
-            .add("a", LongType, nullable = true),
+            .add(
+              "a",
+              ArrayType(
+                new StructType()
+                  .add("b", LongType, nullable = true)
+                  .add("d", StringType, nullable = true),
+                containsNull = true),
+              nullable = true),
           nullable = true)
 
-      checkAnswer(sqlContext.read.schema(userDefinedSchema).parquet(path), Row(Row(0)))
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(Seq(Row(0, null)))))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.test
 
-import org.apache.spark.sql.{Column, ColumnName, SQLContext}
+import org.apache.spark.sql.{ColumnName, SQLContext}
 
 
 /**