@@ -25,9 +25,10 @@ import scala.collection.mutable.ArrayBuffer
2525
2626import org .apache .parquet .column .Dictionary
2727import org .apache .parquet .io .api .{Binary , Converter , GroupConverter , PrimitiveConverter }
28- import org .apache .parquet .schema .OriginalType .LIST
28+ import org .apache .parquet .schema .OriginalType .{LIST , INT_32 , UTF8 }
29+ import org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName .DOUBLE
2930import org .apache .parquet .schema .Type .Repetition
30- import org .apache .parquet .schema .{GroupType , PrimitiveType , Type }
31+ import org .apache .parquet .schema .{GroupType , MessageType , PrimitiveType , Type }
3132
3233import org .apache .spark .sql .catalyst .InternalRow
3334import org .apache .spark .sql .catalyst .expressions ._
@@ -88,12 +89,54 @@ private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUp
8889}
8990
9091/**
91- * A [[CatalystRowConverter ]] is used to convert Parquet "structs" into Spark SQL [[InternalRow ]]s.
92- * Since any Parquet record is also a struct, this converter can also be used as root converter.
92+ * A [[CatalystRowConverter ]] is used to convert Parquet records into Catalyst [[InternalRow ]]s.
93+ * Since Catalyst `StructType` is also a Parquet record, this converter can be used as root
94+ * converter. Take the following Parquet type as an example:
95+ * {{{
96+ * message root {
97+ * required int32 f1;
98+ * optional group f2 {
99+ * required double f21;
100+ * optional binary f22 (utf8);
101+ * }
102+ * }
103+ * }}}
104+ * 5 converters will be created:
105+ *
106+ * - a root [[CatalystRowConverter ]] for [[MessageType ]] `root`, which contains:
107+ * - a [[CatalystPrimitiveConverter ]] for required [[INT_32 ]] field `f1`, and
108+ * - a nested [[CatalystRowConverter ]] for optional [[GroupType ]] `f2`, which contains:
109+ * - a [[CatalystPrimitiveConverter ]] for required [[DOUBLE ]] field `f21`, and
110+ * - a [[CatalystStringConverter ]] for optional [[UTF8 ]] string field `f22`
93111 *
94112 * When used as a root converter, [[NoopUpdater ]] should be used since root converters don't have
95113 * any "parent" container.
96114 *
115+ * @note Constructor argument [[parquetType ]] refers to requested fields of the actual schema of the
116+ * Parquet file being read, while constructor argument [[catalystType ]] refers to requested
117+ * fields of the global schema. The key difference is that, in case of schema merging,
118+ * [[parquetType ]] can be a subset of [[catalystType ]]. For example, it's possible to have
119+ * the following [[catalystType ]]:
120+ * {{{
121+ * new StructType()
122+ * .add("f1", IntegerType, nullable = false)
123+ * .add("f2", StringType, nullable = true)
124+ * .add("f3", new StructType()
125+ * .add("f31", DoubleType, nullable = false)
126+ * .add("f32", IntegerType, nullable = true)
127+ * .add("f33", StringType, nullable = true), nullable = false)
128+ * }}}
129+ * and the following [[parquetType ]] (`f2` and `f32` are missing):
130+ * {{{
131+ * message root {
132+ * required int32 f1;
133+ * required group f3 {
134+ * required double f31;
135+ * optional binary f33 (utf8);
136+ * }
137+ * }
138+ * }}}
139+ *
97140 * @param parquetType Parquet schema of Parquet records
98141 * @param catalystType Spark SQL schema that corresponds to the Parquet record type
99142 * @param updater An updater which propagates converted field values to the parent container
@@ -126,7 +169,24 @@ private[parquet] class CatalystRowConverter(
126169
127170 // Converters for each field.
128171 private val fieldConverters : Array [Converter with HasParentContainerUpdater ] = {
129- parquetType.getFields.zip(catalystType).zipWithIndex.map {
172+ // In case of schema merging, `parquetType` can be a subset of `catalystType`. We need to pad
173+ // those missing fields and create converters for them, although values of these fields are
174+ // always null.
175+ val paddedParquetFields = {
176+ val parquetFields = parquetType.getFields
177+ val parquetFieldNames = parquetFields.map(_.getName).toSet
178+ val missingFields = catalystType.filterNot(f => parquetFieldNames.contains(f.name))
179+
180+ // We don't need to worry about feature flag arguments like `assumeBinaryIsString` when
181+ // creating the schema converter here, since values of missing fields are always null.
182+ val toParquet = new CatalystSchemaConverter ()
183+
184+ (parquetFields ++ missingFields.map(toParquet.convertField)).sortBy { f =>
185+ catalystType.indexWhere(_.name == f.getName)
186+ }
187+ }
188+
189+ paddedParquetFields.zip(catalystType).zipWithIndex.map {
130190 case ((parquetFieldType, catalystField), ordinal) =>
131191 // Converted field value should be set to the `ordinal`-th cell of `currentRow`
132192 newConverter(parquetFieldType, catalystField.dataType, new RowUpdater (currentRow, ordinal))
0 commit comments