@@ -43,10 +43,13 @@ private[parquet] object ParquetTypesConverter extends Logging {
4343 def isPrimitiveType (ctype : DataType ): Boolean =
4444 classOf [PrimitiveType ] isAssignableFrom ctype.getClass
4545
46- def toPrimitiveDataType (parquetType : ParquetPrimitiveType ): DataType =
46+ def toPrimitiveDataType (
47+ parquetType : ParquetPrimitiveType ,
48+ binayAsString : Boolean ): DataType =
4749 parquetType.getPrimitiveTypeName match {
4850 case ParquetPrimitiveTypeName .BINARY
49- if parquetType.getOriginalType == ParquetOriginalType .UTF8 => StringType
51+ if (parquetType.getOriginalType == ParquetOriginalType .UTF8 ||
52+ binayAsString) => StringType
5053 case ParquetPrimitiveTypeName .BINARY => BinaryType
5154 case ParquetPrimitiveTypeName .BOOLEAN => BooleanType
5255 case ParquetPrimitiveTypeName .DOUBLE => DoubleType
@@ -85,7 +88,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
8588 * @param parquetType The type to convert.
8689 * @return The corresponding Catalyst type.
8790 */
88- def toDataType (parquetType : ParquetType ): DataType = {
91+ def toDataType (parquetType : ParquetType , isBinaryAsString : Boolean ): DataType = {
8992 def correspondsToMap (groupType : ParquetGroupType ): Boolean = {
9093 if (groupType.getFieldCount != 1 || groupType.getFields.apply(0 ).isPrimitive) {
9194 false
@@ -107,7 +110,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
107110 }
108111
109112 if (parquetType.isPrimitive) {
110- toPrimitiveDataType(parquetType.asPrimitiveType)
113+ toPrimitiveDataType(parquetType.asPrimitiveType, isBinaryAsString )
111114 } else {
112115 val groupType = parquetType.asGroupType()
113116 parquetType.getOriginalType match {
@@ -116,7 +119,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
116119 case ParquetOriginalType .LIST => { // TODO: check enums!
117120 assert(groupType.getFieldCount == 1 )
118121 val field = groupType.getFields.apply(0 )
119- ArrayType (toDataType(field), containsNull = false )
122+ ArrayType (toDataType(field, isBinaryAsString ), containsNull = false )
120123 }
121124 case ParquetOriginalType .MAP => {
122125 assert(
@@ -126,9 +129,9 @@ private[parquet] object ParquetTypesConverter extends Logging {
126129 assert(
127130 keyValueGroup.getFieldCount == 2 ,
128131 " Parquet Map type malformatted: nested group should have 2 (key, value) fields!" )
129- val keyType = toDataType(keyValueGroup.getFields.apply(0 ))
132+ val keyType = toDataType(keyValueGroup.getFields.apply(0 ), isBinaryAsString )
130133 assert(keyValueGroup.getFields.apply(0 ).getRepetition == Repetition .REQUIRED )
131- val valueType = toDataType(keyValueGroup.getFields.apply(1 ))
134+ val valueType = toDataType(keyValueGroup.getFields.apply(1 ), isBinaryAsString )
132135 assert(keyValueGroup.getFields.apply(1 ).getRepetition == Repetition .REQUIRED )
133136 // TODO: set valueContainsNull explicitly instead of assuming valueContainsNull is true
134137 // at here.
@@ -138,22 +141,22 @@ private[parquet] object ParquetTypesConverter extends Logging {
138141 // Note: the order of these checks is important!
139142 if (correspondsToMap(groupType)) { // MapType
140143 val keyValueGroup = groupType.getFields.apply(0 ).asGroupType()
141- val keyType = toDataType(keyValueGroup.getFields.apply(0 ))
144+ val keyType = toDataType(keyValueGroup.getFields.apply(0 ), isBinaryAsString )
142145 assert(keyValueGroup.getFields.apply(0 ).getRepetition == Repetition .REQUIRED )
143- val valueType = toDataType(keyValueGroup.getFields.apply(1 ))
146+ val valueType = toDataType(keyValueGroup.getFields.apply(1 ), isBinaryAsString )
144147 assert(keyValueGroup.getFields.apply(1 ).getRepetition == Repetition .REQUIRED )
145148 // TODO: set valueContainsNull explicitly instead of assuming valueContainsNull is true
146149 // at here.
147150 MapType (keyType, valueType)
148151 } else if (correspondsToArray(groupType)) { // ArrayType
149- val elementType = toDataType(groupType.getFields.apply(0 ))
152+ val elementType = toDataType(groupType.getFields.apply(0 ), isBinaryAsString )
150153 ArrayType (elementType, containsNull = false )
151154 } else { // everything else: StructType
152155 val fields = groupType
153156 .getFields
154157 .map(ptype => new StructField (
155158 ptype.getName,
156- toDataType(ptype),
159+ toDataType(ptype, isBinaryAsString ),
157160 ptype.getRepetition != Repetition .REQUIRED ))
158161 StructType (fields)
159162 }
@@ -276,15 +279,15 @@ private[parquet] object ParquetTypesConverter extends Logging {
276279 }
277280 }
278281
279- def convertToAttributes (parquetSchema : ParquetType ): Seq [Attribute ] = {
282+ def convertToAttributes (parquetSchema : ParquetType , isBinaryAsString : Boolean ): Seq [Attribute ] = {
280283 parquetSchema
281284 .asGroupType()
282285 .getFields
283286 .map(
284287 field =>
285288 new AttributeReference (
286289 field.getName,
287- toDataType(field),
290+ toDataType(field, isBinaryAsString ),
288291 field.getRepetition != Repetition .REQUIRED )())
289292 }
290293
@@ -404,7 +407,10 @@ private[parquet] object ParquetTypesConverter extends Logging {
404407 * @param conf The Hadoop configuration to use.
405408 * @return A list of attributes that make up the schema.
406409 */
407- def readSchemaFromFile (origPath : Path , conf : Option [Configuration ]): Seq [Attribute ] = {
410+ def readSchemaFromFile (
411+ origPath : Path ,
412+ conf : Option [Configuration ],
413+ isBinaryAsString : Boolean ): Seq [Attribute ] = {
408414 val keyValueMetadata : java.util.Map [String , String ] =
409415 readMetaData(origPath, conf)
410416 .getFileMetaData
@@ -413,7 +419,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
413419 convertFromString(keyValueMetadata.get(RowReadSupport .SPARK_METADATA_KEY ))
414420 } else {
415421 val attributes = convertToAttributes(
416- readMetaData(origPath, conf).getFileMetaData.getSchema)
422+ readMetaData(origPath, conf).getFileMetaData.getSchema, isBinaryAsString )
417423 log.info(s " Falling back to schema conversion from Parquet types; result: $attributes" )
418424 attributes
419425 }
0 commit comments