@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.parquet
2020import java .lang .{Boolean => JBoolean , Double => JDouble , Float => JFloat , Long => JLong }
2121import java .math .{BigDecimal => JBigDecimal }
2222import java .sql .{Date , Timestamp }
23+ import java .util .Locale
2324
2425import scala .collection .JavaConverters .asScalaBufferConverter
2526
@@ -31,7 +32,7 @@ import org.apache.parquet.schema.OriginalType._
3132import org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName
3233import org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName ._
3334
34- import org .apache .spark .sql .catalyst .util .DateTimeUtils
35+ import org .apache .spark .sql .catalyst .util .{ CaseInsensitiveMap , DateTimeUtils }
3536import org .apache .spark .sql .catalyst .util .DateTimeUtils .SQLDate
3637import org .apache .spark .sql .sources
3738import org .apache .spark .unsafe .types .UTF8String
@@ -350,25 +351,46 @@ private[parquet] class ParquetFilters(
350351 }
351352
352353 /**
353- * Returns a map from name of the column to the data type, if predicate push down applies.
354+ * Returns nameMap and typeMap based on different case sensitive mode, if predicate push
355+ * down applies.
354356 */
355- private def getFieldMap (dataType : MessageType ): Map [String , ParquetSchemaType ] = dataType match {
356- case m : MessageType =>
357- // Here we don't flatten the fields in the nested schema but just look up through
358- // root fields. Currently, accessing to nested fields does not push down filters
359- // and it does not support to create filters for them.
360- m.getFields.asScala.filter(_.isPrimitive).map(_.asPrimitiveType()).map { f =>
357+ private def getFieldMaps (dataType : MessageType , caseSensitive : Boolean )
358+ : (Map [String , String ], Map [String , ParquetSchemaType ]) = {
359+ // Here we don't flatten the fields in the nested schema but just look up through
360+ // root fields. Currently, accessing to nested fields does not push down filters
361+ // and it does not support to create filters for them.
362+ val primitiveFields = dataType.getFields.asScala.filter(_.isPrimitive).map(_.asPrimitiveType())
363+ if (caseSensitive) {
364+ val nameMap = primitiveFields.map { f =>
365+ f.getName -> f.getName
366+ }.toMap
367+ val typeMap = primitiveFields.map { f =>
361368 f.getName -> ParquetSchemaType (
362369 f.getOriginalType, f.getPrimitiveTypeName, f.getTypeLength, f.getDecimalMetadata)
363370 }.toMap
364- case _ => Map .empty[String , ParquetSchemaType ]
371+ (nameMap, typeMap)
372+ } else {
373+ // Don't consider ambiguity here, i.e. more than one field is matched in case insensitive
374+ // mode, just skip pushdown for these fields, they will trigger Exception when reading,
375+ // See: SPARK-25132.
376+ val dedupFields = primitiveFields.map { f =>
377+ f.getName -> ParquetSchemaType (
378+ f.getOriginalType, f.getPrimitiveTypeName, f.getTypeLength, f.getDecimalMetadata)
379+ }.groupBy(_._1.toLowerCase(Locale .ROOT )).filter(_._2.size == 1 ).mapValues(_.head)
380+ val nameMap = CaseInsensitiveMap (dedupFields.mapValues(_._1))
381+ val typeMap = CaseInsensitiveMap (dedupFields.mapValues(_._2))
382+ (nameMap, typeMap)
383+ }
365384 }
366385
367386 /**
368387 * Converts data sources filters to Parquet filter predicates.
369388 */
370- def createFilter (schema : MessageType , predicate : sources.Filter ): Option [FilterPredicate ] = {
371- val nameToType = getFieldMap(schema)
389+ def createFilter (
390+ schema : MessageType ,
391+ predicate : sources.Filter ,
392+ caseSensitive : Boolean = true ): Option [FilterPredicate ] = {
393+ val (nameMap, typeMap) = getFieldMaps(schema, caseSensitive)
372394
373395 // Decimal type must make sure that filter value's scale matched the file.
374396 // If doesn't matched, which would cause data corruption.
@@ -381,7 +403,7 @@ private[parquet] class ParquetFilters(
381403 // Parquet's type in the given file should be matched to the value's type
382404 // in the pushed filter in order to push down the filter to Parquet.
383405 def valueCanMakeFilterOn (name : String , value : Any ): Boolean = {
384- value == null || (nameToType (name) match {
406+ value == null || (typeMap (name) match {
385407 case ParquetBooleanType => value.isInstanceOf [JBoolean ]
386408 case ParquetByteType | ParquetShortType | ParquetIntegerType => value.isInstanceOf [Number ]
387409 case ParquetLongType => value.isInstanceOf [JLong ]
@@ -408,7 +430,7 @@ private[parquet] class ParquetFilters(
408430 // filters for the column having dots in the names. Thus, we do not push down such filters.
409431 // See SPARK-20364.
410432 def canMakeFilterOn (name : String , value : Any ): Boolean = {
411- nameToType .contains(name) && ! name.contains(" ." ) && valueCanMakeFilterOn(name, value)
433+ typeMap .contains(name) && ! name.contains(" ." ) && valueCanMakeFilterOn(name, value)
412434 }
413435
414436 // NOTE:
@@ -428,29 +450,29 @@ private[parquet] class ParquetFilters(
428450
429451 predicate match {
430452 case sources.IsNull (name) if canMakeFilterOn(name, null ) =>
431- makeEq.lift(nameToType (name)).map(_(name, null ))
453+ makeEq.lift(typeMap (name)).map(_(nameMap( name) , null ))
432454 case sources.IsNotNull (name) if canMakeFilterOn(name, null ) =>
433- makeNotEq.lift(nameToType (name)).map(_(name, null ))
455+ makeNotEq.lift(typeMap (name)).map(_(nameMap( name) , null ))
434456
435457 case sources.EqualTo (name, value) if canMakeFilterOn(name, value) =>
436- makeEq.lift(nameToType (name)).map(_(name, value))
458+ makeEq.lift(typeMap (name)).map(_(nameMap( name) , value))
437459 case sources.Not (sources.EqualTo (name, value)) if canMakeFilterOn(name, value) =>
438- makeNotEq.lift(nameToType (name)).map(_(name, value))
460+ makeNotEq.lift(typeMap (name)).map(_(nameMap( name) , value))
439461
440462 case sources.EqualNullSafe (name, value) if canMakeFilterOn(name, value) =>
441- makeEq.lift(nameToType (name)).map(_(name, value))
463+ makeEq.lift(typeMap (name)).map(_(nameMap( name) , value))
442464 case sources.Not (sources.EqualNullSafe (name, value)) if canMakeFilterOn(name, value) =>
443- makeNotEq.lift(nameToType (name)).map(_(name, value))
465+ makeNotEq.lift(typeMap (name)).map(_(nameMap( name) , value))
444466
445467 case sources.LessThan (name, value) if canMakeFilterOn(name, value) =>
446- makeLt.lift(nameToType (name)).map(_(name, value))
468+ makeLt.lift(typeMap (name)).map(_(nameMap( name) , value))
447469 case sources.LessThanOrEqual (name, value) if canMakeFilterOn(name, value) =>
448- makeLtEq.lift(nameToType (name)).map(_(name, value))
470+ makeLtEq.lift(typeMap (name)).map(_(nameMap( name) , value))
449471
450472 case sources.GreaterThan (name, value) if canMakeFilterOn(name, value) =>
451- makeGt.lift(nameToType (name)).map(_(name, value))
473+ makeGt.lift(typeMap (name)).map(_(nameMap( name) , value))
452474 case sources.GreaterThanOrEqual (name, value) if canMakeFilterOn(name, value) =>
453- makeGtEq.lift(nameToType (name)).map(_(name, value))
475+ makeGtEq.lift(typeMap (name)).map(_(nameMap( name) , value))
454476
455477 case sources.And (lhs, rhs) =>
456478 // At here, it is not safe to just convert one side if we do not understand the
@@ -477,7 +499,7 @@ private[parquet] class ParquetFilters(
477499 case sources.In (name, values) if canMakeFilterOn(name, values.head)
478500 && values.distinct.length <= pushDownInFilterThreshold =>
479501 values.distinct.flatMap { v =>
480- makeEq.lift(nameToType (name)).map(_(name, v))
502+ makeEq.lift(typeMap (name)).map(_(nameMap( name) , v))
481503 }.reduceLeftOption(FilterApi .or)
482504
483505 case sources.StringStartsWith (name, prefix)
0 commit comments