@@ -22,6 +22,8 @@ import java.io.Serializable
2222import org .apache .parquet .filter2 .predicate ._
2323import org .apache .parquet .filter2 .predicate .FilterApi ._
2424import org .apache .parquet .io .api .Binary
25+ import org .apache .parquet .schema .OriginalType
26+ import org .apache .parquet .schema .PrimitiveType .PrimitiveTypeName
2527
2628import org .apache .spark .sql .sources
2729import org .apache .spark .sql .types ._
@@ -51,15 +53,18 @@ private[sql] object ParquetFilters {
5153 case DoubleType =>
5254 (n : String , v : Any ) => FilterApi .eq(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
5355
56+ // See https://issues.apache.org/jira/browse/SPARK-11153
57+ /*
5458 // Binary.fromString and Binary.fromByteArray don't accept null values
5559 case StringType =>
5660 (n: String, v: Any) => FilterApi.eq(
5761 binaryColumn(n),
58- Option (v).map(s => Binary .fromString (s.asInstanceOf [String ])).orNull)
62+ Option(v).map(s => Binary.fromByteArray (s.asInstanceOf[String].getBytes("utf-8") )).orNull)
5963 case BinaryType =>
6064 (n: String, v: Any) => FilterApi.eq(
6165 binaryColumn(n),
62- Option (v).map(b => Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]])).orNull)
66+ Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
67+ */
6368 }
6469
6570 private val makeNotEq : PartialFunction [DataType , (String , Any ) => FilterPredicate ] = {
@@ -74,14 +79,17 @@ private[sql] object ParquetFilters {
7479 case DoubleType =>
7580 (n : String , v : Any ) => FilterApi .notEq(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
7681
82+ // See https://issues.apache.org/jira/browse/SPARK-11153
83+ /*
7784 case StringType =>
7885 (n: String, v: Any) => FilterApi.notEq(
7986 binaryColumn(n),
80- Option (v).map(s => Binary .fromString (s.asInstanceOf [String ])).orNull)
87+ Option(v).map(s => Binary.fromByteArray (s.asInstanceOf[String].getBytes("utf-8") )).orNull)
8188 case BinaryType =>
8289 (n: String, v: Any) => FilterApi.notEq(
8390 binaryColumn(n),
84- Option (v).map(b => Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]])).orNull)
91+ Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
92+ */
8593 }
8694
8795 private val makeLt : PartialFunction [DataType , (String , Any ) => FilterPredicate ] = {
@@ -94,13 +102,16 @@ private[sql] object ParquetFilters {
94102 case DoubleType =>
95103 (n : String , v : Any ) => FilterApi .lt(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
96104
105+ // See https://issues.apache.org/jira/browse/SPARK-11153
106+ /*
97107 case StringType =>
98108 (n: String, v: Any) =>
99109 FilterApi.lt(binaryColumn(n),
100- Binary .fromString (v.asInstanceOf [String ]))
110+ Binary.fromByteArray (v.asInstanceOf[String].getBytes("utf-8") ))
101111 case BinaryType =>
102112 (n: String, v: Any) =>
103- FilterApi .lt(binaryColumn(n), Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]]))
113+ FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
114+ */
104115 }
105116
106117 private val makeLtEq : PartialFunction [DataType , (String , Any ) => FilterPredicate ] = {
@@ -113,13 +124,16 @@ private[sql] object ParquetFilters {
113124 case DoubleType =>
114125 (n : String , v : Any ) => FilterApi .ltEq(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
115126
127+ // See https://issues.apache.org/jira/browse/SPARK-11153
128+ /*
116129 case StringType =>
117130 (n: String, v: Any) =>
118131 FilterApi.ltEq(binaryColumn(n),
119- Binary .fromString (v.asInstanceOf [String ]))
132+ Binary.fromByteArray (v.asInstanceOf[String].getBytes("utf-8") ))
120133 case BinaryType =>
121134 (n: String, v: Any) =>
122- FilterApi .ltEq(binaryColumn(n), Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]]))
135+ FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
136+ */
123137 }
124138
125139 private val makeGt : PartialFunction [DataType , (String , Any ) => FilterPredicate ] = {
@@ -133,13 +147,15 @@ private[sql] object ParquetFilters {
133147 (n : String , v : Any ) => FilterApi .gt(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
134148
135149 // See https://issues.apache.org/jira/browse/SPARK-11153
150+ /*
136151 case StringType =>
137152 (n: String, v: Any) =>
138153 FilterApi.gt(binaryColumn(n),
139- Binary .fromString (v.asInstanceOf [String ]))
154+ Binary.fromByteArray (v.asInstanceOf[String].getBytes("utf-8") ))
140155 case BinaryType =>
141156 (n: String, v: Any) =>
142- FilterApi .gt(binaryColumn(n), Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]]))
157+ FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
158+ */
143159 }
144160
145161 private val makeGtEq : PartialFunction [DataType , (String , Any ) => FilterPredicate ] = {
@@ -152,13 +168,16 @@ private[sql] object ParquetFilters {
152168 case DoubleType =>
153169 (n : String , v : Any ) => FilterApi .gtEq(doubleColumn(n), v.asInstanceOf [java.lang.Double ])
154170
171+ // See https://issues.apache.org/jira/browse/SPARK-11153
172+ /*
155173 case StringType =>
156174 (n: String, v: Any) =>
157175 FilterApi.gtEq(binaryColumn(n),
158- Binary .fromString (v.asInstanceOf [String ]))
176+ Binary.fromByteArray (v.asInstanceOf[String].getBytes("utf-8") ))
159177 case BinaryType =>
160178 (n: String, v: Any) =>
161- FilterApi .gtEq(binaryColumn(n), Binary .fromReusedByteArray(v.asInstanceOf [Array [Byte ]]))
179+ FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
180+ */
162181 }
163182
164183 private val makeInSet : PartialFunction [DataType , (String , Set [Any ]) => FilterPredicate ] = {
@@ -175,14 +194,17 @@ private[sql] object ParquetFilters {
175194 (n : String , v : Set [Any ]) =>
176195 FilterApi .userDefined(doubleColumn(n), SetInFilter (v.asInstanceOf [Set [java.lang.Double ]]))
177196
197+ // See https://issues.apache.org/jira/browse/SPARK-11153
198+ /*
178199 case StringType =>
179200 (n: String, v: Set[Any]) =>
180201 FilterApi.userDefined(binaryColumn(n),
181- SetInFilter (v.map(s => Binary .fromString (s.asInstanceOf [String ]))))
202+ SetInFilter(v.map(s => Binary.fromByteArray (s.asInstanceOf[String].getBytes("utf-8") ))))
182203 case BinaryType =>
183204 (n: String, v: Set[Any]) =>
184205 FilterApi.userDefined(binaryColumn(n),
185- SetInFilter (v.map(e => Binary .fromReusedByteArray(e.asInstanceOf [Array [Byte ]]))))
206+ SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[Array[Byte]]))))
207+ */
186208 }
187209
188210 /**
@@ -206,6 +228,8 @@ private[sql] object ParquetFilters {
206228 def createFilter (schema : StructType , predicate : sources.Filter ): Option [FilterPredicate ] = {
207229 val dataTypeOf = getFieldMap(schema).toMap
208230
231+ relaxParquetValidTypeMap
232+
209233 // NOTE:
210234 //
211235 // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`,
@@ -275,4 +299,35 @@ private[sql] object ParquetFilters {
275299 case _ => None
276300 }
277301 }
302+
303+ // !! HACK ALERT !!
304+ //
305+ // This lazy val is a workaround for PARQUET-201, and should be removed once we upgrade to
306+ // parquet-mr 1.8.1 or higher versions.
307+ //
308+ // In Parquet, not all types of columns can be used for filter push-down optimization. The set
309+ // of valid column types is controlled by `ValidTypeMap`. Unfortunately, in parquet-mr 1.7.0 and
310+ // prior versions, the limitation is too strict, and doesn't allow `BINARY (ENUM)` columns to be
311+ // pushed down.
312+ //
313+ // This restriction is problematic for Spark SQL, because Spark SQL doesn't have a type that maps
314+ // to Parquet original type `ENUM` directly, and always converts `ENUM` to `StringType`. Thus,
315+ // a predicate involving a `ENUM` field can be pushed-down as a string column, which is perfectly
316+ // legal except that it fails the `ValidTypeMap` check.
317+ //
318+ // Here we add `BINARY (ENUM)` into `ValidTypeMap` lazily via reflection to workaround this issue.
319+ private lazy val relaxParquetValidTypeMap : Unit = {
320+ val constructor = Class
321+ .forName(classOf [ValidTypeMap ].getCanonicalName + " $FullTypeDescriptor" )
322+ .getDeclaredConstructor(classOf [PrimitiveTypeName ], classOf [OriginalType ])
323+
324+ constructor.setAccessible(true )
325+ val enumTypeDescriptor = constructor
326+ .newInstance(PrimitiveTypeName .BINARY , OriginalType .ENUM )
327+ .asInstanceOf [AnyRef ]
328+
329+ val addMethod = classOf [ValidTypeMap ].getDeclaredMethods.find(_.getName == " add" ).get
330+ addMethod.setAccessible(true )
331+ addMethod.invoke(null , classOf [Binary ], enumTypeDescriptor)
332+ }
278333}
0 commit comments