@@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.parquet
1818
1919import java .io .File
2020
21+ import org .apache .parquet .column .{Encoding , ParquetProperties }
2122import org .apache .hadoop .fs .Path
2223import org .apache .parquet .format .converter .ParquetMetadataConverter
2324import org .apache .parquet .hadoop .{ParquetFileReader , ParquetOutputFormat }
@@ -31,6 +32,7 @@ import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf}
3132import org .apache .spark .sql .internal .SQLConf .ParquetOutputTimestampType
3233import org .apache .spark .sql .test .SharedSparkSession
3334import org .apache .spark .sql .types ._
35+ import org .apache .spark .sql .types .DecimalType .{ByteDecimal , IntDecimal , LongDecimal , ShortDecimal }
3436
3537class ParquetTypeWideningSuite
3638 extends QueryTest
@@ -121,6 +123,19 @@ class ParquetTypeWideningSuite
121123 if (dictionaryEnabled && ! DecimalType .isByteArrayDecimalType(dataType)) {
122124 assertAllParquetFilesDictionaryEncoded(dir)
123125 }
126+
127+ // Check which encoding was used when writing Parquet V2 files.
128+ val isParquetV2 = spark.conf.getOption(ParquetOutputFormat .WRITER_VERSION )
129+ .contains(ParquetProperties .WriterVersion .PARQUET_2_0 .toString)
130+ if (isParquetV2) {
131+ if (dictionaryEnabled) {
132+ assertParquetV2Encoding(dir, Encoding .PLAIN )
133+ } else if (DecimalType .is64BitDecimalType(dataType)) {
134+ assertParquetV2Encoding(dir, Encoding .DELTA_BINARY_PACKED )
135+ } else if (DecimalType .isByteArrayDecimalType(dataType)) {
136+ assertParquetV2Encoding(dir, Encoding .DELTA_BYTE_ARRAY )
137+ }
138+ }
124139 df
125140 }
126141
@@ -145,6 +160,27 @@ class ParquetTypeWideningSuite
145160 }
146161 }
147162
163+ /**
164+ * Asserts that all parquet files in the given directory have all their columns encoded with the
165+ * given encoding.
166+ */
167+ private def assertParquetV2Encoding (dir : File , expected_encoding : Encoding ): Unit = {
168+ dir.listFiles(_.getName.endsWith(" .parquet" )).foreach { file =>
169+ val parquetMetadata = ParquetFileReader .readFooter(
170+ spark.sessionState.newHadoopConf(),
171+ new Path (dir.toString, file.getName),
172+ ParquetMetadataConverter .NO_FILTER )
173+ parquetMetadata.getBlocks.forEach { block =>
174+ block.getColumns.forEach { col =>
175+ assert(
176+ col.getEncodings.contains(expected_encoding),
177+ s " Expected column ' ${col.getPath.toDotString}' to use encoding $expected_encoding " +
178+ s " but found ${col.getEncodings}. " )
179+ }
180+ }
181+ }
182+ }
183+
148184 for {
149185 (values : Seq [String ], fromType : DataType , toType : DataType ) <- Seq (
150186 (Seq (" 1" , " 2" , Short .MinValue .toString), ShortType , IntegerType ),
@@ -157,24 +193,77 @@ class ParquetTypeWideningSuite
157193 (Seq (" 2020-01-01" , " 2020-01-02" , " 1312-02-27" ), DateType , TimestampNTZType )
158194 )
159195 }
160- test(s " parquet widening conversion $fromType -> $toType" ) {
161- checkAllParquetReaders(values, fromType, toType, expectError = false )
162- }
196+ test(s " parquet widening conversion $fromType -> $toType" ) {
197+ checkAllParquetReaders(values, fromType, toType, expectError = false )
198+ }
199+
200+ for {
201+ (values : Seq [String ], fromType : DataType , toType : DataType ) <- Seq (
202+ (Seq (" 1" , Byte .MaxValue .toString), ByteType , IntDecimal ),
203+ (Seq (" 1" , Byte .MaxValue .toString), ByteType , LongDecimal ),
204+ (Seq (" 1" , Short .MaxValue .toString), ShortType , IntDecimal ),
205+ (Seq (" 1" , Short .MaxValue .toString), ShortType , LongDecimal ),
206+ (Seq (" 1" , Short .MaxValue .toString), ShortType , DecimalType (DecimalType .MAX_PRECISION , 0 )),
207+ (Seq (" 1" , Int .MaxValue .toString), IntegerType , IntDecimal ),
208+ (Seq (" 1" , Int .MaxValue .toString), IntegerType , LongDecimal ),
209+ (Seq (" 1" , Int .MaxValue .toString), IntegerType , DecimalType (DecimalType .MAX_PRECISION , 0 )),
210+ (Seq (" 1" , Long .MaxValue .toString), LongType , LongDecimal ),
211+ (Seq (" 1" , Long .MaxValue .toString), LongType , DecimalType (DecimalType .MAX_PRECISION , 0 )),
212+ (Seq (" 1" , Byte .MaxValue .toString), ByteType , DecimalType (IntDecimal .precision + 1 , 1 )),
213+ (Seq (" 1" , Short .MaxValue .toString), ShortType , DecimalType (IntDecimal .precision + 1 , 1 )),
214+ (Seq (" 1" , Int .MaxValue .toString), IntegerType , DecimalType (IntDecimal .precision + 1 , 1 )),
215+ (Seq (" 1" , Long .MaxValue .toString), LongType , DecimalType (LongDecimal .precision + 1 , 1 ))
216+ )
217+ }
218+ test(s " parquet widening conversion $fromType -> $toType" ) {
219+ checkAllParquetReaders(values, fromType, toType, expectError = false )
220+ }
163221
164222 for {
165223 (values : Seq [String ], fromType : DataType , toType : DataType ) <- Seq (
166224 (Seq (" 1" , " 2" , Int .MinValue .toString), LongType , IntegerType ),
167225 (Seq (" 1.23" , " 10.34" ), DoubleType , FloatType ),
168226 (Seq (" 1.23" , " 10.34" ), FloatType , LongType ),
227+ (Seq (" 1" , " 10" ), LongType , DoubleType ),
169228 (Seq (" 1" , " 10" ), LongType , DateType ),
170229 (Seq (" 1" , " 10" ), IntegerType , TimestampType ),
171230 (Seq (" 1" , " 10" ), IntegerType , TimestampNTZType ),
172231 (Seq (" 2020-01-01" , " 2020-01-02" , " 1312-02-27" ), DateType , TimestampType )
173232 )
174233 }
175- test(s " unsupported parquet conversion $fromType -> $toType" ) {
176- checkAllParquetReaders(values, fromType, toType, expectError = true )
177- }
234+ test(s " unsupported parquet conversion $fromType -> $toType" ) {
235+ checkAllParquetReaders(values, fromType, toType, expectError = true )
236+ }
237+
238+ for {
239+ (values : Seq [String ], fromType : DataType , toType : DecimalType ) <- Seq (
240+ // Parquet stores byte, short, int values as INT32, which then requires using a decimal that
241+ // can hold at least 4 byte integers.
242+ (Seq (" 1" , " 2" ), ByteType , DecimalType (1 , 0 )),
243+ (Seq (" 1" , " 2" ), ByteType , ByteDecimal ),
244+ (Seq (" 1" , " 2" ), ShortType , ByteDecimal ),
245+ (Seq (" 1" , " 2" ), ShortType , ShortDecimal ),
246+ (Seq (" 1" , " 2" ), IntegerType , ShortDecimal ),
247+ (Seq (" 1" , " 2" ), ByteType , DecimalType (ByteDecimal .precision + 1 , 1 )),
248+ (Seq (" 1" , " 2" ), ShortType , DecimalType (ShortDecimal .precision + 1 , 1 )),
249+ (Seq (" 1" , " 2" ), LongType , IntDecimal ),
250+ (Seq (" 1" , " 2" ), ByteType , DecimalType (ByteDecimal .precision - 1 , 0 )),
251+ (Seq (" 1" , " 2" ), ShortType , DecimalType (ShortDecimal .precision - 1 , 0 )),
252+ (Seq (" 1" , " 2" ), IntegerType , DecimalType (IntDecimal .precision - 1 , 0 )),
253+ (Seq (" 1" , " 2" ), LongType , DecimalType (LongDecimal .precision - 1 , 0 )),
254+ (Seq (" 1" , " 2" ), ByteType , DecimalType (ByteDecimal .precision, 1 )),
255+ (Seq (" 1" , " 2" ), ShortType , DecimalType (ShortDecimal .precision, 1 )),
256+ (Seq (" 1" , " 2" ), IntegerType , DecimalType (IntDecimal .precision, 1 )),
257+ (Seq (" 1" , " 2" ), LongType , DecimalType (LongDecimal .precision, 1 ))
258+ )
259+ }
260+ test(s " unsupported parquet conversion $fromType -> $toType" ) {
261+ checkAllParquetReaders(values, fromType, toType,
262+ expectError =
263+ // parquet-mr allows reading decimals into a smaller precision decimal type without
264+ // checking for overflows. See test below checking for the overflow case in parquet-mr.
265+ spark.conf.get(SQLConf .PARQUET_VECTORIZED_READER_ENABLED .key).toBoolean)
266+ }
178267
179268 for {
180269 (values : Seq [String ], fromType : DataType , toType : DataType ) <- Seq (
@@ -201,17 +290,17 @@ class ParquetTypeWideningSuite
201290 Seq (5 -> 7 , 5 -> 10 , 5 -> 20 , 10 -> 12 , 10 -> 20 , 20 -> 22 ) ++
202291 Seq (7 -> 5 , 10 -> 5 , 20 -> 5 , 12 -> 10 , 20 -> 10 , 22 -> 20 )
203292 }
204- test(
205- s " parquet decimal precision change Decimal( $fromPrecision, 2) -> Decimal( $toPrecision, 2) " ) {
206- checkAllParquetReaders(
207- values = Seq (" 1.23" , " 10.34" ),
208- fromType = DecimalType (fromPrecision, 2 ),
209- toType = DecimalType (toPrecision, 2 ),
210- expectError = fromPrecision > toPrecision &&
211- // parquet-mr allows reading decimals into a smaller precision decimal type without
212- // checking for overflows. See test below checking for the overflow case in parquet-mr.
213- spark.conf.get(SQLConf .PARQUET_VECTORIZED_READER_ENABLED .key).toBoolean)
214- }
293+ test(
294+ s " parquet decimal precision change Decimal( $fromPrecision, 2) -> Decimal( $toPrecision, 2) " ) {
295+ checkAllParquetReaders(
296+ values = Seq (" 1.23" , " 10.34" ),
297+ fromType = DecimalType (fromPrecision, 2 ),
298+ toType = DecimalType (toPrecision, 2 ),
299+ expectError = fromPrecision > toPrecision &&
300+ // parquet-mr allows reading decimals into a smaller precision decimal type without
301+ // checking for overflows. See test below checking for the overflow case in parquet-mr.
302+ spark.conf.get(SQLConf .PARQUET_VECTORIZED_READER_ENABLED .key).toBoolean)
303+ }
215304
216305 for {
217306 ((fromPrecision, fromScale), (toPrecision, toScale)) <-
0 commit comments