Skip to content

Commit cac9b1d

Browse files
rdbluecloud-fan
authored andcommitted
[SPARK-23972][BUILD][SQL] Update Parquet to 1.10.0.
## What changes were proposed in this pull request? This updates Parquet to 1.10.0 and updates the vectorized path for buffer management changes. Parquet 1.10.0 uses ByteBufferInputStream instead of byte arrays in encoders. This allows Parquet to break allocations into smaller chunks that are better for garbage collection. ## How was this patch tested? Existing Parquet tests. Running in production at Netflix for about 3 months. Author: Ryan Blue <[email protected]> Closes #21070 from rdblue/SPARK-23972-update-parquet-to-1.10.0.
1 parent 7e73502 commit cac9b1d

File tree

13 files changed

+241
-198
lines changed

13 files changed

+241
-198
lines changed

dev/deps/spark-deps-hadoop-2.6

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,13 @@ orc-mapreduce-1.4.3-nohive.jar
162162
oro-2.0.8.jar
163163
osgi-resource-locator-1.0.1.jar
164164
paranamer-2.8.jar
165-
parquet-column-1.8.2.jar
166-
parquet-common-1.8.2.jar
167-
parquet-encoding-1.8.2.jar
168-
parquet-format-2.3.1.jar
169-
parquet-hadoop-1.8.2.jar
165+
parquet-column-1.10.0.jar
166+
parquet-common-1.10.0.jar
167+
parquet-encoding-1.10.0.jar
168+
parquet-format-2.4.0.jar
169+
parquet-hadoop-1.10.0.jar
170170
parquet-hadoop-bundle-1.6.0.jar
171-
parquet-jackson-1.8.2.jar
171+
parquet-jackson-1.10.0.jar
172172
protobuf-java-2.5.0.jar
173173
py4j-0.10.6.jar
174174
pyrolite-4.13.jar

dev/deps/spark-deps-hadoop-2.7

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,13 @@ orc-mapreduce-1.4.3-nohive.jar
163163
oro-2.0.8.jar
164164
osgi-resource-locator-1.0.1.jar
165165
paranamer-2.8.jar
166-
parquet-column-1.8.2.jar
167-
parquet-common-1.8.2.jar
168-
parquet-encoding-1.8.2.jar
169-
parquet-format-2.3.1.jar
170-
parquet-hadoop-1.8.2.jar
166+
parquet-column-1.10.0.jar
167+
parquet-common-1.10.0.jar
168+
parquet-encoding-1.10.0.jar
169+
parquet-format-2.4.0.jar
170+
parquet-hadoop-1.10.0.jar
171171
parquet-hadoop-bundle-1.6.0.jar
172-
parquet-jackson-1.8.2.jar
172+
parquet-jackson-1.10.0.jar
173173
protobuf-java-2.5.0.jar
174174
py4j-0.10.6.jar
175175
pyrolite-4.13.jar

dev/deps/spark-deps-hadoop-3.1

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,13 @@ orc-mapreduce-1.4.3-nohive.jar
181181
oro-2.0.8.jar
182182
osgi-resource-locator-1.0.1.jar
183183
paranamer-2.8.jar
184-
parquet-column-1.8.2.jar
185-
parquet-common-1.8.2.jar
186-
parquet-encoding-1.8.2.jar
187-
parquet-format-2.3.1.jar
188-
parquet-hadoop-1.8.2.jar
184+
parquet-column-1.10.0.jar
185+
parquet-common-1.10.0.jar
186+
parquet-encoding-1.10.0.jar
187+
parquet-format-2.4.0.jar
188+
parquet-hadoop-1.10.0.jar
189189
parquet-hadoop-bundle-1.6.0.jar
190-
parquet-jackson-1.8.2.jar
190+
parquet-jackson-1.10.0.jar
191191
protobuf-java-2.5.0.jar
192192
py4j-0.10.6.jar
193193
pyrolite-4.13.jar

docs/sql-programming-guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -964,7 +964,7 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
964964
Sets the compression codec used when writing Parquet files. If either `compression` or
965965
`parquet.compression` is specified in the table-specific options/properties, the precedence would be
966966
`compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`. Acceptable values include:
967-
none, uncompressed, snappy, gzip, lzo.
967+
none, uncompressed, snappy, gzip, lzo, brotli, lz4, zstd.
968968
</td>
969969
</tr>
970970
<tr>

pom.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@
129129
<!-- Version used for internal directory structure -->
130130
<hive.version.short>1.2.1</hive.version.short>
131131
<derby.version>10.12.1.1</derby.version>
132-
<parquet.version>1.8.2</parquet.version>
132+
<parquet.version>1.10.0</parquet.version>
133133
<orc.version>1.4.3</orc.version>
134134
<orc.classifier>nohive</orc.classifier>
135135
<hive.parquet.version>1.6.0</hive.parquet.version>
@@ -1778,6 +1778,12 @@
17781778
<artifactId>parquet-hadoop</artifactId>
17791779
<version>${parquet.version}</version>
17801780
<scope>${parquet.deps.scope}</scope>
1781+
<exclusions>
1782+
<exclusion>
1783+
<groupId>commons-pool</groupId>
1784+
<artifactId>commons-pool</artifactId>
1785+
</exclusion>
1786+
</exclusions>
17811787
</dependency>
17821788
<dependency>
17831789
<groupId>org.apache.parquet</groupId>

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ object SQLConf {
345345
"snappy, gzip, lzo.")
346346
.stringConf
347347
.transform(_.toLowerCase(Locale.ROOT))
348-
.checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
348+
.checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo", "lz4", "brotli", "zstd"))
349349
.createWithDefault("snappy")
350350

351351
val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ protected static IntIterator createRLEIterator(
293293
return new RLEIntIterator(
294294
new RunLengthBitPackingHybridDecoder(
295295
BytesUtils.getWidthFromMaxInt(maxLevel),
296-
new ByteArrayInputStream(bytes.toByteArray())));
296+
bytes.toInputStream()));
297297
} catch (IOException e) {
298298
throw new IOException("could not read levels in page for col " + descriptor, e);
299299
}

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import java.util.Arrays;
2222
import java.util.TimeZone;
2323

24+
import org.apache.parquet.bytes.ByteBufferInputStream;
25+
import org.apache.parquet.bytes.BytesInput;
2426
import org.apache.parquet.bytes.BytesUtils;
2527
import org.apache.parquet.column.ColumnDescriptor;
2628
import org.apache.parquet.column.Dictionary;
@@ -388,15 +390,16 @@ private void decodeDictionaryIds(
388390
* is guaranteed that num is smaller than the number of values left in the current page.
389391
*/
390392

391-
private void readBooleanBatch(int rowId, int num, WritableColumnVector column) {
393+
private void readBooleanBatch(int rowId, int num, WritableColumnVector column)
394+
throws IOException {
392395
if (column.dataType() != DataTypes.BooleanType) {
393396
throw constructConvertNotSupportedException(descriptor, column);
394397
}
395398
defColumn.readBooleans(
396399
num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
397400
}
398401

399-
private void readIntBatch(int rowId, int num, WritableColumnVector column) {
402+
private void readIntBatch(int rowId, int num, WritableColumnVector column) throws IOException {
400403
// This is where we implement support for the valid type conversions.
401404
// TODO: implement remaining type conversions
402405
if (column.dataType() == DataTypes.IntegerType || column.dataType() == DataTypes.DateType ||
@@ -414,7 +417,7 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) {
414417
}
415418
}
416419

417-
private void readLongBatch(int rowId, int num, WritableColumnVector column) {
420+
private void readLongBatch(int rowId, int num, WritableColumnVector column) throws IOException {
418421
// This is where we implement support for the valid type conversions.
419422
if (column.dataType() == DataTypes.LongType ||
420423
DecimalType.is64BitDecimalType(column.dataType()) ||
@@ -434,7 +437,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) {
434437
}
435438
}
436439

437-
private void readFloatBatch(int rowId, int num, WritableColumnVector column) {
440+
private void readFloatBatch(int rowId, int num, WritableColumnVector column) throws IOException {
438441
// This is where we implement support for the valid type conversions.
439442
// TODO: support implicit cast to double?
440443
if (column.dataType() == DataTypes.FloatType) {
@@ -445,7 +448,7 @@ private void readFloatBatch(int rowId, int num, WritableColumnVector column) {
445448
}
446449
}
447450

448-
private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {
451+
private void readDoubleBatch(int rowId, int num, WritableColumnVector column) throws IOException {
449452
// This is where we implement support for the valid type conversions.
450453
// TODO: implement remaining type conversions
451454
if (column.dataType() == DataTypes.DoubleType) {
@@ -456,7 +459,7 @@ private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {
456459
}
457460
}
458461

459-
private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
462+
private void readBinaryBatch(int rowId, int num, WritableColumnVector column) throws IOException {
460463
// This is where we implement support for the valid type conversions.
461464
// TODO: implement remaining type conversions
462465
VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
@@ -556,7 +559,7 @@ public Void visit(DataPageV2 dataPageV2) {
556559
});
557560
}
558561

559-
private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset) throws IOException {
562+
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in) throws IOException {
560563
this.endOfPageValueCount = valuesRead + pageValueCount;
561564
if (dataEncoding.usesDictionary()) {
562565
this.dataColumn = null;
@@ -581,7 +584,7 @@ private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset) thr
581584
}
582585

583586
try {
584-
dataColumn.initFromPage(pageValueCount, bytes, offset);
587+
dataColumn.initFromPage(pageValueCount, in);
585588
} catch (IOException e) {
586589
throw new IOException("could not read page in col " + descriptor, e);
587590
}
@@ -602,12 +605,11 @@ private void readPageV1(DataPageV1 page) throws IOException {
602605
this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
603606
this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
604607
try {
605-
byte[] bytes = page.getBytes().toByteArray();
606-
rlReader.initFromPage(pageValueCount, bytes, 0);
607-
int next = rlReader.getNextOffset();
608-
dlReader.initFromPage(pageValueCount, bytes, next);
609-
next = dlReader.getNextOffset();
610-
initDataReader(page.getValueEncoding(), bytes, next);
608+
BytesInput bytes = page.getBytes();
609+
ByteBufferInputStream in = bytes.toInputStream();
610+
rlReader.initFromPage(pageValueCount, in);
611+
dlReader.initFromPage(pageValueCount, in);
612+
initDataReader(page.getValueEncoding(), in);
611613
} catch (IOException e) {
612614
throw new IOException("could not read page " + page + " in col " + descriptor, e);
613615
}
@@ -619,12 +621,13 @@ private void readPageV2(DataPageV2 page) throws IOException {
619621
page.getRepetitionLevels(), descriptor);
620622

621623
int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
622-
this.defColumn = new VectorizedRleValuesReader(bitWidth);
624+
// do not read the length from the stream. v2 pages handle dividing the page bytes.
625+
this.defColumn = new VectorizedRleValuesReader(bitWidth, false);
623626
this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
624-
this.defColumn.initFromBuffer(
625-
this.pageValueCount, page.getDefinitionLevels().toByteArray());
627+
this.defColumn.initFromPage(
628+
this.pageValueCount, page.getDefinitionLevels().toInputStream());
626629
try {
627-
initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
630+
initDataReader(page.getDataEncoding(), page.getData().toInputStream());
628631
} catch (IOException e) {
629632
throw new IOException("could not read page " + page + " in col " + descriptor, e);
630633
}

0 commit comments

Comments
 (0)