From d947a4cbb53751ee9687ac13e1d79b71b6e9e91d Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 12 Dec 2016 13:26:34 -0800 Subject: [PATCH 1/2] [SPARK-18860][SQL] Update Parquet to 1.9.0 --- dev/deps/spark-deps-hadoop-2.2 | 10 +++++----- dev/deps/spark-deps-hadoop-2.3 | 10 +++++----- dev/deps/spark-deps-hadoop-2.4 | 10 +++++----- dev/deps/spark-deps-hadoop-2.6 | 10 +++++----- dev/deps/spark-deps-hadoop-2.7 | 10 +++++----- pom.xml | 2 +- .../parquet/VectorizedPlainValuesReader.java | 5 +++++ .../parquet/VectorizedRleValuesReader.java | 7 +++++++ .../datasources/parquet/ParquetSchemaConverter.scala | 12 ------------ .../datasources/parquet/ParquetSchemaSuite.scala | 2 +- 10 files changed, 39 insertions(+), 39 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index afbdae0554535..d5cd7e1c36f59 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -129,13 +129,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.3.jar -parquet-column-1.8.1.jar -parquet-common-1.8.1.jar -parquet-encoding-1.8.1.jar +parquet-column-1.9.0.jar +parquet-common-1.9.0.jar +parquet-encoding-1.9.0.jar parquet-format-2.3.0-incubating.jar -parquet-hadoop-1.8.1.jar +parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.1.jar +parquet-jackson-1.9.0.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index adf3863f6718c..cd702dbae3604 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -136,13 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.3.jar -parquet-column-1.8.1.jar -parquet-common-1.8.1.jar -parquet-encoding-1.8.1.jar +parquet-column-1.9.0.jar +parquet-common-1.9.0.jar +parquet-encoding-1.9.0.jar parquet-format-2.3.0-incubating.jar -parquet-hadoop-1.8.1.jar +parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.1.jar +parquet-jackson-1.9.0.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 88e6b3fca08a4..a0f69c0015b97 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -136,13 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.3.jar -parquet-column-1.8.1.jar -parquet-common-1.8.1.jar -parquet-encoding-1.8.1.jar +parquet-column-1.9.0.jar +parquet-common-1.9.0.jar +parquet-encoding-1.9.0.jar parquet-format-2.3.0-incubating.jar -parquet-hadoop-1.8.1.jar +parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.1.jar +parquet-jackson-1.9.0.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 15c5d9f205f25..7b153cc15cbb3 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -144,13 +144,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.3.jar -parquet-column-1.8.1.jar -parquet-common-1.8.1.jar -parquet-encoding-1.8.1.jar +parquet-column-1.9.0.jar +parquet-common-1.9.0.jar +parquet-encoding-1.9.0.jar parquet-format-2.3.0-incubating.jar -parquet-hadoop-1.8.1.jar +parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.1.jar +parquet-jackson-1.9.0.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 77fb5370d98ba..13fde2d314571 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -145,13 +145,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.3.jar -parquet-column-1.8.1.jar -parquet-common-1.8.1.jar -parquet-encoding-1.8.1.jar +parquet-column-1.9.0.jar +parquet-common-1.9.0.jar +parquet-encoding-1.9.0.jar parquet-format-2.3.0-incubating.jar -parquet-hadoop-1.8.1.jar +parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.1.jar +parquet-jackson-1.9.0.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar diff --git a/pom.xml b/pom.xml index 4f12085d044f8..aafb0113a0f82 100644 --- a/pom.xml +++ b/pom.xml @@ -134,7 +134,7 @@ 1.2.1 10.12.1.1 - 1.8.1 + 1.9.0 1.6.0 9.2.16.v20160414 3.1.0 diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java index 98018b7f48bd8..0fa1f118fb5a1 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java @@ -41,6 +41,11 @@ public class VectorizedPlainValuesReader extends ValuesReader implements Vectori public VectorizedPlainValuesReader() { } + @Override + public void initFromPage(int valueCount, ByteBuffer page, int offset) { + throw new UnsupportedOperationException(); + } + @Override public void initFromPage(int valueCount, byte[] bytes, int offset) throws IOException { this.buffer = bytes; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java index 62157389013bb..865785bd35f50 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java @@ -27,6 +27,8 @@ import org.apache.spark.sql.execution.vectorized.ColumnVector; +import java.nio.ByteBuffer; + /** * A values reader for Parquet's run-length encoded data. This is based off of the version in * parquet-mr with these changes: @@ -80,6 +82,11 @@ public VectorizedRleValuesReader(int bitWidth) { init(bitWidth); } + @Override + public void initFromPage(int valueCount, ByteBuffer page, int offset) { + throw new UnsupportedOperationException(); + } + @Override public void initFromPage(int valueCount, byte[] page, int start) { this.offset = start; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala index b4f36ce3752c0..fe18494847803 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -546,21 +546,9 @@ private[parquet] class ParquetSchemaConverter( private[parquet] object ParquetSchemaConverter { val SPARK_PARQUET_SCHEMA_NAME = "spark_schema" - // !! HACK ALERT !! - // - // PARQUET-363 & PARQUET-278: parquet-mr 1.8.1 doesn't allow constructing empty GroupType, - // which prevents us to avoid selecting any columns for queries like `SELECT COUNT(*) FROM t`. - // This issue has been fixed in parquet-mr 1.8.2-SNAPSHOT. - // - // To workaround this problem, here we first construct a `MessageType` with a single dummy - // field, and then remove the field to obtain an empty `MessageType`. - // - // TODO Reverts this change after upgrading parquet-mr to 1.8.2+ val EMPTY_MESSAGE = Types .buildMessage() - .required(PrimitiveType.PrimitiveTypeName.INT32).named("dummy") .named(ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME) - EMPTY_MESSAGE.getFields.clear() def checkFieldName(name: String): Unit = { // ,;{}()\n\t= and space are special characters in Parquet schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 8a980a7eb538f..a6a3084f6ab27 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -1424,7 +1424,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { catalystSchema = new StructType(), - expectedSchema = ParquetSchemaConverter.EMPTY_MESSAGE) + expectedSchema = "message root {}") testSchemaClipping( "disjoint field sets", From 177d447aa15264a79a4390e76e020809b5a6ae71 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 14 Dec 2016 09:55:56 -0800 Subject: [PATCH 2/2] Fix dependency and test case. --- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- .../sql/execution/datasources/parquet/ParquetSchemaSuite.scala | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index d5cd7e1c36f59..d6c4214a6ba78 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -132,7 +132,7 @@ paranamer-2.3.jar parquet-column-1.9.0.jar parquet-common-1.9.0.jar parquet-encoding-1.9.0.jar -parquet-format-2.3.0-incubating.jar +parquet-format-2.3.1.jar parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar parquet-jackson-1.9.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index cd702dbae3604..7a34b45c37bb4 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -139,7 +139,7 @@ paranamer-2.3.jar parquet-column-1.9.0.jar parquet-common-1.9.0.jar parquet-encoding-1.9.0.jar -parquet-format-2.3.0-incubating.jar +parquet-format-2.3.1.jar parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar parquet-jackson-1.9.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index a0f69c0015b97..53f66cf45211b 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -139,7 +139,7 @@ paranamer-2.3.jar parquet-column-1.9.0.jar parquet-common-1.9.0.jar parquet-encoding-1.9.0.jar -parquet-format-2.3.0-incubating.jar +parquet-format-2.3.1.jar parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar parquet-jackson-1.9.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 7b153cc15cbb3..383b5f02787d5 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -147,7 +147,7 @@ paranamer-2.3.jar parquet-column-1.9.0.jar parquet-common-1.9.0.jar parquet-encoding-1.9.0.jar -parquet-format-2.3.0-incubating.jar +parquet-format-2.3.1.jar parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar parquet-jackson-1.9.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 13fde2d314571..895203dfbbe27 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -148,7 +148,7 @@ paranamer-2.3.jar parquet-column-1.9.0.jar parquet-common-1.9.0.jar parquet-encoding-1.9.0.jar -parquet-format-2.3.0-incubating.jar +parquet-format-2.3.1.jar parquet-hadoop-1.9.0.jar parquet-hadoop-bundle-1.6.0.jar parquet-jackson-1.9.0.jar diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index a6a3084f6ab27..8a980a7eb538f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -1424,7 +1424,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { catalystSchema = new StructType(), - expectedSchema = "message root {}") + expectedSchema = ParquetSchemaConverter.EMPTY_MESSAGE) testSchemaClipping( "disjoint field sets",