From 7a8b41cff37ab767c4784b87c62166955ab291ec Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 13 Jan 2022 13:36:15 -0800 Subject: [PATCH 01/20] [SPARK-37974][SQL] Vectorized implementation of DeltaLengthByteArray reader --- .../VectorizedDeltaLengthByteArrayReader.java | 104 +++++++++++++++ .../parquet/VectorizedValuesReader.java | 21 +++ ...uetDeltaLengthByteArrayEncodingSuite.scala | 121 ++++++++++++++++++ 3 files changed, 246 insertions(+) create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java new file mode 100644 index 0000000000000..c1962554e041b --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet; + +import static org.apache.spark.sql.types.DataTypes.IntegerType; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.spark.memory.MemoryMode; +import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector; +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; + +/** + * An implementation of the Parquet DELTA_LENGTH_BYTE_ARRAY decoder that supports the vectorized + * interface. + */ +public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase implements + VectorizedValuesReader { + + private final MemoryMode memoryMode; + private int valueCount; + private final VectorizedDeltaBinaryPackedReader lengthReader = + new VectorizedDeltaBinaryPackedReader(); + private ByteBufferInputStream in; + private WritableColumnVector lengthsVector; + private int currentRow = 0; + + VectorizedDeltaLengthByteArrayReader(MemoryMode memoryMode) { + this.memoryMode = memoryMode; + } + + @Override + public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { + this.valueCount = valueCount; + if (memoryMode == MemoryMode.OFF_HEAP) { + lengthsVector = new OffHeapColumnVector(valueCount, IntegerType); + } else { + lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); + } + lengthReader.initFromPage(valueCount, in); + lengthReader.readIntegers(valueCount, lengthsVector, 0); + this.in = in.remainingStream(); + } + + @Override + public void readBinary(int total, WritableColumnVector c, int rowId) { + if (total == 0) { + return; + } + ByteBuffer buffer; + ByteBufferOutputWriter outputWriter; + if (memoryMode == MemoryMode.OFF_HEAP) { + outputWriter = ByteBufferOutputWriter::copyWriteByteBuffer; + } else { + outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; + } + int length; + for (int i = 0; i < total; i++) { + length = lengthsVector.getInt(rowId + i); + try { + buffer = in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); + } + outputWriter.write(c, rowId + i, buffer, length); + currentRow++; + } + } + + @Override + public void skipBinary(int total) { + if (total == 0) { + return; + } + int length; + for (int i = 0; i < total; i++) { + length = lengthsVector.getInt(currentRow + i); + int remaining = length; + while (remaining > 0) { + remaining -= in.skip(length); + } + } + currentRow += total; + } + +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 7ddece068e099..ca065f9cd012f 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources.parquet; +import java.nio.ByteBuffer; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.parquet.io.api.Binary; @@ -86,4 +87,24 @@ interface IntegerOutputWriter { void write(WritableColumnVector outputColumnVector, int rowId, long val); } + @FunctionalInterface + interface ByteBufferOutputWriter { + void write(WritableColumnVector c, int rowId, ByteBuffer val, int length); + + static void writeArrayByteBuffer(WritableColumnVector c, int rowId, ByteBuffer val, + int length) { + c.putByteArray(rowId, + val.array(), + val.arrayOffset() + val.position(), + length); + } + + static void copyWriteByteBuffer(WritableColumnVector c, int rowId, ByteBuffer val, int length) { + byte[] bytes = new byte[length]; + val.get(bytes); + c.putByteArray(rowId, bytes); + } + + } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala new file mode 100644 index 0000000000000..3cb62d6500f4e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.parquet.bytes.{ByteBufferInputStream, DirectByteBufferAllocator} +import org.apache.parquet.column.values.Utils +import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter +import org.apache.parquet.io.api.Binary + +import org.apache.spark.memory.MemoryMode +import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType} + +/** + * Read tests for vectorized Delta length byte array reader. + * Translated from + * org.apache.parquet.column.values.delta.TestDeltaLengthByteArray + */ +class ParquetDeltaLengthByteArrayEncodingSuite + extends ParquetCompatibilityTest + with SharedSparkSession { + val values: Array[String] = Array("parquet", "hadoop", "mapreduce") + var writer: DeltaLengthByteArrayValuesWriter = _ + var reader: VectorizedDeltaLengthByteArrayReader = _ + private var writableColumnVector: WritableColumnVector = _ + + protected override def beforeEach(): Unit = { + writer = + new DeltaLengthByteArrayValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) + reader = new VectorizedDeltaLengthByteArrayReader(MemoryMode.OFF_HEAP) + super.beforeAll() + } + + test("test serialization") { + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + + test("random strings") { + val values = Utils.getRandomStringSamples(1000, 32) + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + + test("skip with random strings") { + val values = Utils.getRandomStringSamples(1000, 32) + writeData(writer, values) + reader.initFromPage(values.length, writer.getBytes.toInputStream) + writableColumnVector = new OnHeapColumnVector(values.length, StringType) + var i = 0 + while (i < values.length) { + reader.readBinary(1, writableColumnVector, i) + assert(values(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(1) + i += 2 + } + reader = new VectorizedDeltaLengthByteArrayReader(MemoryMode.OFF_HEAP) + reader.initFromPage(values.length, writer.getBytes.toInputStream) + writableColumnVector = new OnHeapColumnVector(values.length, StringType) + var skipCount = 0 + i = 0 + while (i < values.length) { + skipCount = (values.length - i) / 2 + reader.readBinary(1, writableColumnVector, i) + assert(values(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(skipCount) + i += skipCount + 1 + } + } + + // Read the lengths from the beginning of the buffer and compare with the lengths of the values + test("test lengths") { + val reader = new VectorizedDeltaBinaryPackedReader + writeData(writer, values) + val length = values.length + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, writer.getBytes.toInputStream) + reader.readIntegers(length, writableColumnVector, 0) + for (i <- 0 until length) { + assert(values(i).length == writableColumnVector.getInt(i)) + } + } + + private def writeData(writer: DeltaLengthByteArrayValuesWriter, values: Array[String]): Unit = { + for (i <- values.indices) { + writer.writeBytes(Binary.fromString(values(i))) + } + } + + private def readAndValidate( + reader: VectorizedDeltaLengthByteArrayReader, + is: ByteBufferInputStream, + length: Int, + expectedValues: Array[String]): Unit = { + + writableColumnVector = new OnHeapColumnVector(length, StringType) + + reader.initFromPage(length, is) + reader.readBinary(length, writableColumnVector, 0) + + for (i <- 0 until length) { + assert(expectedValues(i).getBytes() sameElements writableColumnVector.getBinary(i)) + } + } + +} From 2c737944ef6a7850fcc34a72f5756c8fd1ee2eef Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 13 Jan 2022 17:20:50 -0800 Subject: [PATCH 02/20] [SPARK-37974][SQL] Vectorized implementation of DeltaByteArray reader --- .../DataSourceReadBenchmark-jdk11-results.txt | 424 ++++++++-------- .../DataSourceReadBenchmark-jdk17-results.txt | 470 +++++++++--------- .../DataSourceReadBenchmark-results.txt | 470 +++++++++--------- .../SpecificParquetRecordReaderBase.java | 9 + .../parquet/VectorizedColumnReader.java | 26 +- .../VectorizedDeltaBinaryPackedReader.java | 5 + .../VectorizedDeltaByteArrayReader.java | 120 ++++- .../VectorizedDeltaLengthByteArrayReader.java | 2 +- .../VectorizedParquetRecordReader.java | 5 +- .../parquet/VectorizedValuesReader.java | 2 + .../ParquetDeltaByteArrayEncodingSuite.scala | 145 ++++++ .../parquet/ParquetEncodingSuite.scala | 19 +- 12 files changed, 983 insertions(+), 714 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index 25c43d8273df8..b3a894f8ad397 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9636 9771 191 1.6 612.6 1.0X -SQL Json 7960 8227 378 2.0 506.1 1.2X -SQL Parquet Vectorized: DataPageV1 113 129 12 139.7 7.2 85.6X -SQL Parquet Vectorized: DataPageV2 84 93 12 186.6 5.4 114.3X -SQL Parquet MR: DataPageV1 1466 1470 6 10.7 93.2 6.6X -SQL Parquet MR: DataPageV2 1334 1347 18 11.8 84.8 7.2X -SQL ORC Vectorized 163 197 27 96.3 10.4 59.0X -SQL ORC MR 1554 1558 6 10.1 98.8 6.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 10153 10161 12 1.5 645.5 1.0X +SQL Json 8463 8512 69 1.9 538.0 1.2X +SQL Parquet Vectorized: DataPageV1 131 149 14 120.0 8.3 77.5X +SQL Parquet Vectorized: DataPageV2 98 112 15 161.2 6.2 104.0X +SQL Parquet MR: DataPageV1 1968 1968 0 8.0 125.1 5.2X +SQL Parquet MR: DataPageV2 1735 1739 6 9.1 110.3 5.9X +SQL ORC Vectorized 164 198 41 96.0 10.4 62.0X +SQL ORC MR 1572 1581 12 10.0 100.0 6.5X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 94 103 13 167.1 6.0 1.0X -ParquetReader Vectorized: DataPageV2 77 86 11 204.3 4.9 1.2X -ParquetReader Vectorized -> Row: DataPageV1 44 47 4 357.0 2.8 2.1X -ParquetReader Vectorized -> Row: DataPageV2 35 37 3 445.2 2.2 2.7X +ParquetReader Vectorized: DataPageV1 102 107 14 154.6 6.5 1.0X +ParquetReader Vectorized: DataPageV2 83 88 10 189.1 5.3 1.2X +ParquetReader Vectorized -> Row: DataPageV1 57 59 3 275.7 3.6 1.8X +ParquetReader Vectorized -> Row: DataPageV2 38 40 3 416.3 2.4 2.7X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11479 11919 622 1.4 729.8 1.0X -SQL Json 9894 9922 39 1.6 629.1 1.2X -SQL Parquet Vectorized: DataPageV1 123 156 30 128.3 7.8 93.6X -SQL Parquet Vectorized: DataPageV2 126 138 19 125.2 8.0 91.4X -SQL Parquet MR: DataPageV1 1986 2500 726 7.9 126.3 5.8X -SQL Parquet MR: DataPageV2 1810 1898 126 8.7 115.1 6.3X -SQL ORC Vectorized 174 210 30 90.5 11.0 66.1X -SQL ORC MR 1645 1652 9 9.6 104.6 7.0X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 12263 12285 31 1.3 779.6 1.0X +SQL Json 9495 9501 9 1.7 603.6 1.3X +SQL Parquet Vectorized: DataPageV1 162 175 10 97.1 10.3 75.7X +SQL Parquet Vectorized: DataPageV2 161 172 12 97.9 10.2 76.4X +SQL Parquet MR: DataPageV1 2074 2105 44 7.6 131.9 5.9X +SQL Parquet MR: DataPageV2 1974 1981 9 8.0 125.5 6.2X +SQL ORC Vectorized 187 218 30 84.3 11.9 65.7X +SQL ORC MR 1529 1553 34 10.3 97.2 8.0X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 166 177 14 94.9 10.5 1.0X -ParquetReader Vectorized: DataPageV2 165 172 11 95.3 10.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 95 100 5 165.7 6.0 1.7X -ParquetReader Vectorized -> Row: DataPageV2 85 89 6 186.0 5.4 2.0X +ParquetReader Vectorized: DataPageV1 205 214 12 76.8 13.0 1.0X +ParquetReader Vectorized: DataPageV2 204 211 10 77.2 13.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 122 132 24 128.7 7.8 1.7X +ParquetReader Vectorized -> Row: DataPageV2 122 126 6 128.4 7.8 1.7X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12176 12646 664 1.3 774.1 1.0X -SQL Json 9696 9729 46 1.6 616.5 1.3X -SQL Parquet Vectorized: DataPageV1 151 201 33 103.9 9.6 80.4X -SQL Parquet Vectorized: DataPageV2 216 235 15 72.7 13.8 56.3X -SQL Parquet MR: DataPageV1 1915 2017 145 8.2 121.8 6.4X -SQL Parquet MR: DataPageV2 1954 1978 33 8.0 124.3 6.2X -SQL ORC Vectorized 197 235 25 79.7 12.6 61.7X -SQL ORC MR 1769 1829 85 8.9 112.5 6.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 12946 12955 12 1.2 823.1 1.0X +SQL Json 9871 9876 8 1.6 627.6 1.3X +SQL Parquet Vectorized: DataPageV1 157 200 34 100.0 10.0 82.3X +SQL Parquet Vectorized: DataPageV2 229 242 14 68.8 14.5 56.7X +SQL Parquet MR: DataPageV1 2388 2389 2 6.6 151.8 5.4X +SQL Parquet MR: DataPageV2 2080 2087 10 7.6 132.2 6.2X +SQL ORC Vectorized 240 285 23 65.6 15.2 54.0X +SQL ORC MR 1699 1732 46 9.3 108.0 7.6X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 230 237 12 68.5 14.6 1.0X -ParquetReader Vectorized: DataPageV2 293 298 9 53.6 18.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 215 265 23 73.2 13.7 1.1X -ParquetReader Vectorized -> Row: DataPageV2 279 301 32 56.3 17.8 0.8X +ParquetReader Vectorized: DataPageV1 242 245 5 65.0 15.4 1.0X +ParquetReader Vectorized: DataPageV2 309 314 9 50.9 19.7 0.8X +ParquetReader Vectorized -> Row: DataPageV1 227 268 18 69.3 14.4 1.1X +ParquetReader Vectorized -> Row: DataPageV2 294 312 25 53.5 18.7 0.8X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13069 13409 482 1.2 830.9 1.0X -SQL Json 10599 10621 32 1.5 673.9 1.2X -SQL Parquet Vectorized: DataPageV1 142 177 34 110.6 9.0 91.9X -SQL Parquet Vectorized: DataPageV2 313 359 28 50.2 19.9 41.7X -SQL Parquet MR: DataPageV1 1979 2044 92 7.9 125.8 6.6X -SQL Parquet MR: DataPageV2 1958 2030 101 8.0 124.5 6.7X -SQL ORC Vectorized 277 303 21 56.7 17.6 47.1X -SQL ORC MR 1692 1782 128 9.3 107.6 7.7X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 14331 14347 22 1.1 911.2 1.0X +SQL Json 10406 10434 40 1.5 661.6 1.4X +SQL Parquet Vectorized: DataPageV1 153 196 41 102.7 9.7 93.6X +SQL Parquet Vectorized: DataPageV2 378 415 30 41.6 24.0 37.9X +SQL Parquet MR: DataPageV1 2439 2446 11 6.4 155.1 5.9X +SQL Parquet MR: DataPageV2 2181 2188 10 7.2 138.7 6.6X +SQL ORC Vectorized 320 346 25 49.2 20.3 44.8X +SQL ORC MR 1851 1853 3 8.5 117.7 7.7X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 253 269 18 62.1 16.1 1.0X -ParquetReader Vectorized: DataPageV2 1197 1199 3 13.1 76.1 0.2X -ParquetReader Vectorized -> Row: DataPageV1 273 361 110 57.7 17.3 0.9X -ParquetReader Vectorized -> Row: DataPageV2 379 438 37 41.5 24.1 0.7X +ParquetReader Vectorized: DataPageV1 258 262 9 60.9 16.4 1.0X +ParquetReader Vectorized: DataPageV2 481 484 3 32.7 30.6 0.5X +ParquetReader Vectorized -> Row: DataPageV1 250 275 26 62.9 15.9 1.0X +ParquetReader Vectorized -> Row: DataPageV2 475 502 27 33.1 30.2 0.5X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17143 17467 458 0.9 1089.9 1.0X -SQL Json 11507 12198 977 1.4 731.6 1.5X -SQL Parquet Vectorized: DataPageV1 238 253 19 66.0 15.2 71.9X -SQL Parquet Vectorized: DataPageV2 502 567 48 31.3 31.9 34.1X -SQL Parquet MR: DataPageV1 2333 2335 3 6.7 148.4 7.3X -SQL Parquet MR: DataPageV2 1948 1972 34 8.1 123.8 8.8X -SQL ORC Vectorized 389 408 20 40.5 24.7 44.1X -SQL ORC MR 1726 1817 128 9.1 109.7 9.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 18450 18451 2 0.9 1173.0 1.0X +SQL Json 12553 12562 13 1.3 798.1 1.5X +SQL Parquet Vectorized: DataPageV1 259 272 12 60.8 16.5 71.3X +SQL Parquet Vectorized: DataPageV2 534 566 22 29.4 34.0 34.5X +SQL Parquet MR: DataPageV1 2529 2537 11 6.2 160.8 7.3X +SQL Parquet MR: DataPageV2 2331 2334 4 6.7 148.2 7.9X +SQL ORC Vectorized 424 460 36 37.1 27.0 43.5X +SQL ORC MR 2009 2023 20 7.8 127.7 9.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 289 340 43 54.4 18.4 1.0X -ParquetReader Vectorized: DataPageV2 572 609 27 27.5 36.4 0.5X -ParquetReader Vectorized -> Row: DataPageV1 329 353 48 47.8 20.9 0.9X -ParquetReader Vectorized -> Row: DataPageV2 639 654 18 24.6 40.6 0.5X +ParquetReader Vectorized: DataPageV1 322 338 21 48.9 20.5 1.0X +ParquetReader Vectorized: DataPageV2 674 683 12 23.3 42.9 0.5X +ParquetReader Vectorized -> Row: DataPageV1 352 358 9 44.7 22.4 0.9X +ParquetReader Vectorized -> Row: DataPageV2 628 660 22 25.0 39.9 0.5X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13721 13812 129 1.1 872.4 1.0X -SQL Json 12147 17632 2196 1.3 772.3 1.1X -SQL Parquet Vectorized: DataPageV1 138 164 25 113.9 8.8 99.4X -SQL Parquet Vectorized: DataPageV2 151 180 26 104.4 9.6 91.1X -SQL Parquet MR: DataPageV1 2006 2078 101 7.8 127.6 6.8X -SQL Parquet MR: DataPageV2 2038 2040 2 7.7 129.6 6.7X -SQL ORC Vectorized 465 475 10 33.8 29.6 29.5X -SQL ORC MR 1814 1860 64 8.7 115.4 7.6X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 14806 14816 14 1.1 941.3 1.0X +SQL Json 11968 11969 1 1.3 760.9 1.2X +SQL Parquet Vectorized: DataPageV1 150 184 26 105.0 9.5 98.8X +SQL Parquet Vectorized: DataPageV2 147 183 32 107.2 9.3 100.9X +SQL Parquet MR: DataPageV1 2338 2352 19 6.7 148.7 6.3X +SQL Parquet MR: DataPageV2 2221 2267 65 7.1 141.2 6.7X +SQL ORC Vectorized 475 494 29 33.1 30.2 31.1X +SQL ORC MR 1967 1978 16 8.0 125.1 7.5X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 275 404 187 57.2 17.5 1.0X -ParquetReader Vectorized: DataPageV2 275 287 12 57.2 17.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 227 265 24 69.2 14.4 1.2X -ParquetReader Vectorized -> Row: DataPageV2 228 259 28 69.1 14.5 1.2X +ParquetReader Vectorized: DataPageV1 236 241 8 66.7 15.0 1.0X +ParquetReader Vectorized: DataPageV2 237 241 9 66.3 15.1 1.0X +ParquetReader Vectorized -> Row: DataPageV1 218 244 25 72.1 13.9 1.1X +ParquetReader Vectorized -> Row: DataPageV2 218 251 21 72.2 13.8 1.1X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17269 17620 496 0.9 1097.9 1.0X -SQL Json 15636 15952 447 1.0 994.1 1.1X -SQL Parquet Vectorized: DataPageV1 238 267 18 66.0 15.1 72.5X -SQL Parquet Vectorized: DataPageV2 222 260 21 70.9 14.1 77.9X -SQL Parquet MR: DataPageV1 2418 2457 56 6.5 153.7 7.1X -SQL Parquet MR: DataPageV2 2194 2207 18 7.2 139.5 7.9X -SQL ORC Vectorized 519 528 14 30.3 33.0 33.3X -SQL ORC MR 1760 1770 14 8.9 111.9 9.8X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 19335 19346 15 0.8 1229.3 1.0X +SQL Json 16112 16121 13 1.0 1024.4 1.2X +SQL Parquet Vectorized: DataPageV1 257 278 29 61.1 16.4 75.1X +SQL Parquet Vectorized: DataPageV2 258 268 9 60.9 16.4 74.9X +SQL Parquet MR: DataPageV1 2542 2557 20 6.2 161.6 7.6X +SQL Parquet MR: DataPageV2 2416 2439 32 6.5 153.6 8.0X +SQL ORC Vectorized 593 605 18 26.5 37.7 32.6X +SQL ORC MR 2134 2141 11 7.4 135.7 9.1X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 284 305 30 55.3 18.1 1.0X -ParquetReader Vectorized: DataPageV2 286 286 1 55.1 18.2 1.0X -ParquetReader Vectorized -> Row: DataPageV1 325 337 16 48.4 20.6 0.9X -ParquetReader Vectorized -> Row: DataPageV2 346 361 16 45.5 22.0 0.8X +ParquetReader Vectorized: DataPageV1 322 346 24 48.8 20.5 1.0X +ParquetReader Vectorized: DataPageV2 326 326 1 48.3 20.7 1.0X +ParquetReader Vectorized -> Row: DataPageV1 350 359 9 44.9 22.3 0.9X +ParquetReader Vectorized -> Row: DataPageV2 348 358 10 45.2 22.1 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12428 12714 405 0.8 1185.2 1.0X -SQL Json 11088 11251 231 0.9 1057.4 1.1X -SQL Parquet Vectorized: DataPageV1 1990 1997 10 5.3 189.8 6.2X -SQL Parquet Vectorized: DataPageV2 2551 2618 95 4.1 243.3 4.9X -SQL Parquet MR: DataPageV1 3903 3913 15 2.7 372.2 3.2X -SQL Parquet MR: DataPageV2 3734 3920 263 2.8 356.1 3.3X -SQL ORC Vectorized 2153 2155 3 4.9 205.3 5.8X -SQL ORC MR 3485 3549 91 3.0 332.4 3.6X +SQL CSV 13899 14000 142 0.8 1325.5 1.0X +SQL Json 11275 11289 20 0.9 1075.3 1.2X +SQL Parquet Vectorized: DataPageV1 2092 2107 21 5.0 199.5 6.6X +SQL Parquet Vectorized: DataPageV2 3073 3074 2 3.4 293.0 4.5X +SQL Parquet MR: DataPageV1 4192 4212 29 2.5 399.8 3.3X +SQL Parquet MR: DataPageV2 4133 4194 87 2.5 394.1 3.4X +SQL ORC Vectorized 2218 2219 1 4.7 211.5 6.3X +SQL ORC MR 3767 3776 12 2.8 359.3 3.7X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7116 7167 72 1.5 678.7 1.0X -SQL Json 6700 6741 58 1.6 639.0 1.1X -SQL Parquet Vectorized: DataPageV1 526 556 36 19.9 50.1 13.5X -SQL Parquet Vectorized: DataPageV2 518 533 15 20.2 49.4 13.7X -SQL Parquet MR: DataPageV1 1504 1656 216 7.0 143.4 4.7X -SQL Parquet MR: DataPageV2 1676 1676 1 6.3 159.8 4.2X -SQL ORC Vectorized 497 518 20 21.1 47.4 14.3X -SQL ORC MR 1657 1787 183 6.3 158.1 4.3X +SQL CSV 7367 7387 28 1.4 702.6 1.0X +SQL Json 6817 6817 0 1.5 650.1 1.1X +SQL Parquet Vectorized: DataPageV1 602 618 15 17.4 57.5 12.2X +SQL Parquet Vectorized: DataPageV2 599 610 15 17.5 57.1 12.3X +SQL Parquet MR: DataPageV1 1888 1936 68 5.6 180.0 3.9X +SQL Parquet MR: DataPageV2 2000 2018 25 5.2 190.7 3.7X +SQL ORC Vectorized 527 545 22 19.9 50.2 14.0X +SQL ORC MR 1916 1927 16 5.5 182.7 3.8X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 18247 18411 232 0.9 1160.1 1.0X -Data column - Json 10860 11264 571 1.4 690.5 1.7X -Data column - Parquet Vectorized: DataPageV1 223 274 26 70.6 14.2 81.9X -Data column - Parquet Vectorized: DataPageV2 537 559 23 29.3 34.1 34.0X -Data column - Parquet MR: DataPageV1 2411 2517 150 6.5 153.3 7.6X -Data column - Parquet MR: DataPageV2 2299 2356 81 6.8 146.2 7.9X -Data column - ORC Vectorized 417 433 11 37.7 26.5 43.8X -Data column - ORC MR 2107 2178 101 7.5 134.0 8.7X -Partition column - CSV 6090 6186 136 2.6 387.2 3.0X -Partition column - Json 9479 9603 176 1.7 602.7 1.9X -Partition column - Parquet Vectorized: DataPageV1 49 69 28 322.0 3.1 373.6X -Partition column - Parquet Vectorized: DataPageV2 49 63 23 322.1 3.1 373.7X -Partition column - Parquet MR: DataPageV1 1200 1225 36 13.1 76.3 15.2X -Partition column - Parquet MR: DataPageV2 1199 1240 57 13.1 76.3 15.2X -Partition column - ORC Vectorized 53 77 26 295.0 3.4 342.2X -Partition column - ORC MR 1287 1346 83 12.2 81.8 14.2X -Both columns - CSV 17671 18140 663 0.9 1123.5 1.0X -Both columns - Json 11675 12167 696 1.3 742.3 1.6X -Both columns - Parquet Vectorized: DataPageV1 298 303 9 52.9 18.9 61.3X -Both columns - Parquet Vectorized: DataPageV2 541 580 36 29.1 34.4 33.7X -Both columns - Parquet MR: DataPageV1 2448 2491 60 6.4 155.6 7.5X -Both columns - Parquet MR: DataPageV2 2303 2352 69 6.8 146.4 7.9X -Both columns - ORC Vectorized 385 406 25 40.9 24.5 47.4X -Both columns - ORC MR 2118 2202 120 7.4 134.6 8.6X +Data column - CSV 19239 19257 25 0.8 1223.2 1.0X +Data column - Json 12387 12393 8 1.3 787.6 1.6X +Data column - Parquet Vectorized: DataPageV1 227 269 25 69.2 14.5 84.6X +Data column - Parquet Vectorized: DataPageV2 612 651 28 25.7 38.9 31.4X +Data column - Parquet MR: DataPageV1 2989 3016 39 5.3 190.0 6.4X +Data column - Parquet MR: DataPageV2 2750 2754 5 5.7 174.8 7.0X +Data column - ORC Vectorized 426 467 33 37.0 27.1 45.2X +Data column - ORC MR 2513 2538 35 6.3 159.8 7.7X +Partition column - CSV 6623 6627 5 2.4 421.1 2.9X +Partition column - Json 10234 10235 2 1.5 650.7 1.9X +Partition column - Parquet Vectorized: DataPageV1 56 73 19 279.8 3.6 342.2X +Partition column - Parquet Vectorized: DataPageV2 57 72 19 278.3 3.6 340.4X +Partition column - Parquet MR: DataPageV1 1392 1417 36 11.3 88.5 13.8X +Partition column - Parquet MR: DataPageV2 1390 1416 37 11.3 88.4 13.8X +Partition column - ORC Vectorized 56 89 36 283.2 3.5 346.4X +Partition column - ORC MR 1578 1581 4 10.0 100.4 12.2X +Both columns - CSV 19178 19181 4 0.8 1219.3 1.0X +Both columns - Json 13104 13105 1 1.2 833.1 1.5X +Both columns - Parquet Vectorized: DataPageV1 314 338 21 50.2 19.9 61.4X +Both columns - Parquet Vectorized: DataPageV2 708 741 54 22.2 45.0 27.2X +Both columns - Parquet MR: DataPageV1 3083 3105 31 5.1 196.0 6.2X +Both columns - Parquet MR: DataPageV2 2897 2901 6 5.4 184.2 6.6X +Both columns - ORC Vectorized 456 504 39 34.5 29.0 42.1X +Both columns - ORC MR 2594 2597 4 6.1 164.9 7.4X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7966 12723 2892 1.3 759.7 1.0X -SQL Json 9897 10008 157 1.1 943.9 0.8X -SQL Parquet Vectorized: DataPageV1 1176 1264 125 8.9 112.1 6.8X -SQL Parquet Vectorized: DataPageV2 2224 2326 144 4.7 212.1 3.6X -SQL Parquet MR: DataPageV1 3431 3483 73 3.1 327.2 2.3X -SQL Parquet MR: DataPageV2 3845 4043 280 2.7 366.7 2.1X -ParquetReader Vectorized: DataPageV1 1055 1056 2 9.9 100.6 7.6X -ParquetReader Vectorized: DataPageV2 2093 2119 37 5.0 199.6 3.8X -SQL ORC Vectorized 1129 1217 125 9.3 107.7 7.1X -SQL ORC MR 2931 2982 72 3.6 279.5 2.7X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 9165 9257 130 1.1 874.1 1.0X +SQL Json 10230 10234 7 1.0 975.6 0.9X +SQL Parquet Vectorized: DataPageV1 1275 1315 56 8.2 121.6 7.2X +SQL Parquet Vectorized: DataPageV2 2406 2407 0 4.4 229.5 3.8X +SQL Parquet MR: DataPageV1 4005 4009 7 2.6 381.9 2.3X +SQL Parquet MR: DataPageV2 4358 4366 12 2.4 415.6 2.1X +ParquetReader Vectorized: DataPageV1 985 995 13 10.6 94.0 9.3X +ParquetReader Vectorized: DataPageV2 2039 2061 32 5.1 194.4 4.5X +SQL ORC Vectorized 1048 1072 34 10.0 99.9 8.7X +SQL ORC MR 3179 3196 24 3.3 303.2 2.9X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6338 6508 240 1.7 604.4 1.0X -SQL Json 7149 7247 138 1.5 681.8 0.9X -SQL Parquet Vectorized: DataPageV1 937 984 45 11.2 89.3 6.8X -SQL Parquet Vectorized: DataPageV2 1582 1608 37 6.6 150.9 4.0X -SQL Parquet MR: DataPageV1 2525 2721 277 4.2 240.8 2.5X -SQL Parquet MR: DataPageV2 2969 2974 7 3.5 283.1 2.1X -ParquetReader Vectorized: DataPageV1 933 940 12 11.2 88.9 6.8X -ParquetReader Vectorized: DataPageV2 1535 1549 20 6.8 146.4 4.1X -SQL ORC Vectorized 1144 1204 86 9.2 109.1 5.5X -SQL ORC MR 2816 2822 8 3.7 268.6 2.3X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 6749 6750 2 1.6 643.6 1.0X +SQL Json 7960 7967 10 1.3 759.1 0.8X +SQL Parquet Vectorized: DataPageV1 1078 1105 37 9.7 102.8 6.3X +SQL Parquet Vectorized: DataPageV2 1939 1941 3 5.4 184.9 3.5X +SQL Parquet MR: DataPageV1 3090 3099 13 3.4 294.7 2.2X +SQL Parquet MR: DataPageV2 3274 3286 17 3.2 312.3 2.1X +ParquetReader Vectorized: DataPageV1 1058 1067 13 9.9 100.9 6.4X +ParquetReader Vectorized: DataPageV2 1847 1848 2 5.7 176.2 3.7X +SQL ORC Vectorized 1307 1307 0 8.0 124.6 5.2X +SQL ORC MR 3078 3122 62 3.4 293.6 2.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4443 4504 86 2.4 423.7 1.0X -SQL Json 4528 4563 49 2.3 431.8 1.0X -SQL Parquet Vectorized: DataPageV1 213 233 15 49.2 20.3 20.8X -SQL Parquet Vectorized: DataPageV2 267 294 22 39.3 25.4 16.7X -SQL Parquet MR: DataPageV1 1691 1700 13 6.2 161.2 2.6X -SQL Parquet MR: DataPageV2 1515 1565 70 6.9 144.5 2.9X -ParquetReader Vectorized: DataPageV1 228 231 2 46.0 21.7 19.5X -ParquetReader Vectorized: DataPageV2 285 296 9 36.8 27.1 15.6X -SQL ORC Vectorized 369 425 82 28.4 35.2 12.1X -SQL ORC MR 1457 1463 9 7.2 138.9 3.0X +SQL CSV 4818 4824 8 2.2 459.5 1.0X +SQL Json 4853 4878 35 2.2 462.8 1.0X +SQL Parquet Vectorized: DataPageV1 255 264 6 41.1 24.3 18.9X +SQL Parquet Vectorized: DataPageV2 711 716 4 14.7 67.8 6.8X +SQL Parquet MR: DataPageV1 2024 2024 1 5.2 193.0 2.4X +SQL Parquet MR: DataPageV2 1920 1922 3 5.5 183.1 2.5X +ParquetReader Vectorized: DataPageV1 272 275 2 38.5 26.0 17.7X +ParquetReader Vectorized: DataPageV2 719 726 6 14.6 68.5 6.7X +SQL ORC Vectorized 478 523 52 21.9 45.6 10.1X +SQL ORC MR 1772 1775 5 5.9 169.0 2.7X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2374 2377 5 0.4 2264.2 1.0X -SQL Json 2693 2726 46 0.4 2568.5 0.9X -SQL Parquet Vectorized: DataPageV1 44 62 16 23.8 42.0 54.0X -SQL Parquet Vectorized: DataPageV2 63 81 21 16.5 60.5 37.5X -SQL Parquet MR: DataPageV1 173 198 27 6.1 164.6 13.8X -SQL Parquet MR: DataPageV2 161 193 30 6.5 153.5 14.8X -SQL ORC Vectorized 53 71 18 19.9 50.2 45.1X -SQL ORC MR 149 182 34 7.0 142.3 15.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 2157 2171 20 0.5 2056.9 1.0X +SQL Json 2929 2931 3 0.4 2793.5 0.7X +SQL Parquet Vectorized: DataPageV1 46 63 17 22.6 44.2 46.6X +SQL Parquet Vectorized: DataPageV2 68 82 17 15.4 64.8 31.7X +SQL Parquet MR: DataPageV1 204 224 27 5.1 194.4 10.6X +SQL Parquet MR: DataPageV2 188 209 33 5.6 179.6 11.5X +SQL ORC Vectorized 57 73 20 18.4 54.3 37.9X +SQL ORC MR 172 191 20 6.1 163.8 12.6X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5149 5193 62 0.2 4910.9 1.0X -SQL Json 10556 10891 475 0.1 10066.5 0.5X -SQL Parquet Vectorized: DataPageV1 64 96 28 16.3 61.3 80.1X -SQL Parquet Vectorized: DataPageV2 83 106 22 12.6 79.1 62.0X -SQL Parquet MR: DataPageV1 196 232 25 5.3 187.4 26.2X -SQL Parquet MR: DataPageV2 184 221 28 5.7 175.1 28.0X -SQL ORC Vectorized 74 98 31 14.1 70.8 69.3X -SQL ORC MR 182 214 38 5.8 173.9 28.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 5271 5277 8 0.2 5027.2 1.0X +SQL Json 11690 11705 21 0.1 11148.8 0.5X +SQL Parquet Vectorized: DataPageV1 62 89 22 17.0 58.7 85.6X +SQL Parquet Vectorized: DataPageV2 83 104 28 12.7 79.0 63.6X +SQL Parquet MR: DataPageV1 222 239 26 4.7 211.3 23.8X +SQL Parquet MR: DataPageV2 207 244 49 5.1 197.1 25.5X +SQL ORC Vectorized 70 91 24 15.1 66.4 75.7X +SQL ORC MR 187 200 21 5.6 178.1 28.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9077 9107 43 0.1 8656.2 1.0X -SQL Json 20131 20886 1067 0.1 19198.5 0.5X -SQL Parquet Vectorized: DataPageV1 93 124 26 11.3 88.8 97.5X -SQL Parquet Vectorized: DataPageV2 103 128 29 10.2 98.5 87.9X -SQL Parquet MR: DataPageV1 218 257 35 4.8 207.6 41.7X -SQL Parquet MR: DataPageV2 213 255 29 4.9 202.7 42.7X -SQL ORC Vectorized 80 95 20 13.0 76.6 112.9X -SQL ORC MR 187 207 20 5.6 178.0 48.6X +SQL CSV 9264 9267 4 0.1 8835.3 1.0X +SQL Json 21871 22402 751 0.0 20857.9 0.4X +SQL Parquet Vectorized: DataPageV1 90 127 29 11.7 85.8 102.9X +SQL Parquet Vectorized: DataPageV2 112 134 20 9.3 107.1 82.5X +SQL Parquet MR: DataPageV1 255 281 26 4.1 243.4 36.3X +SQL Parquet MR: DataPageV2 241 273 32 4.3 230.3 38.4X +SQL ORC Vectorized 85 99 21 12.3 81.3 108.7X +SQL ORC MR 205 220 21 5.1 195.7 45.1X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt index ecba57c0c3cc3..976ab2f166b23 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15972 16369 561 1.0 1015.5 1.0X -SQL Json 9543 9580 54 1.6 606.7 1.7X -SQL Parquet Vectorized: DataPageV1 115 144 19 136.3 7.3 138.4X -SQL Parquet Vectorized: DataPageV2 95 109 15 165.1 6.1 167.6X -SQL Parquet MR: DataPageV1 2098 2119 30 7.5 133.4 7.6X -SQL Parquet MR: DataPageV2 2007 2012 6 7.8 127.6 8.0X -SQL ORC Vectorized 211 225 16 74.5 13.4 75.7X -SQL ORC MR 2077 2103 36 7.6 132.1 7.7X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 9960 10062 144 1.6 633.2 1.0X +SQL Json 7971 8037 92 2.0 506.8 1.2X +SQL Parquet Vectorized: DataPageV1 116 138 14 136.0 7.4 86.1X +SQL Parquet Vectorized: DataPageV2 87 104 14 180.1 5.6 114.1X +SQL Parquet MR: DataPageV1 1708 1712 5 9.2 108.6 5.8X +SQL Parquet MR: DataPageV2 1554 1555 1 10.1 98.8 6.4X +SQL ORC Vectorized 174 182 7 90.3 11.1 57.2X +SQL ORC MR 1510 1512 2 10.4 96.0 6.6X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 43 47 2 369.4 2.7 1.0X -ParquetReader Vectorized: DataPageV2 30 34 2 518.5 1.9 1.4X -ParquetReader Vectorized -> Row: DataPageV1 47 50 2 333.6 3.0 0.9X -ParquetReader Vectorized -> Row: DataPageV2 31 35 2 504.8 2.0 1.4X +ParquetReader Vectorized: DataPageV1 61 62 3 258.8 3.9 1.0X +ParquetReader Vectorized: DataPageV2 44 56 24 356.2 2.8 1.4X +ParquetReader Vectorized -> Row: DataPageV1 50 52 2 312.0 3.2 1.2X +ParquetReader Vectorized -> Row: DataPageV2 32 33 1 494.9 2.0 1.9X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17468 17543 105 0.9 1110.6 1.0X -SQL Json 11059 11065 8 1.4 703.1 1.6X -SQL Parquet Vectorized: DataPageV1 128 142 15 123.1 8.1 136.7X -SQL Parquet Vectorized: DataPageV2 126 141 8 125.2 8.0 139.1X -SQL Parquet MR: DataPageV1 2305 2331 36 6.8 146.5 7.6X -SQL Parquet MR: DataPageV2 2075 2095 28 7.6 131.9 8.4X -SQL ORC Vectorized 172 191 16 91.5 10.9 101.6X -SQL ORC MR 1777 1796 26 8.8 113.0 9.8X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 11898 11909 15 1.3 756.5 1.0X +SQL Json 8991 9265 388 1.7 571.6 1.3X +SQL Parquet Vectorized: DataPageV1 116 125 9 135.2 7.4 102.3X +SQL Parquet Vectorized: DataPageV2 118 125 8 133.0 7.5 100.6X +SQL Parquet MR: DataPageV1 1965 1982 24 8.0 124.9 6.1X +SQL Parquet MR: DataPageV2 1830 1836 8 8.6 116.4 6.5X +SQL ORC Vectorized 167 175 8 94.3 10.6 71.3X +SQL ORC MR 1471 1472 1 10.7 93.5 8.1X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 72 77 5 219.4 4.6 1.0X -ParquetReader Vectorized: DataPageV2 72 77 3 217.9 4.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 76 83 6 206.6 4.8 0.9X -ParquetReader Vectorized -> Row: DataPageV2 75 80 3 210.3 4.8 1.0X +ParquetReader Vectorized: DataPageV1 118 120 2 133.0 7.5 1.0X +ParquetReader Vectorized: DataPageV2 118 120 4 133.2 7.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 71 73 3 220.1 4.5 1.7X +ParquetReader Vectorized -> Row: DataPageV2 72 74 2 217.7 4.6 1.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18330 18332 3 0.9 1165.4 1.0X -SQL Json 11383 11429 66 1.4 723.7 1.6X -SQL Parquet Vectorized: DataPageV1 179 197 13 88.0 11.4 102.5X -SQL Parquet Vectorized: DataPageV2 239 263 18 65.7 15.2 76.6X -SQL Parquet MR: DataPageV1 2552 2567 21 6.2 162.3 7.2X -SQL Parquet MR: DataPageV2 2389 2436 67 6.6 151.9 7.7X -SQL ORC Vectorized 246 263 14 64.0 15.6 74.6X -SQL ORC MR 1965 2002 52 8.0 124.9 9.3X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 12575 12596 29 1.3 799.5 1.0X +SQL Json 9500 9751 355 1.7 604.0 1.3X +SQL Parquet Vectorized: DataPageV1 152 162 11 103.8 9.6 83.0X +SQL Parquet Vectorized: DataPageV2 206 214 6 76.3 13.1 61.0X +SQL Parquet MR: DataPageV1 2150 2170 28 7.3 136.7 5.8X +SQL Parquet MR: DataPageV2 1972 1981 12 8.0 125.4 6.4X +SQL ORC Vectorized 208 217 7 75.5 13.2 60.4X +SQL ORC MR 1626 1638 18 9.7 103.4 7.7X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 253 263 11 62.2 16.1 1.0X -ParquetReader Vectorized: DataPageV2 306 317 7 51.4 19.4 0.8X -ParquetReader Vectorized -> Row: DataPageV1 246 250 4 64.0 15.6 1.0X -ParquetReader Vectorized -> Row: DataPageV2 316 321 4 49.8 20.1 0.8X +ParquetReader Vectorized: DataPageV1 223 226 2 70.6 14.2 1.0X +ParquetReader Vectorized: DataPageV2 281 388 190 56.1 17.8 0.8X +ParquetReader Vectorized -> Row: DataPageV1 207 210 2 75.9 13.2 1.1X +ParquetReader Vectorized -> Row: DataPageV2 262 269 9 60.0 16.7 0.8X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19573 19822 352 0.8 1244.4 1.0X -SQL Json 12141 12217 107 1.3 771.9 1.6X -SQL Parquet Vectorized: DataPageV1 192 222 28 81.8 12.2 101.8X -SQL Parquet Vectorized: DataPageV2 345 373 24 45.6 21.9 56.7X -SQL Parquet MR: DataPageV1 2736 2741 7 5.7 173.9 7.2X -SQL Parquet MR: DataPageV2 2467 2536 97 6.4 156.9 7.9X -SQL ORC Vectorized 332 356 20 47.4 21.1 59.0X -SQL ORC MR 2188 2193 7 7.2 139.1 8.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 13952 13964 18 1.1 887.0 1.0X +SQL Json 9848 9848 1 1.6 626.1 1.4X +SQL Parquet Vectorized: DataPageV1 146 153 5 107.4 9.3 95.3X +SQL Parquet Vectorized: DataPageV2 294 309 19 53.5 18.7 47.4X +SQL Parquet MR: DataPageV1 2129 2131 3 7.4 135.4 6.6X +SQL Parquet MR: DataPageV2 2033 2042 13 7.7 129.3 6.9X +SQL ORC Vectorized 239 244 4 65.8 15.2 58.3X +SQL ORC MR 1650 1654 6 9.5 104.9 8.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 291 295 4 54.1 18.5 1.0X -ParquetReader Vectorized: DataPageV2 493 518 39 31.9 31.3 0.6X -ParquetReader Vectorized -> Row: DataPageV1 300 306 8 52.5 19.1 1.0X -ParquetReader Vectorized -> Row: DataPageV2 471 483 11 33.4 30.0 0.6X +ParquetReader Vectorized: DataPageV1 236 240 2 66.7 15.0 1.0X +ParquetReader Vectorized: DataPageV2 404 423 31 38.9 25.7 0.6X +ParquetReader Vectorized -> Row: DataPageV1 232 237 7 67.7 14.8 1.0X +ParquetReader Vectorized -> Row: DataPageV2 375 381 5 41.9 23.9 0.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 24692 24718 37 0.6 1569.9 1.0X -SQL Json 14839 14875 50 1.1 943.5 1.7X -SQL Parquet Vectorized: DataPageV1 295 316 29 53.3 18.7 83.7X -SQL Parquet Vectorized: DataPageV2 477 505 24 32.9 30.4 51.7X -SQL Parquet MR: DataPageV1 2841 2981 197 5.5 180.6 8.7X -SQL Parquet MR: DataPageV2 2616 2632 23 6.0 166.3 9.4X -SQL ORC Vectorized 388 403 11 40.5 24.7 63.6X -SQL ORC MR 2274 2372 138 6.9 144.6 10.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 18407 18894 690 0.9 1170.3 1.0X +SQL Json 12745 12771 37 1.2 810.3 1.4X +SQL Parquet Vectorized: DataPageV1 246 255 10 64.0 15.6 74.8X +SQL Parquet Vectorized: DataPageV2 559 592 38 28.1 35.5 32.9X +SQL Parquet MR: DataPageV1 2379 2387 11 6.6 151.3 7.7X +SQL Parquet MR: DataPageV2 2188 2196 12 7.2 139.1 8.4X +SQL ORC Vectorized 368 406 51 42.7 23.4 50.0X +SQL ORC MR 1980 2003 32 7.9 125.9 9.3X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 376 387 9 41.9 23.9 1.0X -ParquetReader Vectorized: DataPageV2 585 591 6 26.9 37.2 0.6X -ParquetReader Vectorized -> Row: DataPageV1 377 387 9 41.8 23.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 576 586 10 27.3 36.6 0.7X +ParquetReader Vectorized: DataPageV1 313 315 2 50.3 19.9 1.0X +ParquetReader Vectorized: DataPageV2 617 623 6 25.5 39.2 0.5X +ParquetReader Vectorized -> Row: DataPageV1 276 307 20 57.0 17.5 1.1X +ParquetReader Vectorized -> Row: DataPageV2 590 599 8 26.6 37.5 0.5X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20566 20651 119 0.8 1307.6 1.0X -SQL Json 14337 14409 101 1.1 911.5 1.4X -SQL Parquet Vectorized: DataPageV1 154 167 8 101.9 9.8 133.2X -SQL Parquet Vectorized: DataPageV2 157 178 14 99.9 10.0 130.6X -SQL Parquet MR: DataPageV1 2730 2730 1 5.8 173.5 7.5X -SQL Parquet MR: DataPageV2 2459 2491 45 6.4 156.3 8.4X -SQL ORC Vectorized 479 501 15 32.9 30.4 43.0X -SQL ORC MR 2293 2343 71 6.9 145.8 9.0X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 14449 14465 22 1.1 918.7 1.0X +SQL Json 11933 11944 15 1.3 758.7 1.2X +SQL Parquet Vectorized: DataPageV1 145 174 48 108.5 9.2 99.7X +SQL Parquet Vectorized: DataPageV2 178 186 13 88.5 11.3 81.3X +SQL Parquet MR: DataPageV1 2134 2158 34 7.4 135.7 6.8X +SQL Parquet MR: DataPageV2 2014 2026 17 7.8 128.0 7.2X +SQL ORC Vectorized 442 452 18 35.6 28.1 32.7X +SQL ORC MR 1941 1944 5 8.1 123.4 7.4X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 272 283 9 57.9 17.3 1.0X -ParquetReader Vectorized: DataPageV2 250 288 27 62.9 15.9 1.1X -ParquetReader Vectorized -> Row: DataPageV1 291 301 6 54.1 18.5 0.9X -ParquetReader Vectorized -> Row: DataPageV2 293 305 14 53.6 18.6 0.9X +ParquetReader Vectorized: DataPageV1 239 244 5 65.9 15.2 1.0X +ParquetReader Vectorized: DataPageV2 236 301 117 66.6 15.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 217 237 14 72.6 13.8 1.1X +ParquetReader Vectorized -> Row: DataPageV2 236 238 2 66.6 15.0 1.0X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 25753 25874 171 0.6 1637.3 1.0X -SQL Json 19097 19391 416 0.8 1214.2 1.3X -SQL Parquet Vectorized: DataPageV1 273 288 11 57.6 17.4 94.3X -SQL Parquet Vectorized: DataPageV2 240 277 25 65.5 15.3 107.3X -SQL Parquet MR: DataPageV1 2969 3042 103 5.3 188.8 8.7X -SQL Parquet MR: DataPageV2 2692 2747 78 5.8 171.1 9.6X -SQL ORC Vectorized 601 626 20 26.2 38.2 42.8X -SQL ORC MR 2458 2467 13 6.4 156.3 10.5X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 19310 19577 378 0.8 1227.7 1.0X +SQL Json 16132 16146 20 1.0 1025.7 1.2X +SQL Parquet Vectorized: DataPageV1 246 262 18 64.0 15.6 78.5X +SQL Parquet Vectorized: DataPageV2 216 255 21 72.7 13.8 89.2X +SQL Parquet MR: DataPageV1 2379 3134 1068 6.6 151.3 8.1X +SQL Parquet MR: DataPageV2 6344 6376 46 2.5 403.3 3.0X +SQL ORC Vectorized 484 525 55 32.5 30.8 39.9X +SQL ORC MR 1998 1998 0 7.9 127.0 9.7X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 354 363 7 44.4 22.5 1.0X -ParquetReader Vectorized: DataPageV2 345 359 12 45.5 22.0 1.0X -ParquetReader Vectorized -> Row: DataPageV1 337 345 8 46.7 21.4 1.1X -ParquetReader Vectorized -> Row: DataPageV2 335 364 21 46.9 21.3 1.1X +ParquetReader Vectorized: DataPageV1 294 308 12 53.5 18.7 1.0X +ParquetReader Vectorized: DataPageV2 316 324 9 49.8 20.1 0.9X +ParquetReader Vectorized -> Row: DataPageV1 289 321 17 54.5 18.4 1.0X +ParquetReader Vectorized -> Row: DataPageV2 317 319 2 49.6 20.2 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18074 18101 37 0.6 1723.7 1.0X -SQL Json 13211 13214 5 0.8 1259.9 1.4X -SQL Parquet Vectorized: DataPageV1 2249 2286 53 4.7 214.5 8.0X -SQL Parquet Vectorized: DataPageV2 2804 2818 20 3.7 267.4 6.4X -SQL Parquet MR: DataPageV1 4708 4779 100 2.2 449.0 3.8X -SQL Parquet MR: DataPageV2 4868 5046 251 2.2 464.3 3.7X -SQL ORC Vectorized 2145 2160 20 4.9 204.6 8.4X -SQL ORC MR 4180 4308 182 2.5 398.6 4.3X +SQL CSV 13320 13335 21 0.8 1270.3 1.0X +SQL Json 11087 11093 8 0.9 1057.3 1.2X +SQL Parquet Vectorized: DataPageV1 2098 2115 24 5.0 200.1 6.3X +SQL Parquet Vectorized: DataPageV2 2843 2961 167 3.7 271.1 4.7X +SQL Parquet MR: DataPageV1 4169 4201 45 2.5 397.6 3.2X +SQL Parquet MR: DataPageV2 4397 4414 24 2.4 419.4 3.0X +SQL ORC Vectorized 2864 4244 1951 3.7 273.2 4.7X +SQL ORC MR 3544 3549 7 3.0 338.0 3.8X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11320 11376 78 0.9 1079.6 1.0X -SQL Json 7593 7664 101 1.4 724.1 1.5X -SQL Parquet Vectorized: DataPageV1 633 639 9 16.6 60.3 17.9X -SQL Parquet Vectorized: DataPageV2 621 644 20 16.9 59.2 18.2X -SQL Parquet MR: DataPageV1 2111 2157 65 5.0 201.3 5.4X -SQL Parquet MR: DataPageV2 2018 2064 65 5.2 192.4 5.6X -SQL ORC Vectorized 505 540 36 20.8 48.2 22.4X -SQL ORC MR 2302 2360 82 4.6 219.5 4.9X +SQL CSV 7305 7308 4 1.4 696.7 1.0X +SQL Json 6621 6622 1 1.6 631.4 1.1X +SQL Parquet Vectorized: DataPageV1 608 612 4 17.2 58.0 12.0X +SQL Parquet Vectorized: DataPageV2 611 618 7 17.2 58.2 12.0X +SQL Parquet MR: DataPageV1 1706 1710 5 6.1 162.7 4.3X +SQL Parquet MR: DataPageV2 1640 1653 19 6.4 156.4 4.5X +SQL ORC Vectorized 501 504 3 20.9 47.7 14.6X +SQL ORC MR 1909 1926 24 5.5 182.1 3.8X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 24867 25261 556 0.6 1581.0 1.0X -Data column - Json 13937 13987 70 1.1 886.1 1.8X -Data column - Parquet Vectorized: DataPageV1 252 264 8 62.3 16.0 98.5X -Data column - Parquet Vectorized: DataPageV2 547 560 13 28.8 34.7 45.5X -Data column - Parquet MR: DataPageV1 3492 3509 25 4.5 222.0 7.1X -Data column - Parquet MR: DataPageV2 3148 3208 84 5.0 200.2 7.9X -Data column - ORC Vectorized 493 512 21 31.9 31.3 50.5X -Data column - ORC MR 2925 2943 26 5.4 185.9 8.5X -Partition column - CSV 7847 7851 5 2.0 498.9 3.2X -Partition column - Json 11759 11908 210 1.3 747.6 2.1X -Partition column - Parquet Vectorized: DataPageV1 60 67 7 262.3 3.8 414.7X -Partition column - Parquet Vectorized: DataPageV2 57 65 9 274.2 3.6 433.5X -Partition column - Parquet MR: DataPageV1 1762 1768 8 8.9 112.1 14.1X -Partition column - Parquet MR: DataPageV2 1742 1783 59 9.0 110.7 14.3X -Partition column - ORC Vectorized 59 71 7 265.6 3.8 419.9X -Partition column - ORC MR 1743 1764 29 9.0 110.8 14.3X -Both columns - CSV 25859 25924 92 0.6 1644.1 1.0X -Both columns - Json 14693 14764 101 1.1 934.2 1.7X -Both columns - Parquet Vectorized: DataPageV1 341 395 66 46.2 21.7 73.0X -Both columns - Parquet Vectorized: DataPageV2 624 643 13 25.2 39.7 39.9X -Both columns - Parquet MR: DataPageV1 3541 3611 99 4.4 225.2 7.0X -Both columns - Parquet MR: DataPageV2 3279 3301 32 4.8 208.4 7.6X -Both columns - ORC Vectorized 434 483 40 36.2 27.6 57.3X -Both columns - ORC MR 2946 2964 26 5.3 187.3 8.4X +Data column - CSV 18479 18682 288 0.9 1174.8 1.0X +Data column - Json 12533 12541 11 1.3 796.8 1.5X +Data column - Parquet Vectorized: DataPageV1 272 303 35 57.8 17.3 67.9X +Data column - Parquet Vectorized: DataPageV2 629 653 18 25.0 40.0 29.4X +Data column - Parquet MR: DataPageV1 2777 2782 7 5.7 176.5 6.7X +Data column - Parquet MR: DataPageV2 2581 2603 31 6.1 164.1 7.2X +Data column - ORC Vectorized 418 440 27 37.6 26.6 44.2X +Data column - ORC MR 2297 2332 50 6.8 146.0 8.0X +Partition column - CSV 6496 6569 104 2.4 413.0 2.8X +Partition column - Json 10072 10077 7 1.6 640.3 1.8X +Partition column - Parquet Vectorized: DataPageV1 56 63 5 282.4 3.5 331.7X +Partition column - Parquet Vectorized: DataPageV2 54 58 6 290.0 3.4 340.6X +Partition column - Parquet MR: DataPageV1 1356 1360 6 11.6 86.2 13.6X +Partition column - Parquet MR: DataPageV2 1347 1348 2 11.7 85.6 13.7X +Partition column - ORC Vectorized 55 60 6 284.3 3.5 334.0X +Partition column - ORC MR 1458 1468 15 10.8 92.7 12.7X +Both columns - CSV 19228 20030 1133 0.8 1222.5 1.0X +Both columns - Json 13465 13516 71 1.2 856.1 1.4X +Both columns - Parquet Vectorized: DataPageV1 267 269 2 59.0 17.0 69.3X +Both columns - Parquet Vectorized: DataPageV2 659 691 29 23.9 41.9 28.1X +Both columns - Parquet MR: DataPageV1 2819 2850 44 5.6 179.2 6.6X +Both columns - Parquet MR: DataPageV2 2584 2585 1 6.1 164.3 7.1X +Both columns - ORC Vectorized 389 406 23 40.4 24.7 47.5X +Both columns - ORC MR 2490 2504 19 6.3 158.3 7.4X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13698 13783 121 0.8 1306.3 1.0X -SQL Json 11030 11144 161 1.0 1051.9 1.2X -SQL Parquet Vectorized: DataPageV1 1695 1699 7 6.2 161.6 8.1X -SQL Parquet Vectorized: DataPageV2 2740 2744 5 3.8 261.3 5.0X -SQL Parquet MR: DataPageV1 4547 4594 66 2.3 433.7 3.0X -SQL Parquet MR: DataPageV2 5382 5455 103 1.9 513.3 2.5X -ParquetReader Vectorized: DataPageV1 1238 1238 0 8.5 118.0 11.1X -ParquetReader Vectorized: DataPageV2 2312 2325 19 4.5 220.5 5.9X -SQL ORC Vectorized 1134 1147 18 9.2 108.2 12.1X -SQL ORC MR 3966 4015 69 2.6 378.2 3.5X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 8769 8915 207 1.2 836.2 1.0X +SQL Json 9987 9998 16 1.0 952.4 0.9X +SQL Parquet Vectorized: DataPageV1 1362 1365 4 7.7 129.9 6.4X +SQL Parquet Vectorized: DataPageV2 2118 2124 8 5.0 202.0 4.1X +SQL Parquet MR: DataPageV1 3631 4788 1637 2.9 346.3 2.4X +SQL Parquet MR: DataPageV2 4213 4227 19 2.5 401.8 2.1X +ParquetReader Vectorized: DataPageV1 1003 1012 13 10.5 95.6 8.7X +ParquetReader Vectorized: DataPageV2 1789 1797 12 5.9 170.6 4.9X +SQL ORC Vectorized 980 1029 70 10.7 93.5 8.9X +SQL ORC MR 3057 3060 4 3.4 291.5 2.9X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10613 10658 64 1.0 1012.1 1.0X -SQL Json 8973 8996 33 1.2 855.7 1.2X -SQL Parquet Vectorized: DataPageV1 1208 1221 18 8.7 115.2 8.8X -SQL Parquet Vectorized: DataPageV2 1949 1950 1 5.4 185.9 5.4X -SQL Parquet MR: DataPageV1 3701 3716 21 2.8 353.0 2.9X -SQL Parquet MR: DataPageV2 4150 4192 60 2.5 395.8 2.6X -ParquetReader Vectorized: DataPageV1 1191 1192 1 8.8 113.6 8.9X -ParquetReader Vectorized: DataPageV2 1874 1917 61 5.6 178.7 5.7X -SQL ORC Vectorized 1338 1365 38 7.8 127.6 7.9X -SQL ORC MR 3659 3674 21 2.9 349.0 2.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 6508 6509 1 1.6 620.7 1.0X +SQL Json 7775 7790 22 1.3 741.5 0.8X +SQL Parquet Vectorized: DataPageV1 988 992 4 10.6 94.2 6.6X +SQL Parquet Vectorized: DataPageV2 1605 1612 10 6.5 153.1 4.1X +SQL Parquet MR: DataPageV1 2837 2840 4 3.7 270.6 2.3X +SQL Parquet MR: DataPageV2 3077 3082 7 3.4 293.5 2.1X +ParquetReader Vectorized: DataPageV1 930 951 30 11.3 88.6 7.0X +ParquetReader Vectorized: DataPageV2 1511 1516 6 6.9 144.1 4.3X +SQL ORC Vectorized 1209 1227 25 8.7 115.3 5.4X +SQL ORC MR 2889 2895 9 3.6 275.5 2.3X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8714 8809 134 1.2 831.0 1.0X -SQL Json 5801 5819 25 1.8 553.2 1.5X -SQL Parquet Vectorized: DataPageV1 297 316 11 35.3 28.3 29.3X -SQL Parquet Vectorized: DataPageV2 363 382 12 28.9 34.6 24.0X -SQL Parquet MR: DataPageV1 2350 2366 22 4.5 224.1 3.7X -SQL Parquet MR: DataPageV2 2132 2183 73 4.9 203.3 4.1X -ParquetReader Vectorized: DataPageV1 296 310 13 35.4 28.2 29.4X -ParquetReader Vectorized: DataPageV2 368 372 3 28.5 35.1 23.7X -SQL ORC Vectorized 474 487 10 22.1 45.2 18.4X -SQL ORC MR 2025 2031 9 5.2 193.1 4.3X +SQL CSV 4587 4604 24 2.3 437.5 1.0X +SQL Json 4677 4679 3 2.2 446.0 1.0X +SQL Parquet Vectorized: DataPageV1 235 239 5 44.7 22.4 19.5X +SQL Parquet Vectorized: DataPageV2 620 622 2 16.9 59.1 7.4X +SQL Parquet MR: DataPageV1 1804 1812 11 5.8 172.1 2.5X +SQL Parquet MR: DataPageV2 1605 1659 76 6.5 153.1 2.9X +ParquetReader Vectorized: DataPageV1 235 237 2 44.7 22.4 19.6X +ParquetReader Vectorized: DataPageV2 613 617 3 17.1 58.5 7.5X +SQL ORC Vectorized 387 391 2 27.1 36.9 11.8X +SQL ORC MR 1632 1635 4 6.4 155.7 2.8X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2677 2687 14 0.4 2553.2 1.0X -SQL Json 3581 3588 10 0.3 3414.8 0.7X -SQL Parquet Vectorized: DataPageV1 52 59 7 20.2 49.6 51.5X -SQL Parquet Vectorized: DataPageV2 68 75 7 15.4 65.0 39.3X -SQL Parquet MR: DataPageV1 245 257 9 4.3 233.6 10.9X -SQL Parquet MR: DataPageV2 224 237 8 4.7 213.7 11.9X -SQL ORC Vectorized 64 70 5 16.3 61.3 41.7X -SQL ORC MR 208 216 8 5.0 198.2 12.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 2056 2068 17 0.5 1960.9 1.0X +SQL Json 3193 3242 70 0.3 3045.3 0.6X +SQL Parquet Vectorized: DataPageV1 45 50 7 23.1 43.3 45.3X +SQL Parquet Vectorized: DataPageV2 66 72 5 15.8 63.4 30.9X +SQL Parquet MR: DataPageV1 192 197 6 5.4 183.5 10.7X +SQL Parquet MR: DataPageV2 180 186 7 5.8 171.3 11.4X +SQL ORC Vectorized 55 60 6 19.2 52.1 37.6X +SQL ORC MR 164 169 7 6.4 156.2 12.6X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5753 5771 25 0.2 5486.7 1.0X -SQL Json 13801 13851 71 0.1 13161.9 0.4X -SQL Parquet Vectorized: DataPageV1 75 83 9 14.1 71.1 77.2X -SQL Parquet Vectorized: DataPageV2 84 93 7 12.4 80.6 68.1X -SQL Parquet MR: DataPageV1 269 280 7 3.9 256.5 21.4X -SQL Parquet MR: DataPageV2 251 258 8 4.2 238.9 23.0X -SQL ORC Vectorized 82 88 6 12.8 78.3 70.1X -SQL ORC MR 223 239 8 4.7 213.0 25.8X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 4292 4319 38 0.2 4093.3 1.0X +SQL Json 12613 12769 221 0.1 12028.9 0.3X +SQL Parquet Vectorized: DataPageV1 59 78 23 17.6 56.7 72.2X +SQL Parquet Vectorized: DataPageV2 82 87 6 12.8 78.1 52.4X +SQL Parquet MR: DataPageV1 214 219 6 4.9 204.0 20.1X +SQL Parquet MR: DataPageV2 197 204 6 5.3 188.3 21.7X +SQL ORC Vectorized 70 77 8 14.9 67.0 61.1X +SQL ORC MR 186 200 12 5.6 177.3 23.1X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9487 9503 24 0.1 9047.1 1.0X -SQL Json 26109 26240 186 0.0 24899.2 0.4X -SQL Parquet Vectorized: DataPageV1 100 110 10 10.4 95.8 94.5X -SQL Parquet Vectorized: DataPageV2 113 119 6 9.3 107.3 84.3X -SQL Parquet MR: DataPageV1 280 296 11 3.7 267.2 33.9X -SQL Parquet MR: DataPageV2 281 321 68 3.7 268.0 33.8X -SQL ORC Vectorized 92 101 8 11.4 87.5 103.4X -SQL ORC MR 228 245 10 4.6 217.7 41.6X +SQL CSV 7312 7322 14 0.1 6973.3 1.0X +SQL Json 24046 24366 452 0.0 22932.0 0.3X +SQL Parquet Vectorized: DataPageV1 86 94 7 12.1 82.4 84.6X +SQL Parquet Vectorized: DataPageV2 109 113 7 9.7 103.5 67.4X +SQL Parquet MR: DataPageV1 245 257 12 4.3 233.6 29.9X +SQL Parquet MR: DataPageV2 228 234 8 4.6 217.1 32.1X +SQL ORC Vectorized 86 91 6 12.2 82.1 84.9X +SQL ORC MR 199 209 12 5.3 190.2 36.7X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 6a2b6bfb4a0a8..492fae832aa58 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11570 12144 812 1.4 735.6 1.0X -SQL Json 7542 7568 37 2.1 479.5 1.5X -SQL Parquet Vectorized: DataPageV1 129 144 16 121.9 8.2 89.7X -SQL Parquet Vectorized: DataPageV2 92 106 20 170.3 5.9 125.2X -SQL Parquet MR: DataPageV1 1416 1419 3 11.1 90.0 8.2X -SQL Parquet MR: DataPageV2 1281 1359 110 12.3 81.4 9.0X -SQL ORC Vectorized 161 176 10 97.4 10.3 71.6X -SQL ORC MR 1525 1545 29 10.3 96.9 7.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 11370 11607 334 1.4 722.9 1.0X +SQL Json 7646 7670 34 2.1 486.1 1.5X +SQL Parquet Vectorized: DataPageV1 90 102 10 174.8 5.7 126.4X +SQL Parquet Vectorized: DataPageV2 71 79 10 222.5 4.5 160.9X +SQL Parquet MR: DataPageV1 1514 1517 4 10.4 96.3 7.5X +SQL Parquet MR: DataPageV2 1369 1372 5 11.5 87.0 8.3X +SQL ORC Vectorized 152 163 12 103.3 9.7 74.7X +SQL ORC MR 1382 1402 29 11.4 87.8 8.2X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 111 118 6 142.3 7.0 1.0X -ParquetReader Vectorized: DataPageV2 116 117 2 135.7 7.4 1.0X -ParquetReader Vectorized -> Row: DataPageV1 48 49 1 324.9 3.1 2.3X -ParquetReader Vectorized -> Row: DataPageV2 39 39 1 405.8 2.5 2.9X +ParquetReader Vectorized: DataPageV1 91 93 3 172.7 5.8 1.0X +ParquetReader Vectorized: DataPageV2 84 85 3 188.1 5.3 1.1X +ParquetReader Vectorized -> Row: DataPageV1 38 39 1 408.6 2.4 2.4X +ParquetReader Vectorized -> Row: DataPageV2 31 31 1 509.7 2.0 3.0X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13807 14535 1030 1.1 877.8 1.0X -SQL Json 8079 8094 21 1.9 513.6 1.7X -SQL Parquet Vectorized: DataPageV1 139 152 12 113.0 8.9 99.2X -SQL Parquet Vectorized: DataPageV2 140 147 5 112.5 8.9 98.7X -SQL Parquet MR: DataPageV1 1637 1741 148 9.6 104.1 8.4X -SQL Parquet MR: DataPageV2 1522 1636 161 10.3 96.8 9.1X -SQL ORC Vectorized 147 160 10 106.9 9.4 93.8X -SQL ORC MR 1542 1545 4 10.2 98.1 9.0X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 13939 13970 44 1.1 886.2 1.0X +SQL Json 8771 8792 29 1.8 557.7 1.6X +SQL Parquet Vectorized: DataPageV1 112 116 9 140.7 7.1 124.7X +SQL Parquet Vectorized: DataPageV2 110 115 7 143.1 7.0 126.8X +SQL Parquet MR: DataPageV1 1694 1698 7 9.3 107.7 8.2X +SQL Parquet MR: DataPageV2 1556 1565 12 10.1 98.9 9.0X +SQL ORC Vectorized 122 125 3 128.6 7.8 114.0X +SQL ORC MR 1353 1353 0 11.6 86.0 10.3X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 166 171 8 94.7 10.6 1.0X -ParquetReader Vectorized: DataPageV2 166 169 4 94.7 10.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 156 157 2 100.7 9.9 1.1X -ParquetReader Vectorized -> Row: DataPageV2 156 157 2 100.7 9.9 1.1X +ParquetReader Vectorized: DataPageV1 169 171 2 93.1 10.7 1.0X +ParquetReader Vectorized: DataPageV2 168 170 3 93.4 10.7 1.0X +ParquetReader Vectorized -> Row: DataPageV1 137 139 2 114.5 8.7 1.2X +ParquetReader Vectorized -> Row: DataPageV2 138 140 2 114.1 8.8 1.2X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15327 15421 133 1.0 974.5 1.0X -SQL Json 8564 8799 332 1.8 544.5 1.8X -SQL Parquet Vectorized: DataPageV1 202 219 11 77.8 12.8 75.8X -SQL Parquet Vectorized: DataPageV2 203 210 8 77.7 12.9 75.7X -SQL Parquet MR: DataPageV1 1874 2004 183 8.4 119.2 8.2X -SQL Parquet MR: DataPageV2 1606 1709 146 9.8 102.1 9.5X -SQL ORC Vectorized 167 179 10 94.1 10.6 91.7X -SQL ORC MR 1404 1408 6 11.2 89.3 10.9X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 14823 14843 28 1.1 942.4 1.0X +SQL Json 9345 9346 2 1.7 594.1 1.6X +SQL Parquet Vectorized: DataPageV1 172 189 48 91.4 10.9 86.2X +SQL Parquet Vectorized: DataPageV2 175 179 6 89.9 11.1 84.7X +SQL Parquet MR: DataPageV1 1962 1971 13 8.0 124.7 7.6X +SQL Parquet MR: DataPageV2 1680 1684 6 9.4 106.8 8.8X +SQL ORC Vectorized 158 161 4 99.4 10.1 93.7X +SQL ORC MR 1449 1456 10 10.9 92.1 10.2X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 222 236 13 70.7 14.1 1.0X -ParquetReader Vectorized: DataPageV2 259 268 14 60.8 16.5 0.9X -ParquetReader Vectorized -> Row: DataPageV1 228 248 11 68.9 14.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 264 293 13 59.5 16.8 0.8X +ParquetReader Vectorized: DataPageV1 212 217 4 74.1 13.5 1.0X +ParquetReader Vectorized: DataPageV2 245 249 3 64.1 15.6 0.9X +ParquetReader Vectorized -> Row: DataPageV1 214 215 2 73.7 13.6 1.0X +ParquetReader Vectorized -> Row: DataPageV2 245 248 2 64.1 15.6 0.9X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17479 17651 243 0.9 1111.3 1.0X -SQL Json 9565 9582 25 1.6 608.1 1.8X -SQL Parquet Vectorized: DataPageV1 152 159 8 103.2 9.7 114.7X -SQL Parquet Vectorized: DataPageV2 290 308 18 54.2 18.4 60.3X -SQL Parquet MR: DataPageV1 1861 1980 169 8.5 118.3 9.4X -SQL Parquet MR: DataPageV2 1647 1748 142 9.5 104.7 10.6X -SQL ORC Vectorized 230 251 12 68.3 14.6 75.9X -SQL ORC MR 1645 1648 3 9.6 104.6 10.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 16272 16326 76 1.0 1034.5 1.0X +SQL Json 9900 9906 8 1.6 629.4 1.6X +SQL Parquet Vectorized: DataPageV1 134 138 6 117.0 8.5 121.1X +SQL Parquet Vectorized: DataPageV2 249 254 6 63.1 15.9 65.2X +SQL Parquet MR: DataPageV1 2040 2042 2 7.7 129.7 8.0X +SQL Parquet MR: DataPageV2 1777 1786 13 8.9 113.0 9.2X +SQL ORC Vectorized 216 219 3 72.9 13.7 75.4X +SQL ORC MR 1550 1553 4 10.1 98.6 10.5X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 208 213 9 75.7 13.2 1.0X -ParquetReader Vectorized: DataPageV2 355 382 14 44.3 22.6 0.6X -ParquetReader Vectorized -> Row: DataPageV1 212 233 8 74.1 13.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 350 353 7 45.0 22.2 0.6X +ParquetReader Vectorized: DataPageV1 191 209 42 82.1 12.2 1.0X +ParquetReader Vectorized: DataPageV2 321 323 4 49.1 20.4 0.6X +ParquetReader Vectorized -> Row: DataPageV1 193 195 4 81.5 12.3 1.0X +ParquetReader Vectorized -> Row: DataPageV2 321 326 8 49.1 20.4 0.6X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 21825 21944 169 0.7 1387.6 1.0X -SQL Json 11877 11927 71 1.3 755.1 1.8X -SQL Parquet Vectorized: DataPageV1 229 242 18 68.8 14.5 95.5X -SQL Parquet Vectorized: DataPageV2 435 452 23 36.1 27.7 50.1X -SQL Parquet MR: DataPageV1 2050 2184 190 7.7 130.3 10.6X -SQL Parquet MR: DataPageV2 1829 1927 138 8.6 116.3 11.9X -SQL ORC Vectorized 287 308 14 54.8 18.3 76.0X -SQL ORC MR 1579 1603 34 10.0 100.4 13.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 22520 22525 7 0.7 1431.8 1.0X +SQL Json 12602 12602 1 1.2 801.2 1.8X +SQL Parquet Vectorized: DataPageV1 187 208 55 84.2 11.9 120.6X +SQL Parquet Vectorized: DataPageV2 396 404 9 39.7 25.2 56.8X +SQL Parquet MR: DataPageV1 2218 2232 20 7.1 141.0 10.2X +SQL Parquet MR: DataPageV2 1796 1802 9 8.8 114.2 12.5X +SQL ORC Vectorized 276 279 7 57.1 17.5 81.7X +SQL ORC MR 1621 1630 12 9.7 103.0 13.9X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 299 341 86 52.6 19.0 1.0X -ParquetReader Vectorized: DataPageV2 551 607 110 28.5 35.1 0.5X -ParquetReader Vectorized -> Row: DataPageV1 341 344 4 46.2 21.7 0.9X -ParquetReader Vectorized -> Row: DataPageV2 508 557 33 31.0 32.3 0.6X +ParquetReader Vectorized: DataPageV1 249 253 6 63.1 15.9 1.0X +ParquetReader Vectorized: DataPageV2 469 474 6 33.5 29.8 0.5X +ParquetReader Vectorized -> Row: DataPageV1 250 256 9 62.8 15.9 1.0X +ParquetReader Vectorized -> Row: DataPageV2 470 474 9 33.5 29.9 0.5X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17585 17926 482 0.9 1118.0 1.0X -SQL Json 11927 12180 357 1.3 758.3 1.5X -SQL Parquet Vectorized: DataPageV1 150 161 11 104.6 9.6 116.9X -SQL Parquet Vectorized: DataPageV2 150 160 8 104.7 9.5 117.1X -SQL Parquet MR: DataPageV1 1830 1867 52 8.6 116.4 9.6X -SQL Parquet MR: DataPageV2 1715 1828 160 9.2 109.1 10.3X -SQL ORC Vectorized 328 358 15 48.0 20.8 53.6X -SQL ORC MR 1584 1687 145 9.9 100.7 11.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 17796 17814 25 0.9 1131.5 1.0X +SQL Json 12118 12124 8 1.3 770.5 1.5X +SQL Parquet Vectorized: DataPageV1 131 137 8 119.7 8.4 135.5X +SQL Parquet Vectorized: DataPageV2 131 135 7 119.6 8.4 135.4X +SQL Parquet MR: DataPageV1 1978 1982 7 8.0 125.8 9.0X +SQL Parquet MR: DataPageV2 1830 1844 21 8.6 116.3 9.7X +SQL ORC Vectorized 318 326 10 49.5 20.2 56.0X +SQL ORC MR 1617 1621 5 9.7 102.8 11.0X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 207 211 8 76.0 13.2 1.0X -ParquetReader Vectorized: DataPageV2 207 220 11 75.8 13.2 1.0X -ParquetReader Vectorized -> Row: DataPageV1 208 214 9 75.7 13.2 1.0X -ParquetReader Vectorized -> Row: DataPageV2 208 213 9 75.6 13.2 1.0X +ParquetReader Vectorized: DataPageV1 212 215 3 74.1 13.5 1.0X +ParquetReader Vectorized: DataPageV2 213 215 3 73.7 13.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 221 223 2 71.0 14.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 222 224 3 70.9 14.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 22569 22614 63 0.7 1434.9 1.0X -SQL Json 15590 15600 15 1.0 991.2 1.4X -SQL Parquet Vectorized: DataPageV1 225 241 17 69.9 14.3 100.3X -SQL Parquet Vectorized: DataPageV2 219 236 13 72.0 13.9 103.3X -SQL Parquet MR: DataPageV1 2013 2109 136 7.8 128.0 11.2X -SQL Parquet MR: DataPageV2 1850 1967 165 8.5 117.6 12.2X -SQL ORC Vectorized 396 416 25 39.7 25.2 56.9X -SQL ORC MR 1707 1763 79 9.2 108.5 13.2X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 23606 23611 7 0.7 1500.8 1.0X +SQL Json 16860 16868 11 0.9 1072.0 1.4X +SQL Parquet Vectorized: DataPageV1 186 192 8 84.4 11.9 126.6X +SQL Parquet Vectorized: DataPageV2 186 193 8 84.4 11.9 126.6X +SQL Parquet MR: DataPageV1 2246 2264 26 7.0 142.8 10.5X +SQL Parquet MR: DataPageV2 2091 2098 10 7.5 132.9 11.3X +SQL ORC Vectorized 407 416 11 38.7 25.8 58.1X +SQL ORC MR 1740 1740 1 9.0 110.6 13.6X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 280 298 13 56.2 17.8 1.0X -ParquetReader Vectorized: DataPageV2 278 300 21 56.6 17.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 280 299 13 56.2 17.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 304 307 4 51.8 19.3 0.9X +ParquetReader Vectorized: DataPageV1 249 256 7 63.1 15.8 1.0X +ParquetReader Vectorized: DataPageV2 248 252 4 63.4 15.8 1.0X +ParquetReader Vectorized -> Row: DataPageV1 247 251 6 63.7 15.7 1.0X +ParquetReader Vectorized -> Row: DataPageV2 248 251 3 63.3 15.8 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15548 16002 641 0.7 1482.8 1.0X -SQL Json 10801 11108 434 1.0 1030.1 1.4X -SQL Parquet Vectorized: DataPageV1 1858 1966 152 5.6 177.2 8.4X -SQL Parquet Vectorized: DataPageV2 2342 2466 175 4.5 223.4 6.6X -SQL Parquet MR: DataPageV1 3873 3908 49 2.7 369.4 4.0X -SQL Parquet MR: DataPageV2 3764 3869 148 2.8 358.9 4.1X -SQL ORC Vectorized 2018 2020 3 5.2 192.5 7.7X -SQL ORC MR 3247 3450 287 3.2 309.7 4.8X +SQL CSV 15769 15778 12 0.7 1503.8 1.0X +SQL Json 11597 11610 18 0.9 1105.9 1.4X +SQL Parquet Vectorized: DataPageV1 2119 2121 3 4.9 202.1 7.4X +SQL Parquet Vectorized: DataPageV2 3026 3027 1 3.5 288.6 5.2X +SQL Parquet MR: DataPageV1 3980 3993 18 2.6 379.6 4.0X +SQL Parquet MR: DataPageV2 3899 3901 3 2.7 371.8 4.0X +SQL ORC Vectorized 2075 2084 13 5.1 197.9 7.6X +SQL ORC MR 3736 3739 5 2.8 356.3 4.2X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8028 8337 436 1.3 765.6 1.0X -SQL Json 6362 6488 178 1.6 606.7 1.3X -SQL Parquet Vectorized: DataPageV1 642 673 51 16.3 61.3 12.5X -SQL Parquet Vectorized: DataPageV2 646 678 40 16.2 61.6 12.4X -SQL Parquet MR: DataPageV1 1504 1604 141 7.0 143.5 5.3X -SQL Parquet MR: DataPageV2 1645 1646 1 6.4 156.9 4.9X -SQL ORC Vectorized 386 415 25 27.2 36.8 20.8X -SQL ORC MR 1704 1730 37 6.2 162.5 4.7X +SQL CSV 8221 8223 4 1.3 784.0 1.0X +SQL Json 6938 6941 4 1.5 661.7 1.2X +SQL Parquet Vectorized: DataPageV1 734 747 15 14.3 70.0 11.2X +SQL Parquet Vectorized: DataPageV2 734 736 2 14.3 70.0 11.2X +SQL Parquet MR: DataPageV1 1469 1469 0 7.1 140.1 5.6X +SQL Parquet MR: DataPageV2 1431 1432 1 7.3 136.5 5.7X +SQL ORC Vectorized 446 447 1 23.5 42.5 18.4X +SQL ORC MR 1807 1814 9 5.8 172.3 4.5X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 21472 21514 59 0.7 1365.2 1.0X -Data column - Json 11537 11606 97 1.4 733.5 1.9X -Data column - Parquet Vectorized: DataPageV1 238 256 11 66.1 15.1 90.2X -Data column - Parquet Vectorized: DataPageV2 482 507 17 32.6 30.6 44.6X -Data column - Parquet MR: DataPageV1 2213 2355 200 7.1 140.7 9.7X -Data column - Parquet MR: DataPageV2 2036 2163 179 7.7 129.4 10.5X -Data column - ORC Vectorized 289 310 20 54.4 18.4 74.3X -Data column - ORC MR 1898 1936 54 8.3 120.7 11.3X -Partition column - CSV 6307 6364 80 2.5 401.0 3.4X -Partition column - Json 9167 9253 121 1.7 582.8 2.3X -Partition column - Parquet Vectorized: DataPageV1 62 66 3 253.5 3.9 346.1X -Partition column - Parquet Vectorized: DataPageV2 61 65 2 259.2 3.9 353.8X -Partition column - Parquet MR: DataPageV1 1086 1088 3 14.5 69.0 19.8X -Partition column - Parquet MR: DataPageV2 1091 1146 78 14.4 69.4 19.7X -Partition column - ORC Vectorized 63 67 2 251.1 4.0 342.9X -Partition column - ORC MR 1173 1175 3 13.4 74.6 18.3X -Both columns - CSV 21458 22038 820 0.7 1364.3 1.0X -Both columns - Json 12697 12712 22 1.2 807.2 1.7X -Both columns - Parquet Vectorized: DataPageV1 275 288 10 57.2 17.5 78.0X -Both columns - Parquet Vectorized: DataPageV2 505 525 24 31.2 32.1 42.5X -Both columns - Parquet MR: DataPageV1 2541 2547 9 6.2 161.5 8.5X -Both columns - Parquet MR: DataPageV2 2059 2060 2 7.6 130.9 10.4X -Both columns - ORC Vectorized 326 349 16 48.3 20.7 66.0X -Both columns - ORC MR 2116 2151 50 7.4 134.5 10.1X +Data column - CSV 21873 21928 78 0.7 1390.6 1.0X +Data column - Json 12624 12665 58 1.2 802.6 1.7X +Data column - Parquet Vectorized: DataPageV1 172 178 7 91.5 10.9 127.3X +Data column - Parquet Vectorized: DataPageV2 423 427 6 37.2 26.9 51.7X +Data column - Parquet MR: DataPageV1 2301 2301 0 6.8 146.3 9.5X +Data column - Parquet MR: DataPageV2 2073 2075 3 7.6 131.8 10.6X +Data column - ORC Vectorized 279 283 6 56.4 17.7 78.4X +Data column - ORC MR 1871 1880 12 8.4 119.0 11.7X +Partition column - CSV 6626 6650 33 2.4 421.3 3.3X +Partition column - Json 9917 9921 5 1.6 630.5 2.2X +Partition column - Parquet Vectorized: DataPageV1 47 52 9 332.3 3.0 462.1X +Partition column - Parquet Vectorized: DataPageV2 47 51 13 333.2 3.0 463.4X +Partition column - Parquet MR: DataPageV1 1232 1245 19 12.8 78.3 17.8X +Partition column - Parquet MR: DataPageV2 1243 1243 0 12.7 79.0 17.6X +Partition column - ORC Vectorized 47 54 15 336.3 3.0 467.7X +Partition column - ORC MR 1210 1210 1 13.0 76.9 18.1X +Both columns - CSV 23691 23693 3 0.7 1506.3 0.9X +Both columns - Json 13496 13520 33 1.2 858.1 1.6X +Both columns - Parquet Vectorized: DataPageV1 198 203 6 79.5 12.6 110.6X +Both columns - Parquet Vectorized: DataPageV2 453 457 6 34.7 28.8 48.3X +Both columns - Parquet MR: DataPageV1 2500 2516 22 6.3 159.0 8.7X +Both columns - Parquet MR: DataPageV2 2098 2103 7 7.5 133.4 10.4X +Both columns - ORC Vectorized 310 314 6 50.7 19.7 70.6X +Both columns - ORC MR 1944 1955 15 8.1 123.6 11.3X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10074 10372 422 1.0 960.7 1.0X -SQL Json 10037 10147 156 1.0 957.2 1.0X -SQL Parquet Vectorized: DataPageV1 1192 1226 47 8.8 113.7 8.4X -SQL Parquet Vectorized: DataPageV2 2349 2423 105 4.5 224.0 4.3X -SQL Parquet MR: DataPageV1 2995 3114 168 3.5 285.6 3.4X -SQL Parquet MR: DataPageV2 3847 3900 75 2.7 366.9 2.6X -ParquetReader Vectorized: DataPageV1 888 918 51 11.8 84.7 11.3X -ParquetReader Vectorized: DataPageV2 2128 2159 43 4.9 203.0 4.7X -SQL ORC Vectorized 837 908 61 12.5 79.8 12.0X -SQL ORC MR 2792 2882 127 3.8 266.3 3.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 11439 11463 35 0.9 1090.9 1.0X +SQL Json 11155 11157 3 0.9 1063.8 1.0X +SQL Parquet Vectorized: DataPageV1 1254 1268 20 8.4 119.6 9.1X +SQL Parquet Vectorized: DataPageV2 2697 2698 2 3.9 257.2 4.2X +SQL Parquet MR: DataPageV1 3244 3248 5 3.2 309.4 3.5X +SQL Parquet MR: DataPageV2 4168 4172 6 2.5 397.5 2.7X +ParquetReader Vectorized: DataPageV1 925 930 6 11.3 88.3 12.4X +ParquetReader Vectorized: DataPageV2 2366 2367 1 4.4 225.7 4.8X +SQL ORC Vectorized 899 909 10 11.7 85.7 12.7X +SQL ORC MR 3040 3040 0 3.4 290.0 3.8X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7808 7810 3 1.3 744.6 1.0X -SQL Json 7434 7491 82 1.4 708.9 1.1X -SQL Parquet Vectorized: DataPageV1 1037 1044 10 10.1 98.9 7.5X -SQL Parquet Vectorized: DataPageV2 1528 1529 3 6.9 145.7 5.1X -SQL Parquet MR: DataPageV1 2300 2411 156 4.6 219.4 3.4X -SQL Parquet MR: DataPageV2 2637 2639 4 4.0 251.5 3.0X -ParquetReader Vectorized: DataPageV1 843 907 56 12.4 80.4 9.3X -ParquetReader Vectorized: DataPageV2 1424 1446 30 7.4 135.8 5.5X -SQL ORC Vectorized 1131 1132 1 9.3 107.8 6.9X -SQL ORC MR 2781 2856 106 3.8 265.3 2.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 7298 7299 2 1.4 696.0 1.0X +SQL Json 7964 7967 4 1.3 759.5 0.9X +SQL Parquet Vectorized: DataPageV1 1018 1020 4 10.3 97.1 7.2X +SQL Parquet Vectorized: DataPageV2 1922 1924 3 5.5 183.3 3.8X +SQL Parquet MR: DataPageV1 2470 2472 4 4.2 235.5 3.0X +SQL Parquet MR: DataPageV2 3008 3011 4 3.5 286.9 2.4X +ParquetReader Vectorized: DataPageV1 938 946 6 11.2 89.5 7.8X +ParquetReader Vectorized: DataPageV2 1840 1841 2 5.7 175.5 4.0X +SQL ORC Vectorized 1271 1274 5 8.3 121.2 5.7X +SQL ORC MR 2990 3002 17 3.5 285.2 2.4X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5357 5538 255 2.0 510.9 1.0X -SQL Json 4354 4387 47 2.4 415.2 1.2X -SQL Parquet Vectorized: DataPageV1 212 226 15 49.5 20.2 25.3X -SQL Parquet Vectorized: DataPageV2 265 276 16 39.6 25.2 20.2X -SQL Parquet MR: DataPageV1 1575 1578 4 6.7 150.2 3.4X -SQL Parquet MR: DataPageV2 1624 1638 21 6.5 154.8 3.3X -ParquetReader Vectorized: DataPageV1 219 234 14 47.8 20.9 24.4X -ParquetReader Vectorized: DataPageV2 274 294 17 38.2 26.2 19.5X -SQL ORC Vectorized 370 393 12 28.4 35.3 14.5X -SQL ORC MR 1540 1545 7 6.8 146.9 3.5X +SQL CSV 4689 4700 15 2.2 447.2 1.0X +SQL Json 4272 4278 8 2.5 407.4 1.1X +SQL Parquet Vectorized: DataPageV1 212 215 5 49.4 20.2 22.1X +SQL Parquet Vectorized: DataPageV2 583 586 4 18.0 55.6 8.0X +SQL Parquet MR: DataPageV1 1474 1475 2 7.1 140.6 3.2X +SQL Parquet MR: DataPageV2 1467 1477 14 7.1 139.9 3.2X +ParquetReader Vectorized: DataPageV1 222 225 4 47.2 21.2 21.1X +ParquetReader Vectorized: DataPageV2 594 595 2 17.7 56.6 7.9X +SQL ORC Vectorized 393 396 4 26.7 37.5 11.9X +SQL ORC MR 1496 1512 22 7.0 142.7 3.1X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2159 2212 74 0.5 2059.3 1.0X -SQL Json 2836 2896 84 0.4 2704.5 0.8X -SQL Parquet Vectorized: DataPageV1 54 59 9 19.5 51.4 40.1X -SQL Parquet Vectorized: DataPageV2 66 72 8 15.9 63.1 32.7X -SQL Parquet MR: DataPageV1 173 186 10 6.1 164.5 12.5X -SQL Parquet MR: DataPageV2 159 172 8 6.6 151.8 13.6X -SQL ORC Vectorized 54 60 10 19.2 52.0 39.6X -SQL ORC MR 150 161 7 7.0 143.3 14.4X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 2460 2475 21 0.4 2346.2 1.0X +SQL Json 2985 2986 0 0.4 2847.2 0.8X +SQL Parquet Vectorized: DataPageV1 37 39 4 28.1 35.6 65.9X +SQL Parquet Vectorized: DataPageV2 55 58 7 19.0 52.6 44.6X +SQL Parquet MR: DataPageV1 180 182 2 5.8 171.6 13.7X +SQL Parquet MR: DataPageV2 158 160 2 6.7 150.4 15.6X +SQL ORC Vectorized 45 48 9 23.2 43.1 54.4X +SQL ORC MR 148 149 2 7.1 140.7 16.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5877 5883 8 0.2 5605.0 1.0X -SQL Json 11474 11587 159 0.1 10942.9 0.5X -SQL Parquet Vectorized: DataPageV1 66 72 7 15.9 63.1 88.9X -SQL Parquet Vectorized: DataPageV2 83 90 8 12.6 79.4 70.6X -SQL Parquet MR: DataPageV1 191 201 9 5.5 182.6 30.7X -SQL Parquet MR: DataPageV2 179 187 9 5.9 170.3 32.9X -SQL ORC Vectorized 70 76 12 14.9 67.1 83.5X -SQL ORC MR 167 175 7 6.3 159.2 35.2X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 6459 6474 21 0.2 6159.6 1.0X +SQL Json 12049 12142 132 0.1 11490.4 0.5X +SQL Parquet Vectorized: DataPageV1 52 55 6 20.2 49.4 124.6X +SQL Parquet Vectorized: DataPageV2 69 72 7 15.3 65.3 94.3X +SQL Parquet MR: DataPageV1 196 197 1 5.4 186.6 33.0X +SQL Parquet MR: DataPageV2 177 179 2 5.9 168.3 36.6X +SQL ORC Vectorized 59 63 9 17.8 56.1 109.8X +SQL ORC MR 168 179 5 6.2 160.6 38.3X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9695 9965 382 0.1 9245.8 1.0X -SQL Json 22119 23566 2045 0.0 21094.6 0.4X -SQL Parquet Vectorized: DataPageV1 96 104 7 10.9 91.6 100.9X -SQL Parquet Vectorized: DataPageV2 113 121 8 9.3 107.8 85.8X -SQL Parquet MR: DataPageV1 227 243 9 4.6 216.2 42.8X -SQL Parquet MR: DataPageV2 210 225 12 5.0 200.2 46.2X -SQL ORC Vectorized 90 96 10 11.7 85.7 107.9X -SQL ORC MR 188 199 9 5.6 178.9 51.7X +SQL CSV 10783 10790 10 0.1 10283.3 1.0X +SQL Json 22031 22277 348 0.0 21010.5 0.5X +SQL Parquet Vectorized: DataPageV1 82 85 5 12.9 77.8 132.2X +SQL Parquet Vectorized: DataPageV2 99 104 7 10.5 94.9 108.4X +SQL Parquet MR: DataPageV1 229 233 3 4.6 218.6 47.0X +SQL Parquet MR: DataPageV2 203 211 10 5.2 194.0 53.0X +SQL ORC Vectorized 75 78 6 13.9 72.0 142.9X +SQL ORC MR 194 199 7 5.4 184.6 55.7X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index 07e35c158c8cb..d937231f3730b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -29,6 +29,8 @@ import java.util.Set; import com.google.common.annotations.VisibleForTesting; +import org.apache.parquet.VersionParser; +import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.column.page.PageReadStore; import scala.Option; @@ -69,6 +71,7 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReader fileMetadata = fileReader.getFileMetaData().getKeyValueMetaData(); ReadSupport readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index 57a307b1b7b6b..880a3b1514ed1 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -21,6 +21,8 @@ import java.time.ZoneId; import java.util.PrimitiveIterator; +import org.apache.parquet.CorruptDeltaByteArrays; +import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.BytesUtils; @@ -28,6 +30,7 @@ import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.*; +import org.apache.parquet.column.values.RequiresPreviousReader; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; @@ -35,6 +38,7 @@ import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; import org.apache.parquet.schema.PrimitiveType; +import org.apache.spark.memory.MemoryMode; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.spark.sql.types.Decimal; @@ -86,6 +90,8 @@ public class VectorizedColumnReader { private final ColumnDescriptor descriptor; private final LogicalTypeAnnotation logicalTypeAnnotation; private final String datetimeRebaseMode; + private final ParsedVersion writerVersion; + private final MemoryMode memoryMode; public VectorizedColumnReader( ColumnDescriptor descriptor, @@ -96,7 +102,9 @@ public VectorizedColumnReader( String datetimeRebaseMode, String datetimeRebaseTz, String int96RebaseMode, - String int96RebaseTz) throws IOException { + String int96RebaseTz, + ParsedVersion writerVersion, + MemoryMode memoryMode) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.readState = new ParquetReadState(descriptor.getMaxDefinitionLevel(), rowIndexes); @@ -129,6 +137,8 @@ public VectorizedColumnReader( this.datetimeRebaseMode = datetimeRebaseMode; assert "LEGACY".equals(int96RebaseMode) || "EXCEPTION".equals(int96RebaseMode) || "CORRECTED".equals(int96RebaseMode); + this.writerVersion = writerVersion; + this.memoryMode = memoryMode; } private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName) { @@ -174,7 +184,7 @@ void readBatch(int total, WritableColumnVector column) throws IOException { readState.resetForNewPage(pageValueCount, pageFirstRowIndex); } PrimitiveType.PrimitiveTypeName typeName = - descriptor.getPrimitiveType().getPrimitiveTypeName(); + descriptor.getPrimitiveType().getPrimitiveTypeName(); if (isCurrentPageDictionaryEncoded) { // Save starting offset in case we need to decode dictionary IDs. int startOffset = readState.offset; @@ -259,6 +269,7 @@ private void initDataReader( int pageValueCount, Encoding dataEncoding, ByteBufferInputStream in) throws IOException { + ValuesReader previousReader = this.dataColumn; if (dataEncoding.usesDictionary()) { this.dataColumn = null; if (dictionary == null) { @@ -283,6 +294,11 @@ private void initDataReader( } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && + previousReader != null && previousReader instanceof RequiresPreviousReader) { + // previous reader can only be set if reading sequentially + ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); + } } private ValuesReader getValuesReader(Encoding encoding) { @@ -290,18 +306,18 @@ private ValuesReader getValuesReader(Encoding encoding) { case PLAIN: return new VectorizedPlainValuesReader(); case DELTA_BYTE_ARRAY: - return new VectorizedDeltaByteArrayReader(); + return new VectorizedDeltaByteArrayReader(memoryMode); case DELTA_BINARY_PACKED: return new VectorizedDeltaBinaryPackedReader(); case RLE: PrimitiveType.PrimitiveTypeName typeName = - this.descriptor.getPrimitiveType().getPrimitiveTypeName(); + this.descriptor.getPrimitiveType().getPrimitiveTypeName(); // RLE encoding only supports boolean type `Values`, and `bitwidth` is always 1. if (typeName == BOOLEAN) { return new VectorizedRleValuesReader(1); } else { throw new UnsupportedOperationException( - "RLE encoding is not supported for values of type: " + typeName); + "RLE encoding is not supported for values of type: " + typeName); } default: throw new UnsupportedOperationException("Unsupported encoding: " + encoding); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java index 62fb5f8c96bbf..a11a8ebc86b7a 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java @@ -90,6 +90,7 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce Preconditions.checkArgument(miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); this.miniBlockSizeInValues = (int) miniSize; + // True value count. May be less than valueCount because of nulls this.totalValueCount = BytesUtils.readUnsignedVarInt(in); this.bitWidths = new int[miniBlockNumInABlock]; this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; @@ -97,6 +98,10 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce firstValue = BytesUtils.readZigZagVarLong(in); } + int getTotalValueCount() { + return totalValueCount; + } + @Override public byte readByte() { readValues(1, null, 0, (w, r, v) -> byteVal = (byte) v); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 72b760d426eac..7794b4df4598b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -16,51 +16,133 @@ */ package org.apache.spark.sql.execution.datasources.parquet; +import static org.apache.spark.sql.types.DataTypes.BinaryType; +import static org.apache.spark.sql.types.DataTypes.IntegerType; + import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader; +import org.apache.parquet.column.values.RequiresPreviousReader; +import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; +import org.apache.spark.memory.MemoryMode; +import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector; +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import java.io.IOException; import java.nio.ByteBuffer; /** - * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized interface. + * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized + * interface. */ -public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase { - private final DeltaByteArrayReader deltaByteArrayReader = new DeltaByteArrayReader(); +public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase + implements VectorizedValuesReader, RequiresPreviousReader { + + private final MemoryMode memoryMode; + private int valueCount; + private final VectorizedDeltaBinaryPackedReader prefixLengthReader = + new VectorizedDeltaBinaryPackedReader(); + private final VectorizedDeltaLengthByteArrayReader suffixReader; + private WritableColumnVector prefixLengthVector; + private WritableColumnVector suffixVector; + private byte[] previous = new byte[0]; + private int currentRow = 0; + + //temporary variable used by getBinary + Binary binaryVal; + + VectorizedDeltaByteArrayReader(MemoryMode memoryMode){ + this.memoryMode = memoryMode; + this.suffixReader = new VectorizedDeltaLengthByteArrayReader(memoryMode); + } @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - deltaByteArrayReader.initFromPage(valueCount, in); + this.valueCount = valueCount; + if (memoryMode == MemoryMode.OFF_HEAP) { + prefixLengthVector = new OffHeapColumnVector(valueCount, IntegerType); + suffixVector = new OffHeapColumnVector(valueCount, BinaryType); + } else { + prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType); + suffixVector = new OnHeapColumnVector(valueCount, BinaryType); + } + prefixLengthReader.initFromPage(valueCount, in); + prefixLengthReader.readIntegers(prefixLengthReader.getTotalValueCount(), + prefixLengthVector, 0); + suffixReader.initFromPage(valueCount, in); + suffixReader.readBinary(valueCount, suffixVector, 0); } @Override public Binary readBinary(int len) { - return deltaByteArrayReader.readBytes(); + readValues(1, null, 0, + (w, r, v, l) -> + binaryVal = Binary.fromConstantByteArray(v.array(), v.arrayOffset() + v.position(), l)); + return binaryVal; } - @Override - public void readBinary(int total, WritableColumnVector c, int rowId) { + public void readValues(int total, WritableColumnVector c, int rowId, + ByteBufferOutputWriter outputWriter) { + if (total == 0) { + return; + } + for (int i = 0; i < total; i++) { - Binary binary = deltaByteArrayReader.readBytes(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - c.putByteArray(rowId + i, buffer.array(), buffer.arrayOffset() + buffer.position(), - binary.length()); + int prefixLength = prefixLengthVector.getInt(currentRow); + byte[] suffix = suffixVector.getBinary(currentRow); + // This does not copy bytes + int length = prefixLength + suffix.length; + + // NOTE: due to PARQUET-246, it is important that we + // respect prefixLength which was read from prefixLengthReader, + // even for the *first* value of a page. Even though the first + // value of the page should have an empty prefix, it may not + // because of PARQUET-246. + + // We have to do this to materialize the output + if (prefixLength != 0) { + // We could do + // c.putByteArray(rowId + i, previous, 0, prefixLength); + // c.putByteArray(rowId+i, suffix, prefixLength, suffix.length); + // previous = c.getBinary(rowId+1); + // but it incurs the same cost of copying the values twice _and_ c.getBinary + // is a _slow_ byte by byte copy + // The following always uses the faster system arraycopy method + byte[] out = new byte[length]; + System.arraycopy(previous, 0, out, 0, prefixLength); + System.arraycopy(suffix, 0, out, prefixLength, suffix.length); + previous = out; } else { - byte[] bytes = new byte[binary.length()]; - buffer.get(bytes); - c.putByteArray(rowId + i, bytes); + previous = suffix; } + outputWriter.write(c, rowId + i, ByteBuffer.wrap(previous), previous.length); + currentRow++; } } @Override - public void skipBinary(int total) { - for (int i = 0; i < total; i++) { - deltaByteArrayReader.skip(); + public void readBinary(int total, WritableColumnVector c, int rowId) { + readValues(total, c, rowId, ByteBufferOutputWriter::writeArrayByteBuffer); + } + + /** + * There was a bug (PARQUET-246) in which DeltaByteArrayWriter's reset() method did not clear the + * previous value state that it tracks internally. This resulted in the first value of all pages + * (except for the first page) to be a delta from the last value of the previous page. In order to + * read corrupted files written with this bug, when reading a new page we need to recover the + * previous page's last value to use it (if needed) to read the first value. + */ + public void setPreviousReader(ValuesReader reader) { + if (reader != null) { + this.previous = ((VectorizedDeltaByteArrayReader) reader).previous; } } + @Override + public void skipBinary(int total) { + // we have to read all the values so that we always have the correct 'previous' + // we just don't write it to the output vector + readValues(total, null, currentRow, ByteBufferOutputWriter::skipWrite); + } + } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index c1962554e041b..f4e157888f67c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -56,7 +56,7 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); } lengthReader.initFromPage(valueCount, in); - lengthReader.readIntegers(valueCount, lengthsVector, 0); + lengthReader.readIntegers(lengthReader.getTotalValueCount(), lengthsVector, 0); this.in = in.remainingStream(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java index 50056bf4073e9..84d6d025eaf09 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java @@ -367,7 +367,10 @@ private void checkEndOfRowGroup() throws IOException { datetimeRebaseMode, datetimeRebaseTz, int96RebaseMode, - int96RebaseTz); + int96RebaseTz, + writerVersion, + MEMORY_MODE + ); } totalCountLoadedSoFar += pages.getRowCount(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index ca065f9cd012f..8a2bd51ee7e48 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -105,6 +105,8 @@ static void copyWriteByteBuffer(WritableColumnVector c, int rowId, ByteBuffer va c.putByteArray(rowId, bytes); } + static void skipWrite(WritableColumnVector c, int rowId, ByteBuffer val, int length) { } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala new file mode 100644 index 0000000000000..a4e8cacd85b14 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.parquet.bytes.DirectByteBufferAllocator +import org.apache.parquet.column.values.Utils +import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter + +import org.apache.spark.memory.MemoryMode +import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType} + +/** + * Read tests for vectorized Delta byte array reader. + * Translated from * org.apache.parquet.column.values.delta.TestDeltaByteArray + */ +class ParquetDeltaByteArrayEncodingSuite extends ParquetCompatibilityTest with SharedSparkSession { + val values: Array[String] = Array("parquet-mr", "parquet", "parquet-format"); + val randvalues: Array[String] = Utils.getRandomStringSamples(10000, 32) + + var writer: DeltaByteArrayWriter = _ + var reader: VectorizedDeltaByteArrayReader = _ + private var writableColumnVector: WritableColumnVector = _ + + protected override def beforeEach(): Unit = { + writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) + reader = new VectorizedDeltaByteArrayReader(MemoryMode.ON_HEAP) + super.beforeAll() + } + + test("test Serialization") { + assertReadWrite(writer, reader, values) + } + + test("random strings") { + assertReadWrite(writer, reader, randvalues) + } + + test("random strings with skip") { + assertReadWriteWithSkip(writer, reader, randvalues) + } + + test("random strings with skipN") { + assertReadWriteWithSkipN(writer, reader, randvalues) + } + + test("test lengths") { + var reader = new VectorizedDeltaBinaryPackedReader + Utils.writeData(writer, values) + val data = writer.getBytes.toInputStream + val length = values.length + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, data) + reader.readIntegers(length, writableColumnVector, 0) + // test prefix lengths + assert(0 == writableColumnVector.getInt(0)) + assert(7 == writableColumnVector.getInt(1)) + assert(7 == writableColumnVector.getInt(2)) + + reader = new VectorizedDeltaBinaryPackedReader + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, data) + reader.readIntegers(length, writableColumnVector, 0) + // test suffix lengths + assert(10 == writableColumnVector.getInt(0)) + assert(0 == writableColumnVector.getInt(1)) + assert(7 == writableColumnVector.getInt(2)) + } + + private def assertReadWrite( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + + writableColumnVector = new OnHeapColumnVector(length, StringType) + + reader.initFromPage(length, is) + reader.readBinary(length, writableColumnVector, 0) + + for (i <- 0 until length) { + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + } + } + + private def assertReadWriteWithSkip( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + writableColumnVector = new OnHeapColumnVector(length, StringType) + reader.initFromPage(length, is) + var i = 0 + while ( { + i < vals.length + }) { + reader.readBinary(1, writableColumnVector, i) + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(1) + i += 2 + } + } + + private def assertReadWriteWithSkipN( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + writableColumnVector = new OnHeapColumnVector(length, StringType) + reader.initFromPage(length, is) + var skipCount = 0 + var i = 0 + while ( { + i < vals.length + }) { + skipCount = (vals.length - i) / 2 + reader.readBinary(1, writableColumnVector, i) + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(skipCount) + i += skipCount + 1 + } + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index f7100a53444aa..78da8eab27409 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -147,12 +147,19 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess withTempPath { dir => val path = s"${dir.getCanonicalPath}/test.parquet" - val data = (1 to 3).map { i => - ( i, i.toLong, i.toShort, Array[Byte](i.toByte), s"test_${i}", - DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + i)), - DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + i)), - Period.of(1, i, 0), Duration.ofMillis(i * 100), - new BigDecimal(java.lang.Long.toUnsignedString(i*100000)) + // Have more than 2 * 4096 records (so we have multiple tasks and each task + // reads at least twice from the reader). This will catch any issues with state + // maintained by the reader(s) + // Add at least one string with a null + val data = (1 to 8197).map { i => + ( i, + i.toLong, i.toShort, Array[Byte](i.toByte), + if (i % 2 == 1) s"test_${i}" else null, + DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + ((i % 9) + 1))), + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + (i % 10))), + Period.of(1, (i % 11 ) + 1, 0), + Duration.ofMillis( ((i % 9) + 1) * 100), + new BigDecimal(java.lang.Long.toUnsignedString(i * 100000)) ) } From 52df5170e8c70c2d32861674acfe5b8736322e4b Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Fri, 21 Jan 2022 18:14:36 -0800 Subject: [PATCH 03/20] Addressing review comments --- .../execution/datasources/parquet/VectorizedColumnReader.java | 2 +- .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index 880a3b1514ed1..dd3a46b256ecb 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -295,7 +295,7 @@ private void initDataReader( throw new IOException("could not read page in col " + descriptor, e); } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && - previousReader != null && previousReader instanceof RequiresPreviousReader) { + previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 7794b4df4598b..eb26dfa8b45f2 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -39,7 +39,6 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase implements VectorizedValuesReader, RequiresPreviousReader { private final MemoryMode memoryMode; - private int valueCount; private final VectorizedDeltaBinaryPackedReader prefixLengthReader = new VectorizedDeltaBinaryPackedReader(); private final VectorizedDeltaLengthByteArrayReader suffixReader; @@ -49,7 +48,7 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private int currentRow = 0; //temporary variable used by getBinary - Binary binaryVal; + private Binary binaryVal; VectorizedDeltaByteArrayReader(MemoryMode memoryMode){ this.memoryMode = memoryMode; @@ -58,7 +57,6 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - this.valueCount = valueCount; if (memoryMode == MemoryMode.OFF_HEAP) { prefixLengthVector = new OffHeapColumnVector(valueCount, IntegerType); suffixVector = new OffHeapColumnVector(valueCount, BinaryType); From 0011bab27f04318cb82a651ad688a64df56d8785 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 24 Jan 2022 15:52:12 -0800 Subject: [PATCH 04/20] More review comments addressed --- .../parquet/VectorizedDeltaByteArrayReader.java | 13 ++++++++----- .../datasources/parquet/ParquetEncodingSuite.scala | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index eb26dfa8b45f2..e15a2b243e334 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -48,11 +48,16 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private int currentRow = 0; //temporary variable used by getBinary - private Binary binaryVal; + private final WritableColumnVector binaryValVector; VectorizedDeltaByteArrayReader(MemoryMode memoryMode){ this.memoryMode = memoryMode; this.suffixReader = new VectorizedDeltaLengthByteArrayReader(memoryMode); + if (memoryMode == MemoryMode.OFF_HEAP) { + binaryValVector = new OffHeapColumnVector(1, BinaryType); + } else { + binaryValVector = new OnHeapColumnVector(1, BinaryType); + } } @Override @@ -73,10 +78,8 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce @Override public Binary readBinary(int len) { - readValues(1, null, 0, - (w, r, v, l) -> - binaryVal = Binary.fromConstantByteArray(v.array(), v.arrayOffset() + v.position(), l)); - return binaryVal; + readValues(1, binaryValVector, 0, ByteBufferOutputWriter::writeArrayByteBuffer); + return Binary.fromConstantByteArray(binaryValVector.getBinary(0)); } public void readValues(int total, WritableColumnVector c, int rowId, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 78da8eab27409..8ad660478d450 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -154,7 +154,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess val data = (1 to 8197).map { i => ( i, i.toLong, i.toShort, Array[Byte](i.toByte), - if (i % 2 == 1) s"test_${i}" else null, + if (i % 2 == 1) s"test_$i" else null, DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + ((i % 9) + 1))), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + (i % 10))), Period.of(1, (i % 11 ) + 1, 0), From 50ed8159e6f8e64c6fbc98c9e24e31a063834a62 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 25 Jan 2022 10:45:35 -0800 Subject: [PATCH 05/20] One more review comment --- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index f4e157888f67c..3ca489659fce6 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -36,7 +36,6 @@ public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase i VectorizedValuesReader { private final MemoryMode memoryMode; - private int valueCount; private final VectorizedDeltaBinaryPackedReader lengthReader = new VectorizedDeltaBinaryPackedReader(); private ByteBufferInputStream in; @@ -49,7 +48,6 @@ public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase i @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - this.valueCount = valueCount; if (memoryMode == MemoryMode.OFF_HEAP) { lengthsVector = new OffHeapColumnVector(valueCount, IntegerType); } else { From 3dc340adc6351aed75095945df4a7ddd12014342 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 25 Jan 2022 16:30:50 -0800 Subject: [PATCH 06/20] Updated JDK 8 benchmark --- .../DataSourceReadBenchmark-results.txt | 470 +++++++++--------- 1 file changed, 235 insertions(+), 235 deletions(-) diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 492fae832aa58..fed5d5a84933c 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11370 11607 334 1.4 722.9 1.0X -SQL Json 7646 7670 34 2.1 486.1 1.5X -SQL Parquet Vectorized: DataPageV1 90 102 10 174.8 5.7 126.4X -SQL Parquet Vectorized: DataPageV2 71 79 10 222.5 4.5 160.9X -SQL Parquet MR: DataPageV1 1514 1517 4 10.4 96.3 7.5X -SQL Parquet MR: DataPageV2 1369 1372 5 11.5 87.0 8.3X -SQL ORC Vectorized 152 163 12 103.3 9.7 74.7X -SQL ORC MR 1382 1402 29 11.4 87.8 8.2X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 20534 20708 247 0.8 1305.5 1.0X +SQL Json 10166 10196 42 1.5 646.3 2.0X +SQL Parquet Vectorized: DataPageV1 148 185 22 106.5 9.4 139.0X +SQL Parquet Vectorized: DataPageV2 110 138 26 142.4 7.0 185.9X +SQL Parquet MR: DataPageV1 2098 2108 14 7.5 133.4 9.8X +SQL Parquet MR: DataPageV2 1865 1875 13 8.4 118.6 11.0X +SQL ORC Vectorized 199 209 8 79.2 12.6 103.3X +SQL ORC MR 2047 2053 8 7.7 130.2 10.0X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 91 93 3 172.7 5.8 1.0X -ParquetReader Vectorized: DataPageV2 84 85 3 188.1 5.3 1.1X -ParquetReader Vectorized -> Row: DataPageV1 38 39 1 408.6 2.4 2.4X -ParquetReader Vectorized -> Row: DataPageV2 31 31 1 509.7 2.0 3.0X +ParquetReader Vectorized: DataPageV1 122 130 6 129.4 7.7 1.0X +ParquetReader Vectorized: DataPageV2 110 117 5 142.7 7.0 1.1X +ParquetReader Vectorized -> Row: DataPageV1 54 56 2 291.8 3.4 2.3X +ParquetReader Vectorized -> Row: DataPageV2 46 49 2 339.2 2.9 2.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13939 13970 44 1.1 886.2 1.0X -SQL Json 8771 8792 29 1.8 557.7 1.6X -SQL Parquet Vectorized: DataPageV1 112 116 9 140.7 7.1 124.7X -SQL Parquet Vectorized: DataPageV2 110 115 7 143.1 7.0 126.8X -SQL Parquet MR: DataPageV1 1694 1698 7 9.3 107.7 8.2X -SQL Parquet MR: DataPageV2 1556 1565 12 10.1 98.9 9.0X -SQL ORC Vectorized 122 125 3 128.6 7.8 114.0X -SQL ORC MR 1353 1353 0 11.6 86.0 10.3X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 22495 22582 124 0.7 1430.2 1.0X +SQL Json 11601 11684 118 1.4 737.5 1.9X +SQL Parquet Vectorized: DataPageV1 173 188 22 90.8 11.0 129.9X +SQL Parquet Vectorized: DataPageV2 169 185 20 93.1 10.7 133.2X +SQL Parquet MR: DataPageV1 2408 2434 37 6.5 153.1 9.3X +SQL Parquet MR: DataPageV2 2218 2222 5 7.1 141.0 10.1X +SQL ORC Vectorized 161 172 10 97.4 10.3 139.3X +SQL ORC MR 1926 1949 33 8.2 122.4 11.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 169 171 2 93.1 10.7 1.0X -ParquetReader Vectorized: DataPageV2 168 170 3 93.4 10.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 137 139 2 114.5 8.7 1.2X -ParquetReader Vectorized -> Row: DataPageV2 138 140 2 114.1 8.8 1.2X +ParquetReader Vectorized: DataPageV1 220 229 6 71.4 14.0 1.0X +ParquetReader Vectorized: DataPageV2 223 228 4 70.7 14.1 1.0X +ParquetReader Vectorized -> Row: DataPageV1 203 212 12 77.3 12.9 1.1X +ParquetReader Vectorized -> Row: DataPageV2 201 208 5 78.1 12.8 1.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14823 14843 28 1.1 942.4 1.0X -SQL Json 9345 9346 2 1.7 594.1 1.6X -SQL Parquet Vectorized: DataPageV1 172 189 48 91.4 10.9 86.2X -SQL Parquet Vectorized: DataPageV2 175 179 6 89.9 11.1 84.7X -SQL Parquet MR: DataPageV1 1962 1971 13 8.0 124.7 7.6X -SQL Parquet MR: DataPageV2 1680 1684 6 9.4 106.8 8.8X -SQL ORC Vectorized 158 161 4 99.4 10.1 93.7X -SQL ORC MR 1449 1456 10 10.9 92.1 10.2X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 23404 23570 234 0.7 1488.0 1.0X +SQL Json 12152 12199 67 1.3 772.6 1.9X +SQL Parquet Vectorized: DataPageV1 266 279 10 59.2 16.9 88.1X +SQL Parquet Vectorized: DataPageV2 265 275 8 59.3 16.9 88.2X +SQL Parquet MR: DataPageV1 2721 2762 58 5.8 173.0 8.6X +SQL Parquet MR: DataPageV2 2299 2326 38 6.8 146.2 10.2X +SQL ORC Vectorized 227 232 4 69.4 14.4 103.3X +SQL ORC MR 2020 2118 139 7.8 128.5 11.6X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 212 217 4 74.1 13.5 1.0X -ParquetReader Vectorized: DataPageV2 245 249 3 64.1 15.6 0.9X -ParquetReader Vectorized -> Row: DataPageV1 214 215 2 73.7 13.6 1.0X -ParquetReader Vectorized -> Row: DataPageV2 245 248 2 64.1 15.6 0.9X +ParquetReader Vectorized: DataPageV1 306 322 14 51.3 19.5 1.0X +ParquetReader Vectorized: DataPageV2 348 354 6 45.2 22.1 0.9X +ParquetReader Vectorized -> Row: DataPageV1 304 315 8 51.7 19.4 1.0X +ParquetReader Vectorized -> Row: DataPageV2 349 358 7 45.1 22.2 0.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 16272 16326 76 1.0 1034.5 1.0X -SQL Json 9900 9906 8 1.6 629.4 1.6X -SQL Parquet Vectorized: DataPageV1 134 138 6 117.0 8.5 121.1X -SQL Parquet Vectorized: DataPageV2 249 254 6 63.1 15.9 65.2X -SQL Parquet MR: DataPageV1 2040 2042 2 7.7 129.7 8.0X -SQL Parquet MR: DataPageV2 1777 1786 13 8.9 113.0 9.2X -SQL ORC Vectorized 216 219 3 72.9 13.7 75.4X -SQL ORC MR 1550 1553 4 10.1 98.6 10.5X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 25064 25080 22 0.6 1593.5 1.0X +SQL Json 12730 12829 140 1.2 809.3 2.0X +SQL Parquet Vectorized: DataPageV1 210 223 8 74.8 13.4 119.2X +SQL Parquet Vectorized: DataPageV2 374 394 11 42.0 23.8 66.9X +SQL Parquet MR: DataPageV1 2710 2757 66 5.8 172.3 9.2X +SQL Parquet MR: DataPageV2 2378 2385 9 6.6 151.2 10.5X +SQL ORC Vectorized 306 316 6 51.5 19.4 82.0X +SQL ORC MR 2199 2218 27 7.2 139.8 11.4X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 191 209 42 82.1 12.2 1.0X -ParquetReader Vectorized: DataPageV2 321 323 4 49.1 20.4 0.6X -ParquetReader Vectorized -> Row: DataPageV1 193 195 4 81.5 12.3 1.0X -ParquetReader Vectorized -> Row: DataPageV2 321 326 8 49.1 20.4 0.6X +ParquetReader Vectorized: DataPageV1 286 307 19 55.0 18.2 1.0X +ParquetReader Vectorized: DataPageV2 468 479 11 33.6 29.8 0.6X +ParquetReader Vectorized -> Row: DataPageV1 282 293 11 55.7 18.0 1.0X +ParquetReader Vectorized -> Row: DataPageV2 467 478 8 33.7 29.7 0.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 22520 22525 7 0.7 1431.8 1.0X -SQL Json 12602 12602 1 1.2 801.2 1.8X -SQL Parquet Vectorized: DataPageV1 187 208 55 84.2 11.9 120.6X -SQL Parquet Vectorized: DataPageV2 396 404 9 39.7 25.2 56.8X -SQL Parquet MR: DataPageV1 2218 2232 20 7.1 141.0 10.2X -SQL Parquet MR: DataPageV2 1796 1802 9 8.8 114.2 12.5X -SQL ORC Vectorized 276 279 7 57.1 17.5 81.7X -SQL ORC MR 1621 1630 12 9.7 103.0 13.9X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 32250 32360 155 0.5 2050.4 1.0X +SQL Json 16196 16367 241 1.0 1029.7 2.0X +SQL Parquet Vectorized: DataPageV1 311 325 14 50.7 19.7 103.9X +SQL Parquet Vectorized: DataPageV2 629 641 13 25.0 40.0 51.2X +SQL Parquet MR: DataPageV1 2893 2916 33 5.4 184.0 11.1X +SQL Parquet MR: DataPageV2 2506 2528 30 6.3 159.3 12.9X +SQL ORC Vectorized 388 403 9 40.5 24.7 83.1X +SQL ORC MR 2214 2232 25 7.1 140.8 14.6X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 249 253 6 63.1 15.9 1.0X -ParquetReader Vectorized: DataPageV2 469 474 6 33.5 29.8 0.5X -ParquetReader Vectorized -> Row: DataPageV1 250 256 9 62.8 15.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 470 474 9 33.5 29.9 0.5X +ParquetReader Vectorized: DataPageV1 373 398 15 42.2 23.7 1.0X +ParquetReader Vectorized: DataPageV2 709 715 6 22.2 45.1 0.5X +ParquetReader Vectorized -> Row: DataPageV1 379 388 8 41.5 24.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 691 704 11 22.8 44.0 0.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17796 17814 25 0.9 1131.5 1.0X -SQL Json 12118 12124 8 1.3 770.5 1.5X -SQL Parquet Vectorized: DataPageV1 131 137 8 119.7 8.4 135.5X -SQL Parquet Vectorized: DataPageV2 131 135 7 119.6 8.4 135.4X -SQL Parquet MR: DataPageV1 1978 1982 7 8.0 125.8 9.0X -SQL Parquet MR: DataPageV2 1830 1844 21 8.6 116.3 9.7X -SQL ORC Vectorized 318 326 10 49.5 20.2 56.0X -SQL ORC MR 1617 1621 5 9.7 102.8 11.0X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 26962 27080 167 0.6 1714.2 1.0X +SQL Json 15554 15738 261 1.0 988.9 1.7X +SQL Parquet Vectorized: DataPageV1 196 206 10 80.1 12.5 137.3X +SQL Parquet Vectorized: DataPageV2 191 202 11 82.3 12.1 141.2X +SQL Parquet MR: DataPageV1 2674 2698 35 5.9 170.0 10.1X +SQL Parquet MR: DataPageV2 2502 2512 14 6.3 159.1 10.8X +SQL ORC Vectorized 427 448 16 36.9 27.1 63.2X +SQL ORC MR 2271 2299 39 6.9 144.4 11.9X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 212 215 3 74.1 13.5 1.0X -ParquetReader Vectorized: DataPageV2 213 215 3 73.7 13.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 221 223 2 71.0 14.1 1.0X -ParquetReader Vectorized -> Row: DataPageV2 222 224 3 70.9 14.1 1.0X +ParquetReader Vectorized: DataPageV1 290 301 11 54.2 18.4 1.0X +ParquetReader Vectorized: DataPageV2 288 300 12 54.7 18.3 1.0X +ParquetReader Vectorized -> Row: DataPageV1 281 291 7 55.9 17.9 1.0X +ParquetReader Vectorized -> Row: DataPageV2 282 290 9 55.8 17.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 23606 23611 7 0.7 1500.8 1.0X -SQL Json 16860 16868 11 0.9 1072.0 1.4X -SQL Parquet Vectorized: DataPageV1 186 192 8 84.4 11.9 126.6X -SQL Parquet Vectorized: DataPageV2 186 193 8 84.4 11.9 126.6X -SQL Parquet MR: DataPageV1 2246 2264 26 7.0 142.8 10.5X -SQL Parquet MR: DataPageV2 2091 2098 10 7.5 132.9 11.3X -SQL ORC Vectorized 407 416 11 38.7 25.8 58.1X -SQL ORC MR 1740 1740 1 9.0 110.6 13.6X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 33265 33351 122 0.5 2114.9 1.0X +SQL Json 20959 21022 89 0.8 1332.5 1.6X +SQL Parquet Vectorized: DataPageV1 291 308 11 54.1 18.5 114.4X +SQL Parquet Vectorized: DataPageV2 291 299 7 54.1 18.5 114.4X +SQL Parquet MR: DataPageV1 2838 2892 76 5.5 180.4 11.7X +SQL Parquet MR: DataPageV2 2699 2700 0 5.8 171.6 12.3X +SQL ORC Vectorized 504 527 22 31.2 32.0 66.0X +SQL ORC MR 2355 2365 14 6.7 149.7 14.1X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 249 256 7 63.1 15.8 1.0X -ParquetReader Vectorized: DataPageV2 248 252 4 63.4 15.8 1.0X -ParquetReader Vectorized -> Row: DataPageV1 247 251 6 63.7 15.7 1.0X -ParquetReader Vectorized -> Row: DataPageV2 248 251 3 63.3 15.8 1.0X +ParquetReader Vectorized: DataPageV1 386 395 7 40.8 24.5 1.0X +ParquetReader Vectorized: DataPageV2 375 386 14 41.9 23.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 372 387 16 42.2 23.7 1.0X +ParquetReader Vectorized -> Row: DataPageV2 374 384 15 42.0 23.8 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15769 15778 12 0.7 1503.8 1.0X -SQL Json 11597 11610 18 0.9 1105.9 1.4X -SQL Parquet Vectorized: DataPageV1 2119 2121 3 4.9 202.1 7.4X -SQL Parquet Vectorized: DataPageV2 3026 3027 1 3.5 288.6 5.2X -SQL Parquet MR: DataPageV1 3980 3993 18 2.6 379.6 4.0X -SQL Parquet MR: DataPageV2 3899 3901 3 2.7 371.8 4.0X -SQL ORC Vectorized 2075 2084 13 5.1 197.9 7.6X -SQL ORC MR 3736 3739 5 2.8 356.3 4.2X +SQL CSV 23318 23395 110 0.4 2223.7 1.0X +SQL Json 14676 14727 72 0.7 1399.7 1.6X +SQL Parquet Vectorized: DataPageV1 2420 2445 35 4.3 230.8 9.6X +SQL Parquet Vectorized: DataPageV2 3734 3790 79 2.8 356.1 6.2X +SQL Parquet MR: DataPageV1 4865 4933 97 2.2 463.9 4.8X +SQL Parquet MR: DataPageV2 4998 5030 45 2.1 476.7 4.7X +SQL ORC Vectorized 2395 2444 69 4.4 228.4 9.7X +SQL ORC MR 4666 4669 5 2.2 444.9 5.0X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8221 8223 4 1.3 784.0 1.0X -SQL Json 6938 6941 4 1.5 661.7 1.2X -SQL Parquet Vectorized: DataPageV1 734 747 15 14.3 70.0 11.2X -SQL Parquet Vectorized: DataPageV2 734 736 2 14.3 70.0 11.2X -SQL Parquet MR: DataPageV1 1469 1469 0 7.1 140.1 5.6X -SQL Parquet MR: DataPageV2 1431 1432 1 7.3 136.5 5.7X -SQL ORC Vectorized 446 447 1 23.5 42.5 18.4X -SQL ORC MR 1807 1814 9 5.8 172.3 4.5X +SQL CSV 14158 14278 170 0.7 1350.2 1.0X +SQL Json 8620 8768 209 1.2 822.1 1.6X +SQL Parquet Vectorized: DataPageV1 856 868 10 12.2 81.6 16.5X +SQL Parquet Vectorized: DataPageV2 876 881 5 12.0 83.5 16.2X +SQL Parquet MR: DataPageV1 2207 2228 29 4.8 210.5 6.4X +SQL Parquet MR: DataPageV2 2130 2139 13 4.9 203.2 6.6X +SQL ORC Vectorized 559 569 15 18.8 53.3 25.3X +SQL ORC MR 2375 2388 18 4.4 226.5 6.0X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 21873 21928 78 0.7 1390.6 1.0X -Data column - Json 12624 12665 58 1.2 802.6 1.7X -Data column - Parquet Vectorized: DataPageV1 172 178 7 91.5 10.9 127.3X -Data column - Parquet Vectorized: DataPageV2 423 427 6 37.2 26.9 51.7X -Data column - Parquet MR: DataPageV1 2301 2301 0 6.8 146.3 9.5X -Data column - Parquet MR: DataPageV2 2073 2075 3 7.6 131.8 10.6X -Data column - ORC Vectorized 279 283 6 56.4 17.7 78.4X -Data column - ORC MR 1871 1880 12 8.4 119.0 11.7X -Partition column - CSV 6626 6650 33 2.4 421.3 3.3X -Partition column - Json 9917 9921 5 1.6 630.5 2.2X -Partition column - Parquet Vectorized: DataPageV1 47 52 9 332.3 3.0 462.1X -Partition column - Parquet Vectorized: DataPageV2 47 51 13 333.2 3.0 463.4X -Partition column - Parquet MR: DataPageV1 1232 1245 19 12.8 78.3 17.8X -Partition column - Parquet MR: DataPageV2 1243 1243 0 12.7 79.0 17.6X -Partition column - ORC Vectorized 47 54 15 336.3 3.0 467.7X -Partition column - ORC MR 1210 1210 1 13.0 76.9 18.1X -Both columns - CSV 23691 23693 3 0.7 1506.3 0.9X -Both columns - Json 13496 13520 33 1.2 858.1 1.6X -Both columns - Parquet Vectorized: DataPageV1 198 203 6 79.5 12.6 110.6X -Both columns - Parquet Vectorized: DataPageV2 453 457 6 34.7 28.8 48.3X -Both columns - Parquet MR: DataPageV1 2500 2516 22 6.3 159.0 8.7X -Both columns - Parquet MR: DataPageV2 2098 2103 7 7.5 133.4 10.4X -Both columns - ORC Vectorized 310 314 6 50.7 19.7 70.6X -Both columns - ORC MR 1944 1955 15 8.1 123.6 11.3X +Data column - CSV 32546 32601 77 0.5 2069.2 1.0X +Data column - Json 16186 16506 453 1.0 1029.1 2.0X +Data column - Parquet Vectorized: DataPageV1 304 312 9 51.8 19.3 107.2X +Data column - Parquet Vectorized: DataPageV2 645 662 15 24.4 41.0 50.5X +Data column - Parquet MR: DataPageV1 3209 3267 83 4.9 204.0 10.1X +Data column - Parquet MR: DataPageV2 2893 2942 69 5.4 184.0 11.2X +Data column - ORC Vectorized 405 418 12 38.8 25.8 80.3X +Data column - ORC MR 2743 2804 85 5.7 174.4 11.9X +Partition column - CSV 8546 8618 102 1.8 543.3 3.8X +Partition column - Json 13380 13456 108 1.2 850.7 2.4X +Partition column - Parquet Vectorized: DataPageV1 64 71 5 247.4 4.0 511.9X +Partition column - Parquet Vectorized: DataPageV2 66 73 5 237.7 4.2 491.8X +Partition column - Parquet MR: DataPageV1 1524 1551 38 10.3 96.9 21.4X +Partition column - Parquet MR: DataPageV2 1587 1593 8 9.9 100.9 20.5X +Partition column - ORC Vectorized 69 76 4 227.1 4.4 469.9X +Partition column - ORC MR 1761 1764 4 8.9 112.0 18.5X +Both columns - CSV 29992 30004 17 0.5 1906.8 1.1X +Both columns - Json 17684 17888 288 0.9 1124.3 1.8X +Both columns - Parquet Vectorized: DataPageV1 349 362 11 45.1 22.2 93.4X +Both columns - Parquet Vectorized: DataPageV2 708 712 4 22.2 45.0 45.9X +Both columns - Parquet MR: DataPageV1 3294 3301 10 4.8 209.4 9.9X +Both columns - Parquet MR: DataPageV2 2887 2905 26 5.4 183.5 11.3X +Both columns - ORC Vectorized 441 450 9 35.7 28.0 73.8X +Both columns - ORC MR 2842 2925 117 5.5 180.7 11.5X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11439 11463 35 0.9 1090.9 1.0X -SQL Json 11155 11157 3 0.9 1063.8 1.0X -SQL Parquet Vectorized: DataPageV1 1254 1268 20 8.4 119.6 9.1X -SQL Parquet Vectorized: DataPageV2 2697 2698 2 3.9 257.2 4.2X -SQL Parquet MR: DataPageV1 3244 3248 5 3.2 309.4 3.5X -SQL Parquet MR: DataPageV2 4168 4172 6 2.5 397.5 2.7X -ParquetReader Vectorized: DataPageV1 925 930 6 11.3 88.3 12.4X -ParquetReader Vectorized: DataPageV2 2366 2367 1 4.4 225.7 4.8X -SQL ORC Vectorized 899 909 10 11.7 85.7 12.7X -SQL ORC MR 3040 3040 0 3.4 290.0 3.8X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 14613 14725 159 0.7 1393.6 1.0X +SQL Json 14816 14860 63 0.7 1413.0 1.0X +SQL Parquet Vectorized: DataPageV1 1644 1669 35 6.4 156.8 8.9X +SQL Parquet Vectorized: DataPageV2 3433 3470 52 3.1 327.4 4.3X +SQL Parquet MR: DataPageV1 4374 4389 20 2.4 417.2 3.3X +SQL Parquet MR: DataPageV2 5581 5611 43 1.9 532.2 2.6X +ParquetReader Vectorized: DataPageV1 1213 1222 12 8.6 115.7 12.0X +ParquetReader Vectorized: DataPageV2 3007 3012 7 3.5 286.8 4.9X +SQL ORC Vectorized 1120 1126 9 9.4 106.8 13.1X +SQL ORC MR 3961 4000 55 2.6 377.8 3.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7298 7299 2 1.4 696.0 1.0X -SQL Json 7964 7967 4 1.3 759.5 0.9X -SQL Parquet Vectorized: DataPageV1 1018 1020 4 10.3 97.1 7.2X -SQL Parquet Vectorized: DataPageV2 1922 1924 3 5.5 183.3 3.8X -SQL Parquet MR: DataPageV1 2470 2472 4 4.2 235.5 3.0X -SQL Parquet MR: DataPageV2 3008 3011 4 3.5 286.9 2.4X -ParquetReader Vectorized: DataPageV1 938 946 6 11.2 89.5 7.8X -ParquetReader Vectorized: DataPageV2 1840 1841 2 5.7 175.5 4.0X -SQL ORC Vectorized 1271 1274 5 8.3 121.2 5.7X -SQL ORC MR 2990 3002 17 3.5 285.2 2.4X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 10591 10743 216 1.0 1010.0 1.0X +SQL Json 10195 10432 336 1.0 972.2 1.0X +SQL Parquet Vectorized: DataPageV1 1279 1304 35 8.2 122.0 8.3X +SQL Parquet Vectorized: DataPageV2 2399 2437 55 4.4 228.7 4.4X +SQL Parquet MR: DataPageV1 3324 3402 109 3.2 317.0 3.2X +SQL Parquet MR: DataPageV2 4077 4084 10 2.6 388.8 2.6X +ParquetReader Vectorized: DataPageV1 1161 1164 4 9.0 110.7 9.1X +ParquetReader Vectorized: DataPageV2 2363 2372 12 4.4 225.4 4.5X +SQL ORC Vectorized 1255 1276 30 8.4 119.7 8.4X +SQL ORC MR 3544 3556 16 3.0 338.0 3.0X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4689 4700 15 2.2 447.2 1.0X -SQL Json 4272 4278 8 2.5 407.4 1.1X -SQL Parquet Vectorized: DataPageV1 212 215 5 49.4 20.2 22.1X -SQL Parquet Vectorized: DataPageV2 583 586 4 18.0 55.6 8.0X -SQL Parquet MR: DataPageV1 1474 1475 2 7.1 140.6 3.2X -SQL Parquet MR: DataPageV2 1467 1477 14 7.1 139.9 3.2X -ParquetReader Vectorized: DataPageV1 222 225 4 47.2 21.2 21.1X -ParquetReader Vectorized: DataPageV2 594 595 2 17.7 56.6 7.9X -SQL ORC Vectorized 393 396 4 26.7 37.5 11.9X -SQL ORC MR 1496 1512 22 7.0 142.7 3.1X +SQL CSV 7609 7708 141 1.4 725.6 1.0X +SQL Json 6258 6310 74 1.7 596.8 1.2X +SQL Parquet Vectorized: DataPageV1 279 286 9 37.6 26.6 27.3X +SQL Parquet Vectorized: DataPageV2 817 825 7 12.8 77.9 9.3X +SQL Parquet MR: DataPageV1 2195 2202 11 4.8 209.3 3.5X +SQL Parquet MR: DataPageV2 1993 2001 12 5.3 190.0 3.8X +ParquetReader Vectorized: DataPageV1 289 294 7 36.3 27.6 26.3X +ParquetReader Vectorized: DataPageV2 822 826 5 12.7 78.4 9.3X +SQL ORC Vectorized 446 461 11 23.5 42.5 17.1X +SQL ORC MR 1933 1941 11 5.4 184.3 3.9X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2460 2475 21 0.4 2346.2 1.0X -SQL Json 2985 2986 0 0.4 2847.2 0.8X -SQL Parquet Vectorized: DataPageV1 37 39 4 28.1 35.6 65.9X -SQL Parquet Vectorized: DataPageV2 55 58 7 19.0 52.6 44.6X -SQL Parquet MR: DataPageV1 180 182 2 5.8 171.6 13.7X -SQL Parquet MR: DataPageV2 158 160 2 6.7 150.4 15.6X -SQL ORC Vectorized 45 48 9 23.2 43.1 54.4X -SQL ORC MR 148 149 2 7.1 140.7 16.7X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 3012 3041 41 0.3 2872.0 1.0X +SQL Json 3779 3780 1 0.3 3603.8 0.8X +SQL Parquet Vectorized: DataPageV1 58 65 11 18.1 55.2 52.0X +SQL Parquet Vectorized: DataPageV2 80 88 12 13.1 76.4 37.6X +SQL Parquet MR: DataPageV1 235 243 6 4.5 223.9 12.8X +SQL Parquet MR: DataPageV2 218 232 13 4.8 208.2 13.8X +SQL ORC Vectorized 66 73 4 15.8 63.3 45.4X +SQL ORC MR 200 208 6 5.2 190.7 15.1X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6459 6474 21 0.2 6159.6 1.0X -SQL Json 12049 12142 132 0.1 11490.4 0.5X -SQL Parquet Vectorized: DataPageV1 52 55 6 20.2 49.4 124.6X -SQL Parquet Vectorized: DataPageV2 69 72 7 15.3 65.3 94.3X -SQL Parquet MR: DataPageV1 196 197 1 5.4 186.6 33.0X -SQL Parquet MR: DataPageV2 177 179 2 5.9 168.3 36.6X -SQL ORC Vectorized 59 63 9 17.8 56.1 109.8X -SQL ORC MR 168 179 5 6.2 160.6 38.3X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +SQL CSV 7066 7100 49 0.1 6738.2 1.0X +SQL Json 14074 14208 190 0.1 13421.7 0.5X +SQL Parquet Vectorized: DataPageV1 78 87 11 13.5 74.0 91.1X +SQL Parquet Vectorized: DataPageV2 105 113 12 10.0 100.3 67.2X +SQL Parquet MR: DataPageV1 262 273 9 4.0 249.6 27.0X +SQL Parquet MR: DataPageV2 246 252 5 4.3 234.2 28.8X +SQL ORC Vectorized 84 92 11 12.5 79.8 84.4X +SQL ORC MR 222 238 9 4.7 212.0 31.8X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10783 10790 10 0.1 10283.3 1.0X -SQL Json 22031 22277 348 0.0 21010.5 0.5X -SQL Parquet Vectorized: DataPageV1 82 85 5 12.9 77.8 132.2X -SQL Parquet Vectorized: DataPageV2 99 104 7 10.5 94.9 108.4X -SQL Parquet MR: DataPageV1 229 233 3 4.6 218.6 47.0X -SQL Parquet MR: DataPageV2 203 211 10 5.2 194.0 53.0X -SQL ORC Vectorized 75 78 6 13.9 72.0 142.9X -SQL ORC MR 194 199 7 5.4 184.6 55.7X +SQL CSV 12654 12862 295 0.1 12067.8 1.0X +SQL Json 26593 26840 350 0.0 25361.2 0.5X +SQL Parquet Vectorized: DataPageV1 117 129 7 8.9 111.8 107.9X +SQL Parquet Vectorized: DataPageV2 140 150 11 7.5 133.9 90.2X +SQL Parquet MR: DataPageV1 315 324 6 3.3 300.7 40.1X +SQL Parquet MR: DataPageV2 287 295 8 3.7 273.3 44.2X +SQL ORC Vectorized 107 116 15 9.8 101.7 118.6X +SQL ORC MR 255 262 6 4.1 243.1 49.6X From ca200687ab8b86c849edf7d766bc0261d7b41a08 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 27 Jan 2022 16:39:53 -0800 Subject: [PATCH 07/20] Fix for off heap memory not being initialized. Added off heap mode to unit test. --- .../VectorizedDeltaLengthByteArrayReader.java | 1 + .../parquet/ParquetEncodingSuite.scala | 100 ++++++++++-------- 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 3ca489659fce6..ab9bd35715826 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -50,6 +50,7 @@ public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase i public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { if (memoryMode == MemoryMode.OFF_HEAP) { lengthsVector = new OffHeapColumnVector(valueCount, IntegerType); + lengthsVector.putInts(0, valueCount, 0); } else { lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 8ad660478d450..12015b779fe42 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -27,6 +27,7 @@ import org.apache.parquet.column.{Encoding, ParquetProperties} import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.spark.TestUtils +import org.apache.spark.memory.MemoryMode import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -47,6 +48,13 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess null.asInstanceOf[Duration], null.asInstanceOf[java.lang.Boolean]) + private def withMemoryModes(f: String => Unit): Unit = { + Seq(MemoryMode.OFF_HEAP, MemoryMode.ON_HEAP).foreach(mode => { + val offHeap = if (mode == MemoryMode.OFF_HEAP) "true" else "false" + f(offHeap) + }) + } + test("All Types Dictionary") { (1 :: 1000 :: Nil).foreach { n => { withTempPath { dir => @@ -141,52 +149,54 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess ) val hadoopConf = spark.sessionState.newHadoopConfWithOptions(extraOptions) - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL") { - withTempPath { dir => - val path = s"${dir.getCanonicalPath}/test.parquet" - - // Have more than 2 * 4096 records (so we have multiple tasks and each task - // reads at least twice from the reader). This will catch any issues with state - // maintained by the reader(s) - // Add at least one string with a null - val data = (1 to 8197).map { i => - ( i, - i.toLong, i.toShort, Array[Byte](i.toByte), - if (i % 2 == 1) s"test_$i" else null, - DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + ((i % 9) + 1))), - DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + (i % 10))), - Period.of(1, (i % 11 ) + 1, 0), - Duration.ofMillis( ((i % 9) + 1) * 100), - new BigDecimal(java.lang.Long.toUnsignedString(i * 100000)) - ) + withMemoryModes { offHeapMode => + withSQLConf( + SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED.key -> offHeapMode, + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL") { + withTempPath { dir => + val path = s"${dir.getCanonicalPath}/test.parquet" + // Have more than 2 * 4096 records (so we have multiple tasks and each task + // reads at least twice from the reader). This will catch any issues with state + // maintained by the reader(s) + // Add at least one string with a null + val data = (1 to 81971).map { i => + (i, + i.toLong, i.toShort, Array[Byte](i.toByte), + if (i % 2 == 1) s"test_$i" else null, + DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + ((i % 9) + 1))), + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + (i % 10))), + Period.of(1, (i % 11) + 1, 0), + Duration.ofMillis(((i % 9) + 1) * 100), + new BigDecimal(java.lang.Long.toUnsignedString(i * 100000)) + ) + } + + spark.createDataFrame(data) + .write.options(extraOptions).mode("overwrite").parquet(path) + + val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head + val columnChunkMetadataList = blockMetadata.getColumns.asScala + + // Verify that indeed delta encoding is used for each column + assert(columnChunkMetadataList.length === 10) + assert(columnChunkMetadataList(0).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(1).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(2).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + // Both fixed-length byte array and variable-length byte array (also called BINARY) + // are use DELTA_BYTE_ARRAY for encoding + assert(columnChunkMetadataList(3).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + assert(columnChunkMetadataList(4).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + + assert(columnChunkMetadataList(5).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(6).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(7).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(8).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(9).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + + val actual = spark.read.parquet(path).collect() + assert(actual.sortBy(_.getInt(0)) === data.map(Row.fromTuple)); } - - spark.createDataFrame(data) - .write.options(extraOptions).mode("overwrite").parquet(path) - - val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head - val columnChunkMetadataList = blockMetadata.getColumns.asScala - - // Verify that indeed delta encoding is used for each column - assert(columnChunkMetadataList.length === 10) - assert(columnChunkMetadataList(0).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(1).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(2).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - // Both fixed-length byte array and variable-length byte array (also called BINARY) - // are use DELTA_BYTE_ARRAY for encoding - assert(columnChunkMetadataList(3).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - assert(columnChunkMetadataList(4).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - - assert(columnChunkMetadataList(5).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(6).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(7).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(8).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(9).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - - val actual = spark.read.parquet(path).collect() - assert(actual.sortBy(_.getInt(0)) === data.map(Row.fromTuple)); } } } From 6ad1dbec56a8baf22664aea3e70db323dfdc2342 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 31 Jan 2022 10:54:57 -0800 Subject: [PATCH 08/20] Remove use of OffHeap vectors for internal buffers. Skip writing to output for zero length values in DeltaLengthByteArrayReader --- .../parquet/VectorizedColumnReader.java | 8 ++--- .../VectorizedDeltaByteArrayReader.java | 23 +++----------- .../VectorizedDeltaLengthByteArrayReader.java | 31 ++++++------------- .../VectorizedParquetRecordReader.java | 3 +- .../parquet/VectorizedValuesReader.java | 6 ---- .../ParquetDeltaByteArrayEncodingSuite.scala | 3 +- ...uetDeltaLengthByteArrayEncodingSuite.scala | 5 ++- 7 files changed, 21 insertions(+), 58 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index dd3a46b256ecb..c4f1c0b63bedf 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -38,7 +38,6 @@ import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; import org.apache.parquet.schema.PrimitiveType; -import org.apache.spark.memory.MemoryMode; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.spark.sql.types.Decimal; @@ -91,7 +90,6 @@ public class VectorizedColumnReader { private final LogicalTypeAnnotation logicalTypeAnnotation; private final String datetimeRebaseMode; private final ParsedVersion writerVersion; - private final MemoryMode memoryMode; public VectorizedColumnReader( ColumnDescriptor descriptor, @@ -103,8 +101,7 @@ public VectorizedColumnReader( String datetimeRebaseTz, String int96RebaseMode, String int96RebaseTz, - ParsedVersion writerVersion, - MemoryMode memoryMode) throws IOException { + ParsedVersion writerVersion) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.readState = new ParquetReadState(descriptor.getMaxDefinitionLevel(), rowIndexes); @@ -138,7 +135,6 @@ public VectorizedColumnReader( assert "LEGACY".equals(int96RebaseMode) || "EXCEPTION".equals(int96RebaseMode) || "CORRECTED".equals(int96RebaseMode); this.writerVersion = writerVersion; - this.memoryMode = memoryMode; } private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName) { @@ -306,7 +302,7 @@ private ValuesReader getValuesReader(Encoding encoding) { case PLAIN: return new VectorizedPlainValuesReader(); case DELTA_BYTE_ARRAY: - return new VectorizedDeltaByteArrayReader(memoryMode); + return new VectorizedDeltaByteArrayReader(); case DELTA_BINARY_PACKED: return new VectorizedDeltaBinaryPackedReader(); case RLE: diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index e15a2b243e334..8c7781b379e32 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -23,8 +23,6 @@ import org.apache.parquet.column.values.RequiresPreviousReader; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; -import org.apache.spark.memory.MemoryMode; -import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector; import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; @@ -38,7 +36,6 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase implements VectorizedValuesReader, RequiresPreviousReader { - private final MemoryMode memoryMode; private final VectorizedDeltaBinaryPackedReader prefixLengthReader = new VectorizedDeltaBinaryPackedReader(); private final VectorizedDeltaLengthByteArrayReader suffixReader; @@ -50,25 +47,15 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase //temporary variable used by getBinary private final WritableColumnVector binaryValVector; - VectorizedDeltaByteArrayReader(MemoryMode memoryMode){ - this.memoryMode = memoryMode; - this.suffixReader = new VectorizedDeltaLengthByteArrayReader(memoryMode); - if (memoryMode == MemoryMode.OFF_HEAP) { - binaryValVector = new OffHeapColumnVector(1, BinaryType); - } else { - binaryValVector = new OnHeapColumnVector(1, BinaryType); - } + VectorizedDeltaByteArrayReader() { + this.suffixReader = new VectorizedDeltaLengthByteArrayReader(); + binaryValVector = new OnHeapColumnVector(1, BinaryType); } @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - if (memoryMode == MemoryMode.OFF_HEAP) { - prefixLengthVector = new OffHeapColumnVector(valueCount, IntegerType); - suffixVector = new OffHeapColumnVector(valueCount, BinaryType); - } else { - prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType); - suffixVector = new OnHeapColumnVector(valueCount, BinaryType); - } + prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType); + suffixVector = new OnHeapColumnVector(valueCount, BinaryType); prefixLengthReader.initFromPage(valueCount, in); prefixLengthReader.readIntegers(prefixLengthReader.getTotalValueCount(), prefixLengthVector, 0); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index ab9bd35715826..c7a2485db62c2 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -23,8 +23,6 @@ import java.nio.ByteBuffer; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.io.ParquetDecodingException; -import org.apache.spark.memory.MemoryMode; -import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector; import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; @@ -35,25 +33,18 @@ public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase implements VectorizedValuesReader { - private final MemoryMode memoryMode; private final VectorizedDeltaBinaryPackedReader lengthReader = new VectorizedDeltaBinaryPackedReader(); private ByteBufferInputStream in; private WritableColumnVector lengthsVector; private int currentRow = 0; - VectorizedDeltaLengthByteArrayReader(MemoryMode memoryMode) { - this.memoryMode = memoryMode; + VectorizedDeltaLengthByteArrayReader() { } @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - if (memoryMode == MemoryMode.OFF_HEAP) { - lengthsVector = new OffHeapColumnVector(valueCount, IntegerType); - lengthsVector.putInts(0, valueCount, 0); - } else { - lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); - } + lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); lengthReader.initFromPage(valueCount, in); lengthReader.readIntegers(lengthReader.getTotalValueCount(), lengthsVector, 0); this.in = in.remainingStream(); @@ -66,20 +57,18 @@ public void readBinary(int total, WritableColumnVector c, int rowId) { } ByteBuffer buffer; ByteBufferOutputWriter outputWriter; - if (memoryMode == MemoryMode.OFF_HEAP) { - outputWriter = ByteBufferOutputWriter::copyWriteByteBuffer; - } else { - outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; - } + outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; int length; for (int i = 0; i < total; i++) { length = lengthsVector.getInt(rowId + i); - try { - buffer = in.slice(length); - } catch (EOFException e) { - throw new ParquetDecodingException("Failed to read " + length + " bytes"); + if (length > 0) { + try { + buffer = in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); + } + outputWriter.write(c, rowId + i, buffer, length); } - outputWriter.write(c, rowId + i, buffer, length); currentRow++; } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java index 84d6d025eaf09..401d8f0091c36 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java @@ -368,8 +368,7 @@ private void checkEndOfRowGroup() throws IOException { datetimeRebaseTz, int96RebaseMode, int96RebaseTz, - writerVersion, - MEMORY_MODE + writerVersion ); } totalCountLoadedSoFar += pages.getRowCount(); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 8a2bd51ee7e48..1efa409a3a853 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -99,12 +99,6 @@ static void writeArrayByteBuffer(WritableColumnVector c, int rowId, ByteBuffer v length); } - static void copyWriteByteBuffer(WritableColumnVector c, int rowId, ByteBuffer val, int length) { - byte[] bytes = new byte[length]; - val.get(bytes); - c.putByteArray(rowId, bytes); - } - static void skipWrite(WritableColumnVector c, int rowId, ByteBuffer val, int length) { } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala index a4e8cacd85b14..c71c7c6219c98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala @@ -20,7 +20,6 @@ import org.apache.parquet.bytes.DirectByteBufferAllocator import org.apache.parquet.column.values.Utils import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter -import org.apache.spark.memory.MemoryMode import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StringType} @@ -39,7 +38,7 @@ class ParquetDeltaByteArrayEncodingSuite extends ParquetCompatibilityTest with S protected override def beforeEach(): Unit = { writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) - reader = new VectorizedDeltaByteArrayReader(MemoryMode.ON_HEAP) + reader = new VectorizedDeltaByteArrayReader() super.beforeAll() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala index 3cb62d6500f4e..89b73771f507f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala @@ -21,7 +21,6 @@ import org.apache.parquet.column.values.Utils import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter import org.apache.parquet.io.api.Binary -import org.apache.spark.memory.MemoryMode import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StringType} @@ -42,7 +41,7 @@ class ParquetDeltaLengthByteArrayEncodingSuite protected override def beforeEach(): Unit = { writer = new DeltaLengthByteArrayValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) - reader = new VectorizedDeltaLengthByteArrayReader(MemoryMode.OFF_HEAP) + reader = new VectorizedDeltaLengthByteArrayReader() super.beforeAll() } @@ -69,7 +68,7 @@ class ParquetDeltaLengthByteArrayEncodingSuite reader.skipBinary(1) i += 2 } - reader = new VectorizedDeltaLengthByteArrayReader(MemoryMode.OFF_HEAP) + reader = new VectorizedDeltaLengthByteArrayReader() reader.initFromPage(values.length, writer.getBytes.toInputStream) writableColumnVector = new OnHeapColumnVector(values.length, StringType) var skipCount = 0 From 6f364d9c8706cddccdb16de7d7b232710735673e Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 2 Feb 2022 12:56:18 -0800 Subject: [PATCH 09/20] more review comments addressed --- .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 2 +- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 8c7781b379e32..32e3e41f6a167 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -69,7 +69,7 @@ public Binary readBinary(int len) { return Binary.fromConstantByteArray(binaryValVector.getBinary(0)); } - public void readValues(int total, WritableColumnVector c, int rowId, + private void readValues(int total, WritableColumnVector c, int rowId, ByteBufferOutputWriter outputWriter) { if (total == 0) { return; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index c7a2485db62c2..8f3dab5aa767c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -56,8 +56,7 @@ public void readBinary(int total, WritableColumnVector c, int rowId) { return; } ByteBuffer buffer; - ByteBufferOutputWriter outputWriter; - outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; + ByteBufferOutputWriter outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; int length; for (int i = 0; i < total; i++) { length = lengthsVector.getInt(rowId + i); @@ -69,8 +68,8 @@ public void readBinary(int total, WritableColumnVector c, int rowId) { } outputWriter.write(c, rowId + i, buffer, length); } - currentRow++; } + currentRow += total; } @Override From 80a4ceb73df548cb7f385dcae67bdd3985dbb308 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 9 Feb 2022 12:25:28 -0800 Subject: [PATCH 10/20] Still more review comments addressed --- .../parquet/SpecificParquetRecordReaderBase.java | 2 ++ .../parquet/VectorizedColumnReader.java | 6 +++--- .../parquet/VectorizedDeltaByteArrayReader.java | 16 +++++++--------- .../VectorizedDeltaLengthByteArrayReader.java | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index d937231f3730b..5669534cd111a 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -71,6 +71,8 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReader 0) { - remaining -= in.skip(length); + remaining -= in.skip(remaining); } } currentRow += total; From 406d1768019eed1516925847a59bd2ad5fc7883b Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Fri, 11 Feb 2022 10:52:42 -0800 Subject: [PATCH 11/20] Remove unnecessary check for 'total' parameter in 'readValues' --- .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 4 ---- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 3 --- 2 files changed, 7 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 9e1470826901e..b41f0775bb6eb 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -71,10 +71,6 @@ public Binary readBinary(int len) { private void readValues(int total, WritableColumnVector c, int rowId, ByteBufferOutputWriter outputWriter) { - if (total == 0) { - return; - } - for (int i = 0; i < total; i++) { // NOTE: due to PARQUET-246, it is important that we // respect prefixLength which was read from prefixLengthReader, diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 3a30838b7163d..7b4373ad0bd80 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -52,9 +52,6 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce @Override public void readBinary(int total, WritableColumnVector c, int rowId) { - if (total == 0) { - return; - } ByteBuffer buffer; ByteBufferOutputWriter outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; int length; From be62ad60f5262f12fcc67b8fa2e33c841ddefb69 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 14 Feb 2022 10:01:12 -0800 Subject: [PATCH 12/20] Remove check for zero length in DeltaLengthByteArrayReader, and add unit test with empty strings. --- .../VectorizedDeltaLengthByteArrayReader.java | 12 +++++----- ...uetDeltaLengthByteArrayEncodingSuite.scala | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 7b4373ad0bd80..3676c59b78336 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -57,14 +57,12 @@ public void readBinary(int total, WritableColumnVector c, int rowId) { int length; for (int i = 0; i < total; i++) { length = lengthsVector.getInt(rowId + i); - if (length > 0) { - try { - buffer = in.slice(length); - } catch (EOFException e) { - throw new ParquetDecodingException("Failed to read " + length + " bytes"); - } - outputWriter.write(c, rowId + i, buffer, length); + try { + buffer = in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); } + outputWriter.write(c, rowId + i, buffer, length); } currentRow += total; } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala index 89b73771f507f..17dc70df42a6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala @@ -16,6 +16,9 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import java.util.Random + +import org.apache.commons.lang3.RandomStringUtils import org.apache.parquet.bytes.{ByteBufferInputStream, DirectByteBufferAllocator} import org.apache.parquet.column.values.Utils import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter @@ -56,6 +59,12 @@ class ParquetDeltaLengthByteArrayEncodingSuite readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) } + test("random strings with empty strings") { + val values = getRandomStringSamplesWithEmptyStrings(1000, 32) + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + test("skip with random strings") { val values = Utils.getRandomStringSamples(1000, 32) writeData(writer, values) @@ -117,4 +126,17 @@ class ParquetDeltaLengthByteArrayEncodingSuite } } + def getRandomStringSamplesWithEmptyStrings(numSamples: Int, maxLength: Int): Array[String] = { + val randomLen = new Random + val randomEmpty = new Random + val samples: Array[String] = new Array[String](numSamples) + for (i <- 0 until numSamples) { + var maxLen: Int = randomLen.nextInt(maxLength) + if(randomEmpty.nextInt() % 11 != 0) { + maxLen = 0; + } + samples(i) = RandomStringUtils.randomAlphanumeric(0, maxLen) + } + samples + } } From 166afe12726f81b9698fd47d0557d409aaed2baf Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 15 Feb 2022 17:49:35 -0800 Subject: [PATCH 13/20] Update benchmark --- .../DataSourceReadBenchmark-jdk11-results.txt | 470 +++++++++--------- .../DataSourceReadBenchmark-jdk17-results.txt | 470 +++++++++--------- .../DataSourceReadBenchmark-results.txt | 470 +++++++++--------- 3 files changed, 705 insertions(+), 705 deletions(-) diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index b3a894f8ad397..b8b86906c455f 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10153 10161 12 1.5 645.5 1.0X -SQL Json 8463 8512 69 1.9 538.0 1.2X -SQL Parquet Vectorized: DataPageV1 131 149 14 120.0 8.3 77.5X -SQL Parquet Vectorized: DataPageV2 98 112 15 161.2 6.2 104.0X -SQL Parquet MR: DataPageV1 1968 1968 0 8.0 125.1 5.2X -SQL Parquet MR: DataPageV2 1735 1739 6 9.1 110.3 5.9X -SQL ORC Vectorized 164 198 41 96.0 10.4 62.0X -SQL ORC MR 1572 1581 12 10.0 100.0 6.5X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 14859 14914 77 1.1 944.7 1.0X +SQL Json 9418 9457 55 1.7 598.8 1.6X +SQL Parquet Vectorized: DataPageV1 109 128 14 144.6 6.9 136.6X +SQL Parquet Vectorized: DataPageV2 79 89 8 199.3 5.0 188.3X +SQL Parquet MR: DataPageV1 1699 1743 62 9.3 108.0 8.7X +SQL Parquet MR: DataPageV2 1462 1489 38 10.8 93.0 10.2X +SQL ORC Vectorized 165 200 33 95.3 10.5 90.0X +SQL ORC MR 1409 1420 16 11.2 89.6 10.5X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 102 107 14 154.6 6.5 1.0X -ParquetReader Vectorized: DataPageV2 83 88 10 189.1 5.3 1.2X -ParquetReader Vectorized -> Row: DataPageV1 57 59 3 275.7 3.6 1.8X -ParquetReader Vectorized -> Row: DataPageV2 38 40 3 416.3 2.4 2.7X +ParquetReader Vectorized: DataPageV1 101 104 3 155.2 6.4 1.0X +ParquetReader Vectorized: DataPageV2 82 85 5 192.0 5.2 1.2X +ParquetReader Vectorized -> Row: DataPageV1 48 50 2 324.6 3.1 2.1X +ParquetReader Vectorized -> Row: DataPageV2 29 31 3 539.4 1.9 3.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12263 12285 31 1.3 779.6 1.0X -SQL Json 9495 9501 9 1.7 603.6 1.3X -SQL Parquet Vectorized: DataPageV1 162 175 10 97.1 10.3 75.7X -SQL Parquet Vectorized: DataPageV2 161 172 12 97.9 10.2 76.4X -SQL Parquet MR: DataPageV1 2074 2105 44 7.6 131.9 5.9X -SQL Parquet MR: DataPageV2 1974 1981 9 8.0 125.5 6.2X -SQL ORC Vectorized 187 218 30 84.3 11.9 65.7X -SQL ORC MR 1529 1553 34 10.3 97.2 8.0X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 17619 17639 28 0.9 1120.2 1.0X +SQL Json 10590 10606 23 1.5 673.3 1.7X +SQL Parquet Vectorized: DataPageV1 178 194 10 88.2 11.3 98.8X +SQL Parquet Vectorized: DataPageV2 178 188 9 88.2 11.3 98.7X +SQL Parquet MR: DataPageV1 1884 1887 4 8.4 119.8 9.4X +SQL Parquet MR: DataPageV2 1689 1742 75 9.3 107.4 10.4X +SQL ORC Vectorized 162 193 24 97.0 10.3 108.7X +SQL ORC MR 1505 1552 67 10.5 95.7 11.7X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 205 214 12 76.8 13.0 1.0X -ParquetReader Vectorized: DataPageV2 204 211 10 77.2 13.0 1.0X -ParquetReader Vectorized -> Row: DataPageV1 122 132 24 128.7 7.8 1.7X -ParquetReader Vectorized -> Row: DataPageV2 122 126 6 128.4 7.8 1.7X +ParquetReader Vectorized: DataPageV1 230 236 13 68.3 14.6 1.0X +ParquetReader Vectorized: DataPageV2 228 233 8 69.1 14.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 138 150 26 113.7 8.8 1.7X +ParquetReader Vectorized -> Row: DataPageV2 137 140 2 114.5 8.7 1.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12946 12955 12 1.2 823.1 1.0X -SQL Json 9871 9876 8 1.6 627.6 1.3X -SQL Parquet Vectorized: DataPageV1 157 200 34 100.0 10.0 82.3X -SQL Parquet Vectorized: DataPageV2 229 242 14 68.8 14.5 56.7X -SQL Parquet MR: DataPageV1 2388 2389 2 6.6 151.8 5.4X -SQL Parquet MR: DataPageV2 2080 2087 10 7.6 132.2 6.2X -SQL ORC Vectorized 240 285 23 65.6 15.2 54.0X -SQL ORC MR 1699 1732 46 9.3 108.0 7.6X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 18488 18494 8 0.9 1175.5 1.0X +SQL Json 11190 11195 7 1.4 711.4 1.7X +SQL Parquet Vectorized: DataPageV1 125 155 34 125.7 8.0 147.7X +SQL Parquet Vectorized: DataPageV2 183 192 9 86.1 11.6 101.2X +SQL Parquet MR: DataPageV1 2153 2160 10 7.3 136.9 8.6X +SQL Parquet MR: DataPageV2 1876 1889 18 8.4 119.3 9.9X +SQL ORC Vectorized 212 257 23 74.4 13.4 87.4X +SQL ORC MR 1653 1658 7 9.5 105.1 11.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 242 245 5 65.0 15.4 1.0X -ParquetReader Vectorized: DataPageV2 309 314 9 50.9 19.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 227 268 18 69.3 14.4 1.1X -ParquetReader Vectorized -> Row: DataPageV2 294 312 25 53.5 18.7 0.8X +ParquetReader Vectorized: DataPageV1 198 201 5 79.6 12.6 1.0X +ParquetReader Vectorized: DataPageV2 256 260 3 61.5 16.3 0.8X +ParquetReader Vectorized -> Row: DataPageV1 193 226 14 81.4 12.3 1.0X +ParquetReader Vectorized -> Row: DataPageV2 250 253 2 62.8 15.9 0.8X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14331 14347 22 1.1 911.2 1.0X -SQL Json 10406 10434 40 1.5 661.6 1.4X -SQL Parquet Vectorized: DataPageV1 153 196 41 102.7 9.7 93.6X -SQL Parquet Vectorized: DataPageV2 378 415 30 41.6 24.0 37.9X -SQL Parquet MR: DataPageV1 2439 2446 11 6.4 155.1 5.9X -SQL Parquet MR: DataPageV2 2181 2188 10 7.2 138.7 6.6X -SQL ORC Vectorized 320 346 25 49.2 20.3 44.8X -SQL ORC MR 1851 1853 3 8.5 117.7 7.7X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 18614 18703 125 0.8 1183.5 1.0X +SQL Json 11673 11711 53 1.3 742.2 1.6X +SQL Parquet Vectorized: DataPageV1 128 154 26 123.1 8.1 145.7X +SQL Parquet Vectorized: DataPageV2 270 302 23 58.3 17.1 69.0X +SQL Parquet MR: DataPageV1 2117 2145 39 7.4 134.6 8.8X +SQL Parquet MR: DataPageV2 1855 1860 7 8.5 117.9 10.0X +SQL ORC Vectorized 277 292 16 56.7 17.6 67.2X +SQL ORC MR 1623 1629 9 9.7 103.2 11.5X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 258 262 9 60.9 16.4 1.0X -ParquetReader Vectorized: DataPageV2 481 484 3 32.7 30.6 0.5X -ParquetReader Vectorized -> Row: DataPageV1 250 275 26 62.9 15.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 475 502 27 33.1 30.2 0.5X +ParquetReader Vectorized: DataPageV1 225 226 1 69.9 14.3 1.0X +ParquetReader Vectorized: DataPageV2 362 365 2 43.4 23.0 0.6X +ParquetReader Vectorized -> Row: DataPageV1 193 218 18 81.5 12.3 1.2X +ParquetReader Vectorized -> Row: DataPageV2 360 366 6 43.7 22.9 0.6X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18450 18451 2 0.9 1173.0 1.0X -SQL Json 12553 12562 13 1.3 798.1 1.5X -SQL Parquet Vectorized: DataPageV1 259 272 12 60.8 16.5 71.3X -SQL Parquet Vectorized: DataPageV2 534 566 22 29.4 34.0 34.5X -SQL Parquet MR: DataPageV1 2529 2537 11 6.2 160.8 7.3X -SQL Parquet MR: DataPageV2 2331 2334 4 6.7 148.2 7.9X -SQL ORC Vectorized 424 460 36 37.1 27.0 43.5X -SQL ORC MR 2009 2023 20 7.8 127.7 9.2X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 23939 23953 20 0.7 1522.0 1.0X +SQL Json 14445 14449 5 1.1 918.4 1.7X +SQL Parquet Vectorized: DataPageV1 186 229 28 84.7 11.8 128.9X +SQL Parquet Vectorized: DataPageV2 459 493 25 34.3 29.2 52.2X +SQL Parquet MR: DataPageV1 2180 2184 7 7.2 138.6 11.0X +SQL Parquet MR: DataPageV2 1954 1973 27 8.1 124.2 12.3X +SQL ORC Vectorized 368 392 24 42.8 23.4 65.1X +SQL ORC MR 1793 1794 2 8.8 114.0 13.4X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 322 338 21 48.9 20.5 1.0X -ParquetReader Vectorized: DataPageV2 674 683 12 23.3 42.9 0.5X -ParquetReader Vectorized -> Row: DataPageV1 352 358 9 44.7 22.4 0.9X -ParquetReader Vectorized -> Row: DataPageV2 628 660 22 25.0 39.9 0.5X +ParquetReader Vectorized: DataPageV1 280 293 18 56.1 17.8 1.0X +ParquetReader Vectorized: DataPageV2 577 602 48 27.3 36.7 0.5X +ParquetReader Vectorized -> Row: DataPageV1 314 321 10 50.1 19.9 0.9X +ParquetReader Vectorized -> Row: DataPageV2 581 584 4 27.1 37.0 0.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14806 14816 14 1.1 941.3 1.0X -SQL Json 11968 11969 1 1.3 760.9 1.2X -SQL Parquet Vectorized: DataPageV1 150 184 26 105.0 9.5 98.8X -SQL Parquet Vectorized: DataPageV2 147 183 32 107.2 9.3 100.9X -SQL Parquet MR: DataPageV1 2338 2352 19 6.7 148.7 6.3X -SQL Parquet MR: DataPageV2 2221 2267 65 7.1 141.2 6.7X -SQL ORC Vectorized 475 494 29 33.1 30.2 31.1X -SQL ORC MR 1967 1978 16 8.0 125.1 7.5X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 19278 19291 18 0.8 1225.6 1.0X +SQL Json 13366 13381 21 1.2 849.8 1.4X +SQL Parquet Vectorized: DataPageV1 130 152 23 120.8 8.3 148.1X +SQL Parquet Vectorized: DataPageV2 135 157 17 116.8 8.6 143.2X +SQL Parquet MR: DataPageV1 2126 2137 15 7.4 135.2 9.1X +SQL Parquet MR: DataPageV2 1970 1985 21 8.0 125.2 9.8X +SQL ORC Vectorized 387 396 11 40.7 24.6 49.8X +SQL ORC MR 1831 1832 1 8.6 116.4 10.5X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 236 241 8 66.7 15.0 1.0X -ParquetReader Vectorized: DataPageV2 237 241 9 66.3 15.1 1.0X -ParquetReader Vectorized -> Row: DataPageV1 218 244 25 72.1 13.9 1.1X -ParquetReader Vectorized -> Row: DataPageV2 218 251 21 72.2 13.8 1.1X +ParquetReader Vectorized: DataPageV1 194 197 5 81.1 12.3 1.0X +ParquetReader Vectorized: DataPageV2 194 197 7 81.2 12.3 1.0X +ParquetReader Vectorized -> Row: DataPageV1 225 253 18 69.9 14.3 0.9X +ParquetReader Vectorized -> Row: DataPageV2 224 252 18 70.2 14.2 0.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19335 19346 15 0.8 1229.3 1.0X -SQL Json 16112 16121 13 1.0 1024.4 1.2X -SQL Parquet Vectorized: DataPageV1 257 278 29 61.1 16.4 75.1X -SQL Parquet Vectorized: DataPageV2 258 268 9 60.9 16.4 74.9X -SQL Parquet MR: DataPageV1 2542 2557 20 6.2 161.6 7.6X -SQL Parquet MR: DataPageV2 2416 2439 32 6.5 153.6 8.0X -SQL ORC Vectorized 593 605 18 26.5 37.7 32.6X -SQL ORC MR 2134 2141 11 7.4 135.7 9.1X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 24691 24705 19 0.6 1569.8 1.0X +SQL Json 18028 18028 0 0.9 1146.2 1.4X +SQL Parquet Vectorized: DataPageV1 190 225 28 83.0 12.0 130.3X +SQL Parquet Vectorized: DataPageV2 188 230 26 83.9 11.9 131.7X +SQL Parquet MR: DataPageV1 2362 2365 4 6.7 150.2 10.5X +SQL Parquet MR: DataPageV2 2061 2078 25 7.6 131.0 12.0X +SQL ORC Vectorized 499 524 37 31.6 31.7 49.5X +SQL ORC MR 1870 1880 14 8.4 118.9 13.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 322 346 24 48.8 20.5 1.0X -ParquetReader Vectorized: DataPageV2 326 326 1 48.3 20.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 350 359 9 44.9 22.3 0.9X -ParquetReader Vectorized -> Row: DataPageV2 348 358 10 45.2 22.1 0.9X +ParquetReader Vectorized: DataPageV1 276 295 21 57.0 17.5 1.0X +ParquetReader Vectorized: DataPageV2 278 289 17 56.6 17.7 1.0X +ParquetReader Vectorized -> Row: DataPageV1 315 326 15 50.0 20.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 315 323 8 49.9 20.0 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13899 14000 142 0.8 1325.5 1.0X -SQL Json 11275 11289 20 0.9 1075.3 1.2X -SQL Parquet Vectorized: DataPageV1 2092 2107 21 5.0 199.5 6.6X -SQL Parquet Vectorized: DataPageV2 3073 3074 2 3.4 293.0 4.5X -SQL Parquet MR: DataPageV1 4192 4212 29 2.5 399.8 3.3X -SQL Parquet MR: DataPageV2 4133 4194 87 2.5 394.1 3.4X -SQL ORC Vectorized 2218 2219 1 4.7 211.5 6.3X -SQL ORC MR 3767 3776 12 2.8 359.3 3.7X +SQL CSV 16840 16908 96 0.6 1606.0 1.0X +SQL Json 12496 12513 25 0.8 1191.7 1.3X +SQL Parquet Vectorized: DataPageV1 2169 2172 5 4.8 206.9 7.8X +SQL Parquet Vectorized: DataPageV2 3102 3119 24 3.4 295.9 5.4X +SQL Parquet MR: DataPageV1 4140 4144 5 2.5 394.8 4.1X +SQL Parquet MR: DataPageV2 3988 3996 12 2.6 380.3 4.2X +SQL ORC Vectorized 2180 2196 23 4.8 207.9 7.7X +SQL ORC MR 3765 3766 2 2.8 359.0 4.5X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7367 7387 28 1.4 702.6 1.0X -SQL Json 6817 6817 0 1.5 650.1 1.1X -SQL Parquet Vectorized: DataPageV1 602 618 15 17.4 57.5 12.2X -SQL Parquet Vectorized: DataPageV2 599 610 15 17.5 57.1 12.3X -SQL Parquet MR: DataPageV1 1888 1936 68 5.6 180.0 3.9X -SQL Parquet MR: DataPageV2 2000 2018 25 5.2 190.7 3.7X -SQL ORC Vectorized 527 545 22 19.9 50.2 14.0X -SQL ORC MR 1916 1927 16 5.5 182.7 3.8X +SQL CSV 9960 9960 0 1.1 949.8 1.0X +SQL Json 7625 7712 123 1.4 727.2 1.3X +SQL Parquet Vectorized: DataPageV1 577 582 6 18.2 55.0 17.3X +SQL Parquet Vectorized: DataPageV2 584 592 6 18.0 55.7 17.1X +SQL Parquet MR: DataPageV1 1722 1736 19 6.1 164.2 5.8X +SQL Parquet MR: DataPageV2 1662 1668 9 6.3 158.5 6.0X +SQL ORC Vectorized 483 524 27 21.7 46.1 20.6X +SQL ORC MR 1841 1850 14 5.7 175.5 5.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 19239 19257 25 0.8 1223.2 1.0X -Data column - Json 12387 12393 8 1.3 787.6 1.6X -Data column - Parquet Vectorized: DataPageV1 227 269 25 69.2 14.5 84.6X -Data column - Parquet Vectorized: DataPageV2 612 651 28 25.7 38.9 31.4X -Data column - Parquet MR: DataPageV1 2989 3016 39 5.3 190.0 6.4X -Data column - Parquet MR: DataPageV2 2750 2754 5 5.7 174.8 7.0X -Data column - ORC Vectorized 426 467 33 37.0 27.1 45.2X -Data column - ORC MR 2513 2538 35 6.3 159.8 7.7X -Partition column - CSV 6623 6627 5 2.4 421.1 2.9X -Partition column - Json 10234 10235 2 1.5 650.7 1.9X -Partition column - Parquet Vectorized: DataPageV1 56 73 19 279.8 3.6 342.2X -Partition column - Parquet Vectorized: DataPageV2 57 72 19 278.3 3.6 340.4X -Partition column - Parquet MR: DataPageV1 1392 1417 36 11.3 88.5 13.8X -Partition column - Parquet MR: DataPageV2 1390 1416 37 11.3 88.4 13.8X -Partition column - ORC Vectorized 56 89 36 283.2 3.5 346.4X -Partition column - ORC MR 1578 1581 4 10.0 100.4 12.2X -Both columns - CSV 19178 19181 4 0.8 1219.3 1.0X -Both columns - Json 13104 13105 1 1.2 833.1 1.5X -Both columns - Parquet Vectorized: DataPageV1 314 338 21 50.2 19.9 61.4X -Both columns - Parquet Vectorized: DataPageV2 708 741 54 22.2 45.0 27.2X -Both columns - Parquet MR: DataPageV1 3083 3105 31 5.1 196.0 6.2X -Both columns - Parquet MR: DataPageV2 2897 2901 6 5.4 184.2 6.6X -Both columns - ORC Vectorized 456 504 39 34.5 29.0 42.1X -Both columns - ORC MR 2594 2597 4 6.1 164.9 7.4X +Data column - CSV 23787 23788 2 0.7 1512.3 1.0X +Data column - Json 13993 14011 25 1.1 889.7 1.7X +Data column - Parquet Vectorized: DataPageV1 184 235 36 85.4 11.7 129.2X +Data column - Parquet Vectorized: DataPageV2 531 542 15 29.6 33.7 44.8X +Data column - Parquet MR: DataPageV1 2539 2547 13 6.2 161.4 9.4X +Data column - Parquet MR: DataPageV2 2299 2301 3 6.8 146.2 10.3X +Data column - ORC Vectorized 379 403 23 41.5 24.1 62.8X +Data column - ORC MR 2047 2070 33 7.7 130.1 11.6X +Partition column - CSV 6834 6835 1 2.3 434.5 3.5X +Partition column - Json 11444 11478 49 1.4 727.6 2.1X +Partition column - Parquet Vectorized: DataPageV1 51 71 22 308.6 3.2 466.7X +Partition column - Parquet Vectorized: DataPageV2 51 61 16 310.5 3.2 469.5X +Partition column - Parquet MR: DataPageV1 1203 1214 15 13.1 76.5 19.8X +Partition column - Parquet MR: DataPageV2 1210 1224 20 13.0 76.9 19.7X +Partition column - ORC Vectorized 52 67 14 303.1 3.3 458.4X +Partition column - ORC MR 1338 1342 5 11.8 85.1 17.8X +Both columns - CSV 24051 24052 2 0.7 1529.1 1.0X +Both columns - Json 15016 15030 20 1.0 954.7 1.6X +Both columns - Parquet Vectorized: DataPageV1 235 269 27 66.9 15.0 101.2X +Both columns - Parquet Vectorized: DataPageV2 563 617 60 27.9 35.8 42.2X +Both columns - Parquet MR: DataPageV1 2525 2555 43 6.2 160.5 9.4X +Both columns - Parquet MR: DataPageV2 2256 2267 15 7.0 143.5 10.5X +Both columns - ORC Vectorized 407 454 51 38.7 25.9 58.5X +Both columns - ORC MR 2153 2155 2 7.3 136.9 11.0X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9165 9257 130 1.1 874.1 1.0X -SQL Json 10230 10234 7 1.0 975.6 0.9X -SQL Parquet Vectorized: DataPageV1 1275 1315 56 8.2 121.6 7.2X -SQL Parquet Vectorized: DataPageV2 2406 2407 0 4.4 229.5 3.8X -SQL Parquet MR: DataPageV1 4005 4009 7 2.6 381.9 2.3X -SQL Parquet MR: DataPageV2 4358 4366 12 2.4 415.6 2.1X -ParquetReader Vectorized: DataPageV1 985 995 13 10.6 94.0 9.3X -ParquetReader Vectorized: DataPageV2 2039 2061 32 5.1 194.4 4.5X -SQL ORC Vectorized 1048 1072 34 10.0 99.9 8.7X -SQL ORC MR 3179 3196 24 3.3 303.2 2.9X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 11812 11849 53 0.9 1126.4 1.0X +SQL Json 11454 11467 18 0.9 1092.3 1.0X +SQL Parquet Vectorized: DataPageV1 1250 1276 37 8.4 119.2 9.5X +SQL Parquet Vectorized: DataPageV2 2248 2261 17 4.7 214.4 5.3X +SQL Parquet MR: DataPageV1 3629 3630 1 2.9 346.1 3.3X +SQL Parquet MR: DataPageV2 3929 3934 6 2.7 374.7 3.0X +ParquetReader Vectorized: DataPageV1 921 922 2 11.4 87.8 12.8X +ParquetReader Vectorized: DataPageV2 1890 1890 0 5.5 180.3 6.2X +SQL ORC Vectorized 1079 1105 36 9.7 102.9 10.9X +SQL ORC MR 3042 3070 40 3.4 290.1 3.9X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6749 6750 2 1.6 643.6 1.0X -SQL Json 7960 7967 10 1.3 759.1 0.8X -SQL Parquet Vectorized: DataPageV1 1078 1105 37 9.7 102.8 6.3X -SQL Parquet Vectorized: DataPageV2 1939 1941 3 5.4 184.9 3.5X -SQL Parquet MR: DataPageV1 3090 3099 13 3.4 294.7 2.2X -SQL Parquet MR: DataPageV2 3274 3286 17 3.2 312.3 2.1X -ParquetReader Vectorized: DataPageV1 1058 1067 13 9.9 100.9 6.4X -ParquetReader Vectorized: DataPageV2 1847 1848 2 5.7 176.2 3.7X -SQL ORC Vectorized 1307 1307 0 8.0 124.6 5.2X -SQL ORC MR 3078 3122 62 3.4 293.6 2.2X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 8399 8410 16 1.2 801.0 1.0X +SQL Json 8892 8905 18 1.2 848.0 0.9X +SQL Parquet Vectorized: DataPageV1 1065 1092 38 9.8 101.6 7.9X +SQL Parquet Vectorized: DataPageV2 1747 1747 0 6.0 166.6 4.8X +SQL Parquet MR: DataPageV1 2718 2719 1 3.9 259.2 3.1X +SQL Parquet MR: DataPageV2 2955 2964 12 3.5 281.8 2.8X +ParquetReader Vectorized: DataPageV1 1082 1084 3 9.7 103.2 7.8X +ParquetReader Vectorized: DataPageV2 1707 1713 9 6.1 162.8 4.9X +SQL ORC Vectorized 1345 1357 17 7.8 128.3 6.2X +SQL ORC MR 3012 3046 47 3.5 287.3 2.8X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4818 4824 8 2.2 459.5 1.0X -SQL Json 4853 4878 35 2.2 462.8 1.0X -SQL Parquet Vectorized: DataPageV1 255 264 6 41.1 24.3 18.9X -SQL Parquet Vectorized: DataPageV2 711 716 4 14.7 67.8 6.8X -SQL Parquet MR: DataPageV1 2024 2024 1 5.2 193.0 2.4X -SQL Parquet MR: DataPageV2 1920 1922 3 5.5 183.1 2.5X -ParquetReader Vectorized: DataPageV1 272 275 2 38.5 26.0 17.7X -ParquetReader Vectorized: DataPageV2 719 726 6 14.6 68.5 6.7X -SQL ORC Vectorized 478 523 52 21.9 45.6 10.1X -SQL ORC MR 1772 1775 5 5.9 169.0 2.7X +SQL CSV 6169 6176 10 1.7 588.3 1.0X +SQL Json 5352 5376 35 2.0 510.4 1.2X +SQL Parquet Vectorized: DataPageV1 248 255 7 42.3 23.6 24.9X +SQL Parquet Vectorized: DataPageV2 364 372 10 28.8 34.7 17.0X +SQL Parquet MR: DataPageV1 1624 1626 3 6.5 154.9 3.8X +SQL Parquet MR: DataPageV2 1520 1526 8 6.9 145.0 4.1X +ParquetReader Vectorized: DataPageV1 259 262 1 40.4 24.7 23.8X +ParquetReader Vectorized: DataPageV2 376 378 2 27.9 35.9 16.4X +SQL ORC Vectorized 414 438 30 25.3 39.5 14.9X +SQL ORC MR 1580 1596 22 6.6 150.7 3.9X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2157 2171 20 0.5 2056.9 1.0X -SQL Json 2929 2931 3 0.4 2793.5 0.7X -SQL Parquet Vectorized: DataPageV1 46 63 17 22.6 44.2 46.6X -SQL Parquet Vectorized: DataPageV2 68 82 17 15.4 64.8 31.7X -SQL Parquet MR: DataPageV1 204 224 27 5.1 194.4 10.6X -SQL Parquet MR: DataPageV2 188 209 33 5.6 179.6 11.5X -SQL ORC Vectorized 57 73 20 18.4 54.3 37.9X -SQL ORC MR 172 191 20 6.1 163.8 12.6X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 2403 2408 8 0.4 2291.5 1.0X +SQL Json 2925 2934 14 0.4 2789.3 0.8X +SQL Parquet Vectorized: DataPageV1 41 56 15 25.7 38.9 58.9X +SQL Parquet Vectorized: DataPageV2 60 71 15 17.4 57.5 39.8X +SQL Parquet MR: DataPageV1 179 193 19 5.9 170.7 13.4X +SQL Parquet MR: DataPageV2 159 182 29 6.6 152.1 15.1X +SQL ORC Vectorized 50 66 19 21.0 47.6 48.1X +SQL ORC MR 153 177 49 6.9 145.7 15.7X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5271 5277 8 0.2 5027.2 1.0X -SQL Json 11690 11705 21 0.1 11148.8 0.5X -SQL Parquet Vectorized: DataPageV1 62 89 22 17.0 58.7 85.6X -SQL Parquet Vectorized: DataPageV2 83 104 28 12.7 79.0 63.6X -SQL Parquet MR: DataPageV1 222 239 26 4.7 211.3 23.8X -SQL Parquet MR: DataPageV2 207 244 49 5.1 197.1 25.5X -SQL ORC Vectorized 70 91 24 15.1 66.4 75.7X -SQL ORC MR 187 200 21 5.6 178.1 28.2X - -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 5265 5268 5 0.2 5021.1 1.0X +SQL Json 11138 11189 73 0.1 10621.6 0.5X +SQL Parquet Vectorized: DataPageV1 58 83 22 18.1 55.4 90.7X +SQL Parquet Vectorized: DataPageV2 74 96 25 14.1 70.7 71.0X +SQL Parquet MR: DataPageV1 194 216 31 5.4 185.4 27.1X +SQL Parquet MR: DataPageV2 175 192 24 6.0 167.3 30.0X +SQL ORC Vectorized 65 83 20 16.1 62.1 80.8X +SQL ORC MR 170 187 27 6.2 161.9 31.0X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9264 9267 4 0.1 8835.3 1.0X -SQL Json 21871 22402 751 0.0 20857.9 0.4X -SQL Parquet Vectorized: DataPageV1 90 127 29 11.7 85.8 102.9X -SQL Parquet Vectorized: DataPageV2 112 134 20 9.3 107.1 82.5X -SQL Parquet MR: DataPageV1 255 281 26 4.1 243.4 36.3X -SQL Parquet MR: DataPageV2 241 273 32 4.3 230.3 38.4X -SQL ORC Vectorized 85 99 21 12.3 81.3 108.7X -SQL ORC MR 205 220 21 5.1 195.7 45.1X +SQL CSV 8781 8782 1 0.1 8374.4 1.0X +SQL Json 19880 20088 293 0.1 18959.3 0.4X +SQL Parquet Vectorized: DataPageV1 85 111 20 12.4 80.7 103.8X +SQL Parquet Vectorized: DataPageV2 103 124 24 10.2 98.4 85.1X +SQL Parquet MR: DataPageV1 232 253 19 4.5 221.6 37.8X +SQL Parquet MR: DataPageV2 209 234 23 5.0 199.0 42.1X +SQL ORC Vectorized 86 101 23 12.2 81.9 102.3X +SQL ORC MR 194 212 19 5.4 185.4 45.2X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt index 976ab2f166b23..c13dddbd8c265 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9960 10062 144 1.6 633.2 1.0X -SQL Json 7971 8037 92 2.0 506.8 1.2X -SQL Parquet Vectorized: DataPageV1 116 138 14 136.0 7.4 86.1X -SQL Parquet Vectorized: DataPageV2 87 104 14 180.1 5.6 114.1X -SQL Parquet MR: DataPageV1 1708 1712 5 9.2 108.6 5.8X -SQL Parquet MR: DataPageV2 1554 1555 1 10.1 98.8 6.4X -SQL ORC Vectorized 174 182 7 90.3 11.1 57.2X -SQL ORC MR 1510 1512 2 10.4 96.0 6.6X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 11833 12041 295 1.3 752.3 1.0X +SQL Json 8231 8329 138 1.9 523.3 1.4X +SQL Parquet Vectorized: DataPageV1 86 100 14 181.9 5.5 136.9X +SQL Parquet Vectorized: DataPageV2 65 76 8 241.0 4.2 181.3X +SQL Parquet MR: DataPageV1 1629 1632 4 9.7 103.5 7.3X +SQL Parquet MR: DataPageV2 1475 1489 21 10.7 93.8 8.0X +SQL ORC Vectorized 167 176 8 94.3 10.6 70.9X +SQL ORC MR 1386 1391 8 11.4 88.1 8.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 61 62 3 258.8 3.9 1.0X -ParquetReader Vectorized: DataPageV2 44 56 24 356.2 2.8 1.4X -ParquetReader Vectorized -> Row: DataPageV1 50 52 2 312.0 3.2 1.2X -ParquetReader Vectorized -> Row: DataPageV2 32 33 1 494.9 2.0 1.9X +ParquetReader Vectorized: DataPageV1 71 73 2 220.3 4.5 1.0X +ParquetReader Vectorized: DataPageV2 60 60 1 263.5 3.8 1.2X +ParquetReader Vectorized -> Row: DataPageV1 38 39 1 415.0 2.4 1.9X +ParquetReader Vectorized -> Row: DataPageV2 26 27 1 601.2 1.7 2.7X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11898 11909 15 1.3 756.5 1.0X -SQL Json 8991 9265 388 1.7 571.6 1.3X -SQL Parquet Vectorized: DataPageV1 116 125 9 135.2 7.4 102.3X -SQL Parquet Vectorized: DataPageV2 118 125 8 133.0 7.5 100.6X -SQL Parquet MR: DataPageV1 1965 1982 24 8.0 124.9 6.1X -SQL Parquet MR: DataPageV2 1830 1836 8 8.6 116.4 6.5X -SQL ORC Vectorized 167 175 8 94.3 10.6 71.3X -SQL ORC MR 1471 1472 1 10.7 93.5 8.1X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 13011 13048 51 1.2 827.2 1.0X +SQL Json 9623 9668 64 1.6 611.8 1.4X +SQL Parquet Vectorized: DataPageV1 98 107 11 160.4 6.2 132.7X +SQL Parquet Vectorized: DataPageV2 97 104 6 161.9 6.2 133.9X +SQL Parquet MR: DataPageV1 1763 1793 42 8.9 112.1 7.4X +SQL Parquet MR: DataPageV2 1631 1637 9 9.6 103.7 8.0X +SQL ORC Vectorized 150 156 7 105.1 9.5 86.9X +SQL ORC MR 1384 1414 43 11.4 88.0 9.4X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 118 120 2 133.0 7.5 1.0X -ParquetReader Vectorized: DataPageV2 118 120 4 133.2 7.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 71 73 3 220.1 4.5 1.7X -ParquetReader Vectorized -> Row: DataPageV2 72 74 2 217.7 4.6 1.6X +ParquetReader Vectorized: DataPageV1 101 102 1 155.6 6.4 1.0X +ParquetReader Vectorized: DataPageV2 101 102 2 155.2 6.4 1.0X +ParquetReader Vectorized -> Row: DataPageV1 63 65 2 250.5 4.0 1.6X +ParquetReader Vectorized -> Row: DataPageV2 63 64 1 249.8 4.0 1.6X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12575 12596 29 1.3 799.5 1.0X -SQL Json 9500 9751 355 1.7 604.0 1.3X -SQL Parquet Vectorized: DataPageV1 152 162 11 103.8 9.6 83.0X -SQL Parquet Vectorized: DataPageV2 206 214 6 76.3 13.1 61.0X -SQL Parquet MR: DataPageV1 2150 2170 28 7.3 136.7 5.8X -SQL Parquet MR: DataPageV2 1972 1981 12 8.0 125.4 6.4X -SQL ORC Vectorized 208 217 7 75.5 13.2 60.4X -SQL ORC MR 1626 1638 18 9.7 103.4 7.7X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 13946 13950 6 1.1 886.7 1.0X +SQL Json 10195 10204 12 1.5 648.2 1.4X +SQL Parquet Vectorized: DataPageV1 131 140 8 120.5 8.3 106.9X +SQL Parquet Vectorized: DataPageV2 183 189 5 86.0 11.6 76.3X +SQL Parquet MR: DataPageV1 1979 2023 63 7.9 125.8 7.0X +SQL Parquet MR: DataPageV2 1729 1757 39 9.1 109.9 8.1X +SQL ORC Vectorized 198 206 7 79.4 12.6 70.4X +SQL ORC MR 1547 1562 21 10.2 98.4 9.0X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 223 226 2 70.6 14.2 1.0X -ParquetReader Vectorized: DataPageV2 281 388 190 56.1 17.8 0.8X -ParquetReader Vectorized -> Row: DataPageV1 207 210 2 75.9 13.2 1.1X -ParquetReader Vectorized -> Row: DataPageV2 262 269 9 60.0 16.7 0.8X +ParquetReader Vectorized: DataPageV1 200 208 9 78.5 12.7 1.0X +ParquetReader Vectorized: DataPageV2 249 251 1 63.3 15.8 0.8X +ParquetReader Vectorized -> Row: DataPageV1 196 201 7 80.4 12.4 1.0X +ParquetReader Vectorized -> Row: DataPageV2 245 246 1 64.3 15.5 0.8X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13952 13964 18 1.1 887.0 1.0X -SQL Json 9848 9848 1 1.6 626.1 1.4X -SQL Parquet Vectorized: DataPageV1 146 153 5 107.4 9.3 95.3X -SQL Parquet Vectorized: DataPageV2 294 309 19 53.5 18.7 47.4X -SQL Parquet MR: DataPageV1 2129 2131 3 7.4 135.4 6.6X -SQL Parquet MR: DataPageV2 2033 2042 13 7.7 129.3 6.9X -SQL ORC Vectorized 239 244 4 65.8 15.2 58.3X -SQL ORC MR 1650 1654 6 9.5 104.9 8.5X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 15404 15408 6 1.0 979.3 1.0X +SQL Json 10952 10960 12 1.4 696.3 1.4X +SQL Parquet Vectorized: DataPageV1 147 162 15 107.3 9.3 105.0X +SQL Parquet Vectorized: DataPageV2 286 318 18 55.0 18.2 53.9X +SQL Parquet MR: DataPageV1 2014 2051 52 7.8 128.1 7.6X +SQL Parquet MR: DataPageV2 1877 1887 14 8.4 119.3 8.2X +SQL ORC Vectorized 230 243 17 68.3 14.7 66.8X +SQL ORC MR 1608 1650 59 9.8 102.3 9.6X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 236 240 2 66.7 15.0 1.0X -ParquetReader Vectorized: DataPageV2 404 423 31 38.9 25.7 0.6X -ParquetReader Vectorized -> Row: DataPageV1 232 237 7 67.7 14.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 375 381 5 41.9 23.9 0.6X +ParquetReader Vectorized: DataPageV1 232 235 4 67.9 14.7 1.0X +ParquetReader Vectorized: DataPageV2 370 371 1 42.6 23.5 0.6X +ParquetReader Vectorized -> Row: DataPageV1 204 208 4 77.0 13.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 343 344 2 45.9 21.8 0.7X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18407 18894 690 0.9 1170.3 1.0X -SQL Json 12745 12771 37 1.2 810.3 1.4X -SQL Parquet Vectorized: DataPageV1 246 255 10 64.0 15.6 74.8X -SQL Parquet Vectorized: DataPageV2 559 592 38 28.1 35.5 32.9X -SQL Parquet MR: DataPageV1 2379 2387 11 6.6 151.3 7.7X -SQL Parquet MR: DataPageV2 2188 2196 12 7.2 139.1 8.4X -SQL ORC Vectorized 368 406 51 42.7 23.4 50.0X -SQL ORC MR 1980 2003 32 7.9 125.9 9.3X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 20572 20592 27 0.8 1308.0 1.0X +SQL Json 13884 13889 6 1.1 882.7 1.5X +SQL Parquet Vectorized: DataPageV1 167 194 19 93.9 10.6 122.9X +SQL Parquet Vectorized: DataPageV2 362 374 15 43.5 23.0 56.9X +SQL Parquet MR: DataPageV1 2348 2352 5 6.7 149.3 8.8X +SQL Parquet MR: DataPageV2 1921 1922 1 8.2 122.1 10.7X +SQL ORC Vectorized 271 324 32 58.0 17.2 75.9X +SQL ORC MR 1742 1744 3 9.0 110.8 11.8X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 313 315 2 50.3 19.9 1.0X -ParquetReader Vectorized: DataPageV2 617 623 6 25.5 39.2 0.5X -ParquetReader Vectorized -> Row: DataPageV1 276 307 20 57.0 17.5 1.1X -ParquetReader Vectorized -> Row: DataPageV2 590 599 8 26.6 37.5 0.5X +ParquetReader Vectorized: DataPageV1 248 259 10 63.3 15.8 1.0X +ParquetReader Vectorized: DataPageV2 431 446 22 36.5 27.4 0.6X +ParquetReader Vectorized -> Row: DataPageV1 293 295 2 53.7 18.6 0.8X +ParquetReader Vectorized -> Row: DataPageV2 448 466 19 35.1 28.5 0.6X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14449 14465 22 1.1 918.7 1.0X -SQL Json 11933 11944 15 1.3 758.7 1.2X -SQL Parquet Vectorized: DataPageV1 145 174 48 108.5 9.2 99.7X -SQL Parquet Vectorized: DataPageV2 178 186 13 88.5 11.3 81.3X -SQL Parquet MR: DataPageV1 2134 2158 34 7.4 135.7 6.8X -SQL Parquet MR: DataPageV2 2014 2026 17 7.8 128.0 7.2X -SQL ORC Vectorized 442 452 18 35.6 28.1 32.7X -SQL ORC MR 1941 1944 5 8.1 123.4 7.4X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 15514 15667 218 1.0 986.3 1.0X +SQL Json 12242 12254 17 1.3 778.3 1.3X +SQL Parquet Vectorized: DataPageV1 124 132 7 126.6 7.9 124.9X +SQL Parquet Vectorized: DataPageV2 122 128 5 128.6 7.8 126.9X +SQL Parquet MR: DataPageV1 2015 2022 10 7.8 128.1 7.7X +SQL Parquet MR: DataPageV2 1819 1821 4 8.6 115.6 8.5X +SQL ORC Vectorized 390 410 14 40.3 24.8 39.7X +SQL ORC MR 1745 1750 6 9.0 111.0 8.9X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 239 244 5 65.9 15.2 1.0X -ParquetReader Vectorized: DataPageV2 236 301 117 66.6 15.0 1.0X -ParquetReader Vectorized -> Row: DataPageV1 217 237 14 72.6 13.8 1.1X -ParquetReader Vectorized -> Row: DataPageV2 236 238 2 66.6 15.0 1.0X +ParquetReader Vectorized: DataPageV1 202 207 5 77.8 12.9 1.0X +ParquetReader Vectorized: DataPageV2 224 228 6 70.1 14.3 0.9X +ParquetReader Vectorized -> Row: DataPageV1 232 235 5 67.7 14.8 0.9X +ParquetReader Vectorized -> Row: DataPageV2 233 247 26 67.6 14.8 0.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19310 19577 378 0.8 1227.7 1.0X -SQL Json 16132 16146 20 1.0 1025.7 1.2X -SQL Parquet Vectorized: DataPageV1 246 262 18 64.0 15.6 78.5X -SQL Parquet Vectorized: DataPageV2 216 255 21 72.7 13.8 89.2X -SQL Parquet MR: DataPageV1 2379 3134 1068 6.6 151.3 8.1X -SQL Parquet MR: DataPageV2 6344 6376 46 2.5 403.3 3.0X -SQL ORC Vectorized 484 525 55 32.5 30.8 39.9X -SQL ORC MR 1998 1998 0 7.9 127.0 9.7X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 20827 20829 3 0.8 1324.2 1.0X +SQL Json 17161 17174 18 0.9 1091.1 1.2X +SQL Parquet Vectorized: DataPageV1 200 210 8 78.6 12.7 104.0X +SQL Parquet Vectorized: DataPageV2 202 211 8 77.7 12.9 102.9X +SQL Parquet MR: DataPageV1 2255 2292 53 7.0 143.4 9.2X +SQL Parquet MR: DataPageV2 2087 2091 4 7.5 132.7 10.0X +SQL ORC Vectorized 486 498 9 32.4 30.9 42.9X +SQL ORC MR 1886 1894 10 8.3 119.9 11.0X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 294 308 12 53.5 18.7 1.0X -ParquetReader Vectorized: DataPageV2 316 324 9 49.8 20.1 0.9X -ParquetReader Vectorized -> Row: DataPageV1 289 321 17 54.5 18.4 1.0X -ParquetReader Vectorized -> Row: DataPageV2 317 319 2 49.6 20.2 0.9X +ParquetReader Vectorized: DataPageV1 283 296 31 55.5 18.0 1.0X +ParquetReader Vectorized: DataPageV2 266 278 12 59.1 16.9 1.1X +ParquetReader Vectorized -> Row: DataPageV1 265 278 9 59.4 16.8 1.1X +ParquetReader Vectorized -> Row: DataPageV2 263 275 10 59.9 16.7 1.1X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13320 13335 21 0.8 1270.3 1.0X -SQL Json 11087 11093 8 0.9 1057.3 1.2X -SQL Parquet Vectorized: DataPageV1 2098 2115 24 5.0 200.1 6.3X -SQL Parquet Vectorized: DataPageV2 2843 2961 167 3.7 271.1 4.7X -SQL Parquet MR: DataPageV1 4169 4201 45 2.5 397.6 3.2X -SQL Parquet MR: DataPageV2 4397 4414 24 2.4 419.4 3.0X -SQL ORC Vectorized 2864 4244 1951 3.7 273.2 4.7X -SQL ORC MR 3544 3549 7 3.0 338.0 3.8X +SQL CSV 14490 14490 0 0.7 1381.8 1.0X +SQL Json 11765 11779 20 0.9 1122.0 1.2X +SQL Parquet Vectorized: DataPageV1 2134 2168 48 4.9 203.5 6.8X +SQL Parquet Vectorized: DataPageV2 2694 2700 9 3.9 256.9 5.4X +SQL Parquet MR: DataPageV1 4082 4090 10 2.6 389.3 3.5X +SQL Parquet MR: DataPageV2 4170 4171 0 2.5 397.7 3.5X +SQL ORC Vectorized 2136 2138 2 4.9 203.7 6.8X +SQL ORC MR 3714 3752 53 2.8 354.2 3.9X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7305 7308 4 1.4 696.7 1.0X -SQL Json 6621 6622 1 1.6 631.4 1.1X -SQL Parquet Vectorized: DataPageV1 608 612 4 17.2 58.0 12.0X -SQL Parquet Vectorized: DataPageV2 611 618 7 17.2 58.2 12.0X -SQL Parquet MR: DataPageV1 1706 1710 5 6.1 162.7 4.3X -SQL Parquet MR: DataPageV2 1640 1653 19 6.4 156.4 4.5X -SQL ORC Vectorized 501 504 3 20.9 47.7 14.6X -SQL ORC MR 1909 1926 24 5.5 182.1 3.8X +SQL CSV 7936 7973 53 1.3 756.8 1.0X +SQL Json 6859 6861 3 1.5 654.1 1.2X +SQL Parquet Vectorized: DataPageV1 651 657 6 16.1 62.1 12.2X +SQL Parquet Vectorized: DataPageV2 650 655 4 16.1 62.0 12.2X +SQL Parquet MR: DataPageV1 1650 1674 34 6.4 157.3 4.8X +SQL Parquet MR: DataPageV2 1594 1624 42 6.6 152.1 5.0X +SQL ORC Vectorized 487 509 25 21.5 46.4 16.3X +SQL ORC MR 1751 1780 41 6.0 167.0 4.5X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 18479 18682 288 0.9 1174.8 1.0X -Data column - Json 12533 12541 11 1.3 796.8 1.5X -Data column - Parquet Vectorized: DataPageV1 272 303 35 57.8 17.3 67.9X -Data column - Parquet Vectorized: DataPageV2 629 653 18 25.0 40.0 29.4X -Data column - Parquet MR: DataPageV1 2777 2782 7 5.7 176.5 6.7X -Data column - Parquet MR: DataPageV2 2581 2603 31 6.1 164.1 7.2X -Data column - ORC Vectorized 418 440 27 37.6 26.6 44.2X -Data column - ORC MR 2297 2332 50 6.8 146.0 8.0X -Partition column - CSV 6496 6569 104 2.4 413.0 2.8X -Partition column - Json 10072 10077 7 1.6 640.3 1.8X -Partition column - Parquet Vectorized: DataPageV1 56 63 5 282.4 3.5 331.7X -Partition column - Parquet Vectorized: DataPageV2 54 58 6 290.0 3.4 340.6X -Partition column - Parquet MR: DataPageV1 1356 1360 6 11.6 86.2 13.6X -Partition column - Parquet MR: DataPageV2 1347 1348 2 11.7 85.6 13.7X -Partition column - ORC Vectorized 55 60 6 284.3 3.5 334.0X -Partition column - ORC MR 1458 1468 15 10.8 92.7 12.7X -Both columns - CSV 19228 20030 1133 0.8 1222.5 1.0X -Both columns - Json 13465 13516 71 1.2 856.1 1.4X -Both columns - Parquet Vectorized: DataPageV1 267 269 2 59.0 17.0 69.3X -Both columns - Parquet Vectorized: DataPageV2 659 691 29 23.9 41.9 28.1X -Both columns - Parquet MR: DataPageV1 2819 2850 44 5.6 179.2 6.6X -Both columns - Parquet MR: DataPageV2 2584 2585 1 6.1 164.3 7.1X -Both columns - ORC Vectorized 389 406 23 40.4 24.7 47.5X -Both columns - ORC MR 2490 2504 19 6.3 158.3 7.4X +Data column - CSV 20442 20527 119 0.8 1299.7 1.0X +Data column - Json 13399 13400 1 1.2 851.9 1.5X +Data column - Parquet Vectorized: DataPageV1 204 223 26 77.1 13.0 100.2X +Data column - Parquet Vectorized: DataPageV2 367 402 22 42.9 23.3 55.7X +Data column - Parquet MR: DataPageV1 2619 2631 18 6.0 166.5 7.8X +Data column - Parquet MR: DataPageV2 2354 2367 18 6.7 149.7 8.7X +Data column - ORC Vectorized 373 386 13 42.1 23.7 54.7X +Data column - ORC MR 2013 2013 1 7.8 128.0 10.2X +Partition column - CSV 6545 6565 28 2.4 416.1 3.1X +Partition column - Json 10543 10549 10 1.5 670.3 1.9X +Partition column - Parquet Vectorized: DataPageV1 44 48 6 359.1 2.8 466.8X +Partition column - Parquet Vectorized: DataPageV2 44 49 7 357.7 2.8 464.9X +Partition column - Parquet MR: DataPageV1 1241 1243 3 12.7 78.9 16.5X +Partition column - Parquet MR: DataPageV2 1227 1234 10 12.8 78.0 16.7X +Partition column - ORC Vectorized 45 50 4 347.1 2.9 451.1X +Partition column - ORC MR 1323 1332 12 11.9 84.1 15.4X +Both columns - CSV 20858 20873 22 0.8 1326.1 1.0X +Both columns - Json 14078 14082 5 1.1 895.1 1.5X +Both columns - Parquet Vectorized: DataPageV1 224 234 8 70.2 14.2 91.3X +Both columns - Parquet Vectorized: DataPageV2 441 450 9 35.7 28.0 46.4X +Both columns - Parquet MR: DataPageV1 2655 2664 13 5.9 168.8 7.7X +Both columns - Parquet MR: DataPageV2 2298 2312 19 6.8 146.1 8.9X +Both columns - ORC Vectorized 377 391 22 41.8 24.0 54.3X +Both columns - ORC MR 2131 2135 6 7.4 135.5 9.6X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8769 8915 207 1.2 836.2 1.0X -SQL Json 9987 9998 16 1.0 952.4 0.9X -SQL Parquet Vectorized: DataPageV1 1362 1365 4 7.7 129.9 6.4X -SQL Parquet Vectorized: DataPageV2 2118 2124 8 5.0 202.0 4.1X -SQL Parquet MR: DataPageV1 3631 4788 1637 2.9 346.3 2.4X -SQL Parquet MR: DataPageV2 4213 4227 19 2.5 401.8 2.1X -ParquetReader Vectorized: DataPageV1 1003 1012 13 10.5 95.6 8.7X -ParquetReader Vectorized: DataPageV2 1789 1797 12 5.9 170.6 4.9X -SQL ORC Vectorized 980 1029 70 10.7 93.5 8.9X -SQL ORC MR 3057 3060 4 3.4 291.5 2.9X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 9558 9562 6 1.1 911.5 1.0X +SQL Json 10497 10513 22 1.0 1001.1 0.9X +SQL Parquet Vectorized: DataPageV1 1409 1419 15 7.4 134.4 6.8X +SQL Parquet Vectorized: DataPageV2 2264 2270 9 4.6 215.9 4.2X +SQL Parquet MR: DataPageV1 3468 3473 7 3.0 330.7 2.8X +SQL Parquet MR: DataPageV2 3688 3768 112 2.8 351.8 2.6X +ParquetReader Vectorized: DataPageV1 1005 1008 4 10.4 95.8 9.5X +ParquetReader Vectorized: DataPageV2 1538 1538 0 6.8 146.7 6.2X +SQL ORC Vectorized 1099 1123 34 9.5 104.8 8.7X +SQL ORC MR 2959 2963 6 3.5 282.2 3.2X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6508 6509 1 1.6 620.7 1.0X -SQL Json 7775 7790 22 1.3 741.5 0.8X -SQL Parquet Vectorized: DataPageV1 988 992 4 10.6 94.2 6.6X -SQL Parquet Vectorized: DataPageV2 1605 1612 10 6.5 153.1 4.1X -SQL Parquet MR: DataPageV1 2837 2840 4 3.7 270.6 2.3X -SQL Parquet MR: DataPageV2 3077 3082 7 3.4 293.5 2.1X -ParquetReader Vectorized: DataPageV1 930 951 30 11.3 88.6 7.0X -ParquetReader Vectorized: DataPageV2 1511 1516 6 6.9 144.1 4.3X -SQL ORC Vectorized 1209 1227 25 8.7 115.3 5.4X -SQL ORC MR 2889 2895 9 3.6 275.5 2.3X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 6723 6723 1 1.6 641.1 1.0X +SQL Json 8098 8098 1 1.3 772.3 0.8X +SQL Parquet Vectorized: DataPageV1 1046 1046 0 10.0 99.7 6.4X +SQL Parquet Vectorized: DataPageV2 1308 1315 10 8.0 124.7 5.1X +SQL Parquet MR: DataPageV1 2666 2669 4 3.9 254.2 2.5X +SQL Parquet MR: DataPageV2 2810 2811 1 3.7 268.0 2.4X +ParquetReader Vectorized: DataPageV1 952 954 2 11.0 90.8 7.1X +ParquetReader Vectorized: DataPageV2 1201 1202 1 8.7 114.6 5.6X +SQL ORC Vectorized 1279 1285 8 8.2 122.0 5.3X +SQL ORC MR 2880 2939 84 3.6 274.6 2.3X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4587 4604 24 2.3 437.5 1.0X -SQL Json 4677 4679 3 2.2 446.0 1.0X -SQL Parquet Vectorized: DataPageV1 235 239 5 44.7 22.4 19.5X -SQL Parquet Vectorized: DataPageV2 620 622 2 16.9 59.1 7.4X -SQL Parquet MR: DataPageV1 1804 1812 11 5.8 172.1 2.5X -SQL Parquet MR: DataPageV2 1605 1659 76 6.5 153.1 2.9X -ParquetReader Vectorized: DataPageV1 235 237 2 44.7 22.4 19.6X -ParquetReader Vectorized: DataPageV2 613 617 3 17.1 58.5 7.5X -SQL ORC Vectorized 387 391 2 27.1 36.9 11.8X -SQL ORC MR 1632 1635 4 6.4 155.7 2.8X +SQL CSV 4638 4645 11 2.3 442.3 1.0X +SQL Json 4646 4650 5 2.3 443.1 1.0X +SQL Parquet Vectorized: DataPageV1 226 231 4 46.4 21.6 20.5X +SQL Parquet Vectorized: DataPageV2 307 311 2 34.2 29.3 15.1X +SQL Parquet MR: DataPageV1 1593 1597 5 6.6 151.9 2.9X +SQL Parquet MR: DataPageV2 1452 1460 12 7.2 138.4 3.2X +ParquetReader Vectorized: DataPageV1 235 237 2 44.6 22.4 19.7X +ParquetReader Vectorized: DataPageV2 314 315 2 33.4 29.9 14.8X +SQL ORC Vectorized 392 398 4 26.7 37.4 11.8X +SQL ORC MR 1470 1483 18 7.1 140.2 3.2X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2056 2068 17 0.5 1960.9 1.0X -SQL Json 3193 3242 70 0.3 3045.3 0.6X -SQL Parquet Vectorized: DataPageV1 45 50 7 23.1 43.3 45.3X -SQL Parquet Vectorized: DataPageV2 66 72 5 15.8 63.4 30.9X -SQL Parquet MR: DataPageV1 192 197 6 5.4 183.5 10.7X -SQL Parquet MR: DataPageV2 180 186 7 5.8 171.3 11.4X -SQL ORC Vectorized 55 60 6 19.2 52.1 37.6X -SQL ORC MR 164 169 7 6.4 156.2 12.6X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 2187 2191 6 0.5 2085.6 1.0X +SQL Json 2957 2978 30 0.4 2820.3 0.7X +SQL Parquet Vectorized: DataPageV1 36 39 5 29.2 34.3 60.8X +SQL Parquet Vectorized: DataPageV2 48 50 4 22.0 45.4 46.0X +SQL Parquet MR: DataPageV1 179 184 4 5.8 171.1 12.2X +SQL Parquet MR: DataPageV2 156 163 5 6.7 149.2 14.0X +SQL ORC Vectorized 46 49 4 22.9 43.7 47.7X +SQL ORC MR 143 148 4 7.3 136.2 15.3X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4292 4319 38 0.2 4093.3 1.0X -SQL Json 12613 12769 221 0.1 12028.9 0.3X -SQL Parquet Vectorized: DataPageV1 59 78 23 17.6 56.7 72.2X -SQL Parquet Vectorized: DataPageV2 82 87 6 12.8 78.1 52.4X -SQL Parquet MR: DataPageV1 214 219 6 4.9 204.0 20.1X -SQL Parquet MR: DataPageV2 197 204 6 5.3 188.3 21.7X -SQL ORC Vectorized 70 77 8 14.9 67.0 61.1X -SQL ORC MR 186 200 12 5.6 177.3 23.1X - -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 4728 4728 0 0.2 4508.8 1.0X +SQL Json 11030 11074 62 0.1 10519.5 0.4X +SQL Parquet Vectorized: DataPageV1 48 53 8 21.9 45.7 98.6X +SQL Parquet Vectorized: DataPageV2 59 62 4 17.6 56.7 79.5X +SQL Parquet MR: DataPageV1 197 201 4 5.3 187.8 24.0X +SQL Parquet MR: DataPageV2 174 178 5 6.0 165.8 27.2X +SQL ORC Vectorized 59 63 5 17.7 56.6 79.7X +SQL ORC MR 157 162 5 6.7 150.1 30.0X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7312 7322 14 0.1 6973.3 1.0X -SQL Json 24046 24366 452 0.0 22932.0 0.3X -SQL Parquet Vectorized: DataPageV1 86 94 7 12.1 82.4 84.6X -SQL Parquet Vectorized: DataPageV2 109 113 7 9.7 103.5 67.4X -SQL Parquet MR: DataPageV1 245 257 12 4.3 233.6 29.9X -SQL Parquet MR: DataPageV2 228 234 8 4.6 217.1 32.1X -SQL ORC Vectorized 86 91 6 12.2 82.1 84.9X -SQL ORC MR 199 209 12 5.3 190.2 36.7X +SQL CSV 7841 7851 14 0.1 7477.6 1.0X +SQL Json 20753 20875 172 0.1 19791.5 0.4X +SQL Parquet Vectorized: DataPageV1 74 79 4 14.2 70.5 106.0X +SQL Parquet Vectorized: DataPageV2 84 88 4 12.5 79.9 93.6X +SQL Parquet MR: DataPageV1 223 228 6 4.7 213.1 35.1X +SQL Parquet MR: DataPageV2 197 202 5 5.3 188.3 39.7X +SQL ORC Vectorized 73 77 4 14.3 69.9 106.9X +SQL ORC MR 171 175 3 6.1 163.5 45.7X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index fed5d5a84933c..54b57801cfb81 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20534 20708 247 0.8 1305.5 1.0X -SQL Json 10166 10196 42 1.5 646.3 2.0X -SQL Parquet Vectorized: DataPageV1 148 185 22 106.5 9.4 139.0X -SQL Parquet Vectorized: DataPageV2 110 138 26 142.4 7.0 185.9X -SQL Parquet MR: DataPageV1 2098 2108 14 7.5 133.4 9.8X -SQL Parquet MR: DataPageV2 1865 1875 13 8.4 118.6 11.0X -SQL ORC Vectorized 199 209 8 79.2 12.6 103.3X -SQL ORC MR 2047 2053 8 7.7 130.2 10.0X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 11081 11162 114 1.4 704.5 1.0X +SQL Json 7274 7299 36 2.2 462.5 1.5X +SQL Parquet Vectorized: DataPageV1 98 110 8 160.6 6.2 113.1X +SQL Parquet Vectorized: DataPageV2 69 77 6 228.7 4.4 161.1X +SQL Parquet MR: DataPageV1 1560 1566 8 10.1 99.2 7.1X +SQL Parquet MR: DataPageV2 1360 1369 12 11.6 86.5 8.1X +SQL ORC Vectorized 147 153 6 106.7 9.4 75.2X +SQL ORC MR 1378 1404 36 11.4 87.6 8.0X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 122 130 6 129.4 7.7 1.0X -ParquetReader Vectorized: DataPageV2 110 117 5 142.7 7.0 1.1X -ParquetReader Vectorized -> Row: DataPageV1 54 56 2 291.8 3.4 2.3X -ParquetReader Vectorized -> Row: DataPageV2 46 49 2 339.2 2.9 2.6X +ParquetReader Vectorized: DataPageV1 91 93 3 173.4 5.8 1.0X +ParquetReader Vectorized: DataPageV2 83 86 5 188.9 5.3 1.1X +ParquetReader Vectorized -> Row: DataPageV1 38 38 1 415.3 2.4 2.4X +ParquetReader Vectorized -> Row: DataPageV2 30 31 1 518.9 1.9 3.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 22495 22582 124 0.7 1430.2 1.0X -SQL Json 11601 11684 118 1.4 737.5 1.9X -SQL Parquet Vectorized: DataPageV1 173 188 22 90.8 11.0 129.9X -SQL Parquet Vectorized: DataPageV2 169 185 20 93.1 10.7 133.2X -SQL Parquet MR: DataPageV1 2408 2434 37 6.5 153.1 9.3X -SQL Parquet MR: DataPageV2 2218 2222 5 7.1 141.0 10.1X -SQL ORC Vectorized 161 172 10 97.4 10.3 139.3X -SQL ORC MR 1926 1949 33 8.2 122.4 11.7X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 13865 14012 208 1.1 881.5 1.0X +SQL Json 8898 9468 805 1.8 565.7 1.6X +SQL Parquet Vectorized: DataPageV1 110 114 5 143.5 7.0 126.5X +SQL Parquet Vectorized: DataPageV2 107 113 10 146.8 6.8 129.4X +SQL Parquet MR: DataPageV1 1776 1823 66 8.9 112.9 7.8X +SQL Parquet MR: DataPageV2 1670 1723 74 9.4 106.2 8.3X +SQL ORC Vectorized 155 157 2 101.4 9.9 89.4X +SQL ORC MR 1425 1460 49 11.0 90.6 9.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 220 229 6 71.4 14.0 1.0X -ParquetReader Vectorized: DataPageV2 223 228 4 70.7 14.1 1.0X -ParquetReader Vectorized -> Row: DataPageV1 203 212 12 77.3 12.9 1.1X -ParquetReader Vectorized -> Row: DataPageV2 201 208 5 78.1 12.8 1.1X +ParquetReader Vectorized: DataPageV1 161 163 2 97.7 10.2 1.0X +ParquetReader Vectorized: DataPageV2 161 168 22 97.8 10.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 137 138 1 115.1 8.7 1.2X +ParquetReader Vectorized -> Row: DataPageV2 137 139 3 115.0 8.7 1.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 23404 23570 234 0.7 1488.0 1.0X -SQL Json 12152 12199 67 1.3 772.6 1.9X -SQL Parquet Vectorized: DataPageV1 266 279 10 59.2 16.9 88.1X -SQL Parquet Vectorized: DataPageV2 265 275 8 59.3 16.9 88.2X -SQL Parquet MR: DataPageV1 2721 2762 58 5.8 173.0 8.6X -SQL Parquet MR: DataPageV2 2299 2326 38 6.8 146.2 10.2X -SQL ORC Vectorized 227 232 4 69.4 14.4 103.3X -SQL ORC MR 2020 2118 139 7.8 128.5 11.6X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 15112 15131 26 1.0 960.8 1.0X +SQL Json 9439 9444 6 1.7 600.1 1.6X +SQL Parquet Vectorized: DataPageV1 170 173 3 92.3 10.8 88.7X +SQL Parquet Vectorized: DataPageV2 175 178 4 89.7 11.1 86.2X +SQL Parquet MR: DataPageV1 2033 2050 24 7.7 129.3 7.4X +SQL Parquet MR: DataPageV2 1656 1661 7 9.5 105.3 9.1X +SQL ORC Vectorized 156 161 5 100.9 9.9 96.9X +SQL ORC MR 1474 1490 22 10.7 93.7 10.3X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 306 322 14 51.3 19.5 1.0X -ParquetReader Vectorized: DataPageV2 348 354 6 45.2 22.1 0.9X -ParquetReader Vectorized -> Row: DataPageV1 304 315 8 51.7 19.4 1.0X -ParquetReader Vectorized -> Row: DataPageV2 349 358 7 45.1 22.2 0.9X +ParquetReader Vectorized: DataPageV1 226 228 3 69.6 14.4 1.0X +ParquetReader Vectorized: DataPageV2 259 261 3 60.7 16.5 0.9X +ParquetReader Vectorized -> Row: DataPageV1 210 215 8 74.9 13.4 1.1X +ParquetReader Vectorized -> Row: DataPageV2 245 246 2 64.3 15.6 0.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 25064 25080 22 0.6 1593.5 1.0X -SQL Json 12730 12829 140 1.2 809.3 2.0X -SQL Parquet Vectorized: DataPageV1 210 223 8 74.8 13.4 119.2X -SQL Parquet Vectorized: DataPageV2 374 394 11 42.0 23.8 66.9X -SQL Parquet MR: DataPageV1 2710 2757 66 5.8 172.3 9.2X -SQL Parquet MR: DataPageV2 2378 2385 9 6.6 151.2 10.5X -SQL ORC Vectorized 306 316 6 51.5 19.4 82.0X -SQL ORC MR 2199 2218 27 7.2 139.8 11.4X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 16706 16719 18 0.9 1062.1 1.0X +SQL Json 10038 10044 10 1.6 638.2 1.7X +SQL Parquet Vectorized: DataPageV1 119 123 7 132.0 7.6 140.2X +SQL Parquet Vectorized: DataPageV2 243 248 6 64.8 15.4 68.8X +SQL Parquet MR: DataPageV1 2045 2047 2 7.7 130.0 8.2X +SQL Parquet MR: DataPageV2 1731 1739 12 9.1 110.0 9.7X +SQL ORC Vectorized 215 219 3 73.1 13.7 77.6X +SQL ORC MR 1527 1534 10 10.3 97.1 10.9X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 286 307 19 55.0 18.2 1.0X -ParquetReader Vectorized: DataPageV2 468 479 11 33.6 29.8 0.6X -ParquetReader Vectorized -> Row: DataPageV1 282 293 11 55.7 18.0 1.0X -ParquetReader Vectorized -> Row: DataPageV2 467 478 8 33.7 29.7 0.6X +ParquetReader Vectorized: DataPageV1 203 207 5 77.6 12.9 1.0X +ParquetReader Vectorized: DataPageV2 326 328 2 48.3 20.7 0.6X +ParquetReader Vectorized -> Row: DataPageV1 189 191 2 83.1 12.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 311 335 55 50.6 19.8 0.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 32250 32360 155 0.5 2050.4 1.0X -SQL Json 16196 16367 241 1.0 1029.7 2.0X -SQL Parquet Vectorized: DataPageV1 311 325 14 50.7 19.7 103.9X -SQL Parquet Vectorized: DataPageV2 629 641 13 25.0 40.0 51.2X -SQL Parquet MR: DataPageV1 2893 2916 33 5.4 184.0 11.1X -SQL Parquet MR: DataPageV2 2506 2528 30 6.3 159.3 12.9X -SQL ORC Vectorized 388 403 9 40.5 24.7 83.1X -SQL ORC MR 2214 2232 25 7.1 140.8 14.6X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 22154 22163 14 0.7 1408.5 1.0X +SQL Json 12754 12760 8 1.2 810.9 1.7X +SQL Parquet Vectorized: DataPageV1 185 190 8 85.2 11.7 119.9X +SQL Parquet Vectorized: DataPageV2 386 391 6 40.8 24.5 57.5X +SQL Parquet MR: DataPageV1 2111 2112 2 7.4 134.2 10.5X +SQL Parquet MR: DataPageV2 1808 1808 0 8.7 115.0 12.3X +SQL ORC Vectorized 267 273 6 58.9 17.0 82.9X +SQL ORC MR 1603 1609 8 9.8 101.9 13.8X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 373 398 15 42.2 23.7 1.0X -ParquetReader Vectorized: DataPageV2 709 715 6 22.2 45.1 0.5X -ParquetReader Vectorized -> Row: DataPageV1 379 388 8 41.5 24.1 1.0X -ParquetReader Vectorized -> Row: DataPageV2 691 704 11 22.8 44.0 0.5X +ParquetReader Vectorized: DataPageV1 258 281 55 61.0 16.4 1.0X +ParquetReader Vectorized: DataPageV2 471 479 10 33.4 29.9 0.5X +ParquetReader Vectorized -> Row: DataPageV1 247 250 4 63.8 15.7 1.0X +ParquetReader Vectorized -> Row: DataPageV2 457 462 9 34.4 29.1 0.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 26962 27080 167 0.6 1714.2 1.0X -SQL Json 15554 15738 261 1.0 988.9 1.7X -SQL Parquet Vectorized: DataPageV1 196 206 10 80.1 12.5 137.3X -SQL Parquet Vectorized: DataPageV2 191 202 11 82.3 12.1 141.2X -SQL Parquet MR: DataPageV1 2674 2698 35 5.9 170.0 10.1X -SQL Parquet MR: DataPageV2 2502 2512 14 6.3 159.1 10.8X -SQL ORC Vectorized 427 448 16 36.9 27.1 63.2X -SQL ORC MR 2271 2299 39 6.9 144.4 11.9X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 17565 17590 35 0.9 1116.7 1.0X +SQL Json 12371 12397 37 1.3 786.5 1.4X +SQL Parquet Vectorized: DataPageV1 130 135 7 121.1 8.3 135.2X +SQL Parquet Vectorized: DataPageV2 129 133 8 121.6 8.2 135.7X +SQL Parquet MR: DataPageV1 2002 2013 15 7.9 127.3 8.8X +SQL Parquet MR: DataPageV2 1823 1824 1 8.6 115.9 9.6X +SQL ORC Vectorized 356 359 2 44.1 22.7 49.3X +SQL ORC MR 1626 1650 34 9.7 103.4 10.8X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 290 301 11 54.2 18.4 1.0X -ParquetReader Vectorized: DataPageV2 288 300 12 54.7 18.3 1.0X -ParquetReader Vectorized -> Row: DataPageV1 281 291 7 55.9 17.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 282 290 9 55.8 17.9 1.0X +ParquetReader Vectorized: DataPageV1 200 203 4 78.5 12.7 1.0X +ParquetReader Vectorized: DataPageV2 200 203 3 78.6 12.7 1.0X +ParquetReader Vectorized -> Row: DataPageV1 187 189 3 84.3 11.9 1.1X +ParquetReader Vectorized -> Row: DataPageV2 188 189 2 83.8 11.9 1.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 33265 33351 122 0.5 2114.9 1.0X -SQL Json 20959 21022 89 0.8 1332.5 1.6X -SQL Parquet Vectorized: DataPageV1 291 308 11 54.1 18.5 114.4X -SQL Parquet Vectorized: DataPageV2 291 299 7 54.1 18.5 114.4X -SQL Parquet MR: DataPageV1 2838 2892 76 5.5 180.4 11.7X -SQL Parquet MR: DataPageV2 2699 2700 0 5.8 171.6 12.3X -SQL ORC Vectorized 504 527 22 31.2 32.0 66.0X -SQL ORC MR 2355 2365 14 6.7 149.7 14.1X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 23516 23535 26 0.7 1495.1 1.0X +SQL Json 17023 17023 0 0.9 1082.3 1.4X +SQL Parquet Vectorized: DataPageV1 184 189 7 85.4 11.7 127.7X +SQL Parquet Vectorized: DataPageV2 184 188 4 85.5 11.7 127.8X +SQL Parquet MR: DataPageV1 2098 2100 3 7.5 133.4 11.2X +SQL Parquet MR: DataPageV2 1939 1949 13 8.1 123.3 12.1X +SQL ORC Vectorized 392 393 1 40.2 24.9 60.0X +SQL ORC MR 1735 1747 16 9.1 110.3 13.6X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 386 395 7 40.8 24.5 1.0X -ParquetReader Vectorized: DataPageV2 375 386 14 41.9 23.9 1.0X -ParquetReader Vectorized -> Row: DataPageV1 372 387 16 42.2 23.7 1.0X -ParquetReader Vectorized -> Row: DataPageV2 374 384 15 42.0 23.8 1.0X +ParquetReader Vectorized: DataPageV1 261 265 6 60.2 16.6 1.0X +ParquetReader Vectorized: DataPageV2 260 264 7 60.4 16.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 246 249 6 64.1 15.6 1.1X +ParquetReader Vectorized -> Row: DataPageV2 245 247 2 64.2 15.6 1.1X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 23318 23395 110 0.4 2223.7 1.0X -SQL Json 14676 14727 72 0.7 1399.7 1.6X -SQL Parquet Vectorized: DataPageV1 2420 2445 35 4.3 230.8 9.6X -SQL Parquet Vectorized: DataPageV2 3734 3790 79 2.8 356.1 6.2X -SQL Parquet MR: DataPageV1 4865 4933 97 2.2 463.9 4.8X -SQL Parquet MR: DataPageV2 4998 5030 45 2.1 476.7 4.7X -SQL ORC Vectorized 2395 2444 69 4.4 228.4 9.7X -SQL ORC MR 4666 4669 5 2.2 444.9 5.0X +SQL CSV 15440 15456 22 0.7 1472.5 1.0X +SQL Json 11982 11993 16 0.9 1142.7 1.3X +SQL Parquet Vectorized: DataPageV1 2103 2106 5 5.0 200.5 7.3X +SQL Parquet Vectorized: DataPageV2 3012 3024 18 3.5 287.2 5.1X +SQL Parquet MR: DataPageV1 3874 3880 7 2.7 369.5 4.0X +SQL Parquet MR: DataPageV2 3816 3821 7 2.7 363.9 4.0X +SQL ORC Vectorized 2073 2076 4 5.1 197.7 7.4X +SQL ORC MR 3705 3708 4 2.8 353.4 4.2X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14158 14278 170 0.7 1350.2 1.0X -SQL Json 8620 8768 209 1.2 822.1 1.6X -SQL Parquet Vectorized: DataPageV1 856 868 10 12.2 81.6 16.5X -SQL Parquet Vectorized: DataPageV2 876 881 5 12.0 83.5 16.2X -SQL Parquet MR: DataPageV1 2207 2228 29 4.8 210.5 6.4X -SQL Parquet MR: DataPageV2 2130 2139 13 4.9 203.2 6.6X -SQL ORC Vectorized 559 569 15 18.8 53.3 25.3X -SQL ORC MR 2375 2388 18 4.4 226.5 6.0X +SQL CSV 8007 8041 48 1.3 763.6 1.0X +SQL Json 7111 7114 4 1.5 678.2 1.1X +SQL Parquet Vectorized: DataPageV1 735 737 1 14.3 70.1 10.9X +SQL Parquet Vectorized: DataPageV2 739 746 8 14.2 70.5 10.8X +SQL Parquet MR: DataPageV1 1657 1657 1 6.3 158.0 4.8X +SQL Parquet MR: DataPageV2 1566 1571 7 6.7 149.3 5.1X +SQL ORC Vectorized 449 450 2 23.4 42.8 17.9X +SQL ORC MR 1829 1842 18 5.7 174.4 4.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 32546 32601 77 0.5 2069.2 1.0X -Data column - Json 16186 16506 453 1.0 1029.1 2.0X -Data column - Parquet Vectorized: DataPageV1 304 312 9 51.8 19.3 107.2X -Data column - Parquet Vectorized: DataPageV2 645 662 15 24.4 41.0 50.5X -Data column - Parquet MR: DataPageV1 3209 3267 83 4.9 204.0 10.1X -Data column - Parquet MR: DataPageV2 2893 2942 69 5.4 184.0 11.2X -Data column - ORC Vectorized 405 418 12 38.8 25.8 80.3X -Data column - ORC MR 2743 2804 85 5.7 174.4 11.9X -Partition column - CSV 8546 8618 102 1.8 543.3 3.8X -Partition column - Json 13380 13456 108 1.2 850.7 2.4X -Partition column - Parquet Vectorized: DataPageV1 64 71 5 247.4 4.0 511.9X -Partition column - Parquet Vectorized: DataPageV2 66 73 5 237.7 4.2 491.8X -Partition column - Parquet MR: DataPageV1 1524 1551 38 10.3 96.9 21.4X -Partition column - Parquet MR: DataPageV2 1587 1593 8 9.9 100.9 20.5X -Partition column - ORC Vectorized 69 76 4 227.1 4.4 469.9X -Partition column - ORC MR 1761 1764 4 8.9 112.0 18.5X -Both columns - CSV 29992 30004 17 0.5 1906.8 1.1X -Both columns - Json 17684 17888 288 0.9 1124.3 1.8X -Both columns - Parquet Vectorized: DataPageV1 349 362 11 45.1 22.2 93.4X -Both columns - Parquet Vectorized: DataPageV2 708 712 4 22.2 45.0 45.9X -Both columns - Parquet MR: DataPageV1 3294 3301 10 4.8 209.4 9.9X -Both columns - Parquet MR: DataPageV2 2887 2905 26 5.4 183.5 11.3X -Both columns - ORC Vectorized 441 450 9 35.7 28.0 73.8X -Both columns - ORC MR 2842 2925 117 5.5 180.7 11.5X +Data column - CSV 21400 21426 38 0.7 1360.6 1.0X +Data column - Json 12486 12493 10 1.3 793.8 1.7X +Data column - Parquet Vectorized: DataPageV1 174 178 5 90.6 11.0 123.2X +Data column - Parquet Vectorized: DataPageV2 420 426 8 37.4 26.7 50.9X +Data column - Parquet MR: DataPageV1 2400 2405 7 6.6 152.6 8.9X +Data column - Parquet MR: DataPageV2 2069 2075 9 7.6 131.5 10.3X +Data column - ORC Vectorized 275 278 6 57.3 17.5 77.9X +Data column - ORC MR 1831 1835 5 8.6 116.4 11.7X +Partition column - CSV 6967 6973 9 2.3 443.0 3.1X +Partition column - Json 10130 10130 1 1.6 644.0 2.1X +Partition column - Parquet Vectorized: DataPageV1 44 49 10 356.2 2.8 484.6X +Partition column - Parquet Vectorized: DataPageV2 43 47 10 362.6 2.8 493.4X +Partition column - Parquet MR: DataPageV1 1164 1168 5 13.5 74.0 18.4X +Partition column - Parquet MR: DataPageV2 1163 1167 6 13.5 74.0 18.4X +Partition column - ORC Vectorized 50 53 12 312.7 3.2 425.5X +Partition column - ORC MR 1138 1140 4 13.8 72.3 18.8X +Both columns - CSV 21872 21873 2 0.7 1390.6 1.0X +Both columns - Json 13404 13422 25 1.2 852.2 1.6X +Both columns - Parquet Vectorized: DataPageV1 198 203 5 79.6 12.6 108.3X +Both columns - Parquet Vectorized: DataPageV2 445 451 6 35.3 28.3 48.1X +Both columns - Parquet MR: DataPageV1 2400 2401 1 6.6 152.6 8.9X +Both columns - Parquet MR: DataPageV2 2107 2107 1 7.5 133.9 10.2X +Both columns - ORC Vectorized 303 308 5 51.9 19.3 70.5X +Both columns - ORC MR 1926 1944 24 8.2 122.5 11.1X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14613 14725 159 0.7 1393.6 1.0X -SQL Json 14816 14860 63 0.7 1413.0 1.0X -SQL Parquet Vectorized: DataPageV1 1644 1669 35 6.4 156.8 8.9X -SQL Parquet Vectorized: DataPageV2 3433 3470 52 3.1 327.4 4.3X -SQL Parquet MR: DataPageV1 4374 4389 20 2.4 417.2 3.3X -SQL Parquet MR: DataPageV2 5581 5611 43 1.9 532.2 2.6X -ParquetReader Vectorized: DataPageV1 1213 1222 12 8.6 115.7 12.0X -ParquetReader Vectorized: DataPageV2 3007 3012 7 3.5 286.8 4.9X -SQL ORC Vectorized 1120 1126 9 9.4 106.8 13.1X -SQL ORC MR 3961 4000 55 2.6 377.8 3.7X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 10410 10425 22 1.0 992.7 1.0X +SQL Json 11140 11158 25 0.9 1062.4 0.9X +SQL Parquet Vectorized: DataPageV1 1272 1272 1 8.2 121.3 8.2X +SQL Parquet Vectorized: DataPageV2 2709 2717 11 3.9 258.4 3.8X +SQL Parquet MR: DataPageV1 3451 3465 20 3.0 329.1 3.0X +SQL Parquet MR: DataPageV2 4202 4216 20 2.5 400.7 2.5X +ParquetReader Vectorized: DataPageV1 923 927 6 11.4 88.0 11.3X +ParquetReader Vectorized: DataPageV2 2371 2372 2 4.4 226.1 4.4X +SQL ORC Vectorized 892 908 15 11.8 85.1 11.7X +SQL ORC MR 3093 3094 2 3.4 294.9 3.4X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10591 10743 216 1.0 1010.0 1.0X -SQL Json 10195 10432 336 1.0 972.2 1.0X -SQL Parquet Vectorized: DataPageV1 1279 1304 35 8.2 122.0 8.3X -SQL Parquet Vectorized: DataPageV2 2399 2437 55 4.4 228.7 4.4X -SQL Parquet MR: DataPageV1 3324 3402 109 3.2 317.0 3.2X -SQL Parquet MR: DataPageV2 4077 4084 10 2.6 388.8 2.6X -ParquetReader Vectorized: DataPageV1 1161 1164 4 9.0 110.7 9.1X -ParquetReader Vectorized: DataPageV2 2363 2372 12 4.4 225.4 4.5X -SQL ORC Vectorized 1255 1276 30 8.4 119.7 8.4X -SQL ORC MR 3544 3556 16 3.0 338.0 3.0X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 7716 7717 2 1.4 735.9 1.0X +SQL Json 12784 12792 11 0.8 1219.2 0.6X +SQL Parquet Vectorized: DataPageV1 1020 1027 9 10.3 97.3 7.6X +SQL Parquet Vectorized: DataPageV2 1815 1818 5 5.8 173.1 4.3X +SQL Parquet MR: DataPageV1 3592 3602 15 2.9 342.5 2.1X +SQL Parquet MR: DataPageV2 3517 3536 27 3.0 335.4 2.2X +ParquetReader Vectorized: DataPageV1 951 952 2 11.0 90.7 8.1X +ParquetReader Vectorized: DataPageV2 1731 1732 1 6.1 165.1 4.5X +SQL ORC Vectorized 1188 1191 4 8.8 113.3 6.5X +SQL ORC MR 2894 2896 3 3.6 276.0 2.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7609 7708 141 1.4 725.6 1.0X -SQL Json 6258 6310 74 1.7 596.8 1.2X -SQL Parquet Vectorized: DataPageV1 279 286 9 37.6 26.6 27.3X -SQL Parquet Vectorized: DataPageV2 817 825 7 12.8 77.9 9.3X -SQL Parquet MR: DataPageV1 2195 2202 11 4.8 209.3 3.5X -SQL Parquet MR: DataPageV2 1993 2001 12 5.3 190.0 3.8X -ParquetReader Vectorized: DataPageV1 289 294 7 36.3 27.6 26.3X -ParquetReader Vectorized: DataPageV2 822 826 5 12.7 78.4 9.3X -SQL ORC Vectorized 446 461 11 23.5 42.5 17.1X -SQL ORC MR 1933 1941 11 5.4 184.3 3.9X +SQL CSV 4861 4866 8 2.2 463.6 1.0X +SQL Json 6272 6282 14 1.7 598.2 0.8X +SQL Parquet Vectorized: DataPageV1 212 215 4 49.5 20.2 22.9X +SQL Parquet Vectorized: DataPageV2 332 336 4 31.6 31.7 14.6X +SQL Parquet MR: DataPageV1 2437 2440 4 4.3 232.5 2.0X +SQL Parquet MR: DataPageV2 1897 1924 39 5.5 180.9 2.6X +ParquetReader Vectorized: DataPageV1 220 221 1 47.7 20.9 22.1X +ParquetReader Vectorized: DataPageV2 340 344 4 30.9 32.4 14.3X +SQL ORC Vectorized 363 365 2 28.9 34.6 13.4X +SQL ORC MR 1389 1392 4 7.6 132.4 3.5X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3012 3041 41 0.3 2872.0 1.0X -SQL Json 3779 3780 1 0.3 3603.8 0.8X -SQL Parquet Vectorized: DataPageV1 58 65 11 18.1 55.2 52.0X -SQL Parquet Vectorized: DataPageV2 80 88 12 13.1 76.4 37.6X -SQL Parquet MR: DataPageV1 235 243 6 4.5 223.9 12.8X -SQL Parquet MR: DataPageV2 218 232 13 4.8 208.2 13.8X -SQL ORC Vectorized 66 73 4 15.8 63.3 45.4X -SQL ORC MR 200 208 6 5.2 190.7 15.1X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 2746 2759 18 0.4 2618.8 1.0X +SQL Json 3135 3140 8 0.3 2989.3 0.9X +SQL Parquet Vectorized: DataPageV1 37 40 5 28.4 35.2 74.3X +SQL Parquet Vectorized: DataPageV2 54 56 5 19.5 51.2 51.1X +SQL Parquet MR: DataPageV1 177 180 4 5.9 168.6 15.5X +SQL Parquet MR: DataPageV2 189 194 3 5.5 180.3 14.5X +SQL ORC Vectorized 44 47 9 23.6 42.3 61.9X +SQL ORC MR 144 147 2 7.3 137.4 19.1X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7066 7100 49 0.1 6738.2 1.0X -SQL Json 14074 14208 190 0.1 13421.7 0.5X -SQL Parquet Vectorized: DataPageV1 78 87 11 13.5 74.0 91.1X -SQL Parquet Vectorized: DataPageV2 105 113 12 10.0 100.3 67.2X -SQL Parquet MR: DataPageV1 262 273 9 4.0 249.6 27.0X -SQL Parquet MR: DataPageV2 246 252 5 4.3 234.2 28.8X -SQL ORC Vectorized 84 92 11 12.5 79.8 84.4X -SQL ORC MR 222 238 9 4.7 212.0 31.8X - -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 5391 5398 10 0.2 5141.6 1.0X +SQL Json 10851 10932 114 0.1 10348.6 0.5X +SQL Parquet Vectorized: DataPageV1 50 52 5 20.9 47.8 107.5X +SQL Parquet Vectorized: DataPageV2 66 69 5 15.8 63.3 81.2X +SQL Parquet MR: DataPageV1 196 198 2 5.4 186.5 27.6X +SQL Parquet MR: DataPageV2 205 208 3 5.1 195.3 26.3X +SQL ORC Vectorized 60 62 7 17.5 57.1 90.1X +SQL ORC MR 160 163 3 6.5 152.7 33.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12654 12862 295 0.1 12067.8 1.0X -SQL Json 26593 26840 350 0.0 25361.2 0.5X -SQL Parquet Vectorized: DataPageV1 117 129 7 8.9 111.8 107.9X -SQL Parquet Vectorized: DataPageV2 140 150 11 7.5 133.9 90.2X -SQL Parquet MR: DataPageV1 315 324 6 3.3 300.7 40.1X -SQL Parquet MR: DataPageV2 287 295 8 3.7 273.3 44.2X -SQL ORC Vectorized 107 116 15 9.8 101.7 118.6X -SQL ORC MR 255 262 6 4.1 243.1 49.6X +SQL CSV 8716 8719 4 0.1 8311.9 1.0X +SQL Json 20077 20197 170 0.1 19147.3 0.4X +SQL Parquet Vectorized: DataPageV1 78 84 10 13.4 74.5 111.6X +SQL Parquet Vectorized: DataPageV2 94 98 4 11.1 89.9 92.4X +SQL Parquet MR: DataPageV1 225 230 5 4.7 214.4 38.8X +SQL Parquet MR: DataPageV2 225 236 8 4.7 214.8 38.7X +SQL ORC Vectorized 82 84 4 12.8 77.8 106.8X +SQL ORC MR 183 187 6 5.7 174.2 47.7X From 0583e2f1347947a2e03ca1b1b10a2414f6fe8122 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Fri, 4 Mar 2022 16:37:43 -0800 Subject: [PATCH 14/20] Evaluate suffix array lazily in VectorizedDeltaLengthByteArrayReader --- .../VectorizedDeltaByteArrayReader.java | 19 +++++++++---------- .../VectorizedDeltaLengthByteArrayReader.java | 9 +++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index b41f0775bb6eb..936fa2ce78cd8 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -39,8 +39,7 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private final VectorizedDeltaBinaryPackedReader prefixLengthReader; private final VectorizedDeltaLengthByteArrayReader suffixReader; private WritableColumnVector prefixLengthVector; - private WritableColumnVector suffixVector; - private byte[] previous = new byte[0]; + private ByteBuffer previous = null; private int currentRow = 0; // temporary variable used by getBinary @@ -55,12 +54,10 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType); - suffixVector = new OnHeapColumnVector(valueCount, BinaryType); prefixLengthReader.initFromPage(valueCount, in); prefixLengthReader.readIntegers(prefixLengthReader.getTotalValueCount(), prefixLengthVector, 0); suffixReader.initFromPage(valueCount, in); - suffixReader.readBinary(prefixLengthReader.getTotalValueCount(), suffixVector, 0); } @Override @@ -78,8 +75,10 @@ private void readValues(int total, WritableColumnVector c, int rowId, // value of the page should have an empty prefix, it may not // because of PARQUET-246. int prefixLength = prefixLengthVector.getInt(currentRow); - byte[] suffix = suffixVector.getBinary(currentRow); - int length = prefixLength + suffix.length; + ByteBuffer suffix = suffixReader.getBytes(currentRow); + byte[] suffixArray = suffix.array(); + int suffixLength = suffix.limit() - suffix.position(); + int length = prefixLength + suffixLength; // We have to do this to materialize the output if (prefixLength != 0) { @@ -91,13 +90,13 @@ private void readValues(int total, WritableColumnVector c, int rowId, // is a _slow_ byte by byte copy // The following always uses the faster system arraycopy method byte[] out = new byte[length]; - System.arraycopy(previous, 0, out, 0, prefixLength); - System.arraycopy(suffix, 0, out, prefixLength, suffix.length); - previous = out; + System.arraycopy(previous.array(), previous.position(), out, 0, prefixLength); + System.arraycopy(suffixArray, suffix.position(), out, prefixLength, suffixLength); + previous = ByteBuffer.wrap(out); } else { previous = suffix; } - outputWriter.write(c, rowId + i, ByteBuffer.wrap(previous), previous.length); + outputWriter.write(c, rowId + i, previous, previous.limit() - previous.position()); currentRow++; } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 3676c59b78336..068de94b49770 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -67,6 +67,15 @@ public void readBinary(int total, WritableColumnVector c, int rowId) { currentRow += total; } + public ByteBuffer getBytes(int rowId) { + int length = lengthsVector.getInt(rowId); + try { + return in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); + } + } + @Override public void skipBinary(int total) { if (total == 0) { From 31eee9fce3b04ecb0c1e271a742bc5d39d9d4127 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Fri, 4 Mar 2022 17:33:45 -0800 Subject: [PATCH 15/20] In DeltaLengthByteArrayReader avoid extra copy if memory mode is on_heap. --- .../VectorizedDeltaByteArrayReader.java | 57 ++++++++++++------- .../vectorized/OffHeapColumnVector.java | 7 +++ .../vectorized/OnHeapColumnVector.java | 6 ++ .../vectorized/WritableColumnVector.java | 3 + 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 936fa2ce78cd8..b8d375ee25592 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -39,16 +39,18 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private final VectorizedDeltaBinaryPackedReader prefixLengthReader; private final VectorizedDeltaLengthByteArrayReader suffixReader; private WritableColumnVector prefixLengthVector; - private ByteBuffer previous = null; + private ByteBuffer previous; private int currentRow = 0; // temporary variable used by getBinary private final WritableColumnVector binaryValVector; + private final WritableColumnVector tempBinaryValVector; VectorizedDeltaByteArrayReader() { this.prefixLengthReader = new VectorizedDeltaBinaryPackedReader(); this.suffixReader = new VectorizedDeltaLengthByteArrayReader(); binaryValVector = new OnHeapColumnVector(1, BinaryType); + tempBinaryValVector = new OnHeapColumnVector(1, BinaryType); } @Override @@ -62,12 +64,11 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce @Override public Binary readBinary(int len) { - readValues(1, binaryValVector, 0, ByteBufferOutputWriter::writeArrayByteBuffer); + readValues(1, binaryValVector, 0); return Binary.fromConstantByteArray(binaryValVector.getBinary(0)); } - private void readValues(int total, WritableColumnVector c, int rowId, - ByteBufferOutputWriter outputWriter) { + private void readValues(int total, WritableColumnVector c, int rowId) { for (int i = 0; i < total; i++) { // NOTE: due to PARQUET-246, it is important that we // respect prefixLength which was read from prefixLengthReader, @@ -81,29 +82,21 @@ private void readValues(int total, WritableColumnVector c, int rowId, int length = prefixLength + suffixLength; // We have to do this to materialize the output + WritableColumnVector arrayData = c.arrayData(); + int offset = arrayData.getElementsAppended(); if (prefixLength != 0) { - // We could do - // c.putByteArray(rowId + i, previous, 0, prefixLength); - // c.putByteArray(rowId+i, suffix, prefixLength, suffix.length); - // previous = c.getBinary(rowId+1); - // but it incurs the same cost of copying the values twice _and_ c.getBinary - // is a _slow_ byte by byte copy - // The following always uses the faster system arraycopy method - byte[] out = new byte[length]; - System.arraycopy(previous.array(), previous.position(), out, 0, prefixLength); - System.arraycopy(suffixArray, suffix.position(), out, prefixLength, suffixLength); - previous = ByteBuffer.wrap(out); - } else { - previous = suffix; + arrayData.appendBytes(prefixLength, previous.array(), previous.position()); } - outputWriter.write(c, rowId + i, previous, previous.limit() - previous.position()); + arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); + c.putArray(rowId + i, offset, length); + previous = arrayData.getBytesUnsafe(offset, length); currentRow++; } } @Override public void readBinary(int total, WritableColumnVector c, int rowId) { - readValues(total, c, rowId, ByteBufferOutputWriter::writeArrayByteBuffer); + readValues(total, c, rowId); } /** @@ -121,9 +114,29 @@ public void setPreviousReader(ValuesReader reader) { @Override public void skipBinary(int total) { - // we have to read all the values so that we always have the correct 'previous' - // we just don't write it to the output vector - readValues(total, null, currentRow, ByteBufferOutputWriter::skipWrite); + WritableColumnVector c1 = tempBinaryValVector; + WritableColumnVector c2 = binaryValVector; + + for (int i = 0; i < total; i++) { + int prefixLength = prefixLengthVector.getInt(currentRow); + ByteBuffer suffix = suffixReader.getBytes(currentRow); + byte[] suffixArray = suffix.array(); + int suffixLength = suffix.limit() - suffix.position(); + int length = prefixLength + suffixLength; + + WritableColumnVector arrayData = c1.arrayData(); + c1.reset(); + if (prefixLength != 0) { + arrayData.appendBytes(prefixLength, previous.array(), previous.position()); + } + arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); + previous = arrayData.getBytesUnsafe(0, length); + currentRow++; + + WritableColumnVector tmp = c1; + c1 = c2; + c2 = tmp; + } } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index bbe96819a618b..a7abc5a53bddd 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -221,6 +221,13 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { return UTF8String.fromAddress(null, data + rowId, count); } + @Override + public ByteBuffer getBytesUnsafe(int rowId, int count) { + byte[] array = new byte[count]; + Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count); + return ByteBuffer.wrap(array); + } + // // APIs dealing with shorts // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 833a93f2a2bdb..10ff78e38de49 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -219,6 +219,12 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { return UTF8String.fromBytes(byteData, rowId, count); } + @Override + public ByteBuffer getBytesUnsafe(int rowId, int count) { + return ByteBuffer.wrap(byteData, rowId, count); + } + + // // APIs dealing with Shorts // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index 5e01c372793f1..4c0d58dc5be45 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -18,6 +18,7 @@ import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.ByteBuffer; import com.google.common.annotations.VisibleForTesting; @@ -443,6 +444,8 @@ public byte[] getBinary(int rowId) { } } + public abstract ByteBuffer getBytesUnsafe(int rowId, int count); + /** * Append APIs. These APIs all behave similarly and will append data to the current vector. It * is not valid to mix the put and append APIs. The append APIs are slower and should only be From 9eaf3874dcfd183b46b1653f92b7b25793d961c9 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 7 Mar 2022 11:07:51 -0800 Subject: [PATCH 16/20] Avoid unnecessary check for parameter in skipBytes --- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 068de94b49770..e961bbc31d5a7 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -78,9 +78,6 @@ public ByteBuffer getBytes(int rowId) { @Override public void skipBinary(int total) { - if (total == 0) { - return; - } int length; for (int i = 0; i < total; i++) { length = lengthsVector.getInt(currentRow + i); From 1fc0060d89dca7be420f619ebd969707d0f98f21 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Mon, 7 Mar 2022 13:08:25 -0800 Subject: [PATCH 17/20] Update benchmark results --- .../DataSourceReadBenchmark-jdk11-results.txt | 402 +++++++++--------- .../DataSourceReadBenchmark-jdk17-results.txt | 402 +++++++++--------- .../DataSourceReadBenchmark-results.txt | 402 +++++++++--------- 3 files changed, 603 insertions(+), 603 deletions(-) diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index b8b86906c455f..11fc93406c363 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -3,158 +3,158 @@ SQL Single Numeric Column Scan ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14859 14914 77 1.1 944.7 1.0X -SQL Json 9418 9457 55 1.7 598.8 1.6X -SQL Parquet Vectorized: DataPageV1 109 128 14 144.6 6.9 136.6X -SQL Parquet Vectorized: DataPageV2 79 89 8 199.3 5.0 188.3X -SQL Parquet MR: DataPageV1 1699 1743 62 9.3 108.0 8.7X -SQL Parquet MR: DataPageV2 1462 1489 38 10.8 93.0 10.2X -SQL ORC Vectorized 165 200 33 95.3 10.5 90.0X -SQL ORC MR 1409 1420 16 11.2 89.6 10.5X +SQL CSV 11809 12046 335 1.3 750.8 1.0X +SQL Json 8588 8592 7 1.8 546.0 1.4X +SQL Parquet Vectorized: DataPageV1 140 162 18 112.0 8.9 84.1X +SQL Parquet Vectorized: DataPageV2 103 117 12 152.6 6.6 114.6X +SQL Parquet MR: DataPageV1 1634 1648 20 9.6 103.9 7.2X +SQL Parquet MR: DataPageV2 1495 1501 9 10.5 95.1 7.9X +SQL ORC Vectorized 180 224 42 87.4 11.4 65.6X +SQL ORC MR 1536 1576 57 10.2 97.7 7.7X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 101 104 3 155.2 6.4 1.0X -ParquetReader Vectorized: DataPageV2 82 85 5 192.0 5.2 1.2X -ParquetReader Vectorized -> Row: DataPageV1 48 50 2 324.6 3.1 2.1X -ParquetReader Vectorized -> Row: DataPageV2 29 31 3 539.4 1.9 3.5X +ParquetReader Vectorized: DataPageV1 109 114 10 144.3 6.9 1.0X +ParquetReader Vectorized: DataPageV2 90 93 3 175.3 5.7 1.2X +ParquetReader Vectorized -> Row: DataPageV1 58 60 4 271.9 3.7 1.9X +ParquetReader Vectorized -> Row: DataPageV2 39 41 3 404.0 2.5 2.8X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17619 17639 28 0.9 1120.2 1.0X -SQL Json 10590 10606 23 1.5 673.3 1.7X -SQL Parquet Vectorized: DataPageV1 178 194 10 88.2 11.3 98.8X -SQL Parquet Vectorized: DataPageV2 178 188 9 88.2 11.3 98.7X -SQL Parquet MR: DataPageV1 1884 1887 4 8.4 119.8 9.4X -SQL Parquet MR: DataPageV2 1689 1742 75 9.3 107.4 10.4X -SQL ORC Vectorized 162 193 24 97.0 10.3 108.7X -SQL ORC MR 1505 1552 67 10.5 95.7 11.7X +SQL CSV 14515 14526 16 1.1 922.8 1.0X +SQL Json 9862 9863 2 1.6 627.0 1.5X +SQL Parquet Vectorized: DataPageV1 144 167 31 109.5 9.1 101.1X +SQL Parquet Vectorized: DataPageV2 139 159 27 113.4 8.8 104.6X +SQL Parquet MR: DataPageV1 1777 1780 3 8.8 113.0 8.2X +SQL Parquet MR: DataPageV2 1690 1691 2 9.3 107.4 8.6X +SQL ORC Vectorized 201 238 46 78.3 12.8 72.2X +SQL ORC MR 1513 1522 14 10.4 96.2 9.6X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 230 236 13 68.3 14.6 1.0X -ParquetReader Vectorized: DataPageV2 228 233 8 69.1 14.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 138 150 26 113.7 8.8 1.7X -ParquetReader Vectorized -> Row: DataPageV2 137 140 2 114.5 8.7 1.7X +ParquetReader Vectorized: DataPageV1 182 192 11 86.6 11.5 1.0X +ParquetReader Vectorized: DataPageV2 181 188 7 86.9 11.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 96 99 4 163.3 6.1 1.9X +ParquetReader Vectorized -> Row: DataPageV2 96 99 3 163.4 6.1 1.9X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18488 18494 8 0.9 1175.5 1.0X -SQL Json 11190 11195 7 1.4 711.4 1.7X -SQL Parquet Vectorized: DataPageV1 125 155 34 125.7 8.0 147.7X -SQL Parquet Vectorized: DataPageV2 183 192 9 86.1 11.6 101.2X -SQL Parquet MR: DataPageV1 2153 2160 10 7.3 136.9 8.6X -SQL Parquet MR: DataPageV2 1876 1889 18 8.4 119.3 9.9X -SQL ORC Vectorized 212 257 23 74.4 13.4 87.4X -SQL ORC MR 1653 1658 7 9.5 105.1 11.2X +SQL CSV 15326 15437 156 1.0 974.4 1.0X +SQL Json 10281 10290 13 1.5 653.7 1.5X +SQL Parquet Vectorized: DataPageV1 164 212 36 95.9 10.4 93.4X +SQL Parquet Vectorized: DataPageV2 230 244 11 68.5 14.6 66.7X +SQL Parquet MR: DataPageV1 2108 2111 4 7.5 134.0 7.3X +SQL Parquet MR: DataPageV2 1940 1963 33 8.1 123.3 7.9X +SQL ORC Vectorized 229 279 34 68.7 14.6 66.9X +SQL ORC MR 1903 1906 3 8.3 121.0 8.1X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 198 201 5 79.6 12.6 1.0X -ParquetReader Vectorized: DataPageV2 256 260 3 61.5 16.3 0.8X -ParquetReader Vectorized -> Row: DataPageV1 193 226 14 81.4 12.3 1.0X -ParquetReader Vectorized -> Row: DataPageV2 250 253 2 62.8 15.9 0.8X +ParquetReader Vectorized: DataPageV1 253 262 10 62.2 16.1 1.0X +ParquetReader Vectorized: DataPageV2 323 327 9 48.8 20.5 0.8X +ParquetReader Vectorized -> Row: DataPageV1 280 288 8 56.3 17.8 0.9X +ParquetReader Vectorized -> Row: DataPageV2 301 314 21 52.2 19.1 0.8X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18614 18703 125 0.8 1183.5 1.0X -SQL Json 11673 11711 53 1.3 742.2 1.6X -SQL Parquet Vectorized: DataPageV1 128 154 26 123.1 8.1 145.7X -SQL Parquet Vectorized: DataPageV2 270 302 23 58.3 17.1 69.0X -SQL Parquet MR: DataPageV1 2117 2145 39 7.4 134.6 8.8X -SQL Parquet MR: DataPageV2 1855 1860 7 8.5 117.9 10.0X -SQL ORC Vectorized 277 292 16 56.7 17.6 67.2X -SQL ORC MR 1623 1629 9 9.7 103.2 11.5X +SQL CSV 16756 16776 28 0.9 1065.3 1.0X +SQL Json 10690 10692 3 1.5 679.6 1.6X +SQL Parquet Vectorized: DataPageV1 160 208 45 98.1 10.2 104.5X +SQL Parquet Vectorized: DataPageV2 390 423 23 40.3 24.8 43.0X +SQL Parquet MR: DataPageV1 2196 2201 8 7.2 139.6 7.6X +SQL Parquet MR: DataPageV2 2065 2072 10 7.6 131.3 8.1X +SQL ORC Vectorized 323 338 10 48.7 20.5 51.9X +SQL ORC MR 1899 1906 11 8.3 120.7 8.8X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 225 226 1 69.9 14.3 1.0X -ParquetReader Vectorized: DataPageV2 362 365 2 43.4 23.0 0.6X -ParquetReader Vectorized -> Row: DataPageV1 193 218 18 81.5 12.3 1.2X -ParquetReader Vectorized -> Row: DataPageV2 360 366 6 43.7 22.9 0.6X +ParquetReader Vectorized: DataPageV1 278 285 9 56.6 17.7 1.0X +ParquetReader Vectorized: DataPageV2 514 518 2 30.6 32.7 0.5X +ParquetReader Vectorized -> Row: DataPageV1 308 316 11 51.0 19.6 0.9X +ParquetReader Vectorized -> Row: DataPageV2 498 525 27 31.6 31.6 0.6X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 23939 23953 20 0.7 1522.0 1.0X -SQL Json 14445 14449 5 1.1 918.4 1.7X -SQL Parquet Vectorized: DataPageV1 186 229 28 84.7 11.8 128.9X -SQL Parquet Vectorized: DataPageV2 459 493 25 34.3 29.2 52.2X -SQL Parquet MR: DataPageV1 2180 2184 7 7.2 138.6 11.0X -SQL Parquet MR: DataPageV2 1954 1973 27 8.1 124.2 12.3X -SQL ORC Vectorized 368 392 24 42.8 23.4 65.1X -SQL ORC MR 1793 1794 2 8.8 114.0 13.4X +SQL CSV 21841 21851 14 0.7 1388.6 1.0X +SQL Json 12828 12843 21 1.2 815.6 1.7X +SQL Parquet Vectorized: DataPageV1 241 279 19 65.2 15.3 90.6X +SQL Parquet Vectorized: DataPageV2 554 596 29 28.4 35.2 39.5X +SQL Parquet MR: DataPageV1 2404 2428 34 6.5 152.8 9.1X +SQL Parquet MR: DataPageV2 2153 2166 18 7.3 136.9 10.1X +SQL ORC Vectorized 417 464 62 37.7 26.5 52.4X +SQL ORC MR 2136 2146 14 7.4 135.8 10.2X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 280 293 18 56.1 17.8 1.0X -ParquetReader Vectorized: DataPageV2 577 602 48 27.3 36.7 0.5X -ParquetReader Vectorized -> Row: DataPageV1 314 321 10 50.1 19.9 0.9X -ParquetReader Vectorized -> Row: DataPageV2 581 584 4 27.1 37.0 0.5X +ParquetReader Vectorized: DataPageV1 324 357 34 48.6 20.6 1.0X +ParquetReader Vectorized: DataPageV2 694 702 11 22.6 44.2 0.5X +ParquetReader Vectorized -> Row: DataPageV1 378 385 8 41.6 24.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 701 708 8 22.4 44.6 0.5X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19278 19291 18 0.8 1225.6 1.0X -SQL Json 13366 13381 21 1.2 849.8 1.4X -SQL Parquet Vectorized: DataPageV1 130 152 23 120.8 8.3 148.1X -SQL Parquet Vectorized: DataPageV2 135 157 17 116.8 8.6 143.2X -SQL Parquet MR: DataPageV1 2126 2137 15 7.4 135.2 9.1X -SQL Parquet MR: DataPageV2 1970 1985 21 8.0 125.2 9.8X -SQL ORC Vectorized 387 396 11 40.7 24.6 49.8X -SQL ORC MR 1831 1832 1 8.6 116.4 10.5X +SQL CSV 17238 17239 2 0.9 1096.0 1.0X +SQL Json 12295 12307 18 1.3 781.7 1.4X +SQL Parquet Vectorized: DataPageV1 162 203 27 96.8 10.3 106.1X +SQL Parquet Vectorized: DataPageV2 157 194 32 100.4 10.0 110.0X +SQL Parquet MR: DataPageV1 2163 2165 3 7.3 137.5 8.0X +SQL Parquet MR: DataPageV2 2014 2014 1 7.8 128.0 8.6X +SQL ORC Vectorized 458 462 5 34.4 29.1 37.7X +SQL ORC MR 1984 1984 0 7.9 126.1 8.7X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 194 197 5 81.1 12.3 1.0X -ParquetReader Vectorized: DataPageV2 194 197 7 81.2 12.3 1.0X -ParquetReader Vectorized -> Row: DataPageV1 225 253 18 69.9 14.3 0.9X -ParquetReader Vectorized -> Row: DataPageV2 224 252 18 70.2 14.2 0.9X +ParquetReader Vectorized: DataPageV1 252 259 10 62.3 16.0 1.0X +ParquetReader Vectorized: DataPageV2 252 256 9 62.3 16.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 259 307 40 60.7 16.5 1.0X +ParquetReader Vectorized -> Row: DataPageV2 260 295 25 60.5 16.5 1.0X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 24691 24705 19 0.6 1569.8 1.0X -SQL Json 18028 18028 0 0.9 1146.2 1.4X -SQL Parquet Vectorized: DataPageV1 190 225 28 83.0 12.0 130.3X -SQL Parquet Vectorized: DataPageV2 188 230 26 83.9 11.9 131.7X -SQL Parquet MR: DataPageV1 2362 2365 4 6.7 150.2 10.5X -SQL Parquet MR: DataPageV2 2061 2078 25 7.6 131.0 12.0X -SQL ORC Vectorized 499 524 37 31.6 31.7 49.5X -SQL ORC MR 1870 1880 14 8.4 118.9 13.2X +SQL CSV 22485 22536 72 0.7 1429.5 1.0X +SQL Json 16281 16286 8 1.0 1035.1 1.4X +SQL Parquet Vectorized: DataPageV1 232 288 35 67.9 14.7 97.1X +SQL Parquet Vectorized: DataPageV2 277 290 9 56.8 17.6 81.2X +SQL Parquet MR: DataPageV1 2331 2341 15 6.7 148.2 9.6X +SQL Parquet MR: DataPageV2 2216 2229 18 7.1 140.9 10.1X +SQL ORC Vectorized 561 569 9 28.0 35.7 40.1X +SQL ORC MR 2118 2137 27 7.4 134.6 10.6X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 276 295 21 57.0 17.5 1.0X -ParquetReader Vectorized: DataPageV2 278 289 17 56.6 17.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 315 326 15 50.0 20.0 0.9X -ParquetReader Vectorized -> Row: DataPageV2 315 323 8 49.9 20.0 0.9X +ParquetReader Vectorized: DataPageV1 355 356 1 44.3 22.6 1.0X +ParquetReader Vectorized: DataPageV2 355 356 1 44.3 22.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 379 386 9 41.5 24.1 0.9X +ParquetReader Vectorized -> Row: DataPageV2 379 389 10 41.5 24.1 0.9X ================================================================================================ @@ -162,17 +162,17 @@ Int and String Scan ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 16840 16908 96 0.6 1606.0 1.0X -SQL Json 12496 12513 25 0.8 1191.7 1.3X -SQL Parquet Vectorized: DataPageV1 2169 2172 5 4.8 206.9 7.8X -SQL Parquet Vectorized: DataPageV2 3102 3119 24 3.4 295.9 5.4X -SQL Parquet MR: DataPageV1 4140 4144 5 2.5 394.8 4.1X -SQL Parquet MR: DataPageV2 3988 3996 12 2.6 380.3 4.2X -SQL ORC Vectorized 2180 2196 23 4.8 207.9 7.7X -SQL ORC MR 3765 3766 2 2.8 359.0 4.5X +SQL CSV 15733 15738 8 0.7 1500.4 1.0X +SQL Json 11953 11969 22 0.9 1140.0 1.3X +SQL Parquet Vectorized: DataPageV1 2100 2137 52 5.0 200.2 7.5X +SQL Parquet Vectorized: DataPageV2 2525 2535 14 4.2 240.8 6.2X +SQL Parquet MR: DataPageV1 4075 4110 49 2.6 388.6 3.9X +SQL Parquet MR: DataPageV2 3991 4014 34 2.6 380.6 3.9X +SQL ORC Vectorized 2323 2355 45 4.5 221.5 6.8X +SQL ORC MR 3776 3882 150 2.8 360.1 4.2X ================================================================================================ @@ -180,17 +180,17 @@ Repeated String Scan ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9960 9960 0 1.1 949.8 1.0X -SQL Json 7625 7712 123 1.4 727.2 1.3X -SQL Parquet Vectorized: DataPageV1 577 582 6 18.2 55.0 17.3X -SQL Parquet Vectorized: DataPageV2 584 592 6 18.0 55.7 17.1X -SQL Parquet MR: DataPageV1 1722 1736 19 6.1 164.2 5.8X -SQL Parquet MR: DataPageV2 1662 1668 9 6.3 158.5 6.0X -SQL ORC Vectorized 483 524 27 21.7 46.1 20.6X -SQL ORC MR 1841 1850 14 5.7 175.5 5.4X +SQL CSV 8921 8966 63 1.2 850.7 1.0X +SQL Json 7215 7218 5 1.5 688.1 1.2X +SQL Parquet Vectorized: DataPageV1 604 627 23 17.3 57.6 14.8X +SQL Parquet Vectorized: DataPageV2 606 620 18 17.3 57.8 14.7X +SQL Parquet MR: DataPageV1 1686 1693 10 6.2 160.8 5.3X +SQL Parquet MR: DataPageV2 1660 1665 8 6.3 158.3 5.4X +SQL ORC Vectorized 541 548 7 19.4 51.6 16.5X +SQL ORC MR 1920 1930 13 5.5 183.1 4.6X ================================================================================================ @@ -198,33 +198,33 @@ Partitioned Table Scan ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 23787 23788 2 0.7 1512.3 1.0X -Data column - Json 13993 14011 25 1.1 889.7 1.7X -Data column - Parquet Vectorized: DataPageV1 184 235 36 85.4 11.7 129.2X -Data column - Parquet Vectorized: DataPageV2 531 542 15 29.6 33.7 44.8X -Data column - Parquet MR: DataPageV1 2539 2547 13 6.2 161.4 9.4X -Data column - Parquet MR: DataPageV2 2299 2301 3 6.8 146.2 10.3X -Data column - ORC Vectorized 379 403 23 41.5 24.1 62.8X -Data column - ORC MR 2047 2070 33 7.7 130.1 11.6X -Partition column - CSV 6834 6835 1 2.3 434.5 3.5X -Partition column - Json 11444 11478 49 1.4 727.6 2.1X -Partition column - Parquet Vectorized: DataPageV1 51 71 22 308.6 3.2 466.7X -Partition column - Parquet Vectorized: DataPageV2 51 61 16 310.5 3.2 469.5X -Partition column - Parquet MR: DataPageV1 1203 1214 15 13.1 76.5 19.8X -Partition column - Parquet MR: DataPageV2 1210 1224 20 13.0 76.9 19.7X -Partition column - ORC Vectorized 52 67 14 303.1 3.3 458.4X -Partition column - ORC MR 1338 1342 5 11.8 85.1 17.8X -Both columns - CSV 24051 24052 2 0.7 1529.1 1.0X -Both columns - Json 15016 15030 20 1.0 954.7 1.6X -Both columns - Parquet Vectorized: DataPageV1 235 269 27 66.9 15.0 101.2X -Both columns - Parquet Vectorized: DataPageV2 563 617 60 27.9 35.8 42.2X -Both columns - Parquet MR: DataPageV1 2525 2555 43 6.2 160.5 9.4X -Both columns - Parquet MR: DataPageV2 2256 2267 15 7.0 143.5 10.5X -Both columns - ORC Vectorized 407 454 51 38.7 25.9 58.5X -Both columns - ORC MR 2153 2155 2 7.3 136.9 11.0X +Data column - CSV 21951 21976 36 0.7 1395.6 1.0X +Data column - Json 12896 12905 14 1.2 819.9 1.7X +Data column - Parquet Vectorized: DataPageV1 247 307 48 63.6 15.7 88.7X +Data column - Parquet Vectorized: DataPageV2 657 686 25 23.9 41.8 33.4X +Data column - Parquet MR: DataPageV1 2705 2708 3 5.8 172.0 8.1X +Data column - Parquet MR: DataPageV2 2621 2621 0 6.0 166.6 8.4X +Data column - ORC Vectorized 440 468 30 35.7 28.0 49.9X +Data column - ORC MR 2553 2565 17 6.2 162.3 8.6X +Partition column - CSV 6640 6641 1 2.4 422.2 3.3X +Partition column - Json 10499 10512 19 1.5 667.5 2.1X +Partition column - Parquet Vectorized: DataPageV1 60 79 24 261.4 3.8 364.8X +Partition column - Parquet Vectorized: DataPageV2 58 81 26 270.2 3.7 377.0X +Partition column - Parquet MR: DataPageV1 1387 1412 35 11.3 88.2 15.8X +Partition column - Parquet MR: DataPageV2 1383 1407 34 11.4 87.9 15.9X +Partition column - ORC Vectorized 61 85 25 256.8 3.9 358.4X +Partition column - ORC MR 1552 1553 1 10.1 98.7 14.1X +Both columns - CSV 21896 21919 32 0.7 1392.1 1.0X +Both columns - Json 13645 13664 27 1.2 867.5 1.6X +Both columns - Parquet Vectorized: DataPageV1 307 351 33 51.3 19.5 71.6X +Both columns - Parquet Vectorized: DataPageV2 698 740 36 22.5 44.4 31.4X +Both columns - Parquet MR: DataPageV1 2804 2821 24 5.6 178.3 7.8X +Both columns - Parquet MR: DataPageV2 2624 2636 16 6.0 166.8 8.4X +Both columns - ORC Vectorized 462 521 53 34.0 29.4 47.5X +Both columns - ORC MR 2564 2580 22 6.1 163.0 8.6X ================================================================================================ @@ -232,49 +232,49 @@ String with Nulls Scan ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11812 11849 53 0.9 1126.4 1.0X -SQL Json 11454 11467 18 0.9 1092.3 1.0X -SQL Parquet Vectorized: DataPageV1 1250 1276 37 8.4 119.2 9.5X -SQL Parquet Vectorized: DataPageV2 2248 2261 17 4.7 214.4 5.3X -SQL Parquet MR: DataPageV1 3629 3630 1 2.9 346.1 3.3X -SQL Parquet MR: DataPageV2 3929 3934 6 2.7 374.7 3.0X -ParquetReader Vectorized: DataPageV1 921 922 2 11.4 87.8 12.8X -ParquetReader Vectorized: DataPageV2 1890 1890 0 5.5 180.3 6.2X -SQL ORC Vectorized 1079 1105 36 9.7 102.9 10.9X -SQL ORC MR 3042 3070 40 3.4 290.1 3.9X +SQL CSV 10818 10826 11 1.0 1031.6 1.0X +SQL Json 10812 10833 29 1.0 1031.2 1.0X +SQL Parquet Vectorized: DataPageV1 1301 1312 15 8.1 124.1 8.3X +SQL Parquet Vectorized: DataPageV2 1953 1982 42 5.4 186.2 5.5X +SQL Parquet MR: DataPageV1 3677 3680 5 2.9 350.6 2.9X +SQL Parquet MR: DataPageV2 3970 3972 2 2.6 378.6 2.7X +ParquetReader Vectorized: DataPageV1 1004 1016 16 10.4 95.8 10.8X +ParquetReader Vectorized: DataPageV2 1606 1622 22 6.5 153.2 6.7X +SQL ORC Vectorized 1160 1182 30 9.0 110.7 9.3X +SQL ORC MR 3266 3330 90 3.2 311.4 3.3X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8399 8410 16 1.2 801.0 1.0X -SQL Json 8892 8905 18 1.2 848.0 0.9X -SQL Parquet Vectorized: DataPageV1 1065 1092 38 9.8 101.6 7.9X -SQL Parquet Vectorized: DataPageV2 1747 1747 0 6.0 166.6 4.8X -SQL Parquet MR: DataPageV1 2718 2719 1 3.9 259.2 3.1X -SQL Parquet MR: DataPageV2 2955 2964 12 3.5 281.8 2.8X -ParquetReader Vectorized: DataPageV1 1082 1084 3 9.7 103.2 7.8X -ParquetReader Vectorized: DataPageV2 1707 1713 9 6.1 162.8 4.9X -SQL ORC Vectorized 1345 1357 17 7.8 128.3 6.2X -SQL ORC MR 3012 3046 47 3.5 287.3 2.8X +SQL CSV 7971 7981 15 1.3 760.2 1.0X +SQL Json 8266 8269 3 1.3 788.4 1.0X +SQL Parquet Vectorized: DataPageV1 1025 1036 15 10.2 97.8 7.8X +SQL Parquet Vectorized: DataPageV2 1432 1440 11 7.3 136.6 5.6X +SQL Parquet MR: DataPageV1 2792 2806 20 3.8 266.3 2.9X +SQL Parquet MR: DataPageV2 2958 2992 47 3.5 282.1 2.7X +ParquetReader Vectorized: DataPageV1 1010 1024 20 10.4 96.3 7.9X +ParquetReader Vectorized: DataPageV2 1331 1335 4 7.9 127.0 6.0X +SQL ORC Vectorized 1266 1271 6 8.3 120.8 6.3X +SQL ORC MR 3032 3089 81 3.5 289.2 2.6X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6169 6176 10 1.7 588.3 1.0X -SQL Json 5352 5376 35 2.0 510.4 1.2X -SQL Parquet Vectorized: DataPageV1 248 255 7 42.3 23.6 24.9X -SQL Parquet Vectorized: DataPageV2 364 372 10 28.8 34.7 17.0X -SQL Parquet MR: DataPageV1 1624 1626 3 6.5 154.9 3.8X -SQL Parquet MR: DataPageV2 1520 1526 8 6.9 145.0 4.1X -ParquetReader Vectorized: DataPageV1 259 262 1 40.4 24.7 23.8X -ParquetReader Vectorized: DataPageV2 376 378 2 27.9 35.9 16.4X -SQL ORC Vectorized 414 438 30 25.3 39.5 14.9X -SQL ORC MR 1580 1596 22 6.6 150.7 3.9X +SQL CSV 5829 5833 5 1.8 555.9 1.0X +SQL Json 4966 4978 17 2.1 473.6 1.2X +SQL Parquet Vectorized: DataPageV1 236 244 7 44.5 22.5 24.7X +SQL Parquet Vectorized: DataPageV2 305 315 13 34.4 29.1 19.1X +SQL Parquet MR: DataPageV1 1777 1784 10 5.9 169.5 3.3X +SQL Parquet MR: DataPageV2 1635 1637 4 6.4 155.9 3.6X +ParquetReader Vectorized: DataPageV1 242 246 2 43.2 23.1 24.0X +ParquetReader Vectorized: DataPageV2 309 313 7 34.0 29.5 18.9X +SQL ORC Vectorized 391 419 53 26.8 37.3 14.9X +SQL ORC MR 1686 1687 1 6.2 160.8 3.5X ================================================================================================ @@ -282,42 +282,42 @@ Single Column Scan From Wide Columns ================================================================================================ OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2403 2408 8 0.4 2291.5 1.0X -SQL Json 2925 2934 14 0.4 2789.3 0.8X -SQL Parquet Vectorized: DataPageV1 41 56 15 25.7 38.9 58.9X -SQL Parquet Vectorized: DataPageV2 60 71 15 17.4 57.5 39.8X -SQL Parquet MR: DataPageV1 179 193 19 5.9 170.7 13.4X -SQL Parquet MR: DataPageV2 159 182 29 6.6 152.1 15.1X -SQL ORC Vectorized 50 66 19 21.0 47.6 48.1X -SQL ORC MR 153 177 49 6.9 145.7 15.7X +SQL CSV 2301 2305 6 0.5 2194.0 1.0X +SQL Json 2874 2895 29 0.4 2741.1 0.8X +SQL Parquet Vectorized: DataPageV1 47 66 20 22.3 44.8 48.9X +SQL Parquet Vectorized: DataPageV2 74 90 25 14.2 70.5 31.1X +SQL Parquet MR: DataPageV1 198 219 26 5.3 189.0 11.6X +SQL Parquet MR: DataPageV2 178 207 45 5.9 170.1 12.9X +SQL ORC Vectorized 59 76 20 17.6 56.7 38.7X +SQL ORC MR 173 193 24 6.1 164.6 13.3X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5265 5268 5 0.2 5021.1 1.0X -SQL Json 11138 11189 73 0.1 10621.6 0.5X -SQL Parquet Vectorized: DataPageV1 58 83 22 18.1 55.4 90.7X -SQL Parquet Vectorized: DataPageV2 74 96 25 14.1 70.7 71.0X -SQL Parquet MR: DataPageV1 194 216 31 5.4 185.4 27.1X -SQL Parquet MR: DataPageV2 175 192 24 6.0 167.3 30.0X -SQL ORC Vectorized 65 83 20 16.1 62.1 80.8X -SQL ORC MR 170 187 27 6.2 161.9 31.0X +SQL CSV 5418 5425 9 0.2 5167.2 1.0X +SQL Json 11463 11574 156 0.1 10932.3 0.5X +SQL Parquet Vectorized: DataPageV1 66 103 28 15.8 63.4 81.5X +SQL Parquet Vectorized: DataPageV2 90 115 27 11.7 85.5 60.4X +SQL Parquet MR: DataPageV1 218 234 23 4.8 208.3 24.8X +SQL Parquet MR: DataPageV2 199 225 29 5.3 190.1 27.2X +SQL ORC Vectorized 76 106 31 13.7 72.8 71.0X +SQL ORC MR 193 216 28 5.4 184.2 28.0X OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8781 8782 1 0.1 8374.4 1.0X -SQL Json 19880 20088 293 0.1 18959.3 0.4X -SQL Parquet Vectorized: DataPageV1 85 111 20 12.4 80.7 103.8X -SQL Parquet Vectorized: DataPageV2 103 124 24 10.2 98.4 85.1X -SQL Parquet MR: DataPageV1 232 253 19 4.5 221.6 37.8X -SQL Parquet MR: DataPageV2 209 234 23 5.0 199.0 42.1X -SQL ORC Vectorized 86 101 23 12.2 81.9 102.3X -SQL ORC MR 194 212 19 5.4 185.4 45.2X +SQL CSV 9430 9430 0 0.1 8993.3 1.0X +SQL Json 21268 21347 111 0.0 20283.1 0.4X +SQL Parquet Vectorized: DataPageV1 97 124 24 10.9 92.1 97.6X +SQL Parquet Vectorized: DataPageV2 119 136 19 8.8 113.6 79.2X +SQL Parquet MR: DataPageV1 254 285 35 4.1 242.1 37.1X +SQL Parquet MR: DataPageV2 231 260 30 4.5 220.0 40.9X +SQL ORC Vectorized 95 119 31 11.1 90.4 99.5X +SQL ORC MR 214 219 5 4.9 203.6 44.2X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt index c13dddbd8c265..8ff176457af10 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt @@ -3,158 +3,158 @@ SQL Single Numeric Column Scan ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11833 12041 295 1.3 752.3 1.0X -SQL Json 8231 8329 138 1.9 523.3 1.4X -SQL Parquet Vectorized: DataPageV1 86 100 14 181.9 5.5 136.9X -SQL Parquet Vectorized: DataPageV2 65 76 8 241.0 4.2 181.3X -SQL Parquet MR: DataPageV1 1629 1632 4 9.7 103.5 7.3X -SQL Parquet MR: DataPageV2 1475 1489 21 10.7 93.8 8.0X -SQL ORC Vectorized 167 176 8 94.3 10.6 70.9X -SQL ORC MR 1386 1391 8 11.4 88.1 8.5X +SQL CSV 9610 10067 646 1.6 611.0 1.0X +SQL Json 8316 8410 133 1.9 528.7 1.2X +SQL Parquet Vectorized: DataPageV1 123 145 10 127.7 7.8 78.0X +SQL Parquet Vectorized: DataPageV2 93 108 12 170.0 5.9 103.8X +SQL Parquet MR: DataPageV1 1766 1768 2 8.9 112.3 5.4X +SQL Parquet MR: DataPageV2 1540 1543 3 10.2 97.9 6.2X +SQL ORC Vectorized 175 182 6 89.6 11.2 54.8X +SQL ORC MR 1517 1533 22 10.4 96.5 6.3X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 71 73 2 220.3 4.5 1.0X -ParquetReader Vectorized: DataPageV2 60 60 1 263.5 3.8 1.2X -ParquetReader Vectorized -> Row: DataPageV1 38 39 1 415.0 2.4 1.9X -ParquetReader Vectorized -> Row: DataPageV2 26 27 1 601.2 1.7 2.7X +ParquetReader Vectorized: DataPageV1 61 63 2 256.3 3.9 1.0X +ParquetReader Vectorized: DataPageV2 44 45 2 356.3 2.8 1.4X +ParquetReader Vectorized -> Row: DataPageV1 51 51 1 311.3 3.2 1.2X +ParquetReader Vectorized -> Row: DataPageV2 32 33 2 492.4 2.0 1.9X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13011 13048 51 1.2 827.2 1.0X -SQL Json 9623 9668 64 1.6 611.8 1.4X -SQL Parquet Vectorized: DataPageV1 98 107 11 160.4 6.2 132.7X -SQL Parquet Vectorized: DataPageV2 97 104 6 161.9 6.2 133.9X -SQL Parquet MR: DataPageV1 1763 1793 42 8.9 112.1 7.4X -SQL Parquet MR: DataPageV2 1631 1637 9 9.6 103.7 8.0X -SQL ORC Vectorized 150 156 7 105.1 9.5 86.9X -SQL ORC MR 1384 1414 43 11.4 88.0 9.4X +SQL CSV 14866 14885 26 1.1 945.2 1.0X +SQL Json 9585 9586 3 1.6 609.4 1.6X +SQL Parquet Vectorized: DataPageV1 119 131 12 132.4 7.6 125.2X +SQL Parquet Vectorized: DataPageV2 119 125 5 132.0 7.6 124.7X +SQL Parquet MR: DataPageV1 1954 2025 101 8.0 124.2 7.6X +SQL Parquet MR: DataPageV2 1800 1824 35 8.7 114.4 8.3X +SQL ORC Vectorized 169 176 6 93.0 10.8 87.9X +SQL ORC MR 1432 1467 50 11.0 91.0 10.4X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 101 102 1 155.6 6.4 1.0X -ParquetReader Vectorized: DataPageV2 101 102 2 155.2 6.4 1.0X -ParquetReader Vectorized -> Row: DataPageV1 63 65 2 250.5 4.0 1.6X -ParquetReader Vectorized -> Row: DataPageV2 63 64 1 249.8 4.0 1.6X +ParquetReader Vectorized: DataPageV1 118 120 2 133.0 7.5 1.0X +ParquetReader Vectorized: DataPageV2 119 120 2 132.6 7.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 72 73 2 218.1 4.6 1.6X +ParquetReader Vectorized -> Row: DataPageV2 72 74 2 217.7 4.6 1.6X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13946 13950 6 1.1 886.7 1.0X -SQL Json 10195 10204 12 1.5 648.2 1.4X -SQL Parquet Vectorized: DataPageV1 131 140 8 120.5 8.3 106.9X -SQL Parquet Vectorized: DataPageV2 183 189 5 86.0 11.6 76.3X -SQL Parquet MR: DataPageV1 1979 2023 63 7.9 125.8 7.0X -SQL Parquet MR: DataPageV2 1729 1757 39 9.1 109.9 8.1X -SQL ORC Vectorized 198 206 7 79.4 12.6 70.4X -SQL ORC MR 1547 1562 21 10.2 98.4 9.0X +SQL CSV 14601 14699 139 1.1 928.3 1.0X +SQL Json 9446 9517 101 1.7 600.5 1.5X +SQL Parquet Vectorized: DataPageV1 156 168 15 101.1 9.9 93.8X +SQL Parquet Vectorized: DataPageV2 197 213 15 79.6 12.6 73.9X +SQL Parquet MR: DataPageV1 2113 2130 23 7.4 134.4 6.9X +SQL Parquet MR: DataPageV2 1739 1784 64 9.0 110.5 8.4X +SQL ORC Vectorized 192 205 10 81.9 12.2 76.0X +SQL ORC MR 1518 1588 100 10.4 96.5 9.6X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 200 208 9 78.5 12.7 1.0X -ParquetReader Vectorized: DataPageV2 249 251 1 63.3 15.8 0.8X -ParquetReader Vectorized -> Row: DataPageV1 196 201 7 80.4 12.4 1.0X -ParquetReader Vectorized -> Row: DataPageV2 245 246 1 64.3 15.5 0.8X +ParquetReader Vectorized: DataPageV1 215 221 6 73.2 13.7 1.0X +ParquetReader Vectorized: DataPageV2 269 278 8 58.5 17.1 0.8X +ParquetReader Vectorized -> Row: DataPageV1 206 208 2 76.2 13.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 244 262 10 64.4 15.5 0.9X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15404 15408 6 1.0 979.3 1.0X -SQL Json 10952 10960 12 1.4 696.3 1.4X -SQL Parquet Vectorized: DataPageV1 147 162 15 107.3 9.3 105.0X -SQL Parquet Vectorized: DataPageV2 286 318 18 55.0 18.2 53.9X -SQL Parquet MR: DataPageV1 2014 2051 52 7.8 128.1 7.6X -SQL Parquet MR: DataPageV2 1877 1887 14 8.4 119.3 8.2X -SQL ORC Vectorized 230 243 17 68.3 14.7 66.8X -SQL ORC MR 1608 1650 59 9.8 102.3 9.6X +SQL CSV 15886 16086 282 1.0 1010.0 1.0X +SQL Json 9872 9880 12 1.6 627.6 1.6X +SQL Parquet Vectorized: DataPageV1 174 195 22 90.4 11.1 91.3X +SQL Parquet Vectorized: DataPageV2 393 409 16 40.0 25.0 40.4X +SQL Parquet MR: DataPageV1 1953 2064 157 8.1 124.2 8.1X +SQL Parquet MR: DataPageV2 2215 2231 23 7.1 140.8 7.2X +SQL ORC Vectorized 280 314 22 56.1 17.8 56.7X +SQL ORC MR 1681 1706 35 9.4 106.9 9.5X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 232 235 4 67.9 14.7 1.0X -ParquetReader Vectorized: DataPageV2 370 371 1 42.6 23.5 0.6X -ParquetReader Vectorized -> Row: DataPageV1 204 208 4 77.0 13.0 1.1X -ParquetReader Vectorized -> Row: DataPageV2 343 344 2 45.9 21.8 0.7X +ParquetReader Vectorized: DataPageV1 253 263 8 62.1 16.1 1.0X +ParquetReader Vectorized: DataPageV2 450 461 15 34.9 28.6 0.6X +ParquetReader Vectorized -> Row: DataPageV1 241 253 12 65.2 15.3 1.1X +ParquetReader Vectorized -> Row: DataPageV2 437 448 14 36.0 27.8 0.6X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20572 20592 27 0.8 1308.0 1.0X -SQL Json 13884 13889 6 1.1 882.7 1.5X -SQL Parquet Vectorized: DataPageV1 167 194 19 93.9 10.6 122.9X -SQL Parquet Vectorized: DataPageV2 362 374 15 43.5 23.0 56.9X -SQL Parquet MR: DataPageV1 2348 2352 5 6.7 149.3 8.8X -SQL Parquet MR: DataPageV2 1921 1922 1 8.2 122.1 10.7X -SQL ORC Vectorized 271 324 32 58.0 17.2 75.9X -SQL ORC MR 1742 1744 3 9.0 110.8 11.8X +SQL CSV 20641 20744 145 0.8 1312.3 1.0X +SQL Json 13055 13122 95 1.2 830.0 1.6X +SQL Parquet Vectorized: DataPageV1 246 267 16 63.8 15.7 83.8X +SQL Parquet Vectorized: DataPageV2 513 532 16 30.7 32.6 40.2X +SQL Parquet MR: DataPageV1 2354 2387 47 6.7 149.7 8.8X +SQL Parquet MR: DataPageV2 2118 2148 43 7.4 134.6 9.7X +SQL ORC Vectorized 418 437 17 37.6 26.6 49.4X +SQL ORC MR 1808 1852 61 8.7 115.0 11.4X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 248 259 10 63.3 15.8 1.0X -ParquetReader Vectorized: DataPageV2 431 446 22 36.5 27.4 0.6X -ParquetReader Vectorized -> Row: DataPageV1 293 295 2 53.7 18.6 0.8X -ParquetReader Vectorized -> Row: DataPageV2 448 466 19 35.1 28.5 0.6X +ParquetReader Vectorized: DataPageV1 306 315 5 51.5 19.4 1.0X +ParquetReader Vectorized: DataPageV2 584 591 11 26.9 37.1 0.5X +ParquetReader Vectorized -> Row: DataPageV1 288 299 14 54.6 18.3 1.1X +ParquetReader Vectorized -> Row: DataPageV2 549 557 8 28.6 34.9 0.6X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15514 15667 218 1.0 986.3 1.0X -SQL Json 12242 12254 17 1.3 778.3 1.3X -SQL Parquet Vectorized: DataPageV1 124 132 7 126.6 7.9 124.9X -SQL Parquet Vectorized: DataPageV2 122 128 5 128.6 7.8 126.9X -SQL Parquet MR: DataPageV1 2015 2022 10 7.8 128.1 7.7X -SQL Parquet MR: DataPageV2 1819 1821 4 8.6 115.6 8.5X -SQL ORC Vectorized 390 410 14 40.3 24.8 39.7X -SQL ORC MR 1745 1750 6 9.0 111.0 8.9X +SQL CSV 17024 17292 378 0.9 1082.4 1.0X +SQL Json 11724 11904 255 1.3 745.4 1.5X +SQL Parquet Vectorized: DataPageV1 174 186 11 90.6 11.0 98.1X +SQL Parquet Vectorized: DataPageV2 173 189 14 90.9 11.0 98.4X +SQL Parquet MR: DataPageV1 1932 2037 148 8.1 122.9 8.8X +SQL Parquet MR: DataPageV2 1947 1976 41 8.1 123.8 8.7X +SQL ORC Vectorized 432 459 36 36.4 27.5 39.4X +SQL ORC MR 1984 1985 1 7.9 126.1 8.6X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 202 207 5 77.8 12.9 1.0X -ParquetReader Vectorized: DataPageV2 224 228 6 70.1 14.3 0.9X -ParquetReader Vectorized -> Row: DataPageV1 232 235 5 67.7 14.8 0.9X -ParquetReader Vectorized -> Row: DataPageV2 233 247 26 67.6 14.8 0.9X +ParquetReader Vectorized: DataPageV1 257 259 2 61.2 16.3 1.0X +ParquetReader Vectorized: DataPageV2 239 254 10 65.8 15.2 1.1X +ParquetReader Vectorized -> Row: DataPageV1 259 260 1 60.8 16.4 1.0X +ParquetReader Vectorized -> Row: DataPageV2 258 262 6 61.0 16.4 1.0X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20827 20829 3 0.8 1324.2 1.0X -SQL Json 17161 17174 18 0.9 1091.1 1.2X -SQL Parquet Vectorized: DataPageV1 200 210 8 78.6 12.7 104.0X -SQL Parquet Vectorized: DataPageV2 202 211 8 77.7 12.9 102.9X -SQL Parquet MR: DataPageV1 2255 2292 53 7.0 143.4 9.2X -SQL Parquet MR: DataPageV2 2087 2091 4 7.5 132.7 10.0X -SQL ORC Vectorized 486 498 9 32.4 30.9 42.9X -SQL ORC MR 1886 1894 10 8.3 119.9 11.0X +SQL CSV 22592 22594 4 0.7 1436.3 1.0X +SQL Json 16252 16271 26 1.0 1033.3 1.4X +SQL Parquet Vectorized: DataPageV1 247 271 22 63.6 15.7 91.3X +SQL Parquet Vectorized: DataPageV2 252 266 14 62.4 16.0 89.6X +SQL Parquet MR: DataPageV1 2337 2352 21 6.7 148.6 9.7X +SQL Parquet MR: DataPageV2 2187 2223 50 7.2 139.1 10.3X +SQL ORC Vectorized 489 526 25 32.2 31.1 46.2X +SQL ORC MR 1816 1892 107 8.7 115.5 12.4X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 283 296 31 55.5 18.0 1.0X -ParquetReader Vectorized: DataPageV2 266 278 12 59.1 16.9 1.1X -ParquetReader Vectorized -> Row: DataPageV1 265 278 9 59.4 16.8 1.1X -ParquetReader Vectorized -> Row: DataPageV2 263 275 10 59.9 16.7 1.1X +ParquetReader Vectorized: DataPageV1 291 304 8 54.0 18.5 1.0X +ParquetReader Vectorized: DataPageV2 298 309 7 52.9 18.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 330 338 16 47.7 21.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 331 338 12 47.5 21.1 0.9X ================================================================================================ @@ -162,17 +162,17 @@ Int and String Scan ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14490 14490 0 0.7 1381.8 1.0X -SQL Json 11765 11779 20 0.9 1122.0 1.2X -SQL Parquet Vectorized: DataPageV1 2134 2168 48 4.9 203.5 6.8X -SQL Parquet Vectorized: DataPageV2 2694 2700 9 3.9 256.9 5.4X -SQL Parquet MR: DataPageV1 4082 4090 10 2.6 389.3 3.5X -SQL Parquet MR: DataPageV2 4170 4171 0 2.5 397.7 3.5X -SQL ORC Vectorized 2136 2138 2 4.9 203.7 6.8X -SQL ORC MR 3714 3752 53 2.8 354.2 3.9X +SQL CSV 14365 14780 587 0.7 1369.9 1.0X +SQL Json 10718 10772 76 1.0 1022.2 1.3X +SQL Parquet Vectorized: DataPageV1 1932 1988 80 5.4 184.2 7.4X +SQL Parquet Vectorized: DataPageV2 2298 2317 27 4.6 219.2 6.2X +SQL Parquet MR: DataPageV1 3829 3957 181 2.7 365.1 3.8X +SQL Parquet MR: DataPageV2 4176 4208 46 2.5 398.3 3.4X +SQL ORC Vectorized 2026 2046 28 5.2 193.2 7.1X +SQL ORC MR 3566 3580 21 2.9 340.0 4.0X ================================================================================================ @@ -180,17 +180,17 @@ Repeated String Scan ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7936 7973 53 1.3 756.8 1.0X -SQL Json 6859 6861 3 1.5 654.1 1.2X -SQL Parquet Vectorized: DataPageV1 651 657 6 16.1 62.1 12.2X -SQL Parquet Vectorized: DataPageV2 650 655 4 16.1 62.0 12.2X -SQL Parquet MR: DataPageV1 1650 1674 34 6.4 157.3 4.8X -SQL Parquet MR: DataPageV2 1594 1624 42 6.6 152.1 5.0X -SQL ORC Vectorized 487 509 25 21.5 46.4 16.3X -SQL ORC MR 1751 1780 41 6.0 167.0 4.5X +SQL CSV 9372 9373 1 1.1 893.8 1.0X +SQL Json 6862 6865 4 1.5 654.4 1.4X +SQL Parquet Vectorized: DataPageV1 606 613 8 17.3 57.8 15.5X +SQL Parquet Vectorized: DataPageV2 611 615 3 17.2 58.3 15.3X +SQL Parquet MR: DataPageV1 1713 1721 11 6.1 163.3 5.5X +SQL Parquet MR: DataPageV2 1721 1724 4 6.1 164.1 5.4X +SQL ORC Vectorized 467 469 2 22.5 44.5 20.1X +SQL ORC MR 1816 1818 2 5.8 173.2 5.2X ================================================================================================ @@ -198,33 +198,33 @@ Partitioned Table Scan ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 20442 20527 119 0.8 1299.7 1.0X -Data column - Json 13399 13400 1 1.2 851.9 1.5X -Data column - Parquet Vectorized: DataPageV1 204 223 26 77.1 13.0 100.2X -Data column - Parquet Vectorized: DataPageV2 367 402 22 42.9 23.3 55.7X -Data column - Parquet MR: DataPageV1 2619 2631 18 6.0 166.5 7.8X -Data column - Parquet MR: DataPageV2 2354 2367 18 6.7 149.7 8.7X -Data column - ORC Vectorized 373 386 13 42.1 23.7 54.7X -Data column - ORC MR 2013 2013 1 7.8 128.0 10.2X -Partition column - CSV 6545 6565 28 2.4 416.1 3.1X -Partition column - Json 10543 10549 10 1.5 670.3 1.9X -Partition column - Parquet Vectorized: DataPageV1 44 48 6 359.1 2.8 466.8X -Partition column - Parquet Vectorized: DataPageV2 44 49 7 357.7 2.8 464.9X -Partition column - Parquet MR: DataPageV1 1241 1243 3 12.7 78.9 16.5X -Partition column - Parquet MR: DataPageV2 1227 1234 10 12.8 78.0 16.7X -Partition column - ORC Vectorized 45 50 4 347.1 2.9 451.1X -Partition column - ORC MR 1323 1332 12 11.9 84.1 15.4X -Both columns - CSV 20858 20873 22 0.8 1326.1 1.0X -Both columns - Json 14078 14082 5 1.1 895.1 1.5X -Both columns - Parquet Vectorized: DataPageV1 224 234 8 70.2 14.2 91.3X -Both columns - Parquet Vectorized: DataPageV2 441 450 9 35.7 28.0 46.4X -Both columns - Parquet MR: DataPageV1 2655 2664 13 5.9 168.8 7.7X -Both columns - Parquet MR: DataPageV2 2298 2312 19 6.8 146.1 8.9X -Both columns - ORC Vectorized 377 391 22 41.8 24.0 54.3X -Both columns - ORC MR 2131 2135 6 7.4 135.5 9.6X +Data column - CSV 21799 22053 360 0.7 1385.9 1.0X +Data column - Json 12978 12985 10 1.2 825.1 1.7X +Data column - Parquet Vectorized: DataPageV1 261 277 15 60.4 16.6 83.7X +Data column - Parquet Vectorized: DataPageV2 601 647 42 26.2 38.2 36.3X +Data column - Parquet MR: DataPageV1 2796 2798 2 5.6 177.8 7.8X +Data column - Parquet MR: DataPageV2 2595 2626 43 6.1 165.0 8.4X +Data column - ORC Vectorized 428 449 25 36.8 27.2 50.9X +Data column - ORC MR 2162 2274 159 7.3 137.5 10.1X +Partition column - CSV 5804 5922 167 2.7 369.0 3.8X +Partition column - Json 10410 10455 64 1.5 661.8 2.1X +Partition column - Parquet Vectorized: DataPageV1 56 60 6 280.9 3.6 389.3X +Partition column - Parquet Vectorized: DataPageV2 55 59 5 286.5 3.5 397.1X +Partition column - Parquet MR: DataPageV1 1357 1357 1 11.6 86.3 16.1X +Partition column - Parquet MR: DataPageV2 1339 1339 0 11.7 85.1 16.3X +Partition column - ORC Vectorized 57 61 5 276.3 3.6 382.9X +Partition column - ORC MR 1346 1351 7 11.7 85.6 16.2X +Both columns - CSV 20812 21349 759 0.8 1323.2 1.0X +Both columns - Json 13061 13372 440 1.2 830.4 1.7X +Both columns - Parquet Vectorized: DataPageV1 265 275 6 59.3 16.9 82.1X +Both columns - Parquet Vectorized: DataPageV2 619 637 20 25.4 39.4 35.2X +Both columns - Parquet MR: DataPageV1 2827 2830 4 5.6 179.8 7.7X +Both columns - Parquet MR: DataPageV2 2593 2603 14 6.1 164.8 8.4X +Both columns - ORC Vectorized 391 432 37 40.2 24.9 55.7X +Both columns - ORC MR 2438 2455 25 6.5 155.0 8.9X ================================================================================================ @@ -232,49 +232,49 @@ String with Nulls Scan ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9558 9562 6 1.1 911.5 1.0X -SQL Json 10497 10513 22 1.0 1001.1 0.9X -SQL Parquet Vectorized: DataPageV1 1409 1419 15 7.4 134.4 6.8X -SQL Parquet Vectorized: DataPageV2 2264 2270 9 4.6 215.9 4.2X -SQL Parquet MR: DataPageV1 3468 3473 7 3.0 330.7 2.8X -SQL Parquet MR: DataPageV2 3688 3768 112 2.8 351.8 2.6X -ParquetReader Vectorized: DataPageV1 1005 1008 4 10.4 95.8 9.5X -ParquetReader Vectorized: DataPageV2 1538 1538 0 6.8 146.7 6.2X -SQL ORC Vectorized 1099 1123 34 9.5 104.8 8.7X -SQL ORC MR 2959 2963 6 3.5 282.2 3.2X +SQL CSV 10697 10736 56 1.0 1020.1 1.0X +SQL Json 9722 9963 341 1.1 927.2 1.1X +SQL Parquet Vectorized: DataPageV1 1337 1342 6 7.8 127.6 8.0X +SQL Parquet Vectorized: DataPageV2 1731 1757 38 6.1 165.1 6.2X +SQL Parquet MR: DataPageV1 3581 3584 4 2.9 341.5 3.0X +SQL Parquet MR: DataPageV2 3996 4001 7 2.6 381.1 2.7X +ParquetReader Vectorized: DataPageV1 1006 1015 13 10.4 96.0 10.6X +ParquetReader Vectorized: DataPageV2 1476 1477 2 7.1 140.7 7.2X +SQL ORC Vectorized 957 1042 120 11.0 91.3 11.2X +SQL ORC MR 3060 3068 11 3.4 291.8 3.5X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6723 6723 1 1.6 641.1 1.0X -SQL Json 8098 8098 1 1.3 772.3 0.8X -SQL Parquet Vectorized: DataPageV1 1046 1046 0 10.0 99.7 6.4X -SQL Parquet Vectorized: DataPageV2 1308 1315 10 8.0 124.7 5.1X -SQL Parquet MR: DataPageV1 2666 2669 4 3.9 254.2 2.5X -SQL Parquet MR: DataPageV2 2810 2811 1 3.7 268.0 2.4X -ParquetReader Vectorized: DataPageV1 952 954 2 11.0 90.8 7.1X -ParquetReader Vectorized: DataPageV2 1201 1202 1 8.7 114.6 5.6X -SQL ORC Vectorized 1279 1285 8 8.2 122.0 5.3X -SQL ORC MR 2880 2939 84 3.6 274.6 2.3X +SQL CSV 7299 7300 1 1.4 696.1 1.0X +SQL Json 7453 7659 292 1.4 710.8 1.0X +SQL Parquet Vectorized: DataPageV1 896 916 32 11.7 85.4 8.1X +SQL Parquet Vectorized: DataPageV2 1282 1283 1 8.2 122.3 5.7X +SQL Parquet MR: DataPageV1 2586 2678 130 4.1 246.6 2.8X +SQL Parquet MR: DataPageV2 3061 3066 6 3.4 291.9 2.4X +ParquetReader Vectorized: DataPageV1 913 915 3 11.5 87.0 8.0X +ParquetReader Vectorized: DataPageV2 1181 1183 3 8.9 112.6 6.2X +SQL ORC Vectorized 1102 1111 13 9.5 105.1 6.6X +SQL ORC MR 2916 3002 121 3.6 278.1 2.5X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4638 4645 11 2.3 442.3 1.0X -SQL Json 4646 4650 5 2.3 443.1 1.0X -SQL Parquet Vectorized: DataPageV1 226 231 4 46.4 21.6 20.5X -SQL Parquet Vectorized: DataPageV2 307 311 2 34.2 29.3 15.1X -SQL Parquet MR: DataPageV1 1593 1597 5 6.6 151.9 2.9X -SQL Parquet MR: DataPageV2 1452 1460 12 7.2 138.4 3.2X -ParquetReader Vectorized: DataPageV1 235 237 2 44.6 22.4 19.7X -ParquetReader Vectorized: DataPageV2 314 315 2 33.4 29.9 14.8X -SQL ORC Vectorized 392 398 4 26.7 37.4 11.8X -SQL ORC MR 1470 1483 18 7.1 140.2 3.2X +SQL CSV 4615 4619 6 2.3 440.1 1.0X +SQL Json 4926 4927 1 2.1 469.8 0.9X +SQL Parquet Vectorized: DataPageV1 240 246 5 43.8 22.9 19.3X +SQL Parquet Vectorized: DataPageV2 287 295 4 36.5 27.4 16.1X +SQL Parquet MR: DataPageV1 1774 1781 10 5.9 169.2 2.6X +SQL Parquet MR: DataPageV2 1772 1773 1 5.9 169.0 2.6X +ParquetReader Vectorized: DataPageV1 238 240 2 44.0 22.7 19.4X +ParquetReader Vectorized: DataPageV2 285 288 3 36.8 27.2 16.2X +SQL ORC Vectorized 382 392 6 27.4 36.5 12.1X +SQL ORC MR 1616 1617 2 6.5 154.1 2.9X ================================================================================================ @@ -282,42 +282,42 @@ Single Column Scan From Wide Columns ================================================================================================ OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2187 2191 6 0.5 2085.6 1.0X -SQL Json 2957 2978 30 0.4 2820.3 0.7X -SQL Parquet Vectorized: DataPageV1 36 39 5 29.2 34.3 60.8X -SQL Parquet Vectorized: DataPageV2 48 50 4 22.0 45.4 46.0X -SQL Parquet MR: DataPageV1 179 184 4 5.8 171.1 12.2X -SQL Parquet MR: DataPageV2 156 163 5 6.7 149.2 14.0X -SQL ORC Vectorized 46 49 4 22.9 43.7 47.7X -SQL ORC MR 143 148 4 7.3 136.2 15.3X +SQL CSV 2051 2052 2 0.5 1956.1 1.0X +SQL Json 3230 3232 3 0.3 3080.6 0.6X +SQL Parquet Vectorized: DataPageV1 45 50 7 23.2 43.2 45.3X +SQL Parquet Vectorized: DataPageV2 67 72 8 15.6 64.1 30.5X +SQL Parquet MR: DataPageV1 191 198 8 5.5 181.9 10.8X +SQL Parquet MR: DataPageV2 176 181 6 6.0 167.7 11.7X +SQL ORC Vectorized 55 60 6 19.0 52.7 37.1X +SQL ORC MR 164 168 4 6.4 156.1 12.5X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4728 4728 0 0.2 4508.8 1.0X -SQL Json 11030 11074 62 0.1 10519.5 0.4X -SQL Parquet Vectorized: DataPageV1 48 53 8 21.9 45.7 98.6X -SQL Parquet Vectorized: DataPageV2 59 62 4 17.6 56.7 79.5X -SQL Parquet MR: DataPageV1 197 201 4 5.3 187.8 24.0X -SQL Parquet MR: DataPageV2 174 178 5 6.0 165.8 27.2X -SQL ORC Vectorized 59 63 5 17.7 56.6 79.7X -SQL ORC MR 157 162 5 6.7 150.1 30.0X +SQL CSV 4530 4530 0 0.2 4320.0 1.0X +SQL Json 12530 12536 9 0.1 11949.2 0.4X +SQL Parquet Vectorized: DataPageV1 60 65 6 17.4 57.6 75.0X +SQL Parquet Vectorized: DataPageV2 83 91 8 12.6 79.1 54.6X +SQL Parquet MR: DataPageV1 211 216 7 5.0 201.2 21.5X +SQL Parquet MR: DataPageV2 195 204 12 5.4 186.0 23.2X +SQL ORC Vectorized 70 75 5 14.9 67.1 64.4X +SQL ORC MR 182 191 11 5.8 173.5 24.9X OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7841 7851 14 0.1 7477.6 1.0X -SQL Json 20753 20875 172 0.1 19791.5 0.4X -SQL Parquet Vectorized: DataPageV1 74 79 4 14.2 70.5 106.0X -SQL Parquet Vectorized: DataPageV2 84 88 4 12.5 79.9 93.6X -SQL Parquet MR: DataPageV1 223 228 6 4.7 213.1 35.1X -SQL Parquet MR: DataPageV2 197 202 5 5.3 188.3 39.7X -SQL ORC Vectorized 73 77 4 14.3 69.9 106.9X -SQL ORC MR 171 175 3 6.1 163.5 45.7X +SQL CSV 7758 7763 7 0.1 7398.8 1.0X +SQL Json 24530 24546 23 0.0 23393.2 0.3X +SQL Parquet Vectorized: DataPageV1 91 96 6 11.5 87.1 84.9X +SQL Parquet Vectorized: DataPageV2 113 118 6 9.2 108.1 68.4X +SQL Parquet MR: DataPageV1 246 254 8 4.3 234.2 31.6X +SQL Parquet MR: DataPageV2 229 235 6 4.6 218.7 33.8X +SQL ORC Vectorized 88 92 6 11.9 83.8 88.3X +SQL ORC MR 205 214 9 5.1 195.2 37.9X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 54b57801cfb81..1a7ebe51057be 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -3,158 +3,158 @@ SQL Single Numeric Column Scan ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11081 11162 114 1.4 704.5 1.0X -SQL Json 7274 7299 36 2.2 462.5 1.5X -SQL Parquet Vectorized: DataPageV1 98 110 8 160.6 6.2 113.1X -SQL Parquet Vectorized: DataPageV2 69 77 6 228.7 4.4 161.1X -SQL Parquet MR: DataPageV1 1560 1566 8 10.1 99.2 7.1X -SQL Parquet MR: DataPageV2 1360 1369 12 11.6 86.5 8.1X -SQL ORC Vectorized 147 153 6 106.7 9.4 75.2X -SQL ORC MR 1378 1404 36 11.4 87.6 8.0X +SQL CSV 12972 13210 337 1.2 824.8 1.0X +SQL Json 7440 7634 275 2.1 473.0 1.7X +SQL Parquet Vectorized: DataPageV1 125 137 10 125.8 8.0 103.7X +SQL Parquet Vectorized: DataPageV2 93 103 20 168.4 5.9 138.9X +SQL Parquet MR: DataPageV1 1621 1657 52 9.7 103.0 8.0X +SQL Parquet MR: DataPageV2 1396 1420 34 11.3 88.7 9.3X +SQL ORC Vectorized 178 186 16 88.5 11.3 73.0X +SQL ORC MR 1501 1503 4 10.5 95.4 8.6X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 91 93 3 173.4 5.8 1.0X -ParquetReader Vectorized: DataPageV2 83 86 5 188.9 5.3 1.1X -ParquetReader Vectorized -> Row: DataPageV1 38 38 1 415.3 2.4 2.4X -ParquetReader Vectorized -> Row: DataPageV2 30 31 1 518.9 1.9 3.0X +ParquetReader Vectorized: DataPageV1 132 134 4 119.3 8.4 1.0X +ParquetReader Vectorized: DataPageV2 115 117 3 136.7 7.3 1.1X +ParquetReader Vectorized -> Row: DataPageV1 57 58 1 275.1 3.6 2.3X +ParquetReader Vectorized -> Row: DataPageV2 41 41 1 387.9 2.6 3.3X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13865 14012 208 1.1 881.5 1.0X -SQL Json 8898 9468 805 1.8 565.7 1.6X -SQL Parquet Vectorized: DataPageV1 110 114 5 143.5 7.0 126.5X -SQL Parquet Vectorized: DataPageV2 107 113 10 146.8 6.8 129.4X -SQL Parquet MR: DataPageV1 1776 1823 66 8.9 112.9 7.8X -SQL Parquet MR: DataPageV2 1670 1723 74 9.4 106.2 8.3X -SQL ORC Vectorized 155 157 2 101.4 9.9 89.4X -SQL ORC MR 1425 1460 49 11.0 90.6 9.7X +SQL CSV 15808 15867 83 1.0 1005.0 1.0X +SQL Json 9119 9174 78 1.7 579.8 1.7X +SQL Parquet Vectorized: DataPageV1 157 163 7 100.2 10.0 100.7X +SQL Parquet Vectorized: DataPageV2 156 161 5 100.6 9.9 101.1X +SQL Parquet MR: DataPageV1 1846 1871 36 8.5 117.4 8.6X +SQL Parquet MR: DataPageV2 1702 1707 7 9.2 108.2 9.3X +SQL ORC Vectorized 130 134 2 120.7 8.3 121.3X +SQL ORC MR 1536 1542 9 10.2 97.7 10.3X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 161 163 2 97.7 10.2 1.0X -ParquetReader Vectorized: DataPageV2 161 168 22 97.8 10.2 1.0X -ParquetReader Vectorized -> Row: DataPageV1 137 138 1 115.1 8.7 1.2X -ParquetReader Vectorized -> Row: DataPageV2 137 139 3 115.0 8.7 1.2X +ParquetReader Vectorized: DataPageV1 198 202 5 79.3 12.6 1.0X +ParquetReader Vectorized: DataPageV2 197 199 3 79.8 12.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 188 190 3 83.4 12.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 188 190 3 83.5 12.0 1.1X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15112 15131 26 1.0 960.8 1.0X -SQL Json 9439 9444 6 1.7 600.1 1.6X -SQL Parquet Vectorized: DataPageV1 170 173 3 92.3 10.8 88.7X -SQL Parquet Vectorized: DataPageV2 175 178 4 89.7 11.1 86.2X -SQL Parquet MR: DataPageV1 2033 2050 24 7.7 129.3 7.4X -SQL Parquet MR: DataPageV2 1656 1661 7 9.5 105.3 9.1X -SQL ORC Vectorized 156 161 5 100.9 9.9 96.9X -SQL ORC MR 1474 1490 22 10.7 93.7 10.3X +SQL CSV 16474 16493 27 1.0 1047.4 1.0X +SQL Json 9477 9478 1 1.7 602.6 1.7X +SQL Parquet Vectorized: DataPageV1 211 216 7 74.4 13.4 77.9X +SQL Parquet Vectorized: DataPageV2 215 221 5 73.0 13.7 76.5X +SQL Parquet MR: DataPageV1 2114 2133 28 7.4 134.4 7.8X +SQL Parquet MR: DataPageV2 1792 1808 22 8.8 113.9 9.2X +SQL ORC Vectorized 179 182 4 88.0 11.4 92.2X +SQL ORC MR 1586 1588 2 9.9 100.8 10.4X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 226 228 3 69.6 14.4 1.0X -ParquetReader Vectorized: DataPageV2 259 261 3 60.7 16.5 0.9X -ParquetReader Vectorized -> Row: DataPageV1 210 215 8 74.9 13.4 1.1X -ParquetReader Vectorized -> Row: DataPageV2 245 246 2 64.3 15.6 0.9X +ParquetReader Vectorized: DataPageV1 254 257 5 62.0 16.1 1.0X +ParquetReader Vectorized: DataPageV2 299 302 4 52.6 19.0 0.8X +ParquetReader Vectorized -> Row: DataPageV1 236 238 4 66.7 15.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 281 283 4 56.0 17.9 0.9X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 16706 16719 18 0.9 1062.1 1.0X -SQL Json 10038 10044 10 1.6 638.2 1.7X -SQL Parquet Vectorized: DataPageV1 119 123 7 132.0 7.6 140.2X -SQL Parquet Vectorized: DataPageV2 243 248 6 64.8 15.4 68.8X -SQL Parquet MR: DataPageV1 2045 2047 2 7.7 130.0 8.2X -SQL Parquet MR: DataPageV2 1731 1739 12 9.1 110.0 9.7X -SQL ORC Vectorized 215 219 3 73.1 13.7 77.6X -SQL ORC MR 1527 1534 10 10.3 97.1 10.9X +SQL CSV 18049 18086 52 0.9 1147.5 1.0X +SQL Json 10073 10074 1 1.6 640.4 1.8X +SQL Parquet Vectorized: DataPageV1 177 184 9 89.1 11.2 102.3X +SQL Parquet Vectorized: DataPageV2 301 306 6 52.2 19.1 59.9X +SQL Parquet MR: DataPageV1 2120 2134 21 7.4 134.8 8.5X +SQL Parquet MR: DataPageV2 1855 1893 54 8.5 117.9 9.7X +SQL ORC Vectorized 246 249 1 63.8 15.7 73.2X +SQL ORC MR 1655 1660 6 9.5 105.2 10.9X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 203 207 5 77.6 12.9 1.0X -ParquetReader Vectorized: DataPageV2 326 328 2 48.3 20.7 0.6X -ParquetReader Vectorized -> Row: DataPageV1 189 191 2 83.1 12.0 1.1X -ParquetReader Vectorized -> Row: DataPageV2 311 335 55 50.6 19.8 0.7X +ParquetReader Vectorized: DataPageV1 239 243 5 65.8 15.2 1.0X +ParquetReader Vectorized: DataPageV2 384 387 4 40.9 24.4 0.6X +ParquetReader Vectorized -> Row: DataPageV1 223 224 3 70.7 14.2 1.1X +ParquetReader Vectorized -> Row: DataPageV2 366 370 7 43.0 23.3 0.7X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 22154 22163 14 0.7 1408.5 1.0X -SQL Json 12754 12760 8 1.2 810.9 1.7X -SQL Parquet Vectorized: DataPageV1 185 190 8 85.2 11.7 119.9X -SQL Parquet Vectorized: DataPageV2 386 391 6 40.8 24.5 57.5X -SQL Parquet MR: DataPageV1 2111 2112 2 7.4 134.2 10.5X -SQL Parquet MR: DataPageV2 1808 1808 0 8.7 115.0 12.3X -SQL ORC Vectorized 267 273 6 58.9 17.0 82.9X -SQL ORC MR 1603 1609 8 9.8 101.9 13.8X +SQL CSV 22703 22737 48 0.7 1443.4 1.0X +SQL Json 12723 12743 28 1.2 808.9 1.8X +SQL Parquet Vectorized: DataPageV1 228 261 76 69.1 14.5 99.7X +SQL Parquet Vectorized: DataPageV2 465 472 7 33.8 29.5 48.9X +SQL Parquet MR: DataPageV1 2166 2168 3 7.3 137.7 10.5X +SQL Parquet MR: DataPageV2 1921 1936 21 8.2 122.1 11.8X +SQL ORC Vectorized 307 313 10 51.2 19.5 73.9X +SQL ORC MR 1730 1745 21 9.1 110.0 13.1X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 258 281 55 61.0 16.4 1.0X -ParquetReader Vectorized: DataPageV2 471 479 10 33.4 29.9 0.5X -ParquetReader Vectorized -> Row: DataPageV1 247 250 4 63.8 15.7 1.0X -ParquetReader Vectorized -> Row: DataPageV2 457 462 9 34.4 29.1 0.6X +ParquetReader Vectorized: DataPageV1 309 316 10 51.0 19.6 1.0X +ParquetReader Vectorized: DataPageV2 559 563 5 28.1 35.5 0.6X +ParquetReader Vectorized -> Row: DataPageV1 292 296 6 53.9 18.6 1.1X +ParquetReader Vectorized -> Row: DataPageV2 541 547 8 29.1 34.4 0.6X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17565 17590 35 0.9 1116.7 1.0X -SQL Json 12371 12397 37 1.3 786.5 1.4X -SQL Parquet Vectorized: DataPageV1 130 135 7 121.1 8.3 135.2X -SQL Parquet Vectorized: DataPageV2 129 133 8 121.6 8.2 135.7X -SQL Parquet MR: DataPageV1 2002 2013 15 7.9 127.3 8.8X -SQL Parquet MR: DataPageV2 1823 1824 1 8.6 115.9 9.6X -SQL ORC Vectorized 356 359 2 44.1 22.7 49.3X -SQL ORC MR 1626 1650 34 9.7 103.4 10.8X +SQL CSV 18790 18808 25 0.8 1194.6 1.0X +SQL Json 11572 11579 10 1.4 735.7 1.6X +SQL Parquet Vectorized: DataPageV1 155 158 5 101.7 9.8 121.6X +SQL Parquet Vectorized: DataPageV2 158 162 6 99.6 10.0 119.0X +SQL Parquet MR: DataPageV1 2041 2050 12 7.7 129.8 9.2X +SQL Parquet MR: DataPageV2 1903 1905 3 8.3 121.0 9.9X +SQL ORC Vectorized 357 359 2 44.1 22.7 52.7X +SQL ORC MR 1745 1755 15 9.0 110.9 10.8X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 200 203 4 78.5 12.7 1.0X -ParquetReader Vectorized: DataPageV2 200 203 3 78.6 12.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 187 189 3 84.3 11.9 1.1X -ParquetReader Vectorized -> Row: DataPageV2 188 189 2 83.8 11.9 1.1X +ParquetReader Vectorized: DataPageV1 239 243 4 65.7 15.2 1.0X +ParquetReader Vectorized: DataPageV2 240 243 4 65.7 15.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 221 225 4 71.1 14.1 1.1X +ParquetReader Vectorized -> Row: DataPageV2 223 225 4 70.6 14.2 1.1X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 23516 23535 26 0.7 1495.1 1.0X -SQL Json 17023 17023 0 0.9 1082.3 1.4X -SQL Parquet Vectorized: DataPageV1 184 189 7 85.4 11.7 127.7X -SQL Parquet Vectorized: DataPageV2 184 188 4 85.5 11.7 127.8X -SQL Parquet MR: DataPageV1 2098 2100 3 7.5 133.4 11.2X -SQL Parquet MR: DataPageV2 1939 1949 13 8.1 123.3 12.1X -SQL ORC Vectorized 392 393 1 40.2 24.9 60.0X -SQL ORC MR 1735 1747 16 9.1 110.3 13.6X +SQL CSV 23476 23478 3 0.7 1492.6 1.0X +SQL Json 14568 15103 757 1.1 926.2 1.6X +SQL Parquet Vectorized: DataPageV1 212 230 16 74.2 13.5 110.7X +SQL Parquet Vectorized: DataPageV2 209 218 8 75.4 13.3 112.5X +SQL Parquet MR: DataPageV1 1943 2080 194 8.1 123.5 12.1X +SQL Parquet MR: DataPageV2 1824 1830 9 8.6 116.0 12.9X +SQL ORC Vectorized 395 419 20 39.9 25.1 59.5X +SQL ORC MR 1844 1855 15 8.5 117.2 12.7X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 261 265 6 60.2 16.6 1.0X -ParquetReader Vectorized: DataPageV2 260 264 7 60.4 16.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 246 249 6 64.1 15.6 1.1X -ParquetReader Vectorized -> Row: DataPageV2 245 247 2 64.2 15.6 1.1X +ParquetReader Vectorized: DataPageV1 280 322 88 56.1 17.8 1.0X +ParquetReader Vectorized: DataPageV2 282 301 19 55.8 17.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 284 290 4 55.3 18.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 287 293 9 54.8 18.3 1.0X ================================================================================================ @@ -162,17 +162,17 @@ Int and String Scan ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15440 15456 22 0.7 1472.5 1.0X -SQL Json 11982 11993 16 0.9 1142.7 1.3X -SQL Parquet Vectorized: DataPageV1 2103 2106 5 5.0 200.5 7.3X -SQL Parquet Vectorized: DataPageV2 3012 3024 18 3.5 287.2 5.1X -SQL Parquet MR: DataPageV1 3874 3880 7 2.7 369.5 4.0X -SQL Parquet MR: DataPageV2 3816 3821 7 2.7 363.9 4.0X -SQL ORC Vectorized 2073 2076 4 5.1 197.7 7.4X -SQL ORC MR 3705 3708 4 2.8 353.4 4.2X +SQL CSV 14663 15652 1399 0.7 1398.4 1.0X +SQL Json 10757 10845 125 1.0 1025.9 1.4X +SQL Parquet Vectorized: DataPageV1 1815 1933 166 5.8 173.1 8.1X +SQL Parquet Vectorized: DataPageV2 2244 2297 75 4.7 214.0 6.5X +SQL Parquet MR: DataPageV1 3491 3685 273 3.0 333.0 4.2X +SQL Parquet MR: DataPageV2 3600 3627 37 2.9 343.4 4.1X +SQL ORC Vectorized 1804 1895 129 5.8 172.0 8.1X +SQL ORC MR 3181 3379 280 3.3 303.4 4.6X ================================================================================================ @@ -180,17 +180,17 @@ Repeated String Scan ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8007 8041 48 1.3 763.6 1.0X -SQL Json 7111 7114 4 1.5 678.2 1.1X -SQL Parquet Vectorized: DataPageV1 735 737 1 14.3 70.1 10.9X -SQL Parquet Vectorized: DataPageV2 739 746 8 14.2 70.5 10.8X -SQL Parquet MR: DataPageV1 1657 1657 1 6.3 158.0 4.8X -SQL Parquet MR: DataPageV2 1566 1571 7 6.7 149.3 5.1X -SQL ORC Vectorized 449 450 2 23.4 42.8 17.9X -SQL ORC MR 1829 1842 18 5.7 174.4 4.4X +SQL CSV 8466 8778 441 1.2 807.4 1.0X +SQL Json 6389 6454 93 1.6 609.3 1.3X +SQL Parquet Vectorized: DataPageV1 644 675 52 16.3 61.4 13.1X +SQL Parquet Vectorized: DataPageV2 640 668 44 16.4 61.0 13.2X +SQL Parquet MR: DataPageV1 1579 1602 33 6.6 150.6 5.4X +SQL Parquet MR: DataPageV2 1536 1539 4 6.8 146.5 5.5X +SQL ORC Vectorized 439 443 4 23.9 41.9 19.3X +SQL ORC MR 1787 1806 27 5.9 170.5 4.7X ================================================================================================ @@ -198,33 +198,33 @@ Partitioned Table Scan ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 21400 21426 38 0.7 1360.6 1.0X -Data column - Json 12486 12493 10 1.3 793.8 1.7X -Data column - Parquet Vectorized: DataPageV1 174 178 5 90.6 11.0 123.2X -Data column - Parquet Vectorized: DataPageV2 420 426 8 37.4 26.7 50.9X -Data column - Parquet MR: DataPageV1 2400 2405 7 6.6 152.6 8.9X -Data column - Parquet MR: DataPageV2 2069 2075 9 7.6 131.5 10.3X -Data column - ORC Vectorized 275 278 6 57.3 17.5 77.9X -Data column - ORC MR 1831 1835 5 8.6 116.4 11.7X -Partition column - CSV 6967 6973 9 2.3 443.0 3.1X -Partition column - Json 10130 10130 1 1.6 644.0 2.1X -Partition column - Parquet Vectorized: DataPageV1 44 49 10 356.2 2.8 484.6X -Partition column - Parquet Vectorized: DataPageV2 43 47 10 362.6 2.8 493.4X -Partition column - Parquet MR: DataPageV1 1164 1168 5 13.5 74.0 18.4X -Partition column - Parquet MR: DataPageV2 1163 1167 6 13.5 74.0 18.4X -Partition column - ORC Vectorized 50 53 12 312.7 3.2 425.5X -Partition column - ORC MR 1138 1140 4 13.8 72.3 18.8X -Both columns - CSV 21872 21873 2 0.7 1390.6 1.0X -Both columns - Json 13404 13422 25 1.2 852.2 1.6X -Both columns - Parquet Vectorized: DataPageV1 198 203 5 79.6 12.6 108.3X -Both columns - Parquet Vectorized: DataPageV2 445 451 6 35.3 28.3 48.1X -Both columns - Parquet MR: DataPageV1 2400 2401 1 6.6 152.6 8.9X -Both columns - Parquet MR: DataPageV2 2107 2107 1 7.5 133.9 10.2X -Both columns - ORC Vectorized 303 308 5 51.9 19.3 70.5X -Both columns - ORC MR 1926 1944 24 8.2 122.5 11.1X +Data column - CSV 22527 22546 26 0.7 1432.3 1.0X +Data column - Json 12533 12712 254 1.3 796.8 1.8X +Data column - Parquet Vectorized: DataPageV1 229 244 14 68.7 14.6 98.3X +Data column - Parquet Vectorized: DataPageV2 508 519 16 31.0 32.3 44.3X +Data column - Parquet MR: DataPageV1 2525 2535 13 6.2 160.6 8.9X +Data column - Parquet MR: DataPageV2 2194 2209 21 7.2 139.5 10.3X +Data column - ORC Vectorized 315 317 2 50.0 20.0 71.6X +Data column - ORC MR 2098 2100 3 7.5 133.4 10.7X +Partition column - CSV 6747 6753 9 2.3 429.0 3.3X +Partition column - Json 10080 10102 32 1.6 640.8 2.2X +Partition column - Parquet Vectorized: DataPageV1 60 63 2 262.8 3.8 376.4X +Partition column - Parquet Vectorized: DataPageV2 58 63 8 270.2 3.7 387.1X +Partition column - Parquet MR: DataPageV1 1152 1155 4 13.6 73.3 19.5X +Partition column - Parquet MR: DataPageV2 1149 1149 1 13.7 73.0 19.6X +Partition column - ORC Vectorized 61 64 3 259.8 3.8 372.1X +Partition column - ORC MR 1332 1332 0 11.8 84.7 16.9X +Both columns - CSV 23030 23042 17 0.7 1464.2 1.0X +Both columns - Json 13569 13581 16 1.2 862.7 1.7X +Both columns - Parquet Vectorized: DataPageV1 268 277 11 58.7 17.0 84.0X +Both columns - Parquet Vectorized: DataPageV2 551 557 7 28.6 35.0 40.9X +Both columns - Parquet MR: DataPageV1 2556 2557 0 6.2 162.5 8.8X +Both columns - Parquet MR: DataPageV2 2287 2292 7 6.9 145.4 9.9X +Both columns - ORC Vectorized 361 363 2 43.6 22.9 62.5X +Both columns - ORC MR 2158 2161 5 7.3 137.2 10.4X ================================================================================================ @@ -232,49 +232,49 @@ String with Nulls Scan ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10410 10425 22 1.0 992.7 1.0X -SQL Json 11140 11158 25 0.9 1062.4 0.9X -SQL Parquet Vectorized: DataPageV1 1272 1272 1 8.2 121.3 8.2X -SQL Parquet Vectorized: DataPageV2 2709 2717 11 3.9 258.4 3.8X -SQL Parquet MR: DataPageV1 3451 3465 20 3.0 329.1 3.0X -SQL Parquet MR: DataPageV2 4202 4216 20 2.5 400.7 2.5X -ParquetReader Vectorized: DataPageV1 923 927 6 11.4 88.0 11.3X -ParquetReader Vectorized: DataPageV2 2371 2372 2 4.4 226.1 4.4X -SQL ORC Vectorized 892 908 15 11.8 85.1 11.7X -SQL ORC MR 3093 3094 2 3.4 294.9 3.4X +SQL CSV 11418 11463 63 0.9 1088.9 1.0X +SQL Json 9698 9938 339 1.1 924.9 1.2X +SQL Parquet Vectorized: DataPageV1 1176 1207 45 8.9 112.1 9.7X +SQL Parquet Vectorized: DataPageV2 1652 1669 24 6.3 157.6 6.9X +SQL Parquet MR: DataPageV1 3041 3119 109 3.4 290.0 3.8X +SQL Parquet MR: DataPageV2 4030 4110 114 2.6 384.3 2.8X +ParquetReader Vectorized: DataPageV1 1008 1014 8 10.4 96.2 11.3X +ParquetReader Vectorized: DataPageV2 1247 1305 82 8.4 118.9 9.2X +SQL ORC Vectorized 820 856 56 12.8 78.2 13.9X +SQL ORC MR 2762 2807 64 3.8 263.4 4.1X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7716 7717 2 1.4 735.9 1.0X -SQL Json 12784 12792 11 0.8 1219.2 0.6X -SQL Parquet Vectorized: DataPageV1 1020 1027 9 10.3 97.3 7.6X -SQL Parquet Vectorized: DataPageV2 1815 1818 5 5.8 173.1 4.3X -SQL Parquet MR: DataPageV1 3592 3602 15 2.9 342.5 2.1X -SQL Parquet MR: DataPageV2 3517 3536 27 3.0 335.4 2.2X -ParquetReader Vectorized: DataPageV1 951 952 2 11.0 90.7 8.1X -ParquetReader Vectorized: DataPageV2 1731 1732 1 6.1 165.1 4.5X -SQL ORC Vectorized 1188 1191 4 8.8 113.3 6.5X -SQL ORC MR 2894 2896 3 3.6 276.0 2.7X +SQL CSV 6752 6756 5 1.6 644.0 1.0X +SQL Json 7469 7549 112 1.4 712.3 0.9X +SQL Parquet Vectorized: DataPageV1 912 990 67 11.5 87.0 7.4X +SQL Parquet Vectorized: DataPageV2 1141 1215 104 9.2 108.8 5.9X +SQL Parquet MR: DataPageV1 2256 2418 229 4.6 215.1 3.0X +SQL Parquet MR: DataPageV2 2712 2882 241 3.9 258.6 2.5X +ParquetReader Vectorized: DataPageV1 956 960 6 11.0 91.2 7.1X +ParquetReader Vectorized: DataPageV2 1211 1211 1 8.7 115.5 5.6X +SQL ORC Vectorized 1135 1135 1 9.2 108.2 6.0X +SQL ORC MR 2716 2766 70 3.9 259.0 2.5X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4861 4866 8 2.2 463.6 1.0X -SQL Json 6272 6282 14 1.7 598.2 0.8X -SQL Parquet Vectorized: DataPageV1 212 215 4 49.5 20.2 22.9X -SQL Parquet Vectorized: DataPageV2 332 336 4 31.6 31.7 14.6X -SQL Parquet MR: DataPageV1 2437 2440 4 4.3 232.5 2.0X -SQL Parquet MR: DataPageV2 1897 1924 39 5.5 180.9 2.6X -ParquetReader Vectorized: DataPageV1 220 221 1 47.7 20.9 22.1X -ParquetReader Vectorized: DataPageV2 340 344 4 30.9 32.4 14.3X -SQL ORC Vectorized 363 365 2 28.9 34.6 13.4X -SQL ORC MR 1389 1392 4 7.6 132.4 3.5X +SQL CSV 4496 4710 303 2.3 428.8 1.0X +SQL Json 4324 4343 28 2.4 412.3 1.0X +SQL Parquet Vectorized: DataPageV1 221 244 9 47.5 21.0 20.4X +SQL Parquet Vectorized: DataPageV2 270 288 13 38.8 25.8 16.6X +SQL Parquet MR: DataPageV1 1451 1461 15 7.2 138.3 3.1X +SQL Parquet MR: DataPageV2 1364 1368 5 7.7 130.0 3.3X +ParquetReader Vectorized: DataPageV1 256 258 2 40.9 24.5 17.5X +ParquetReader Vectorized: DataPageV2 273 291 17 38.4 26.0 16.5X +SQL ORC Vectorized 345 367 24 30.4 32.9 13.0X +SQL ORC MR 1508 1509 2 7.0 143.8 3.0X ================================================================================================ @@ -282,42 +282,42 @@ Single Column Scan From Wide Columns ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2746 2759 18 0.4 2618.8 1.0X -SQL Json 3135 3140 8 0.3 2989.3 0.9X -SQL Parquet Vectorized: DataPageV1 37 40 5 28.4 35.2 74.3X -SQL Parquet Vectorized: DataPageV2 54 56 5 19.5 51.2 51.1X -SQL Parquet MR: DataPageV1 177 180 4 5.9 168.6 15.5X -SQL Parquet MR: DataPageV2 189 194 3 5.5 180.3 14.5X -SQL ORC Vectorized 44 47 9 23.6 42.3 61.9X -SQL ORC MR 144 147 2 7.3 137.4 19.1X +SQL CSV 2036 2140 147 0.5 1941.4 1.0X +SQL Json 2796 2927 186 0.4 2666.5 0.7X +SQL Parquet Vectorized: DataPageV1 47 52 7 22.2 45.0 43.1X +SQL Parquet Vectorized: DataPageV2 64 69 7 16.4 61.2 31.7X +SQL Parquet MR: DataPageV1 176 190 11 5.9 168.1 11.5X +SQL Parquet MR: DataPageV2 157 171 6 6.7 149.3 13.0X +SQL ORC Vectorized 52 56 10 20.3 49.2 39.5X +SQL ORC MR 142 152 8 7.4 135.9 14.3X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5391 5398 10 0.2 5141.6 1.0X -SQL Json 10851 10932 114 0.1 10348.6 0.5X -SQL Parquet Vectorized: DataPageV1 50 52 5 20.9 47.8 107.5X -SQL Parquet Vectorized: DataPageV2 66 69 5 15.8 63.3 81.2X -SQL Parquet MR: DataPageV1 196 198 2 5.4 186.5 27.6X -SQL Parquet MR: DataPageV2 205 208 3 5.1 195.3 26.3X -SQL ORC Vectorized 60 62 7 17.5 57.1 90.1X -SQL ORC MR 160 163 3 6.5 152.7 33.7X +SQL CSV 5384 5560 249 0.2 5134.8 1.0X +SQL Json 10934 11224 410 0.1 10427.1 0.5X +SQL Parquet Vectorized: DataPageV1 62 67 7 16.8 59.5 86.3X +SQL Parquet Vectorized: DataPageV2 79 85 7 13.3 75.3 68.1X +SQL Parquet MR: DataPageV1 198 211 9 5.3 188.6 27.2X +SQL Parquet MR: DataPageV2 177 188 9 5.9 168.7 30.4X +SQL ORC Vectorized 67 73 10 15.6 64.0 80.2X +SQL ORC MR 160 172 8 6.6 152.3 33.7X OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8716 8719 4 0.1 8311.9 1.0X -SQL Json 20077 20197 170 0.1 19147.3 0.4X -SQL Parquet Vectorized: DataPageV1 78 84 10 13.4 74.5 111.6X -SQL Parquet Vectorized: DataPageV2 94 98 4 11.1 89.9 92.4X -SQL Parquet MR: DataPageV1 225 230 5 4.7 214.4 38.8X -SQL Parquet MR: DataPageV2 225 236 8 4.7 214.8 38.7X -SQL ORC Vectorized 82 84 4 12.8 77.8 106.8X -SQL ORC MR 183 187 6 5.7 174.2 47.7X +SQL CSV 9602 9882 396 0.1 9157.0 1.0X +SQL Json 21369 21987 874 0.0 20379.5 0.4X +SQL Parquet Vectorized: DataPageV1 90 97 7 11.7 85.4 107.2X +SQL Parquet Vectorized: DataPageV2 107 115 7 9.8 102.0 89.8X +SQL Parquet MR: DataPageV1 227 234 14 4.6 216.1 42.4X +SQL Parquet MR: DataPageV2 204 216 10 5.1 194.4 47.1X +SQL ORC Vectorized 81 89 8 12.9 77.6 118.1X +SQL ORC MR 181 195 12 5.8 172.3 53.2X From d95100aebe85965784b63e4af5fa3671908d6ff9 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 15 Mar 2022 13:43:49 -0700 Subject: [PATCH 18/20] More review comments --- .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 3 ++- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 4 +--- .../spark/sql/execution/vectorized/WritableColumnVector.java | 4 ++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index b8d375ee25592..baee79eb0da4f 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -42,8 +42,9 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private ByteBuffer previous; private int currentRow = 0; - // temporary variable used by getBinary + // temporary variable used by readBinary private final WritableColumnVector binaryValVector; + // temporary variable used by skipBinary private final WritableColumnVector tempBinaryValVector; VectorizedDeltaByteArrayReader() { diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index e961bbc31d5a7..1841e71569ac3 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -78,10 +78,8 @@ public ByteBuffer getBytes(int rowId) { @Override public void skipBinary(int total) { - int length; for (int i = 0; i < total; i++) { - length = lengthsVector.getInt(currentRow + i); - int remaining = length; + int remaining = lengthsVector.getInt(currentRow + i); while (remaining > 0) { remaining -= in.skip(remaining); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index 4c0d58dc5be45..d1897a071c67a 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -444,6 +444,10 @@ public byte[] getBinary(int rowId) { } } + /** + * Gets the values of bytes from [rowId, rowId + count), as a ByteBuffer. + * This method is similar to {@link ColumnVector#getBytes(int, int)}, but avoids making a copy. + */ public abstract ByteBuffer getBytesUnsafe(int rowId, int count); /** From 6d273f03cd96542460cfe7d6be01a8a2a48588bc Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 16 Mar 2022 11:01:01 -0700 Subject: [PATCH 19/20] More review comments addressed --- .../execution/datasources/parquet/VectorizedColumnReader.java | 3 ++- .../parquet/VectorizedDeltaBinaryPackedReader.java | 1 + .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 4 ++-- .../parquet/VectorizedDeltaLengthByteArrayReader.java | 1 - .../datasources/parquet/VectorizedParquetRecordReader.java | 3 +-- .../execution/datasources/parquet/VectorizedValuesReader.java | 3 +-- .../parquet/ParquetDeltaByteArrayEncodingSuite.scala | 1 - .../execution/datasources/parquet/ParquetEncodingSuite.scala | 2 +- 8 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index 49a71356a8938..ee09d2b2a3be9 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -290,9 +290,10 @@ private void initDataReader( } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } + // for PARQUET-246 (See VectorizedDeltaByteArrayReader.setPreviousValues) if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader instanceof RequiresPreviousReader) { - // previous reader can only be set if reading sequentially + // previousReader can only be set if reading sequentially ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java index a11a8ebc86b7a..3218c20ece893 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java @@ -98,6 +98,7 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce firstValue = BytesUtils.readZigZagVarLong(in); } + // True value count. May be less than valueCount because of nulls int getTotalValueCount() { return totalValueCount; } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index baee79eb0da4f..d7ad089e41b6b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -42,9 +42,9 @@ public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase private ByteBuffer previous; private int currentRow = 0; - // temporary variable used by readBinary + // Temporary variable used by readBinary private final WritableColumnVector binaryValVector; - // temporary variable used by skipBinary + // Temporary variable used by skipBinary private final WritableColumnVector tempBinaryValVector; VectorizedDeltaByteArrayReader() { diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java index 1841e71569ac3..ac5b8527f5e13 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -86,5 +86,4 @@ public void skipBinary(int total) { } currentRow += total; } - } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java index 401d8f0091c36..cbf60125e1284 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java @@ -368,8 +368,7 @@ private void checkEndOfRowGroup() throws IOException { datetimeRebaseTz, int96RebaseMode, int96RebaseTz, - writerVersion - ); + writerVersion); } totalCountLoadedSoFar += pages.getRowCount(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 1efa409a3a853..4308614338499 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.parquet; import java.nio.ByteBuffer; + import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.parquet.io.api.Binary; @@ -100,7 +101,5 @@ static void writeArrayByteBuffer(WritableColumnVector c, int rowId, ByteBuffer v } static void skipWrite(WritableColumnVector c, int rowId, ByteBuffer val, int length) { } - } - } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala index c71c7c6219c98..c54eef348f342 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala @@ -140,5 +140,4 @@ class ParquetDeltaByteArrayEncodingSuite extends ParquetCompatibilityTest with S i += skipCount + 1 } } - } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 12015b779fe42..07e2849ce6f19 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -160,7 +160,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess // reads at least twice from the reader). This will catch any issues with state // maintained by the reader(s) // Add at least one string with a null - val data = (1 to 81971).map { i => + val data = (1 to 8193).map { i => (i, i.toLong, i.toShort, Array[Byte](i.toByte), if (i % 2 == 1) s"test_$i" else null, From 1d150224df1c09304a7fe5a5afca2fdf493798b7 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 16 Mar 2022 15:34:53 -0700 Subject: [PATCH 20/20] Cleaner naming for WritableColumnVector.getBytesUnsafe --- .../datasources/parquet/VectorizedDeltaByteArrayReader.java | 4 ++-- .../spark/sql/execution/vectorized/OffHeapColumnVector.java | 6 ++---- .../spark/sql/execution/vectorized/OnHeapColumnVector.java | 2 +- .../sql/execution/vectorized/WritableColumnVector.java | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index d7ad089e41b6b..b3fc54a8d152c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -90,7 +90,7 @@ private void readValues(int total, WritableColumnVector c, int rowId) { } arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); c.putArray(rowId + i, offset, length); - previous = arrayData.getBytesUnsafe(offset, length); + previous = arrayData.getByteBuffer(offset, length); currentRow++; } } @@ -131,7 +131,7 @@ public void skipBinary(int total) { arrayData.appendBytes(prefixLength, previous.array(), previous.position()); } arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); - previous = arrayData.getBytesUnsafe(0, length); + previous = arrayData.getByteBuffer(0, length); currentRow++; WritableColumnVector tmp = c1; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index a7abc5a53bddd..42552c7afc624 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -222,10 +222,8 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { } @Override - public ByteBuffer getBytesUnsafe(int rowId, int count) { - byte[] array = new byte[count]; - Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count); - return ByteBuffer.wrap(array); + public ByteBuffer getByteBuffer(int rowId, int count) { + return ByteBuffer.wrap(getBytes(rowId, count)); } // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 10ff78e38de49..d246a3c24e4a6 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -220,7 +220,7 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { } @Override - public ByteBuffer getBytesUnsafe(int rowId, int count) { + public ByteBuffer getByteBuffer(int rowId, int count) { return ByteBuffer.wrap(byteData, rowId, count); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index d1897a071c67a..ae457a16123d2 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -448,7 +448,7 @@ public byte[] getBinary(int rowId) { * Gets the values of bytes from [rowId, rowId + count), as a ByteBuffer. * This method is similar to {@link ColumnVector#getBytes(int, int)}, but avoids making a copy. */ - public abstract ByteBuffer getBytesUnsafe(int rowId, int count); + public abstract ByteBuffer getByteBuffer(int rowId, int count); /** * Append APIs. These APIs all behave similarly and will append data to the current vector. It