diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index 25c43d8273df8..11fc93406c363 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9636 9771 191 1.6 612.6 1.0X -SQL Json 7960 8227 378 2.0 506.1 1.2X -SQL Parquet Vectorized: DataPageV1 113 129 12 139.7 7.2 85.6X -SQL Parquet Vectorized: DataPageV2 84 93 12 186.6 5.4 114.3X -SQL Parquet MR: DataPageV1 1466 1470 6 10.7 93.2 6.6X -SQL Parquet MR: DataPageV2 1334 1347 18 11.8 84.8 7.2X -SQL ORC Vectorized 163 197 27 96.3 10.4 59.0X -SQL ORC MR 1554 1558 6 10.1 98.8 6.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 11809 12046 335 1.3 750.8 1.0X +SQL Json 8588 8592 7 1.8 546.0 1.4X +SQL Parquet Vectorized: DataPageV1 140 162 18 112.0 8.9 84.1X +SQL Parquet Vectorized: DataPageV2 103 117 12 152.6 6.6 114.6X +SQL Parquet MR: DataPageV1 1634 1648 20 9.6 103.9 7.2X +SQL Parquet MR: DataPageV2 1495 1501 9 10.5 95.1 7.9X +SQL ORC Vectorized 180 224 42 87.4 11.4 65.6X +SQL ORC MR 1536 1576 57 10.2 97.7 7.7X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 94 103 13 167.1 6.0 1.0X -ParquetReader Vectorized: DataPageV2 77 86 11 204.3 4.9 1.2X -ParquetReader Vectorized -> Row: DataPageV1 44 47 4 357.0 2.8 2.1X -ParquetReader Vectorized -> Row: DataPageV2 35 37 3 445.2 2.2 2.7X +ParquetReader Vectorized: DataPageV1 109 114 10 144.3 6.9 1.0X +ParquetReader Vectorized: DataPageV2 90 93 3 175.3 5.7 1.2X +ParquetReader Vectorized -> Row: DataPageV1 58 60 4 271.9 3.7 1.9X +ParquetReader Vectorized -> Row: DataPageV2 39 41 3 404.0 2.5 2.8X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11479 11919 622 1.4 729.8 1.0X -SQL Json 9894 9922 39 1.6 629.1 1.2X -SQL Parquet Vectorized: DataPageV1 123 156 30 128.3 7.8 93.6X -SQL Parquet Vectorized: DataPageV2 126 138 19 125.2 8.0 91.4X -SQL Parquet MR: DataPageV1 1986 2500 726 7.9 126.3 5.8X -SQL Parquet MR: DataPageV2 1810 1898 126 8.7 115.1 6.3X -SQL ORC Vectorized 174 210 30 90.5 11.0 66.1X -SQL ORC MR 1645 1652 9 9.6 104.6 7.0X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 14515 14526 16 1.1 922.8 1.0X +SQL Json 9862 9863 2 1.6 627.0 1.5X +SQL Parquet Vectorized: DataPageV1 144 167 31 109.5 9.1 101.1X +SQL Parquet Vectorized: DataPageV2 139 159 27 113.4 8.8 104.6X +SQL Parquet MR: DataPageV1 1777 1780 3 8.8 113.0 8.2X +SQL Parquet MR: DataPageV2 1690 1691 2 9.3 107.4 8.6X +SQL ORC Vectorized 201 238 46 78.3 12.8 72.2X +SQL ORC MR 1513 1522 14 10.4 96.2 9.6X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 166 177 14 94.9 10.5 1.0X -ParquetReader Vectorized: DataPageV2 165 172 11 95.3 10.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 95 100 5 165.7 6.0 1.7X -ParquetReader Vectorized -> Row: DataPageV2 85 89 6 186.0 5.4 2.0X +ParquetReader Vectorized: DataPageV1 182 192 11 86.6 11.5 1.0X +ParquetReader Vectorized: DataPageV2 181 188 7 86.9 11.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 96 99 4 163.3 6.1 1.9X +ParquetReader Vectorized -> Row: DataPageV2 96 99 3 163.4 6.1 1.9X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12176 12646 664 1.3 774.1 1.0X -SQL Json 9696 9729 46 1.6 616.5 1.3X -SQL Parquet Vectorized: DataPageV1 151 201 33 103.9 9.6 80.4X -SQL Parquet Vectorized: DataPageV2 216 235 15 72.7 13.8 56.3X -SQL Parquet MR: DataPageV1 1915 2017 145 8.2 121.8 6.4X -SQL Parquet MR: DataPageV2 1954 1978 33 8.0 124.3 6.2X -SQL ORC Vectorized 197 235 25 79.7 12.6 61.7X -SQL ORC MR 1769 1829 85 8.9 112.5 6.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 15326 15437 156 1.0 974.4 1.0X +SQL Json 10281 10290 13 1.5 653.7 1.5X +SQL Parquet Vectorized: DataPageV1 164 212 36 95.9 10.4 93.4X +SQL Parquet Vectorized: DataPageV2 230 244 11 68.5 14.6 66.7X +SQL Parquet MR: DataPageV1 2108 2111 4 7.5 134.0 7.3X +SQL Parquet MR: DataPageV2 1940 1963 33 8.1 123.3 7.9X +SQL ORC Vectorized 229 279 34 68.7 14.6 66.9X +SQL ORC MR 1903 1906 3 8.3 121.0 8.1X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 230 237 12 68.5 14.6 1.0X -ParquetReader Vectorized: DataPageV2 293 298 9 53.6 18.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 215 265 23 73.2 13.7 1.1X -ParquetReader Vectorized -> Row: DataPageV2 279 301 32 56.3 17.8 0.8X +ParquetReader Vectorized: DataPageV1 253 262 10 62.2 16.1 1.0X +ParquetReader Vectorized: DataPageV2 323 327 9 48.8 20.5 0.8X +ParquetReader Vectorized -> Row: DataPageV1 280 288 8 56.3 17.8 0.9X +ParquetReader Vectorized -> Row: DataPageV2 301 314 21 52.2 19.1 0.8X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13069 13409 482 1.2 830.9 1.0X -SQL Json 10599 10621 32 1.5 673.9 1.2X -SQL Parquet Vectorized: DataPageV1 142 177 34 110.6 9.0 91.9X -SQL Parquet Vectorized: DataPageV2 313 359 28 50.2 19.9 41.7X -SQL Parquet MR: DataPageV1 1979 2044 92 7.9 125.8 6.6X -SQL Parquet MR: DataPageV2 1958 2030 101 8.0 124.5 6.7X -SQL ORC Vectorized 277 303 21 56.7 17.6 47.1X -SQL ORC MR 1692 1782 128 9.3 107.6 7.7X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 16756 16776 28 0.9 1065.3 1.0X +SQL Json 10690 10692 3 1.5 679.6 1.6X +SQL Parquet Vectorized: DataPageV1 160 208 45 98.1 10.2 104.5X +SQL Parquet Vectorized: DataPageV2 390 423 23 40.3 24.8 43.0X +SQL Parquet MR: DataPageV1 2196 2201 8 7.2 139.6 7.6X +SQL Parquet MR: DataPageV2 2065 2072 10 7.6 131.3 8.1X +SQL ORC Vectorized 323 338 10 48.7 20.5 51.9X +SQL ORC MR 1899 1906 11 8.3 120.7 8.8X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 253 269 18 62.1 16.1 1.0X -ParquetReader Vectorized: DataPageV2 1197 1199 3 13.1 76.1 0.2X -ParquetReader Vectorized -> Row: DataPageV1 273 361 110 57.7 17.3 0.9X -ParquetReader Vectorized -> Row: DataPageV2 379 438 37 41.5 24.1 0.7X +ParquetReader Vectorized: DataPageV1 278 285 9 56.6 17.7 1.0X +ParquetReader Vectorized: DataPageV2 514 518 2 30.6 32.7 0.5X +ParquetReader Vectorized -> Row: DataPageV1 308 316 11 51.0 19.6 0.9X +ParquetReader Vectorized -> Row: DataPageV2 498 525 27 31.6 31.6 0.6X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17143 17467 458 0.9 1089.9 1.0X -SQL Json 11507 12198 977 1.4 731.6 1.5X -SQL Parquet Vectorized: DataPageV1 238 253 19 66.0 15.2 71.9X -SQL Parquet Vectorized: DataPageV2 502 567 48 31.3 31.9 34.1X -SQL Parquet MR: DataPageV1 2333 2335 3 6.7 148.4 7.3X -SQL Parquet MR: DataPageV2 1948 1972 34 8.1 123.8 8.8X -SQL ORC Vectorized 389 408 20 40.5 24.7 44.1X -SQL ORC MR 1726 1817 128 9.1 109.7 9.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 21841 21851 14 0.7 1388.6 1.0X +SQL Json 12828 12843 21 1.2 815.6 1.7X +SQL Parquet Vectorized: DataPageV1 241 279 19 65.2 15.3 90.6X +SQL Parquet Vectorized: DataPageV2 554 596 29 28.4 35.2 39.5X +SQL Parquet MR: DataPageV1 2404 2428 34 6.5 152.8 9.1X +SQL Parquet MR: DataPageV2 2153 2166 18 7.3 136.9 10.1X +SQL ORC Vectorized 417 464 62 37.7 26.5 52.4X +SQL ORC MR 2136 2146 14 7.4 135.8 10.2X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 289 340 43 54.4 18.4 1.0X -ParquetReader Vectorized: DataPageV2 572 609 27 27.5 36.4 0.5X -ParquetReader Vectorized -> Row: DataPageV1 329 353 48 47.8 20.9 0.9X -ParquetReader Vectorized -> Row: DataPageV2 639 654 18 24.6 40.6 0.5X +ParquetReader Vectorized: DataPageV1 324 357 34 48.6 20.6 1.0X +ParquetReader Vectorized: DataPageV2 694 702 11 22.6 44.2 0.5X +ParquetReader Vectorized -> Row: DataPageV1 378 385 8 41.6 24.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 701 708 8 22.4 44.6 0.5X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13721 13812 129 1.1 872.4 1.0X -SQL Json 12147 17632 2196 1.3 772.3 1.1X -SQL Parquet Vectorized: DataPageV1 138 164 25 113.9 8.8 99.4X -SQL Parquet Vectorized: DataPageV2 151 180 26 104.4 9.6 91.1X -SQL Parquet MR: DataPageV1 2006 2078 101 7.8 127.6 6.8X -SQL Parquet MR: DataPageV2 2038 2040 2 7.7 129.6 6.7X -SQL ORC Vectorized 465 475 10 33.8 29.6 29.5X -SQL ORC MR 1814 1860 64 8.7 115.4 7.6X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 17238 17239 2 0.9 1096.0 1.0X +SQL Json 12295 12307 18 1.3 781.7 1.4X +SQL Parquet Vectorized: DataPageV1 162 203 27 96.8 10.3 106.1X +SQL Parquet Vectorized: DataPageV2 157 194 32 100.4 10.0 110.0X +SQL Parquet MR: DataPageV1 2163 2165 3 7.3 137.5 8.0X +SQL Parquet MR: DataPageV2 2014 2014 1 7.8 128.0 8.6X +SQL ORC Vectorized 458 462 5 34.4 29.1 37.7X +SQL ORC MR 1984 1984 0 7.9 126.1 8.7X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 275 404 187 57.2 17.5 1.0X -ParquetReader Vectorized: DataPageV2 275 287 12 57.2 17.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 227 265 24 69.2 14.4 1.2X -ParquetReader Vectorized -> Row: DataPageV2 228 259 28 69.1 14.5 1.2X +ParquetReader Vectorized: DataPageV1 252 259 10 62.3 16.0 1.0X +ParquetReader Vectorized: DataPageV2 252 256 9 62.3 16.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 259 307 40 60.7 16.5 1.0X +ParquetReader Vectorized -> Row: DataPageV2 260 295 25 60.5 16.5 1.0X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17269 17620 496 0.9 1097.9 1.0X -SQL Json 15636 15952 447 1.0 994.1 1.1X -SQL Parquet Vectorized: DataPageV1 238 267 18 66.0 15.1 72.5X -SQL Parquet Vectorized: DataPageV2 222 260 21 70.9 14.1 77.9X -SQL Parquet MR: DataPageV1 2418 2457 56 6.5 153.7 7.1X -SQL Parquet MR: DataPageV2 2194 2207 18 7.2 139.5 7.9X -SQL ORC Vectorized 519 528 14 30.3 33.0 33.3X -SQL ORC MR 1760 1770 14 8.9 111.9 9.8X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 22485 22536 72 0.7 1429.5 1.0X +SQL Json 16281 16286 8 1.0 1035.1 1.4X +SQL Parquet Vectorized: DataPageV1 232 288 35 67.9 14.7 97.1X +SQL Parquet Vectorized: DataPageV2 277 290 9 56.8 17.6 81.2X +SQL Parquet MR: DataPageV1 2331 2341 15 6.7 148.2 9.6X +SQL Parquet MR: DataPageV2 2216 2229 18 7.1 140.9 10.1X +SQL ORC Vectorized 561 569 9 28.0 35.7 40.1X +SQL ORC MR 2118 2137 27 7.4 134.6 10.6X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 284 305 30 55.3 18.1 1.0X -ParquetReader Vectorized: DataPageV2 286 286 1 55.1 18.2 1.0X -ParquetReader Vectorized -> Row: DataPageV1 325 337 16 48.4 20.6 0.9X -ParquetReader Vectorized -> Row: DataPageV2 346 361 16 45.5 22.0 0.8X +ParquetReader Vectorized: DataPageV1 355 356 1 44.3 22.6 1.0X +ParquetReader Vectorized: DataPageV2 355 356 1 44.3 22.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 379 386 9 41.5 24.1 0.9X +ParquetReader Vectorized -> Row: DataPageV2 379 389 10 41.5 24.1 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12428 12714 405 0.8 1185.2 1.0X -SQL Json 11088 11251 231 0.9 1057.4 1.1X -SQL Parquet Vectorized: DataPageV1 1990 1997 10 5.3 189.8 6.2X -SQL Parquet Vectorized: DataPageV2 2551 2618 95 4.1 243.3 4.9X -SQL Parquet MR: DataPageV1 3903 3913 15 2.7 372.2 3.2X -SQL Parquet MR: DataPageV2 3734 3920 263 2.8 356.1 3.3X -SQL ORC Vectorized 2153 2155 3 4.9 205.3 5.8X -SQL ORC MR 3485 3549 91 3.0 332.4 3.6X +SQL CSV 15733 15738 8 0.7 1500.4 1.0X +SQL Json 11953 11969 22 0.9 1140.0 1.3X +SQL Parquet Vectorized: DataPageV1 2100 2137 52 5.0 200.2 7.5X +SQL Parquet Vectorized: DataPageV2 2525 2535 14 4.2 240.8 6.2X +SQL Parquet MR: DataPageV1 4075 4110 49 2.6 388.6 3.9X +SQL Parquet MR: DataPageV2 3991 4014 34 2.6 380.6 3.9X +SQL ORC Vectorized 2323 2355 45 4.5 221.5 6.8X +SQL ORC MR 3776 3882 150 2.8 360.1 4.2X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7116 7167 72 1.5 678.7 1.0X -SQL Json 6700 6741 58 1.6 639.0 1.1X -SQL Parquet Vectorized: DataPageV1 526 556 36 19.9 50.1 13.5X -SQL Parquet Vectorized: DataPageV2 518 533 15 20.2 49.4 13.7X -SQL Parquet MR: DataPageV1 1504 1656 216 7.0 143.4 4.7X -SQL Parquet MR: DataPageV2 1676 1676 1 6.3 159.8 4.2X -SQL ORC Vectorized 497 518 20 21.1 47.4 14.3X -SQL ORC MR 1657 1787 183 6.3 158.1 4.3X +SQL CSV 8921 8966 63 1.2 850.7 1.0X +SQL Json 7215 7218 5 1.5 688.1 1.2X +SQL Parquet Vectorized: DataPageV1 604 627 23 17.3 57.6 14.8X +SQL Parquet Vectorized: DataPageV2 606 620 18 17.3 57.8 14.7X +SQL Parquet MR: DataPageV1 1686 1693 10 6.2 160.8 5.3X +SQL Parquet MR: DataPageV2 1660 1665 8 6.3 158.3 5.4X +SQL ORC Vectorized 541 548 7 19.4 51.6 16.5X +SQL ORC MR 1920 1930 13 5.5 183.1 4.6X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 18247 18411 232 0.9 1160.1 1.0X -Data column - Json 10860 11264 571 1.4 690.5 1.7X -Data column - Parquet Vectorized: DataPageV1 223 274 26 70.6 14.2 81.9X -Data column - Parquet Vectorized: DataPageV2 537 559 23 29.3 34.1 34.0X -Data column - Parquet MR: DataPageV1 2411 2517 150 6.5 153.3 7.6X -Data column - Parquet MR: DataPageV2 2299 2356 81 6.8 146.2 7.9X -Data column - ORC Vectorized 417 433 11 37.7 26.5 43.8X -Data column - ORC MR 2107 2178 101 7.5 134.0 8.7X -Partition column - CSV 6090 6186 136 2.6 387.2 3.0X -Partition column - Json 9479 9603 176 1.7 602.7 1.9X -Partition column - Parquet Vectorized: DataPageV1 49 69 28 322.0 3.1 373.6X -Partition column - Parquet Vectorized: DataPageV2 49 63 23 322.1 3.1 373.7X -Partition column - Parquet MR: DataPageV1 1200 1225 36 13.1 76.3 15.2X -Partition column - Parquet MR: DataPageV2 1199 1240 57 13.1 76.3 15.2X -Partition column - ORC Vectorized 53 77 26 295.0 3.4 342.2X -Partition column - ORC MR 1287 1346 83 12.2 81.8 14.2X -Both columns - CSV 17671 18140 663 0.9 1123.5 1.0X -Both columns - Json 11675 12167 696 1.3 742.3 1.6X -Both columns - Parquet Vectorized: DataPageV1 298 303 9 52.9 18.9 61.3X -Both columns - Parquet Vectorized: DataPageV2 541 580 36 29.1 34.4 33.7X -Both columns - Parquet MR: DataPageV1 2448 2491 60 6.4 155.6 7.5X -Both columns - Parquet MR: DataPageV2 2303 2352 69 6.8 146.4 7.9X -Both columns - ORC Vectorized 385 406 25 40.9 24.5 47.4X -Both columns - ORC MR 2118 2202 120 7.4 134.6 8.6X +Data column - CSV 21951 21976 36 0.7 1395.6 1.0X +Data column - Json 12896 12905 14 1.2 819.9 1.7X +Data column - Parquet Vectorized: DataPageV1 247 307 48 63.6 15.7 88.7X +Data column - Parquet Vectorized: DataPageV2 657 686 25 23.9 41.8 33.4X +Data column - Parquet MR: DataPageV1 2705 2708 3 5.8 172.0 8.1X +Data column - Parquet MR: DataPageV2 2621 2621 0 6.0 166.6 8.4X +Data column - ORC Vectorized 440 468 30 35.7 28.0 49.9X +Data column - ORC MR 2553 2565 17 6.2 162.3 8.6X +Partition column - CSV 6640 6641 1 2.4 422.2 3.3X +Partition column - Json 10499 10512 19 1.5 667.5 2.1X +Partition column - Parquet Vectorized: DataPageV1 60 79 24 261.4 3.8 364.8X +Partition column - Parquet Vectorized: DataPageV2 58 81 26 270.2 3.7 377.0X +Partition column - Parquet MR: DataPageV1 1387 1412 35 11.3 88.2 15.8X +Partition column - Parquet MR: DataPageV2 1383 1407 34 11.4 87.9 15.9X +Partition column - ORC Vectorized 61 85 25 256.8 3.9 358.4X +Partition column - ORC MR 1552 1553 1 10.1 98.7 14.1X +Both columns - CSV 21896 21919 32 0.7 1392.1 1.0X +Both columns - Json 13645 13664 27 1.2 867.5 1.6X +Both columns - Parquet Vectorized: DataPageV1 307 351 33 51.3 19.5 71.6X +Both columns - Parquet Vectorized: DataPageV2 698 740 36 22.5 44.4 31.4X +Both columns - Parquet MR: DataPageV1 2804 2821 24 5.6 178.3 7.8X +Both columns - Parquet MR: DataPageV2 2624 2636 16 6.0 166.8 8.4X +Both columns - ORC Vectorized 462 521 53 34.0 29.4 47.5X +Both columns - ORC MR 2564 2580 22 6.1 163.0 8.6X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7966 12723 2892 1.3 759.7 1.0X -SQL Json 9897 10008 157 1.1 943.9 0.8X -SQL Parquet Vectorized: DataPageV1 1176 1264 125 8.9 112.1 6.8X -SQL Parquet Vectorized: DataPageV2 2224 2326 144 4.7 212.1 3.6X -SQL Parquet MR: DataPageV1 3431 3483 73 3.1 327.2 2.3X -SQL Parquet MR: DataPageV2 3845 4043 280 2.7 366.7 2.1X -ParquetReader Vectorized: DataPageV1 1055 1056 2 9.9 100.6 7.6X -ParquetReader Vectorized: DataPageV2 2093 2119 37 5.0 199.6 3.8X -SQL ORC Vectorized 1129 1217 125 9.3 107.7 7.1X -SQL ORC MR 2931 2982 72 3.6 279.5 2.7X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 10818 10826 11 1.0 1031.6 1.0X +SQL Json 10812 10833 29 1.0 1031.2 1.0X +SQL Parquet Vectorized: DataPageV1 1301 1312 15 8.1 124.1 8.3X +SQL Parquet Vectorized: DataPageV2 1953 1982 42 5.4 186.2 5.5X +SQL Parquet MR: DataPageV1 3677 3680 5 2.9 350.6 2.9X +SQL Parquet MR: DataPageV2 3970 3972 2 2.6 378.6 2.7X +ParquetReader Vectorized: DataPageV1 1004 1016 16 10.4 95.8 10.8X +ParquetReader Vectorized: DataPageV2 1606 1622 22 6.5 153.2 6.7X +SQL ORC Vectorized 1160 1182 30 9.0 110.7 9.3X +SQL ORC MR 3266 3330 90 3.2 311.4 3.3X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6338 6508 240 1.7 604.4 1.0X -SQL Json 7149 7247 138 1.5 681.8 0.9X -SQL Parquet Vectorized: DataPageV1 937 984 45 11.2 89.3 6.8X -SQL Parquet Vectorized: DataPageV2 1582 1608 37 6.6 150.9 4.0X -SQL Parquet MR: DataPageV1 2525 2721 277 4.2 240.8 2.5X -SQL Parquet MR: DataPageV2 2969 2974 7 3.5 283.1 2.1X -ParquetReader Vectorized: DataPageV1 933 940 12 11.2 88.9 6.8X -ParquetReader Vectorized: DataPageV2 1535 1549 20 6.8 146.4 4.1X -SQL ORC Vectorized 1144 1204 86 9.2 109.1 5.5X -SQL ORC MR 2816 2822 8 3.7 268.6 2.3X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 7971 7981 15 1.3 760.2 1.0X +SQL Json 8266 8269 3 1.3 788.4 1.0X +SQL Parquet Vectorized: DataPageV1 1025 1036 15 10.2 97.8 7.8X +SQL Parquet Vectorized: DataPageV2 1432 1440 11 7.3 136.6 5.6X +SQL Parquet MR: DataPageV1 2792 2806 20 3.8 266.3 2.9X +SQL Parquet MR: DataPageV2 2958 2992 47 3.5 282.1 2.7X +ParquetReader Vectorized: DataPageV1 1010 1024 20 10.4 96.3 7.9X +ParquetReader Vectorized: DataPageV2 1331 1335 4 7.9 127.0 6.0X +SQL ORC Vectorized 1266 1271 6 8.3 120.8 6.3X +SQL ORC MR 3032 3089 81 3.5 289.2 2.6X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4443 4504 86 2.4 423.7 1.0X -SQL Json 4528 4563 49 2.3 431.8 1.0X -SQL Parquet Vectorized: DataPageV1 213 233 15 49.2 20.3 20.8X -SQL Parquet Vectorized: DataPageV2 267 294 22 39.3 25.4 16.7X -SQL Parquet MR: DataPageV1 1691 1700 13 6.2 161.2 2.6X -SQL Parquet MR: DataPageV2 1515 1565 70 6.9 144.5 2.9X -ParquetReader Vectorized: DataPageV1 228 231 2 46.0 21.7 19.5X -ParquetReader Vectorized: DataPageV2 285 296 9 36.8 27.1 15.6X -SQL ORC Vectorized 369 425 82 28.4 35.2 12.1X -SQL ORC MR 1457 1463 9 7.2 138.9 3.0X +SQL CSV 5829 5833 5 1.8 555.9 1.0X +SQL Json 4966 4978 17 2.1 473.6 1.2X +SQL Parquet Vectorized: DataPageV1 236 244 7 44.5 22.5 24.7X +SQL Parquet Vectorized: DataPageV2 305 315 13 34.4 29.1 19.1X +SQL Parquet MR: DataPageV1 1777 1784 10 5.9 169.5 3.3X +SQL Parquet MR: DataPageV2 1635 1637 4 6.4 155.9 3.6X +ParquetReader Vectorized: DataPageV1 242 246 2 43.2 23.1 24.0X +ParquetReader Vectorized: DataPageV2 309 313 7 34.0 29.5 18.9X +SQL ORC Vectorized 391 419 53 26.8 37.3 14.9X +SQL ORC MR 1686 1687 1 6.2 160.8 3.5X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2374 2377 5 0.4 2264.2 1.0X -SQL Json 2693 2726 46 0.4 2568.5 0.9X -SQL Parquet Vectorized: DataPageV1 44 62 16 23.8 42.0 54.0X -SQL Parquet Vectorized: DataPageV2 63 81 21 16.5 60.5 37.5X -SQL Parquet MR: DataPageV1 173 198 27 6.1 164.6 13.8X -SQL Parquet MR: DataPageV2 161 193 30 6.5 153.5 14.8X -SQL ORC Vectorized 53 71 18 19.9 50.2 45.1X -SQL ORC MR 149 182 34 7.0 142.3 15.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 2301 2305 6 0.5 2194.0 1.0X +SQL Json 2874 2895 29 0.4 2741.1 0.8X +SQL Parquet Vectorized: DataPageV1 47 66 20 22.3 44.8 48.9X +SQL Parquet Vectorized: DataPageV2 74 90 25 14.2 70.5 31.1X +SQL Parquet MR: DataPageV1 198 219 26 5.3 189.0 11.6X +SQL Parquet MR: DataPageV2 178 207 45 5.9 170.1 12.9X +SQL ORC Vectorized 59 76 20 17.6 56.7 38.7X +SQL ORC MR 173 193 24 6.1 164.6 13.3X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5149 5193 62 0.2 4910.9 1.0X -SQL Json 10556 10891 475 0.1 10066.5 0.5X -SQL Parquet Vectorized: DataPageV1 64 96 28 16.3 61.3 80.1X -SQL Parquet Vectorized: DataPageV2 83 106 22 12.6 79.1 62.0X -SQL Parquet MR: DataPageV1 196 232 25 5.3 187.4 26.2X -SQL Parquet MR: DataPageV2 184 221 28 5.7 175.1 28.0X -SQL ORC Vectorized 74 98 31 14.1 70.8 69.3X -SQL ORC MR 182 214 38 5.8 173.9 28.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure +SQL CSV 5418 5425 9 0.2 5167.2 1.0X +SQL Json 11463 11574 156 0.1 10932.3 0.5X +SQL Parquet Vectorized: DataPageV1 66 103 28 15.8 63.4 81.5X +SQL Parquet Vectorized: DataPageV2 90 115 27 11.7 85.5 60.4X +SQL Parquet MR: DataPageV1 218 234 23 4.8 208.3 24.8X +SQL Parquet MR: DataPageV2 199 225 29 5.3 190.1 27.2X +SQL ORC Vectorized 76 106 31 13.7 72.8 71.0X +SQL ORC MR 193 216 28 5.4 184.2 28.0X + +OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9077 9107 43 0.1 8656.2 1.0X -SQL Json 20131 20886 1067 0.1 19198.5 0.5X -SQL Parquet Vectorized: DataPageV1 93 124 26 11.3 88.8 97.5X -SQL Parquet Vectorized: DataPageV2 103 128 29 10.2 98.5 87.9X -SQL Parquet MR: DataPageV1 218 257 35 4.8 207.6 41.7X -SQL Parquet MR: DataPageV2 213 255 29 4.9 202.7 42.7X -SQL ORC Vectorized 80 95 20 13.0 76.6 112.9X -SQL ORC MR 187 207 20 5.6 178.0 48.6X +SQL CSV 9430 9430 0 0.1 8993.3 1.0X +SQL Json 21268 21347 111 0.0 20283.1 0.4X +SQL Parquet Vectorized: DataPageV1 97 124 24 10.9 92.1 97.6X +SQL Parquet Vectorized: DataPageV2 119 136 19 8.8 113.6 79.2X +SQL Parquet MR: DataPageV1 254 285 35 4.1 242.1 37.1X +SQL Parquet MR: DataPageV2 231 260 30 4.5 220.0 40.9X +SQL ORC Vectorized 95 119 31 11.1 90.4 99.5X +SQL ORC MR 214 219 5 4.9 203.6 44.2X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt index ecba57c0c3cc3..8ff176457af10 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15972 16369 561 1.0 1015.5 1.0X -SQL Json 9543 9580 54 1.6 606.7 1.7X -SQL Parquet Vectorized: DataPageV1 115 144 19 136.3 7.3 138.4X -SQL Parquet Vectorized: DataPageV2 95 109 15 165.1 6.1 167.6X -SQL Parquet MR: DataPageV1 2098 2119 30 7.5 133.4 7.6X -SQL Parquet MR: DataPageV2 2007 2012 6 7.8 127.6 8.0X -SQL ORC Vectorized 211 225 16 74.5 13.4 75.7X -SQL ORC MR 2077 2103 36 7.6 132.1 7.7X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 9610 10067 646 1.6 611.0 1.0X +SQL Json 8316 8410 133 1.9 528.7 1.2X +SQL Parquet Vectorized: DataPageV1 123 145 10 127.7 7.8 78.0X +SQL Parquet Vectorized: DataPageV2 93 108 12 170.0 5.9 103.8X +SQL Parquet MR: DataPageV1 1766 1768 2 8.9 112.3 5.4X +SQL Parquet MR: DataPageV2 1540 1543 3 10.2 97.9 6.2X +SQL ORC Vectorized 175 182 6 89.6 11.2 54.8X +SQL ORC MR 1517 1533 22 10.4 96.5 6.3X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 43 47 2 369.4 2.7 1.0X -ParquetReader Vectorized: DataPageV2 30 34 2 518.5 1.9 1.4X -ParquetReader Vectorized -> Row: DataPageV1 47 50 2 333.6 3.0 0.9X -ParquetReader Vectorized -> Row: DataPageV2 31 35 2 504.8 2.0 1.4X +ParquetReader Vectorized: DataPageV1 61 63 2 256.3 3.9 1.0X +ParquetReader Vectorized: DataPageV2 44 45 2 356.3 2.8 1.4X +ParquetReader Vectorized -> Row: DataPageV1 51 51 1 311.3 3.2 1.2X +ParquetReader Vectorized -> Row: DataPageV2 32 33 2 492.4 2.0 1.9X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17468 17543 105 0.9 1110.6 1.0X -SQL Json 11059 11065 8 1.4 703.1 1.6X -SQL Parquet Vectorized: DataPageV1 128 142 15 123.1 8.1 136.7X -SQL Parquet Vectorized: DataPageV2 126 141 8 125.2 8.0 139.1X -SQL Parquet MR: DataPageV1 2305 2331 36 6.8 146.5 7.6X -SQL Parquet MR: DataPageV2 2075 2095 28 7.6 131.9 8.4X -SQL ORC Vectorized 172 191 16 91.5 10.9 101.6X -SQL ORC MR 1777 1796 26 8.8 113.0 9.8X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 14866 14885 26 1.1 945.2 1.0X +SQL Json 9585 9586 3 1.6 609.4 1.6X +SQL Parquet Vectorized: DataPageV1 119 131 12 132.4 7.6 125.2X +SQL Parquet Vectorized: DataPageV2 119 125 5 132.0 7.6 124.7X +SQL Parquet MR: DataPageV1 1954 2025 101 8.0 124.2 7.6X +SQL Parquet MR: DataPageV2 1800 1824 35 8.7 114.4 8.3X +SQL ORC Vectorized 169 176 6 93.0 10.8 87.9X +SQL ORC MR 1432 1467 50 11.0 91.0 10.4X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 72 77 5 219.4 4.6 1.0X -ParquetReader Vectorized: DataPageV2 72 77 3 217.9 4.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 76 83 6 206.6 4.8 0.9X -ParquetReader Vectorized -> Row: DataPageV2 75 80 3 210.3 4.8 1.0X +ParquetReader Vectorized: DataPageV1 118 120 2 133.0 7.5 1.0X +ParquetReader Vectorized: DataPageV2 119 120 2 132.6 7.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 72 73 2 218.1 4.6 1.6X +ParquetReader Vectorized -> Row: DataPageV2 72 74 2 217.7 4.6 1.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18330 18332 3 0.9 1165.4 1.0X -SQL Json 11383 11429 66 1.4 723.7 1.6X -SQL Parquet Vectorized: DataPageV1 179 197 13 88.0 11.4 102.5X -SQL Parquet Vectorized: DataPageV2 239 263 18 65.7 15.2 76.6X -SQL Parquet MR: DataPageV1 2552 2567 21 6.2 162.3 7.2X -SQL Parquet MR: DataPageV2 2389 2436 67 6.6 151.9 7.7X -SQL ORC Vectorized 246 263 14 64.0 15.6 74.6X -SQL ORC MR 1965 2002 52 8.0 124.9 9.3X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 14601 14699 139 1.1 928.3 1.0X +SQL Json 9446 9517 101 1.7 600.5 1.5X +SQL Parquet Vectorized: DataPageV1 156 168 15 101.1 9.9 93.8X +SQL Parquet Vectorized: DataPageV2 197 213 15 79.6 12.6 73.9X +SQL Parquet MR: DataPageV1 2113 2130 23 7.4 134.4 6.9X +SQL Parquet MR: DataPageV2 1739 1784 64 9.0 110.5 8.4X +SQL ORC Vectorized 192 205 10 81.9 12.2 76.0X +SQL ORC MR 1518 1588 100 10.4 96.5 9.6X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 253 263 11 62.2 16.1 1.0X -ParquetReader Vectorized: DataPageV2 306 317 7 51.4 19.4 0.8X -ParquetReader Vectorized -> Row: DataPageV1 246 250 4 64.0 15.6 1.0X -ParquetReader Vectorized -> Row: DataPageV2 316 321 4 49.8 20.1 0.8X +ParquetReader Vectorized: DataPageV1 215 221 6 73.2 13.7 1.0X +ParquetReader Vectorized: DataPageV2 269 278 8 58.5 17.1 0.8X +ParquetReader Vectorized -> Row: DataPageV1 206 208 2 76.2 13.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 244 262 10 64.4 15.5 0.9X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19573 19822 352 0.8 1244.4 1.0X -SQL Json 12141 12217 107 1.3 771.9 1.6X -SQL Parquet Vectorized: DataPageV1 192 222 28 81.8 12.2 101.8X -SQL Parquet Vectorized: DataPageV2 345 373 24 45.6 21.9 56.7X -SQL Parquet MR: DataPageV1 2736 2741 7 5.7 173.9 7.2X -SQL Parquet MR: DataPageV2 2467 2536 97 6.4 156.9 7.9X -SQL ORC Vectorized 332 356 20 47.4 21.1 59.0X -SQL ORC MR 2188 2193 7 7.2 139.1 8.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 15886 16086 282 1.0 1010.0 1.0X +SQL Json 9872 9880 12 1.6 627.6 1.6X +SQL Parquet Vectorized: DataPageV1 174 195 22 90.4 11.1 91.3X +SQL Parquet Vectorized: DataPageV2 393 409 16 40.0 25.0 40.4X +SQL Parquet MR: DataPageV1 1953 2064 157 8.1 124.2 8.1X +SQL Parquet MR: DataPageV2 2215 2231 23 7.1 140.8 7.2X +SQL ORC Vectorized 280 314 22 56.1 17.8 56.7X +SQL ORC MR 1681 1706 35 9.4 106.9 9.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 291 295 4 54.1 18.5 1.0X -ParquetReader Vectorized: DataPageV2 493 518 39 31.9 31.3 0.6X -ParquetReader Vectorized -> Row: DataPageV1 300 306 8 52.5 19.1 1.0X -ParquetReader Vectorized -> Row: DataPageV2 471 483 11 33.4 30.0 0.6X +ParquetReader Vectorized: DataPageV1 253 263 8 62.1 16.1 1.0X +ParquetReader Vectorized: DataPageV2 450 461 15 34.9 28.6 0.6X +ParquetReader Vectorized -> Row: DataPageV1 241 253 12 65.2 15.3 1.1X +ParquetReader Vectorized -> Row: DataPageV2 437 448 14 36.0 27.8 0.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 24692 24718 37 0.6 1569.9 1.0X -SQL Json 14839 14875 50 1.1 943.5 1.7X -SQL Parquet Vectorized: DataPageV1 295 316 29 53.3 18.7 83.7X -SQL Parquet Vectorized: DataPageV2 477 505 24 32.9 30.4 51.7X -SQL Parquet MR: DataPageV1 2841 2981 197 5.5 180.6 8.7X -SQL Parquet MR: DataPageV2 2616 2632 23 6.0 166.3 9.4X -SQL ORC Vectorized 388 403 11 40.5 24.7 63.6X -SQL ORC MR 2274 2372 138 6.9 144.6 10.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 20641 20744 145 0.8 1312.3 1.0X +SQL Json 13055 13122 95 1.2 830.0 1.6X +SQL Parquet Vectorized: DataPageV1 246 267 16 63.8 15.7 83.8X +SQL Parquet Vectorized: DataPageV2 513 532 16 30.7 32.6 40.2X +SQL Parquet MR: DataPageV1 2354 2387 47 6.7 149.7 8.8X +SQL Parquet MR: DataPageV2 2118 2148 43 7.4 134.6 9.7X +SQL ORC Vectorized 418 437 17 37.6 26.6 49.4X +SQL ORC MR 1808 1852 61 8.7 115.0 11.4X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 376 387 9 41.9 23.9 1.0X -ParquetReader Vectorized: DataPageV2 585 591 6 26.9 37.2 0.6X -ParquetReader Vectorized -> Row: DataPageV1 377 387 9 41.8 23.9 1.0X -ParquetReader Vectorized -> Row: DataPageV2 576 586 10 27.3 36.6 0.7X +ParquetReader Vectorized: DataPageV1 306 315 5 51.5 19.4 1.0X +ParquetReader Vectorized: DataPageV2 584 591 11 26.9 37.1 0.5X +ParquetReader Vectorized -> Row: DataPageV1 288 299 14 54.6 18.3 1.1X +ParquetReader Vectorized -> Row: DataPageV2 549 557 8 28.6 34.9 0.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20566 20651 119 0.8 1307.6 1.0X -SQL Json 14337 14409 101 1.1 911.5 1.4X -SQL Parquet Vectorized: DataPageV1 154 167 8 101.9 9.8 133.2X -SQL Parquet Vectorized: DataPageV2 157 178 14 99.9 10.0 130.6X -SQL Parquet MR: DataPageV1 2730 2730 1 5.8 173.5 7.5X -SQL Parquet MR: DataPageV2 2459 2491 45 6.4 156.3 8.4X -SQL ORC Vectorized 479 501 15 32.9 30.4 43.0X -SQL ORC MR 2293 2343 71 6.9 145.8 9.0X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 17024 17292 378 0.9 1082.4 1.0X +SQL Json 11724 11904 255 1.3 745.4 1.5X +SQL Parquet Vectorized: DataPageV1 174 186 11 90.6 11.0 98.1X +SQL Parquet Vectorized: DataPageV2 173 189 14 90.9 11.0 98.4X +SQL Parquet MR: DataPageV1 1932 2037 148 8.1 122.9 8.8X +SQL Parquet MR: DataPageV2 1947 1976 41 8.1 123.8 8.7X +SQL ORC Vectorized 432 459 36 36.4 27.5 39.4X +SQL ORC MR 1984 1985 1 7.9 126.1 8.6X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 272 283 9 57.9 17.3 1.0X -ParquetReader Vectorized: DataPageV2 250 288 27 62.9 15.9 1.1X -ParquetReader Vectorized -> Row: DataPageV1 291 301 6 54.1 18.5 0.9X -ParquetReader Vectorized -> Row: DataPageV2 293 305 14 53.6 18.6 0.9X +ParquetReader Vectorized: DataPageV1 257 259 2 61.2 16.3 1.0X +ParquetReader Vectorized: DataPageV2 239 254 10 65.8 15.2 1.1X +ParquetReader Vectorized -> Row: DataPageV1 259 260 1 60.8 16.4 1.0X +ParquetReader Vectorized -> Row: DataPageV2 258 262 6 61.0 16.4 1.0X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 25753 25874 171 0.6 1637.3 1.0X -SQL Json 19097 19391 416 0.8 1214.2 1.3X -SQL Parquet Vectorized: DataPageV1 273 288 11 57.6 17.4 94.3X -SQL Parquet Vectorized: DataPageV2 240 277 25 65.5 15.3 107.3X -SQL Parquet MR: DataPageV1 2969 3042 103 5.3 188.8 8.7X -SQL Parquet MR: DataPageV2 2692 2747 78 5.8 171.1 9.6X -SQL ORC Vectorized 601 626 20 26.2 38.2 42.8X -SQL ORC MR 2458 2467 13 6.4 156.3 10.5X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 22592 22594 4 0.7 1436.3 1.0X +SQL Json 16252 16271 26 1.0 1033.3 1.4X +SQL Parquet Vectorized: DataPageV1 247 271 22 63.6 15.7 91.3X +SQL Parquet Vectorized: DataPageV2 252 266 14 62.4 16.0 89.6X +SQL Parquet MR: DataPageV1 2337 2352 21 6.7 148.6 9.7X +SQL Parquet MR: DataPageV2 2187 2223 50 7.2 139.1 10.3X +SQL ORC Vectorized 489 526 25 32.2 31.1 46.2X +SQL ORC MR 1816 1892 107 8.7 115.5 12.4X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 354 363 7 44.4 22.5 1.0X -ParquetReader Vectorized: DataPageV2 345 359 12 45.5 22.0 1.0X -ParquetReader Vectorized -> Row: DataPageV1 337 345 8 46.7 21.4 1.1X -ParquetReader Vectorized -> Row: DataPageV2 335 364 21 46.9 21.3 1.1X +ParquetReader Vectorized: DataPageV1 291 304 8 54.0 18.5 1.0X +ParquetReader Vectorized: DataPageV2 298 309 7 52.9 18.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 330 338 16 47.7 21.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 331 338 12 47.5 21.1 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18074 18101 37 0.6 1723.7 1.0X -SQL Json 13211 13214 5 0.8 1259.9 1.4X -SQL Parquet Vectorized: DataPageV1 2249 2286 53 4.7 214.5 8.0X -SQL Parquet Vectorized: DataPageV2 2804 2818 20 3.7 267.4 6.4X -SQL Parquet MR: DataPageV1 4708 4779 100 2.2 449.0 3.8X -SQL Parquet MR: DataPageV2 4868 5046 251 2.2 464.3 3.7X -SQL ORC Vectorized 2145 2160 20 4.9 204.6 8.4X -SQL ORC MR 4180 4308 182 2.5 398.6 4.3X +SQL CSV 14365 14780 587 0.7 1369.9 1.0X +SQL Json 10718 10772 76 1.0 1022.2 1.3X +SQL Parquet Vectorized: DataPageV1 1932 1988 80 5.4 184.2 7.4X +SQL Parquet Vectorized: DataPageV2 2298 2317 27 4.6 219.2 6.2X +SQL Parquet MR: DataPageV1 3829 3957 181 2.7 365.1 3.8X +SQL Parquet MR: DataPageV2 4176 4208 46 2.5 398.3 3.4X +SQL ORC Vectorized 2026 2046 28 5.2 193.2 7.1X +SQL ORC MR 3566 3580 21 2.9 340.0 4.0X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11320 11376 78 0.9 1079.6 1.0X -SQL Json 7593 7664 101 1.4 724.1 1.5X -SQL Parquet Vectorized: DataPageV1 633 639 9 16.6 60.3 17.9X -SQL Parquet Vectorized: DataPageV2 621 644 20 16.9 59.2 18.2X -SQL Parquet MR: DataPageV1 2111 2157 65 5.0 201.3 5.4X -SQL Parquet MR: DataPageV2 2018 2064 65 5.2 192.4 5.6X -SQL ORC Vectorized 505 540 36 20.8 48.2 22.4X -SQL ORC MR 2302 2360 82 4.6 219.5 4.9X +SQL CSV 9372 9373 1 1.1 893.8 1.0X +SQL Json 6862 6865 4 1.5 654.4 1.4X +SQL Parquet Vectorized: DataPageV1 606 613 8 17.3 57.8 15.5X +SQL Parquet Vectorized: DataPageV2 611 615 3 17.2 58.3 15.3X +SQL Parquet MR: DataPageV1 1713 1721 11 6.1 163.3 5.5X +SQL Parquet MR: DataPageV2 1721 1724 4 6.1 164.1 5.4X +SQL ORC Vectorized 467 469 2 22.5 44.5 20.1X +SQL ORC MR 1816 1818 2 5.8 173.2 5.2X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 24867 25261 556 0.6 1581.0 1.0X -Data column - Json 13937 13987 70 1.1 886.1 1.8X -Data column - Parquet Vectorized: DataPageV1 252 264 8 62.3 16.0 98.5X -Data column - Parquet Vectorized: DataPageV2 547 560 13 28.8 34.7 45.5X -Data column - Parquet MR: DataPageV1 3492 3509 25 4.5 222.0 7.1X -Data column - Parquet MR: DataPageV2 3148 3208 84 5.0 200.2 7.9X -Data column - ORC Vectorized 493 512 21 31.9 31.3 50.5X -Data column - ORC MR 2925 2943 26 5.4 185.9 8.5X -Partition column - CSV 7847 7851 5 2.0 498.9 3.2X -Partition column - Json 11759 11908 210 1.3 747.6 2.1X -Partition column - Parquet Vectorized: DataPageV1 60 67 7 262.3 3.8 414.7X -Partition column - Parquet Vectorized: DataPageV2 57 65 9 274.2 3.6 433.5X -Partition column - Parquet MR: DataPageV1 1762 1768 8 8.9 112.1 14.1X -Partition column - Parquet MR: DataPageV2 1742 1783 59 9.0 110.7 14.3X -Partition column - ORC Vectorized 59 71 7 265.6 3.8 419.9X -Partition column - ORC MR 1743 1764 29 9.0 110.8 14.3X -Both columns - CSV 25859 25924 92 0.6 1644.1 1.0X -Both columns - Json 14693 14764 101 1.1 934.2 1.7X -Both columns - Parquet Vectorized: DataPageV1 341 395 66 46.2 21.7 73.0X -Both columns - Parquet Vectorized: DataPageV2 624 643 13 25.2 39.7 39.9X -Both columns - Parquet MR: DataPageV1 3541 3611 99 4.4 225.2 7.0X -Both columns - Parquet MR: DataPageV2 3279 3301 32 4.8 208.4 7.6X -Both columns - ORC Vectorized 434 483 40 36.2 27.6 57.3X -Both columns - ORC MR 2946 2964 26 5.3 187.3 8.4X +Data column - CSV 21799 22053 360 0.7 1385.9 1.0X +Data column - Json 12978 12985 10 1.2 825.1 1.7X +Data column - Parquet Vectorized: DataPageV1 261 277 15 60.4 16.6 83.7X +Data column - Parquet Vectorized: DataPageV2 601 647 42 26.2 38.2 36.3X +Data column - Parquet MR: DataPageV1 2796 2798 2 5.6 177.8 7.8X +Data column - Parquet MR: DataPageV2 2595 2626 43 6.1 165.0 8.4X +Data column - ORC Vectorized 428 449 25 36.8 27.2 50.9X +Data column - ORC MR 2162 2274 159 7.3 137.5 10.1X +Partition column - CSV 5804 5922 167 2.7 369.0 3.8X +Partition column - Json 10410 10455 64 1.5 661.8 2.1X +Partition column - Parquet Vectorized: DataPageV1 56 60 6 280.9 3.6 389.3X +Partition column - Parquet Vectorized: DataPageV2 55 59 5 286.5 3.5 397.1X +Partition column - Parquet MR: DataPageV1 1357 1357 1 11.6 86.3 16.1X +Partition column - Parquet MR: DataPageV2 1339 1339 0 11.7 85.1 16.3X +Partition column - ORC Vectorized 57 61 5 276.3 3.6 382.9X +Partition column - ORC MR 1346 1351 7 11.7 85.6 16.2X +Both columns - CSV 20812 21349 759 0.8 1323.2 1.0X +Both columns - Json 13061 13372 440 1.2 830.4 1.7X +Both columns - Parquet Vectorized: DataPageV1 265 275 6 59.3 16.9 82.1X +Both columns - Parquet Vectorized: DataPageV2 619 637 20 25.4 39.4 35.2X +Both columns - Parquet MR: DataPageV1 2827 2830 4 5.6 179.8 7.7X +Both columns - Parquet MR: DataPageV2 2593 2603 14 6.1 164.8 8.4X +Both columns - ORC Vectorized 391 432 37 40.2 24.9 55.7X +Both columns - ORC MR 2438 2455 25 6.5 155.0 8.9X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13698 13783 121 0.8 1306.3 1.0X -SQL Json 11030 11144 161 1.0 1051.9 1.2X -SQL Parquet Vectorized: DataPageV1 1695 1699 7 6.2 161.6 8.1X -SQL Parquet Vectorized: DataPageV2 2740 2744 5 3.8 261.3 5.0X -SQL Parquet MR: DataPageV1 4547 4594 66 2.3 433.7 3.0X -SQL Parquet MR: DataPageV2 5382 5455 103 1.9 513.3 2.5X -ParquetReader Vectorized: DataPageV1 1238 1238 0 8.5 118.0 11.1X -ParquetReader Vectorized: DataPageV2 2312 2325 19 4.5 220.5 5.9X -SQL ORC Vectorized 1134 1147 18 9.2 108.2 12.1X -SQL ORC MR 3966 4015 69 2.6 378.2 3.5X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 10697 10736 56 1.0 1020.1 1.0X +SQL Json 9722 9963 341 1.1 927.2 1.1X +SQL Parquet Vectorized: DataPageV1 1337 1342 6 7.8 127.6 8.0X +SQL Parquet Vectorized: DataPageV2 1731 1757 38 6.1 165.1 6.2X +SQL Parquet MR: DataPageV1 3581 3584 4 2.9 341.5 3.0X +SQL Parquet MR: DataPageV2 3996 4001 7 2.6 381.1 2.7X +ParquetReader Vectorized: DataPageV1 1006 1015 13 10.4 96.0 10.6X +ParquetReader Vectorized: DataPageV2 1476 1477 2 7.1 140.7 7.2X +SQL ORC Vectorized 957 1042 120 11.0 91.3 11.2X +SQL ORC MR 3060 3068 11 3.4 291.8 3.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10613 10658 64 1.0 1012.1 1.0X -SQL Json 8973 8996 33 1.2 855.7 1.2X -SQL Parquet Vectorized: DataPageV1 1208 1221 18 8.7 115.2 8.8X -SQL Parquet Vectorized: DataPageV2 1949 1950 1 5.4 185.9 5.4X -SQL Parquet MR: DataPageV1 3701 3716 21 2.8 353.0 2.9X -SQL Parquet MR: DataPageV2 4150 4192 60 2.5 395.8 2.6X -ParquetReader Vectorized: DataPageV1 1191 1192 1 8.8 113.6 8.9X -ParquetReader Vectorized: DataPageV2 1874 1917 61 5.6 178.7 5.7X -SQL ORC Vectorized 1338 1365 38 7.8 127.6 7.9X -SQL ORC MR 3659 3674 21 2.9 349.0 2.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 7299 7300 1 1.4 696.1 1.0X +SQL Json 7453 7659 292 1.4 710.8 1.0X +SQL Parquet Vectorized: DataPageV1 896 916 32 11.7 85.4 8.1X +SQL Parquet Vectorized: DataPageV2 1282 1283 1 8.2 122.3 5.7X +SQL Parquet MR: DataPageV1 2586 2678 130 4.1 246.6 2.8X +SQL Parquet MR: DataPageV2 3061 3066 6 3.4 291.9 2.4X +ParquetReader Vectorized: DataPageV1 913 915 3 11.5 87.0 8.0X +ParquetReader Vectorized: DataPageV2 1181 1183 3 8.9 112.6 6.2X +SQL ORC Vectorized 1102 1111 13 9.5 105.1 6.6X +SQL ORC MR 2916 3002 121 3.6 278.1 2.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8714 8809 134 1.2 831.0 1.0X -SQL Json 5801 5819 25 1.8 553.2 1.5X -SQL Parquet Vectorized: DataPageV1 297 316 11 35.3 28.3 29.3X -SQL Parquet Vectorized: DataPageV2 363 382 12 28.9 34.6 24.0X -SQL Parquet MR: DataPageV1 2350 2366 22 4.5 224.1 3.7X -SQL Parquet MR: DataPageV2 2132 2183 73 4.9 203.3 4.1X -ParquetReader Vectorized: DataPageV1 296 310 13 35.4 28.2 29.4X -ParquetReader Vectorized: DataPageV2 368 372 3 28.5 35.1 23.7X -SQL ORC Vectorized 474 487 10 22.1 45.2 18.4X -SQL ORC MR 2025 2031 9 5.2 193.1 4.3X +SQL CSV 4615 4619 6 2.3 440.1 1.0X +SQL Json 4926 4927 1 2.1 469.8 0.9X +SQL Parquet Vectorized: DataPageV1 240 246 5 43.8 22.9 19.3X +SQL Parquet Vectorized: DataPageV2 287 295 4 36.5 27.4 16.1X +SQL Parquet MR: DataPageV1 1774 1781 10 5.9 169.2 2.6X +SQL Parquet MR: DataPageV2 1772 1773 1 5.9 169.0 2.6X +ParquetReader Vectorized: DataPageV1 238 240 2 44.0 22.7 19.4X +ParquetReader Vectorized: DataPageV2 285 288 3 36.8 27.2 16.2X +SQL ORC Vectorized 382 392 6 27.4 36.5 12.1X +SQL ORC MR 1616 1617 2 6.5 154.1 2.9X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2677 2687 14 0.4 2553.2 1.0X -SQL Json 3581 3588 10 0.3 3414.8 0.7X -SQL Parquet Vectorized: DataPageV1 52 59 7 20.2 49.6 51.5X -SQL Parquet Vectorized: DataPageV2 68 75 7 15.4 65.0 39.3X -SQL Parquet MR: DataPageV1 245 257 9 4.3 233.6 10.9X -SQL Parquet MR: DataPageV2 224 237 8 4.7 213.7 11.9X -SQL ORC Vectorized 64 70 5 16.3 61.3 41.7X -SQL ORC MR 208 216 8 5.0 198.2 12.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 2051 2052 2 0.5 1956.1 1.0X +SQL Json 3230 3232 3 0.3 3080.6 0.6X +SQL Parquet Vectorized: DataPageV1 45 50 7 23.2 43.2 45.3X +SQL Parquet Vectorized: DataPageV2 67 72 8 15.6 64.1 30.5X +SQL Parquet MR: DataPageV1 191 198 8 5.5 181.9 10.8X +SQL Parquet MR: DataPageV2 176 181 6 6.0 167.7 11.7X +SQL ORC Vectorized 55 60 6 19.0 52.7 37.1X +SQL ORC MR 164 168 4 6.4 156.1 12.5X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5753 5771 25 0.2 5486.7 1.0X -SQL Json 13801 13851 71 0.1 13161.9 0.4X -SQL Parquet Vectorized: DataPageV1 75 83 9 14.1 71.1 77.2X -SQL Parquet Vectorized: DataPageV2 84 93 7 12.4 80.6 68.1X -SQL Parquet MR: DataPageV1 269 280 7 3.9 256.5 21.4X -SQL Parquet MR: DataPageV2 251 258 8 4.2 238.9 23.0X -SQL ORC Vectorized 82 88 6 12.8 78.3 70.1X -SQL ORC MR 223 239 8 4.7 213.0 25.8X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +SQL CSV 4530 4530 0 0.2 4320.0 1.0X +SQL Json 12530 12536 9 0.1 11949.2 0.4X +SQL Parquet Vectorized: DataPageV1 60 65 6 17.4 57.6 75.0X +SQL Parquet Vectorized: DataPageV2 83 91 8 12.6 79.1 54.6X +SQL Parquet MR: DataPageV1 211 216 7 5.0 201.2 21.5X +SQL Parquet MR: DataPageV2 195 204 12 5.4 186.0 23.2X +SQL ORC Vectorized 70 75 5 14.9 67.1 64.4X +SQL ORC MR 182 191 11 5.8 173.5 24.9X + +OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1028-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9487 9503 24 0.1 9047.1 1.0X -SQL Json 26109 26240 186 0.0 24899.2 0.4X -SQL Parquet Vectorized: DataPageV1 100 110 10 10.4 95.8 94.5X -SQL Parquet Vectorized: DataPageV2 113 119 6 9.3 107.3 84.3X -SQL Parquet MR: DataPageV1 280 296 11 3.7 267.2 33.9X -SQL Parquet MR: DataPageV2 281 321 68 3.7 268.0 33.8X -SQL ORC Vectorized 92 101 8 11.4 87.5 103.4X -SQL ORC MR 228 245 10 4.6 217.7 41.6X +SQL CSV 7758 7763 7 0.1 7398.8 1.0X +SQL Json 24530 24546 23 0.0 23393.2 0.3X +SQL Parquet Vectorized: DataPageV1 91 96 6 11.5 87.1 84.9X +SQL Parquet Vectorized: DataPageV2 113 118 6 9.2 108.1 68.4X +SQL Parquet MR: DataPageV1 246 254 8 4.3 234.2 31.6X +SQL Parquet MR: DataPageV2 229 235 6 4.6 218.7 33.8X +SQL ORC Vectorized 88 92 6 11.9 83.8 88.3X +SQL ORC MR 205 214 9 5.1 195.2 37.9X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 6a2b6bfb4a0a8..1a7ebe51057be 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,322 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11570 12144 812 1.4 735.6 1.0X -SQL Json 7542 7568 37 2.1 479.5 1.5X -SQL Parquet Vectorized: DataPageV1 129 144 16 121.9 8.2 89.7X -SQL Parquet Vectorized: DataPageV2 92 106 20 170.3 5.9 125.2X -SQL Parquet MR: DataPageV1 1416 1419 3 11.1 90.0 8.2X -SQL Parquet MR: DataPageV2 1281 1359 110 12.3 81.4 9.0X -SQL ORC Vectorized 161 176 10 97.4 10.3 71.6X -SQL ORC MR 1525 1545 29 10.3 96.9 7.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 12972 13210 337 1.2 824.8 1.0X +SQL Json 7440 7634 275 2.1 473.0 1.7X +SQL Parquet Vectorized: DataPageV1 125 137 10 125.8 8.0 103.7X +SQL Parquet Vectorized: DataPageV2 93 103 20 168.4 5.9 138.9X +SQL Parquet MR: DataPageV1 1621 1657 52 9.7 103.0 8.0X +SQL Parquet MR: DataPageV2 1396 1420 34 11.3 88.7 9.3X +SQL ORC Vectorized 178 186 16 88.5 11.3 73.0X +SQL ORC MR 1501 1503 4 10.5 95.4 8.6X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 111 118 6 142.3 7.0 1.0X -ParquetReader Vectorized: DataPageV2 116 117 2 135.7 7.4 1.0X -ParquetReader Vectorized -> Row: DataPageV1 48 49 1 324.9 3.1 2.3X -ParquetReader Vectorized -> Row: DataPageV2 39 39 1 405.8 2.5 2.9X +ParquetReader Vectorized: DataPageV1 132 134 4 119.3 8.4 1.0X +ParquetReader Vectorized: DataPageV2 115 117 3 136.7 7.3 1.1X +ParquetReader Vectorized -> Row: DataPageV1 57 58 1 275.1 3.6 2.3X +ParquetReader Vectorized -> Row: DataPageV2 41 41 1 387.9 2.6 3.3X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13807 14535 1030 1.1 877.8 1.0X -SQL Json 8079 8094 21 1.9 513.6 1.7X -SQL Parquet Vectorized: DataPageV1 139 152 12 113.0 8.9 99.2X -SQL Parquet Vectorized: DataPageV2 140 147 5 112.5 8.9 98.7X -SQL Parquet MR: DataPageV1 1637 1741 148 9.6 104.1 8.4X -SQL Parquet MR: DataPageV2 1522 1636 161 10.3 96.8 9.1X -SQL ORC Vectorized 147 160 10 106.9 9.4 93.8X -SQL ORC MR 1542 1545 4 10.2 98.1 9.0X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 15808 15867 83 1.0 1005.0 1.0X +SQL Json 9119 9174 78 1.7 579.8 1.7X +SQL Parquet Vectorized: DataPageV1 157 163 7 100.2 10.0 100.7X +SQL Parquet Vectorized: DataPageV2 156 161 5 100.6 9.9 101.1X +SQL Parquet MR: DataPageV1 1846 1871 36 8.5 117.4 8.6X +SQL Parquet MR: DataPageV2 1702 1707 7 9.2 108.2 9.3X +SQL ORC Vectorized 130 134 2 120.7 8.3 121.3X +SQL ORC MR 1536 1542 9 10.2 97.7 10.3X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 166 171 8 94.7 10.6 1.0X -ParquetReader Vectorized: DataPageV2 166 169 4 94.7 10.6 1.0X -ParquetReader Vectorized -> Row: DataPageV1 156 157 2 100.7 9.9 1.1X -ParquetReader Vectorized -> Row: DataPageV2 156 157 2 100.7 9.9 1.1X +ParquetReader Vectorized: DataPageV1 198 202 5 79.3 12.6 1.0X +ParquetReader Vectorized: DataPageV2 197 199 3 79.8 12.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 188 190 3 83.4 12.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 188 190 3 83.5 12.0 1.1X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15327 15421 133 1.0 974.5 1.0X -SQL Json 8564 8799 332 1.8 544.5 1.8X -SQL Parquet Vectorized: DataPageV1 202 219 11 77.8 12.8 75.8X -SQL Parquet Vectorized: DataPageV2 203 210 8 77.7 12.9 75.7X -SQL Parquet MR: DataPageV1 1874 2004 183 8.4 119.2 8.2X -SQL Parquet MR: DataPageV2 1606 1709 146 9.8 102.1 9.5X -SQL ORC Vectorized 167 179 10 94.1 10.6 91.7X -SQL ORC MR 1404 1408 6 11.2 89.3 10.9X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 16474 16493 27 1.0 1047.4 1.0X +SQL Json 9477 9478 1 1.7 602.6 1.7X +SQL Parquet Vectorized: DataPageV1 211 216 7 74.4 13.4 77.9X +SQL Parquet Vectorized: DataPageV2 215 221 5 73.0 13.7 76.5X +SQL Parquet MR: DataPageV1 2114 2133 28 7.4 134.4 7.8X +SQL Parquet MR: DataPageV2 1792 1808 22 8.8 113.9 9.2X +SQL ORC Vectorized 179 182 4 88.0 11.4 92.2X +SQL ORC MR 1586 1588 2 9.9 100.8 10.4X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 222 236 13 70.7 14.1 1.0X -ParquetReader Vectorized: DataPageV2 259 268 14 60.8 16.5 0.9X -ParquetReader Vectorized -> Row: DataPageV1 228 248 11 68.9 14.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 264 293 13 59.5 16.8 0.8X +ParquetReader Vectorized: DataPageV1 254 257 5 62.0 16.1 1.0X +ParquetReader Vectorized: DataPageV2 299 302 4 52.6 19.0 0.8X +ParquetReader Vectorized -> Row: DataPageV1 236 238 4 66.7 15.0 1.1X +ParquetReader Vectorized -> Row: DataPageV2 281 283 4 56.0 17.9 0.9X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17479 17651 243 0.9 1111.3 1.0X -SQL Json 9565 9582 25 1.6 608.1 1.8X -SQL Parquet Vectorized: DataPageV1 152 159 8 103.2 9.7 114.7X -SQL Parquet Vectorized: DataPageV2 290 308 18 54.2 18.4 60.3X -SQL Parquet MR: DataPageV1 1861 1980 169 8.5 118.3 9.4X -SQL Parquet MR: DataPageV2 1647 1748 142 9.5 104.7 10.6X -SQL ORC Vectorized 230 251 12 68.3 14.6 75.9X -SQL ORC MR 1645 1648 3 9.6 104.6 10.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 18049 18086 52 0.9 1147.5 1.0X +SQL Json 10073 10074 1 1.6 640.4 1.8X +SQL Parquet Vectorized: DataPageV1 177 184 9 89.1 11.2 102.3X +SQL Parquet Vectorized: DataPageV2 301 306 6 52.2 19.1 59.9X +SQL Parquet MR: DataPageV1 2120 2134 21 7.4 134.8 8.5X +SQL Parquet MR: DataPageV2 1855 1893 54 8.5 117.9 9.7X +SQL ORC Vectorized 246 249 1 63.8 15.7 73.2X +SQL ORC MR 1655 1660 6 9.5 105.2 10.9X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 208 213 9 75.7 13.2 1.0X -ParquetReader Vectorized: DataPageV2 355 382 14 44.3 22.6 0.6X -ParquetReader Vectorized -> Row: DataPageV1 212 233 8 74.1 13.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 350 353 7 45.0 22.2 0.6X +ParquetReader Vectorized: DataPageV1 239 243 5 65.8 15.2 1.0X +ParquetReader Vectorized: DataPageV2 384 387 4 40.9 24.4 0.6X +ParquetReader Vectorized -> Row: DataPageV1 223 224 3 70.7 14.2 1.1X +ParquetReader Vectorized -> Row: DataPageV2 366 370 7 43.0 23.3 0.7X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 21825 21944 169 0.7 1387.6 1.0X -SQL Json 11877 11927 71 1.3 755.1 1.8X -SQL Parquet Vectorized: DataPageV1 229 242 18 68.8 14.5 95.5X -SQL Parquet Vectorized: DataPageV2 435 452 23 36.1 27.7 50.1X -SQL Parquet MR: DataPageV1 2050 2184 190 7.7 130.3 10.6X -SQL Parquet MR: DataPageV2 1829 1927 138 8.6 116.3 11.9X -SQL ORC Vectorized 287 308 14 54.8 18.3 76.0X -SQL ORC MR 1579 1603 34 10.0 100.4 13.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 22703 22737 48 0.7 1443.4 1.0X +SQL Json 12723 12743 28 1.2 808.9 1.8X +SQL Parquet Vectorized: DataPageV1 228 261 76 69.1 14.5 99.7X +SQL Parquet Vectorized: DataPageV2 465 472 7 33.8 29.5 48.9X +SQL Parquet MR: DataPageV1 2166 2168 3 7.3 137.7 10.5X +SQL Parquet MR: DataPageV2 1921 1936 21 8.2 122.1 11.8X +SQL ORC Vectorized 307 313 10 51.2 19.5 73.9X +SQL ORC MR 1730 1745 21 9.1 110.0 13.1X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 299 341 86 52.6 19.0 1.0X -ParquetReader Vectorized: DataPageV2 551 607 110 28.5 35.1 0.5X -ParquetReader Vectorized -> Row: DataPageV1 341 344 4 46.2 21.7 0.9X -ParquetReader Vectorized -> Row: DataPageV2 508 557 33 31.0 32.3 0.6X +ParquetReader Vectorized: DataPageV1 309 316 10 51.0 19.6 1.0X +ParquetReader Vectorized: DataPageV2 559 563 5 28.1 35.5 0.6X +ParquetReader Vectorized -> Row: DataPageV1 292 296 6 53.9 18.6 1.1X +ParquetReader Vectorized -> Row: DataPageV2 541 547 8 29.1 34.4 0.6X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17585 17926 482 0.9 1118.0 1.0X -SQL Json 11927 12180 357 1.3 758.3 1.5X -SQL Parquet Vectorized: DataPageV1 150 161 11 104.6 9.6 116.9X -SQL Parquet Vectorized: DataPageV2 150 160 8 104.7 9.5 117.1X -SQL Parquet MR: DataPageV1 1830 1867 52 8.6 116.4 9.6X -SQL Parquet MR: DataPageV2 1715 1828 160 9.2 109.1 10.3X -SQL ORC Vectorized 328 358 15 48.0 20.8 53.6X -SQL ORC MR 1584 1687 145 9.9 100.7 11.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 18790 18808 25 0.8 1194.6 1.0X +SQL Json 11572 11579 10 1.4 735.7 1.6X +SQL Parquet Vectorized: DataPageV1 155 158 5 101.7 9.8 121.6X +SQL Parquet Vectorized: DataPageV2 158 162 6 99.6 10.0 119.0X +SQL Parquet MR: DataPageV1 2041 2050 12 7.7 129.8 9.2X +SQL Parquet MR: DataPageV2 1903 1905 3 8.3 121.0 9.9X +SQL ORC Vectorized 357 359 2 44.1 22.7 52.7X +SQL ORC MR 1745 1755 15 9.0 110.9 10.8X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 207 211 8 76.0 13.2 1.0X -ParquetReader Vectorized: DataPageV2 207 220 11 75.8 13.2 1.0X -ParquetReader Vectorized -> Row: DataPageV1 208 214 9 75.7 13.2 1.0X -ParquetReader Vectorized -> Row: DataPageV2 208 213 9 75.6 13.2 1.0X +ParquetReader Vectorized: DataPageV1 239 243 4 65.7 15.2 1.0X +ParquetReader Vectorized: DataPageV2 240 243 4 65.7 15.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 221 225 4 71.1 14.1 1.1X +ParquetReader Vectorized -> Row: DataPageV2 223 225 4 70.6 14.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 22569 22614 63 0.7 1434.9 1.0X -SQL Json 15590 15600 15 1.0 991.2 1.4X -SQL Parquet Vectorized: DataPageV1 225 241 17 69.9 14.3 100.3X -SQL Parquet Vectorized: DataPageV2 219 236 13 72.0 13.9 103.3X -SQL Parquet MR: DataPageV1 2013 2109 136 7.8 128.0 11.2X -SQL Parquet MR: DataPageV2 1850 1967 165 8.5 117.6 12.2X -SQL ORC Vectorized 396 416 25 39.7 25.2 56.9X -SQL ORC MR 1707 1763 79 9.2 108.5 13.2X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 23476 23478 3 0.7 1492.6 1.0X +SQL Json 14568 15103 757 1.1 926.2 1.6X +SQL Parquet Vectorized: DataPageV1 212 230 16 74.2 13.5 110.7X +SQL Parquet Vectorized: DataPageV2 209 218 8 75.4 13.3 112.5X +SQL Parquet MR: DataPageV1 1943 2080 194 8.1 123.5 12.1X +SQL Parquet MR: DataPageV2 1824 1830 9 8.6 116.0 12.9X +SQL ORC Vectorized 395 419 20 39.9 25.1 59.5X +SQL ORC MR 1844 1855 15 8.5 117.2 12.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 280 298 13 56.2 17.8 1.0X -ParquetReader Vectorized: DataPageV2 278 300 21 56.6 17.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 280 299 13 56.2 17.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 304 307 4 51.8 19.3 0.9X +ParquetReader Vectorized: DataPageV1 280 322 88 56.1 17.8 1.0X +ParquetReader Vectorized: DataPageV2 282 301 19 55.8 17.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 284 290 4 55.3 18.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 287 293 9 54.8 18.3 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15548 16002 641 0.7 1482.8 1.0X -SQL Json 10801 11108 434 1.0 1030.1 1.4X -SQL Parquet Vectorized: DataPageV1 1858 1966 152 5.6 177.2 8.4X -SQL Parquet Vectorized: DataPageV2 2342 2466 175 4.5 223.4 6.6X -SQL Parquet MR: DataPageV1 3873 3908 49 2.7 369.4 4.0X -SQL Parquet MR: DataPageV2 3764 3869 148 2.8 358.9 4.1X -SQL ORC Vectorized 2018 2020 3 5.2 192.5 7.7X -SQL ORC MR 3247 3450 287 3.2 309.7 4.8X +SQL CSV 14663 15652 1399 0.7 1398.4 1.0X +SQL Json 10757 10845 125 1.0 1025.9 1.4X +SQL Parquet Vectorized: DataPageV1 1815 1933 166 5.8 173.1 8.1X +SQL Parquet Vectorized: DataPageV2 2244 2297 75 4.7 214.0 6.5X +SQL Parquet MR: DataPageV1 3491 3685 273 3.0 333.0 4.2X +SQL Parquet MR: DataPageV2 3600 3627 37 2.9 343.4 4.1X +SQL ORC Vectorized 1804 1895 129 5.8 172.0 8.1X +SQL ORC MR 3181 3379 280 3.3 303.4 4.6X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8028 8337 436 1.3 765.6 1.0X -SQL Json 6362 6488 178 1.6 606.7 1.3X -SQL Parquet Vectorized: DataPageV1 642 673 51 16.3 61.3 12.5X -SQL Parquet Vectorized: DataPageV2 646 678 40 16.2 61.6 12.4X -SQL Parquet MR: DataPageV1 1504 1604 141 7.0 143.5 5.3X -SQL Parquet MR: DataPageV2 1645 1646 1 6.4 156.9 4.9X -SQL ORC Vectorized 386 415 25 27.2 36.8 20.8X -SQL ORC MR 1704 1730 37 6.2 162.5 4.7X +SQL CSV 8466 8778 441 1.2 807.4 1.0X +SQL Json 6389 6454 93 1.6 609.3 1.3X +SQL Parquet Vectorized: DataPageV1 644 675 52 16.3 61.4 13.1X +SQL Parquet Vectorized: DataPageV2 640 668 44 16.4 61.0 13.2X +SQL Parquet MR: DataPageV1 1579 1602 33 6.6 150.6 5.4X +SQL Parquet MR: DataPageV2 1536 1539 4 6.8 146.5 5.5X +SQL ORC Vectorized 439 443 4 23.9 41.9 19.3X +SQL ORC MR 1787 1806 27 5.9 170.5 4.7X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 21472 21514 59 0.7 1365.2 1.0X -Data column - Json 11537 11606 97 1.4 733.5 1.9X -Data column - Parquet Vectorized: DataPageV1 238 256 11 66.1 15.1 90.2X -Data column - Parquet Vectorized: DataPageV2 482 507 17 32.6 30.6 44.6X -Data column - Parquet MR: DataPageV1 2213 2355 200 7.1 140.7 9.7X -Data column - Parquet MR: DataPageV2 2036 2163 179 7.7 129.4 10.5X -Data column - ORC Vectorized 289 310 20 54.4 18.4 74.3X -Data column - ORC MR 1898 1936 54 8.3 120.7 11.3X -Partition column - CSV 6307 6364 80 2.5 401.0 3.4X -Partition column - Json 9167 9253 121 1.7 582.8 2.3X -Partition column - Parquet Vectorized: DataPageV1 62 66 3 253.5 3.9 346.1X -Partition column - Parquet Vectorized: DataPageV2 61 65 2 259.2 3.9 353.8X -Partition column - Parquet MR: DataPageV1 1086 1088 3 14.5 69.0 19.8X -Partition column - Parquet MR: DataPageV2 1091 1146 78 14.4 69.4 19.7X -Partition column - ORC Vectorized 63 67 2 251.1 4.0 342.9X -Partition column - ORC MR 1173 1175 3 13.4 74.6 18.3X -Both columns - CSV 21458 22038 820 0.7 1364.3 1.0X -Both columns - Json 12697 12712 22 1.2 807.2 1.7X -Both columns - Parquet Vectorized: DataPageV1 275 288 10 57.2 17.5 78.0X -Both columns - Parquet Vectorized: DataPageV2 505 525 24 31.2 32.1 42.5X -Both columns - Parquet MR: DataPageV1 2541 2547 9 6.2 161.5 8.5X -Both columns - Parquet MR: DataPageV2 2059 2060 2 7.6 130.9 10.4X -Both columns - ORC Vectorized 326 349 16 48.3 20.7 66.0X -Both columns - ORC MR 2116 2151 50 7.4 134.5 10.1X +Data column - CSV 22527 22546 26 0.7 1432.3 1.0X +Data column - Json 12533 12712 254 1.3 796.8 1.8X +Data column - Parquet Vectorized: DataPageV1 229 244 14 68.7 14.6 98.3X +Data column - Parquet Vectorized: DataPageV2 508 519 16 31.0 32.3 44.3X +Data column - Parquet MR: DataPageV1 2525 2535 13 6.2 160.6 8.9X +Data column - Parquet MR: DataPageV2 2194 2209 21 7.2 139.5 10.3X +Data column - ORC Vectorized 315 317 2 50.0 20.0 71.6X +Data column - ORC MR 2098 2100 3 7.5 133.4 10.7X +Partition column - CSV 6747 6753 9 2.3 429.0 3.3X +Partition column - Json 10080 10102 32 1.6 640.8 2.2X +Partition column - Parquet Vectorized: DataPageV1 60 63 2 262.8 3.8 376.4X +Partition column - Parquet Vectorized: DataPageV2 58 63 8 270.2 3.7 387.1X +Partition column - Parquet MR: DataPageV1 1152 1155 4 13.6 73.3 19.5X +Partition column - Parquet MR: DataPageV2 1149 1149 1 13.7 73.0 19.6X +Partition column - ORC Vectorized 61 64 3 259.8 3.8 372.1X +Partition column - ORC MR 1332 1332 0 11.8 84.7 16.9X +Both columns - CSV 23030 23042 17 0.7 1464.2 1.0X +Both columns - Json 13569 13581 16 1.2 862.7 1.7X +Both columns - Parquet Vectorized: DataPageV1 268 277 11 58.7 17.0 84.0X +Both columns - Parquet Vectorized: DataPageV2 551 557 7 28.6 35.0 40.9X +Both columns - Parquet MR: DataPageV1 2556 2557 0 6.2 162.5 8.8X +Both columns - Parquet MR: DataPageV2 2287 2292 7 6.9 145.4 9.9X +Both columns - ORC Vectorized 361 363 2 43.6 22.9 62.5X +Both columns - ORC MR 2158 2161 5 7.3 137.2 10.4X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10074 10372 422 1.0 960.7 1.0X -SQL Json 10037 10147 156 1.0 957.2 1.0X -SQL Parquet Vectorized: DataPageV1 1192 1226 47 8.8 113.7 8.4X -SQL Parquet Vectorized: DataPageV2 2349 2423 105 4.5 224.0 4.3X -SQL Parquet MR: DataPageV1 2995 3114 168 3.5 285.6 3.4X -SQL Parquet MR: DataPageV2 3847 3900 75 2.7 366.9 2.6X -ParquetReader Vectorized: DataPageV1 888 918 51 11.8 84.7 11.3X -ParquetReader Vectorized: DataPageV2 2128 2159 43 4.9 203.0 4.7X -SQL ORC Vectorized 837 908 61 12.5 79.8 12.0X -SQL ORC MR 2792 2882 127 3.8 266.3 3.6X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 11418 11463 63 0.9 1088.9 1.0X +SQL Json 9698 9938 339 1.1 924.9 1.2X +SQL Parquet Vectorized: DataPageV1 1176 1207 45 8.9 112.1 9.7X +SQL Parquet Vectorized: DataPageV2 1652 1669 24 6.3 157.6 6.9X +SQL Parquet MR: DataPageV1 3041 3119 109 3.4 290.0 3.8X +SQL Parquet MR: DataPageV2 4030 4110 114 2.6 384.3 2.8X +ParquetReader Vectorized: DataPageV1 1008 1014 8 10.4 96.2 11.3X +ParquetReader Vectorized: DataPageV2 1247 1305 82 8.4 118.9 9.2X +SQL ORC Vectorized 820 856 56 12.8 78.2 13.9X +SQL ORC MR 2762 2807 64 3.8 263.4 4.1X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7808 7810 3 1.3 744.6 1.0X -SQL Json 7434 7491 82 1.4 708.9 1.1X -SQL Parquet Vectorized: DataPageV1 1037 1044 10 10.1 98.9 7.5X -SQL Parquet Vectorized: DataPageV2 1528 1529 3 6.9 145.7 5.1X -SQL Parquet MR: DataPageV1 2300 2411 156 4.6 219.4 3.4X -SQL Parquet MR: DataPageV2 2637 2639 4 4.0 251.5 3.0X -ParquetReader Vectorized: DataPageV1 843 907 56 12.4 80.4 9.3X -ParquetReader Vectorized: DataPageV2 1424 1446 30 7.4 135.8 5.5X -SQL ORC Vectorized 1131 1132 1 9.3 107.8 6.9X -SQL ORC MR 2781 2856 106 3.8 265.3 2.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 6752 6756 5 1.6 644.0 1.0X +SQL Json 7469 7549 112 1.4 712.3 0.9X +SQL Parquet Vectorized: DataPageV1 912 990 67 11.5 87.0 7.4X +SQL Parquet Vectorized: DataPageV2 1141 1215 104 9.2 108.8 5.9X +SQL Parquet MR: DataPageV1 2256 2418 229 4.6 215.1 3.0X +SQL Parquet MR: DataPageV2 2712 2882 241 3.9 258.6 2.5X +ParquetReader Vectorized: DataPageV1 956 960 6 11.0 91.2 7.1X +ParquetReader Vectorized: DataPageV2 1211 1211 1 8.7 115.5 5.6X +SQL ORC Vectorized 1135 1135 1 9.2 108.2 6.0X +SQL ORC MR 2716 2766 70 3.9 259.0 2.5X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5357 5538 255 2.0 510.9 1.0X -SQL Json 4354 4387 47 2.4 415.2 1.2X -SQL Parquet Vectorized: DataPageV1 212 226 15 49.5 20.2 25.3X -SQL Parquet Vectorized: DataPageV2 265 276 16 39.6 25.2 20.2X -SQL Parquet MR: DataPageV1 1575 1578 4 6.7 150.2 3.4X -SQL Parquet MR: DataPageV2 1624 1638 21 6.5 154.8 3.3X -ParquetReader Vectorized: DataPageV1 219 234 14 47.8 20.9 24.4X -ParquetReader Vectorized: DataPageV2 274 294 17 38.2 26.2 19.5X -SQL ORC Vectorized 370 393 12 28.4 35.3 14.5X -SQL ORC MR 1540 1545 7 6.8 146.9 3.5X +SQL CSV 4496 4710 303 2.3 428.8 1.0X +SQL Json 4324 4343 28 2.4 412.3 1.0X +SQL Parquet Vectorized: DataPageV1 221 244 9 47.5 21.0 20.4X +SQL Parquet Vectorized: DataPageV2 270 288 13 38.8 25.8 16.6X +SQL Parquet MR: DataPageV1 1451 1461 15 7.2 138.3 3.1X +SQL Parquet MR: DataPageV2 1364 1368 5 7.7 130.0 3.3X +ParquetReader Vectorized: DataPageV1 256 258 2 40.9 24.5 17.5X +ParquetReader Vectorized: DataPageV2 273 291 17 38.4 26.0 16.5X +SQL ORC Vectorized 345 367 24 30.4 32.9 13.0X +SQL ORC MR 1508 1509 2 7.0 143.8 3.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2159 2212 74 0.5 2059.3 1.0X -SQL Json 2836 2896 84 0.4 2704.5 0.8X -SQL Parquet Vectorized: DataPageV1 54 59 9 19.5 51.4 40.1X -SQL Parquet Vectorized: DataPageV2 66 72 8 15.9 63.1 32.7X -SQL Parquet MR: DataPageV1 173 186 10 6.1 164.5 12.5X -SQL Parquet MR: DataPageV2 159 172 8 6.6 151.8 13.6X -SQL ORC Vectorized 54 60 10 19.2 52.0 39.6X -SQL ORC MR 150 161 7 7.0 143.3 14.4X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 2036 2140 147 0.5 1941.4 1.0X +SQL Json 2796 2927 186 0.4 2666.5 0.7X +SQL Parquet Vectorized: DataPageV1 47 52 7 22.2 45.0 43.1X +SQL Parquet Vectorized: DataPageV2 64 69 7 16.4 61.2 31.7X +SQL Parquet MR: DataPageV1 176 190 11 5.9 168.1 11.5X +SQL Parquet MR: DataPageV2 157 171 6 6.7 149.3 13.0X +SQL ORC Vectorized 52 56 10 20.3 49.2 39.5X +SQL ORC MR 142 152 8 7.4 135.9 14.3X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5877 5883 8 0.2 5605.0 1.0X -SQL Json 11474 11587 159 0.1 10942.9 0.5X -SQL Parquet Vectorized: DataPageV1 66 72 7 15.9 63.1 88.9X -SQL Parquet Vectorized: DataPageV2 83 90 8 12.6 79.4 70.6X -SQL Parquet MR: DataPageV1 191 201 9 5.5 182.6 30.7X -SQL Parquet MR: DataPageV2 179 187 9 5.9 170.3 32.9X -SQL ORC Vectorized 70 76 12 14.9 67.1 83.5X -SQL ORC MR 167 175 7 6.3 159.2 35.2X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +SQL CSV 5384 5560 249 0.2 5134.8 1.0X +SQL Json 10934 11224 410 0.1 10427.1 0.5X +SQL Parquet Vectorized: DataPageV1 62 67 7 16.8 59.5 86.3X +SQL Parquet Vectorized: DataPageV2 79 85 7 13.3 75.3 68.1X +SQL Parquet MR: DataPageV1 198 211 9 5.3 188.6 27.2X +SQL Parquet MR: DataPageV2 177 188 9 5.9 168.7 30.4X +SQL ORC Vectorized 67 73 10 15.6 64.0 80.2X +SQL ORC MR 160 172 8 6.6 152.3 33.7X + +OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1028-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9695 9965 382 0.1 9245.8 1.0X -SQL Json 22119 23566 2045 0.0 21094.6 0.4X -SQL Parquet Vectorized: DataPageV1 96 104 7 10.9 91.6 100.9X -SQL Parquet Vectorized: DataPageV2 113 121 8 9.3 107.8 85.8X -SQL Parquet MR: DataPageV1 227 243 9 4.6 216.2 42.8X -SQL Parquet MR: DataPageV2 210 225 12 5.0 200.2 46.2X -SQL ORC Vectorized 90 96 10 11.7 85.7 107.9X -SQL ORC MR 188 199 9 5.6 178.9 51.7X +SQL CSV 9602 9882 396 0.1 9157.0 1.0X +SQL Json 21369 21987 874 0.0 20379.5 0.4X +SQL Parquet Vectorized: DataPageV1 90 97 7 11.7 85.4 107.2X +SQL Parquet Vectorized: DataPageV2 107 115 7 9.8 102.0 89.8X +SQL Parquet MR: DataPageV1 227 234 14 4.6 216.1 42.4X +SQL Parquet MR: DataPageV2 204 216 10 5.1 194.4 47.1X +SQL ORC Vectorized 81 89 8 12.9 77.6 118.1X +SQL ORC MR 181 195 12 5.8 172.3 53.2X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index 07e35c158c8cb..5669534cd111a 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -29,6 +29,8 @@ import java.util.Set; import com.google.common.annotations.VisibleForTesting; +import org.apache.parquet.VersionParser; +import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.column.page.PageReadStore; import scala.Option; @@ -69,6 +71,9 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReader fileMetadata = fileReader.getFileMetaData().getKeyValueMetaData(); ReadSupport readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index 57a307b1b7b6b..ee09d2b2a3be9 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -21,6 +21,8 @@ import java.time.ZoneId; import java.util.PrimitiveIterator; +import org.apache.parquet.CorruptDeltaByteArrays; +import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.BytesUtils; @@ -28,6 +30,7 @@ import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.*; +import org.apache.parquet.column.values.RequiresPreviousReader; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; @@ -86,6 +89,7 @@ public class VectorizedColumnReader { private final ColumnDescriptor descriptor; private final LogicalTypeAnnotation logicalTypeAnnotation; private final String datetimeRebaseMode; + private final ParsedVersion writerVersion; public VectorizedColumnReader( ColumnDescriptor descriptor, @@ -96,7 +100,8 @@ public VectorizedColumnReader( String datetimeRebaseMode, String datetimeRebaseTz, String int96RebaseMode, - String int96RebaseTz) throws IOException { + String int96RebaseTz, + ParsedVersion writerVersion) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.readState = new ParquetReadState(descriptor.getMaxDefinitionLevel(), rowIndexes); @@ -129,6 +134,7 @@ public VectorizedColumnReader( this.datetimeRebaseMode = datetimeRebaseMode; assert "LEGACY".equals(int96RebaseMode) || "EXCEPTION".equals(int96RebaseMode) || "CORRECTED".equals(int96RebaseMode); + this.writerVersion = writerVersion; } private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName) { @@ -259,6 +265,7 @@ private void initDataReader( int pageValueCount, Encoding dataEncoding, ByteBufferInputStream in) throws IOException { + ValuesReader previousReader = this.dataColumn; if (dataEncoding.usesDictionary()) { this.dataColumn = null; if (dictionary == null) { @@ -283,6 +290,12 @@ private void initDataReader( } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } + // for PARQUET-246 (See VectorizedDeltaByteArrayReader.setPreviousValues) + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && + previousReader instanceof RequiresPreviousReader) { + // previousReader can only be set if reading sequentially + ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); + } } private ValuesReader getValuesReader(Encoding encoding) { diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java index 62fb5f8c96bbf..3218c20ece893 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaBinaryPackedReader.java @@ -90,6 +90,7 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce Preconditions.checkArgument(miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); this.miniBlockSizeInValues = (int) miniSize; + // True value count. May be less than valueCount because of nulls this.totalValueCount = BytesUtils.readUnsignedVarInt(in); this.bitWidths = new int[miniBlockNumInABlock]; this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; @@ -97,6 +98,11 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce firstValue = BytesUtils.readZigZagVarLong(in); } + // True value count. May be less than valueCount because of nulls + int getTotalValueCount() { + return totalValueCount; + } + @Override public byte readByte() { readValues(1, null, 0, (w, r, v) -> byteVal = (byte) v); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java index 72b760d426eac..b3fc54a8d152c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaByteArrayReader.java @@ -16,50 +16,127 @@ */ package org.apache.spark.sql.execution.datasources.parquet; +import static org.apache.spark.sql.types.DataTypes.BinaryType; +import static org.apache.spark.sql.types.DataTypes.IntegerType; + import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader; +import org.apache.parquet.column.values.RequiresPreviousReader; +import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import java.io.IOException; import java.nio.ByteBuffer; /** - * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized interface. + * An implementation of the Parquet DELTA_BYTE_ARRAY decoder that supports the vectorized + * interface. */ -public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase { - private final DeltaByteArrayReader deltaByteArrayReader = new DeltaByteArrayReader(); +public class VectorizedDeltaByteArrayReader extends VectorizedReaderBase + implements VectorizedValuesReader, RequiresPreviousReader { + + private final VectorizedDeltaBinaryPackedReader prefixLengthReader; + private final VectorizedDeltaLengthByteArrayReader suffixReader; + private WritableColumnVector prefixLengthVector; + private ByteBuffer previous; + private int currentRow = 0; + + // Temporary variable used by readBinary + private final WritableColumnVector binaryValVector; + // Temporary variable used by skipBinary + private final WritableColumnVector tempBinaryValVector; + + VectorizedDeltaByteArrayReader() { + this.prefixLengthReader = new VectorizedDeltaBinaryPackedReader(); + this.suffixReader = new VectorizedDeltaLengthByteArrayReader(); + binaryValVector = new OnHeapColumnVector(1, BinaryType); + tempBinaryValVector = new OnHeapColumnVector(1, BinaryType); + } @Override public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - deltaByteArrayReader.initFromPage(valueCount, in); + prefixLengthVector = new OnHeapColumnVector(valueCount, IntegerType); + prefixLengthReader.initFromPage(valueCount, in); + prefixLengthReader.readIntegers(prefixLengthReader.getTotalValueCount(), + prefixLengthVector, 0); + suffixReader.initFromPage(valueCount, in); } @Override public Binary readBinary(int len) { - return deltaByteArrayReader.readBytes(); + readValues(1, binaryValVector, 0); + return Binary.fromConstantByteArray(binaryValVector.getBinary(0)); } - @Override - public void readBinary(int total, WritableColumnVector c, int rowId) { + private void readValues(int total, WritableColumnVector c, int rowId) { for (int i = 0; i < total; i++) { - Binary binary = deltaByteArrayReader.readBytes(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - c.putByteArray(rowId + i, buffer.array(), buffer.arrayOffset() + buffer.position(), - binary.length()); - } else { - byte[] bytes = new byte[binary.length()]; - buffer.get(bytes); - c.putByteArray(rowId + i, bytes); + // NOTE: due to PARQUET-246, it is important that we + // respect prefixLength which was read from prefixLengthReader, + // even for the *first* value of a page. Even though the first + // value of the page should have an empty prefix, it may not + // because of PARQUET-246. + int prefixLength = prefixLengthVector.getInt(currentRow); + ByteBuffer suffix = suffixReader.getBytes(currentRow); + byte[] suffixArray = suffix.array(); + int suffixLength = suffix.limit() - suffix.position(); + int length = prefixLength + suffixLength; + + // We have to do this to materialize the output + WritableColumnVector arrayData = c.arrayData(); + int offset = arrayData.getElementsAppended(); + if (prefixLength != 0) { + arrayData.appendBytes(prefixLength, previous.array(), previous.position()); } + arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); + c.putArray(rowId + i, offset, length); + previous = arrayData.getByteBuffer(offset, length); + currentRow++; + } + } + + @Override + public void readBinary(int total, WritableColumnVector c, int rowId) { + readValues(total, c, rowId); + } + + /** + * There was a bug (PARQUET-246) in which DeltaByteArrayWriter's reset() method did not clear the + * previous value state that it tracks internally. This resulted in the first value of all pages + * (except for the first page) to be a delta from the last value of the previous page. In order to + * read corrupted files written with this bug, when reading a new page we need to recover the + * previous page's last value to use it (if needed) to read the first value. + */ + public void setPreviousReader(ValuesReader reader) { + if (reader != null) { + this.previous = ((VectorizedDeltaByteArrayReader) reader).previous; } } @Override public void skipBinary(int total) { + WritableColumnVector c1 = tempBinaryValVector; + WritableColumnVector c2 = binaryValVector; + for (int i = 0; i < total; i++) { - deltaByteArrayReader.skip(); + int prefixLength = prefixLengthVector.getInt(currentRow); + ByteBuffer suffix = suffixReader.getBytes(currentRow); + byte[] suffixArray = suffix.array(); + int suffixLength = suffix.limit() - suffix.position(); + int length = prefixLength + suffixLength; + + WritableColumnVector arrayData = c1.arrayData(); + c1.reset(); + if (prefixLength != 0) { + arrayData.appendBytes(prefixLength, previous.array(), previous.position()); + } + arrayData.appendBytes(suffixLength, suffixArray, suffix.position()); + previous = arrayData.getByteBuffer(0, length); + currentRow++; + + WritableColumnVector tmp = c1; + c1 = c2; + c2 = tmp; } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java new file mode 100644 index 0000000000000..ac5b8527f5e13 --- /dev/null +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedDeltaLengthByteArrayReader.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet; + +import static org.apache.spark.sql.types.DataTypes.IntegerType; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; + +/** + * An implementation of the Parquet DELTA_LENGTH_BYTE_ARRAY decoder that supports the vectorized + * interface. + */ +public class VectorizedDeltaLengthByteArrayReader extends VectorizedReaderBase implements + VectorizedValuesReader { + + private final VectorizedDeltaBinaryPackedReader lengthReader; + private ByteBufferInputStream in; + private WritableColumnVector lengthsVector; + private int currentRow = 0; + + VectorizedDeltaLengthByteArrayReader() { + lengthReader = new VectorizedDeltaBinaryPackedReader(); + } + + @Override + public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { + lengthsVector = new OnHeapColumnVector(valueCount, IntegerType); + lengthReader.initFromPage(valueCount, in); + lengthReader.readIntegers(lengthReader.getTotalValueCount(), lengthsVector, 0); + this.in = in.remainingStream(); + } + + @Override + public void readBinary(int total, WritableColumnVector c, int rowId) { + ByteBuffer buffer; + ByteBufferOutputWriter outputWriter = ByteBufferOutputWriter::writeArrayByteBuffer; + int length; + for (int i = 0; i < total; i++) { + length = lengthsVector.getInt(rowId + i); + try { + buffer = in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); + } + outputWriter.write(c, rowId + i, buffer, length); + } + currentRow += total; + } + + public ByteBuffer getBytes(int rowId) { + int length = lengthsVector.getInt(rowId); + try { + return in.slice(length); + } catch (EOFException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes"); + } + } + + @Override + public void skipBinary(int total) { + for (int i = 0; i < total; i++) { + int remaining = lengthsVector.getInt(currentRow + i); + while (remaining > 0) { + remaining -= in.skip(remaining); + } + } + currentRow += total; + } +} diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java index 50056bf4073e9..cbf60125e1284 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java @@ -367,7 +367,8 @@ private void checkEndOfRowGroup() throws IOException { datetimeRebaseMode, datetimeRebaseTz, int96RebaseMode, - int96RebaseTz); + int96RebaseTz, + writerVersion); } totalCountLoadedSoFar += pages.getRowCount(); } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 7ddece068e099..4308614338499 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.parquet; +import java.nio.ByteBuffer; + import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.parquet.io.api.Binary; @@ -86,4 +88,18 @@ interface IntegerOutputWriter { void write(WritableColumnVector outputColumnVector, int rowId, long val); } + @FunctionalInterface + interface ByteBufferOutputWriter { + void write(WritableColumnVector c, int rowId, ByteBuffer val, int length); + + static void writeArrayByteBuffer(WritableColumnVector c, int rowId, ByteBuffer val, + int length) { + c.putByteArray(rowId, + val.array(), + val.arrayOffset() + val.position(), + length); + } + + static void skipWrite(WritableColumnVector c, int rowId, ByteBuffer val, int length) { } + } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index bbe96819a618b..42552c7afc624 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -221,6 +221,11 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { return UTF8String.fromAddress(null, data + rowId, count); } + @Override + public ByteBuffer getByteBuffer(int rowId, int count) { + return ByteBuffer.wrap(getBytes(rowId, count)); + } + // // APIs dealing with shorts // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 833a93f2a2bdb..d246a3c24e4a6 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -219,6 +219,12 @@ protected UTF8String getBytesAsUTF8String(int rowId, int count) { return UTF8String.fromBytes(byteData, rowId, count); } + @Override + public ByteBuffer getByteBuffer(int rowId, int count) { + return ByteBuffer.wrap(byteData, rowId, count); + } + + // // APIs dealing with Shorts // diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index 5e01c372793f1..ae457a16123d2 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -18,6 +18,7 @@ import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.ByteBuffer; import com.google.common.annotations.VisibleForTesting; @@ -443,6 +444,12 @@ public byte[] getBinary(int rowId) { } } + /** + * Gets the values of bytes from [rowId, rowId + count), as a ByteBuffer. + * This method is similar to {@link ColumnVector#getBytes(int, int)}, but avoids making a copy. + */ + public abstract ByteBuffer getByteBuffer(int rowId, int count); + /** * Append APIs. These APIs all behave similarly and will append data to the current vector. It * is not valid to mix the put and append APIs. The append APIs are slower and should only be diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala new file mode 100644 index 0000000000000..c54eef348f342 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaByteArrayEncodingSuite.scala @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.parquet.bytes.DirectByteBufferAllocator +import org.apache.parquet.column.values.Utils +import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter + +import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType} + +/** + * Read tests for vectorized Delta byte array reader. + * Translated from * org.apache.parquet.column.values.delta.TestDeltaByteArray + */ +class ParquetDeltaByteArrayEncodingSuite extends ParquetCompatibilityTest with SharedSparkSession { + val values: Array[String] = Array("parquet-mr", "parquet", "parquet-format"); + val randvalues: Array[String] = Utils.getRandomStringSamples(10000, 32) + + var writer: DeltaByteArrayWriter = _ + var reader: VectorizedDeltaByteArrayReader = _ + private var writableColumnVector: WritableColumnVector = _ + + protected override def beforeEach(): Unit = { + writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) + reader = new VectorizedDeltaByteArrayReader() + super.beforeAll() + } + + test("test Serialization") { + assertReadWrite(writer, reader, values) + } + + test("random strings") { + assertReadWrite(writer, reader, randvalues) + } + + test("random strings with skip") { + assertReadWriteWithSkip(writer, reader, randvalues) + } + + test("random strings with skipN") { + assertReadWriteWithSkipN(writer, reader, randvalues) + } + + test("test lengths") { + var reader = new VectorizedDeltaBinaryPackedReader + Utils.writeData(writer, values) + val data = writer.getBytes.toInputStream + val length = values.length + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, data) + reader.readIntegers(length, writableColumnVector, 0) + // test prefix lengths + assert(0 == writableColumnVector.getInt(0)) + assert(7 == writableColumnVector.getInt(1)) + assert(7 == writableColumnVector.getInt(2)) + + reader = new VectorizedDeltaBinaryPackedReader + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, data) + reader.readIntegers(length, writableColumnVector, 0) + // test suffix lengths + assert(10 == writableColumnVector.getInt(0)) + assert(0 == writableColumnVector.getInt(1)) + assert(7 == writableColumnVector.getInt(2)) + } + + private def assertReadWrite( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + + writableColumnVector = new OnHeapColumnVector(length, StringType) + + reader.initFromPage(length, is) + reader.readBinary(length, writableColumnVector, 0) + + for (i <- 0 until length) { + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + } + } + + private def assertReadWriteWithSkip( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + writableColumnVector = new OnHeapColumnVector(length, StringType) + reader.initFromPage(length, is) + var i = 0 + while ( { + i < vals.length + }) { + reader.readBinary(1, writableColumnVector, i) + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(1) + i += 2 + } + } + + private def assertReadWriteWithSkipN( + writer: DeltaByteArrayWriter, + reader: VectorizedDeltaByteArrayReader, + vals: Array[String]): Unit = { + Utils.writeData(writer, vals) + val length = vals.length + val is = writer.getBytes.toInputStream + writableColumnVector = new OnHeapColumnVector(length, StringType) + reader.initFromPage(length, is) + var skipCount = 0 + var i = 0 + while ( { + i < vals.length + }) { + skipCount = (vals.length - i) / 2 + reader.readBinary(1, writableColumnVector, i) + assert(vals(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(skipCount) + i += skipCount + 1 + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala new file mode 100644 index 0000000000000..17dc70df42a6d --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetDeltaLengthByteArrayEncodingSuite.scala @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.parquet + +import java.util.Random + +import org.apache.commons.lang3.RandomStringUtils +import org.apache.parquet.bytes.{ByteBufferInputStream, DirectByteBufferAllocator} +import org.apache.parquet.column.values.Utils +import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter +import org.apache.parquet.io.api.Binary + +import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType} + +/** + * Read tests for vectorized Delta length byte array reader. + * Translated from + * org.apache.parquet.column.values.delta.TestDeltaLengthByteArray + */ +class ParquetDeltaLengthByteArrayEncodingSuite + extends ParquetCompatibilityTest + with SharedSparkSession { + val values: Array[String] = Array("parquet", "hadoop", "mapreduce") + var writer: DeltaLengthByteArrayValuesWriter = _ + var reader: VectorizedDeltaLengthByteArrayReader = _ + private var writableColumnVector: WritableColumnVector = _ + + protected override def beforeEach(): Unit = { + writer = + new DeltaLengthByteArrayValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator) + reader = new VectorizedDeltaLengthByteArrayReader() + super.beforeAll() + } + + test("test serialization") { + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + + test("random strings") { + val values = Utils.getRandomStringSamples(1000, 32) + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + + test("random strings with empty strings") { + val values = getRandomStringSamplesWithEmptyStrings(1000, 32) + writeData(writer, values) + readAndValidate(reader, writer.getBytes.toInputStream, values.length, values) + } + + test("skip with random strings") { + val values = Utils.getRandomStringSamples(1000, 32) + writeData(writer, values) + reader.initFromPage(values.length, writer.getBytes.toInputStream) + writableColumnVector = new OnHeapColumnVector(values.length, StringType) + var i = 0 + while (i < values.length) { + reader.readBinary(1, writableColumnVector, i) + assert(values(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(1) + i += 2 + } + reader = new VectorizedDeltaLengthByteArrayReader() + reader.initFromPage(values.length, writer.getBytes.toInputStream) + writableColumnVector = new OnHeapColumnVector(values.length, StringType) + var skipCount = 0 + i = 0 + while (i < values.length) { + skipCount = (values.length - i) / 2 + reader.readBinary(1, writableColumnVector, i) + assert(values(i).getBytes() sameElements writableColumnVector.getBinary(i)) + reader.skipBinary(skipCount) + i += skipCount + 1 + } + } + + // Read the lengths from the beginning of the buffer and compare with the lengths of the values + test("test lengths") { + val reader = new VectorizedDeltaBinaryPackedReader + writeData(writer, values) + val length = values.length + writableColumnVector = new OnHeapColumnVector(length, IntegerType) + reader.initFromPage(length, writer.getBytes.toInputStream) + reader.readIntegers(length, writableColumnVector, 0) + for (i <- 0 until length) { + assert(values(i).length == writableColumnVector.getInt(i)) + } + } + + private def writeData(writer: DeltaLengthByteArrayValuesWriter, values: Array[String]): Unit = { + for (i <- values.indices) { + writer.writeBytes(Binary.fromString(values(i))) + } + } + + private def readAndValidate( + reader: VectorizedDeltaLengthByteArrayReader, + is: ByteBufferInputStream, + length: Int, + expectedValues: Array[String]): Unit = { + + writableColumnVector = new OnHeapColumnVector(length, StringType) + + reader.initFromPage(length, is) + reader.readBinary(length, writableColumnVector, 0) + + for (i <- 0 until length) { + assert(expectedValues(i).getBytes() sameElements writableColumnVector.getBinary(i)) + } + } + + def getRandomStringSamplesWithEmptyStrings(numSamples: Int, maxLength: Int): Array[String] = { + val randomLen = new Random + val randomEmpty = new Random + val samples: Array[String] = new Array[String](numSamples) + for (i <- 0 until numSamples) { + var maxLen: Int = randomLen.nextInt(maxLength) + if(randomEmpty.nextInt() % 11 != 0) { + maxLen = 0; + } + samples(i) = RandomStringUtils.randomAlphanumeric(0, maxLen) + } + samples + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index f7100a53444aa..07e2849ce6f19 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -27,6 +27,7 @@ import org.apache.parquet.column.{Encoding, ParquetProperties} import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.spark.TestUtils +import org.apache.spark.memory.MemoryMode import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -47,6 +48,13 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess null.asInstanceOf[Duration], null.asInstanceOf[java.lang.Boolean]) + private def withMemoryModes(f: String => Unit): Unit = { + Seq(MemoryMode.OFF_HEAP, MemoryMode.ON_HEAP).foreach(mode => { + val offHeap = if (mode == MemoryMode.OFF_HEAP) "true" else "false" + f(offHeap) + }) + } + test("All Types Dictionary") { (1 :: 1000 :: Nil).foreach { n => { withTempPath { dir => @@ -141,45 +149,54 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess ) val hadoopConf = spark.sessionState.newHadoopConfWithOptions(extraOptions) - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL") { - withTempPath { dir => - val path = s"${dir.getCanonicalPath}/test.parquet" - - val data = (1 to 3).map { i => - ( i, i.toLong, i.toShort, Array[Byte](i.toByte), s"test_${i}", - DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + i)), - DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + i)), - Period.of(1, i, 0), Duration.ofMillis(i * 100), - new BigDecimal(java.lang.Long.toUnsignedString(i*100000)) - ) + withMemoryModes { offHeapMode => + withSQLConf( + SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED.key -> offHeapMode, + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + ParquetOutputFormat.JOB_SUMMARY_LEVEL -> "ALL") { + withTempPath { dir => + val path = s"${dir.getCanonicalPath}/test.parquet" + // Have more than 2 * 4096 records (so we have multiple tasks and each task + // reads at least twice from the reader). This will catch any issues with state + // maintained by the reader(s) + // Add at least one string with a null + val data = (1 to 8193).map { i => + (i, + i.toLong, i.toShort, Array[Byte](i.toByte), + if (i % 2 == 1) s"test_$i" else null, + DateTimeUtils.fromJavaDate(Date.valueOf(s"2021-11-0" + ((i % 9) + 1))), + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(s"2020-11-01 12:00:0" + (i % 10))), + Period.of(1, (i % 11) + 1, 0), + Duration.ofMillis(((i % 9) + 1) * 100), + new BigDecimal(java.lang.Long.toUnsignedString(i * 100000)) + ) + } + + spark.createDataFrame(data) + .write.options(extraOptions).mode("overwrite").parquet(path) + + val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head + val columnChunkMetadataList = blockMetadata.getColumns.asScala + + // Verify that indeed delta encoding is used for each column + assert(columnChunkMetadataList.length === 10) + assert(columnChunkMetadataList(0).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(1).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(2).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + // Both fixed-length byte array and variable-length byte array (also called BINARY) + // are use DELTA_BYTE_ARRAY for encoding + assert(columnChunkMetadataList(3).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + assert(columnChunkMetadataList(4).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + + assert(columnChunkMetadataList(5).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(6).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(7).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(8).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) + assert(columnChunkMetadataList(9).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) + + val actual = spark.read.parquet(path).collect() + assert(actual.sortBy(_.getInt(0)) === data.map(Row.fromTuple)); } - - spark.createDataFrame(data) - .write.options(extraOptions).mode("overwrite").parquet(path) - - val blockMetadata = readFooter(new Path(path), hadoopConf).getBlocks.asScala.head - val columnChunkMetadataList = blockMetadata.getColumns.asScala - - // Verify that indeed delta encoding is used for each column - assert(columnChunkMetadataList.length === 10) - assert(columnChunkMetadataList(0).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(1).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(2).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - // Both fixed-length byte array and variable-length byte array (also called BINARY) - // are use DELTA_BYTE_ARRAY for encoding - assert(columnChunkMetadataList(3).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - assert(columnChunkMetadataList(4).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - - assert(columnChunkMetadataList(5).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(6).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(7).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(8).getEncodings.contains(Encoding.DELTA_BINARY_PACKED)) - assert(columnChunkMetadataList(9).getEncodings.contains(Encoding.DELTA_BYTE_ARRAY)) - - val actual = spark.read.parquet(path).collect() - assert(actual.sortBy(_.getInt(0)) === data.map(Row.fromTuple)); } } }