diff --git a/pom.xml b/pom.xml index 8c303e9353f75..81a0126539b1d 100644 --- a/pom.xml +++ b/pom.xml @@ -300,6 +300,12 @@ false + + jitpack.io + https://jitpack.io + Jitpack.io repository + + diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 93256bc9e0143..2f82b1ce9c18f 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -274,7 +274,9 @@ object SparkBuild extends PomBuild { "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/maven2/", DefaultMavenRepository, Resolver.mavenLocal, - Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) + Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns), + // needed for brotli-codec + "jitpack.io" at "https://jitpack.io" ), externalResolvers := resolvers.value, otherResolvers := SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))).value, diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 73fa60c2173bc..d7e9cac744ed7 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -178,6 +178,12 @@ htmlunit-driver test + + com.github.rdblue + brotli-codec + 0.1.1 + test + target/scala-${scala.binary.version}/classes diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala new file mode 100644 index 0000000000000..92b887e948da9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} + +trait FileSourceCodecSuite extends QueryTest with SQLTestUtils with SharedSparkSession { + + protected def format: String + protected val codecConfigName: String + protected def availableCodecs: Seq[String] + + def testWithAllCodecs(name: String)(f: => Unit): Unit = { + for (codec <- availableCodecs) { + test(s"$name - file source $format - codec: $codec") { + withSQLConf(codecConfigName -> codec) { + f + } + } + } + } + + testWithAllCodecs("write and read") { + withTempPath { dir => + testData + .repartition(5) + .write + .format(format) + .save(dir.getCanonicalPath) + + val df = spark.read.format(format).load(dir.getCanonicalPath) + checkAnswer(df, testData) + } + } +} + +class ParquetCodecSuite extends FileSourceCodecSuite { + + override def format: String = "parquet" + override val codecConfigName: String = SQLConf.PARQUET_COMPRESSION.key + // Exclude "lzo" because it is GPL-licenced so not included in Hadoop. + // TODO(SPARK-36669): "lz4" codec fails due to HADOOP-17891. + override protected def availableCodecs: Seq[String] = + if (System.getProperty("os.arch") == "aarch64") { + // Exclude "brotli" due to PARQUET-1975. + Seq("none", "uncompressed", "snappy", "gzip", "zstd") + } else { + Seq("none", "uncompressed", "snappy", "gzip", "brotli", "zstd") + } +} + +class OrcCodecSuite extends FileSourceCodecSuite { + + override def format: String = "orc" + override val codecConfigName: String = SQLConf.ORC_COMPRESSION.key + override protected def availableCodecs = Seq("none", "uncompressed", "snappy", + "zlib", "zstd", "lz4", "lzo") +}