From 020334ee7ac8120fdfc12261945de0b248c5f245 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Tue, 10 Jun 2025 20:26:29 -0700 Subject: [PATCH 01/17] fix spark client --- plugins/spark/v3.5/spark/build.gradle.kts | 4 +++- .../in-dev/unreleased/polaris-spark-client.md | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index a2a54e26be..7c57c1a994 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -113,7 +113,7 @@ dependencies { } tasks.register("createPolarisSparkJar") { - archiveClassifier = "bundle" + archiveClassifier = null isZip64 = true // include the LICENSE and NOTICE files for the shadow Jar @@ -140,3 +140,5 @@ tasks.register("createPolarisSparkJar") { } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } + +tasks.named("jar") { archiveClassifier.set("testJar") } diff --git a/site/content/in-dev/unreleased/polaris-spark-client.md b/site/content/in-dev/unreleased/polaris-spark-client.md index 4ceb536a9c..65ef3a5bb8 100644 --- a/site/content/in-dev/unreleased/polaris-spark-client.md +++ b/site/content/in-dev/unreleased/polaris-spark-client.md @@ -128,3 +128,21 @@ The Polaris Spark client has the following functionality limitations: 3) Rename a Delta table is not supported. 4) ALTER TABLE ... SET LOCATION is not supported for DELTA table. 5) For other non-Iceberg tables like csv, it is not supported. + +## Iceberg Spark Runtime Client compatibility with Polaris Spark Client +The Polaris Spark Client today is not designed to be used with Iceberg Spark Runtime client together. In other words, +there is no guarantee provided if both `org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:` and +`org.apache.polaris:polaris-spark-3.5_2.12:` are provided for the Spark `package` configuration. + +In order to help the usage of IcebergExtension use the same client, Polaris Spark Client also ships a version of +Iceberg Spark Runtime with it, the version information is described in the following table: + +| Spark Client Version | Iceberg Spark Runtime Version | +|----------------------|-------------------------------| +| 1.0.0 | 1.9.0 | + +## Trouble Shooting +1. When starting spark, sometimes it complains it failed to download a package, for example, + `download failed: org.apache.commons#commons-compress;1.26.2!commons-compress.jar`. If that happens, you can explicitly + specify the package in the `package` configuration, for example, `--package org.apache.commons:commons-compress:1.27.1` + From 33e3fdee87decbea92921a7253fd8db2467971e3 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 11 Jun 2025 14:05:07 -0700 Subject: [PATCH 02/17] fix test failure and address feedback --- .../v3.5/getting-started/notebooks/SparkPolaris.ipynb | 2 +- plugins/spark/v3.5/regtests/run.sh | 9 +++++++-- plugins/spark/v3.5/spark/build.gradle.kts | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb index f4459b0714..2d4d17b956 100644 --- a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb +++ b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb @@ -266,7 +266,7 @@ "from pyspark.sql import SparkSession\n", "\n", "spark = (SparkSession.builder\n", - " .config(\"spark.jars\", \"../polaris_libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar\")\n", + " .config(\"spark.jars\", \"../polaris_libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar\")\n", " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.2.1\")\n", " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", " .config('spark.sql.iceberg.vectorization.enabled', 'false')\n", diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index acf5d1a906..216b95386b 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -70,9 +70,14 @@ SPARK_VERSION="3.5.5" for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" - # find the project jar SPARK_DIR=${SPARK_ROOT_DIR}/spark - JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.*-bundle.jar" -print -quit) + POLARIS_ROOT_DIR=$(dirname $(dirname $(dirname $(dirname ${SPARK_DIR})))) + echo "POLARIS ROOT DIR=${POLARIS_ROOT_DIR}" + # read the current polaris version + read -r POLARIS_VERISON < ${POLARIS_ROOT_DIR}/version.txt + echo "POLARIS VERSION=${POLARIS_VERISON}" + # find the spark client jar + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-${POLARIS_VERISON}.jar" -print -quit) echo "find jar ${JAR_PATH}" SPARK_EXISTS="TRUE" diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 7c57c1a994..a95f091b6d 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -141,4 +141,4 @@ tasks.register("createPolarisSparkJar") { tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } -tasks.named("jar") { archiveClassifier.set("testJar") } +tasks.named("jar") { archiveClassifier.set("defaultJar") } From dbb23e1b8c4f3f4a1da48a6e459d11ed7f966758 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 11 Jun 2025 14:58:30 -0700 Subject: [PATCH 03/17] fix error --- plugins/spark/v3.5/spark/build.gradle.kts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index a95f091b6d..fd8cfcd525 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -139,6 +139,12 @@ tasks.register("createPolarisSparkJar") { relocate("org.apache.avro", "org.apache.polaris.shaded.org.apache.avro") } +tasks.named("jar") { isEnabled = false } + tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } -tasks.named("jar") { archiveClassifier.set("defaultJar") } +tasks.named("test") { + dependsOn("sourcesJar") +} + +// tasks.named("jar") { archiveClassifier.set("defaultJar") } From 855697634408c0e112cdaafad65a2aade75d5c47 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 11 Jun 2025 16:08:54 -0700 Subject: [PATCH 04/17] update regression test --- plugins/spark/v3.5/regtests/run.sh | 7 +------ plugins/spark/v3.5/spark/build.gradle.kts | 10 ++-------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index 216b95386b..447f8c83c6 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -71,13 +71,8 @@ SPARK_VERSION="3.5.5" for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" SPARK_DIR=${SPARK_ROOT_DIR}/spark - POLARIS_ROOT_DIR=$(dirname $(dirname $(dirname $(dirname ${SPARK_DIR})))) - echo "POLARIS ROOT DIR=${POLARIS_ROOT_DIR}" - # read the current polaris version - read -r POLARIS_VERISON < ${POLARIS_ROOT_DIR}/version.txt - echo "POLARIS VERSION=${POLARIS_VERISON}" # find the spark client jar - JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-${POLARIS_VERISON}.jar" -print -quit) + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.*-SNAPSHOT.jar" -print -quit) echo "find jar ${JAR_PATH}" SPARK_EXISTS="TRUE" diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index fd8cfcd525..240b162cf2 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -135,16 +135,10 @@ tasks.register("createPolarisSparkJar") { exclude(dependency("org.apache.avro:avro*.*")) } - relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml.jackson") + relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml") relocate("org.apache.avro", "org.apache.polaris.shaded.org.apache.avro") } -tasks.named("jar") { isEnabled = false } - tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } -tasks.named("test") { - dependsOn("sourcesJar") -} - -// tasks.named("jar") { archiveClassifier.set("defaultJar") } +tasks.named("jar") { archiveClassifier.set("defaultJar") } From 028cc36bd7f392e7f9b62da0dbafad3307a27927 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 11 Jun 2025 16:18:55 -0700 Subject: [PATCH 05/17] update classifier name --- plugins/spark/v3.5/spark/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 240b162cf2..01ce71d3dd 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -141,4 +141,4 @@ tasks.register("createPolarisSparkJar") { tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } -tasks.named("jar") { archiveClassifier.set("defaultJar") } +tasks.named("jar") { archiveClassifier.set("internal") } From 4c8a9fb2d9a38f2dfc3ec686ebb76cad9a7ffeaf Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 11 Jun 2025 16:30:02 -0700 Subject: [PATCH 06/17] address comment --- plugins/spark/v3.5/spark/build.gradle.kts | 9 +++++++-- .../in-dev/unreleased/polaris-spark-client.md | 14 +++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 01ce71d3dd..972e526066 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -113,7 +113,6 @@ dependencies { } tasks.register("createPolarisSparkJar") { - archiveClassifier = null isZip64 = true // include the LICENSE and NOTICE files for the shadow Jar @@ -141,4 +140,10 @@ tasks.register("createPolarisSparkJar") { tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } -tasks.named("jar") { archiveClassifier.set("internal") } +tasks.named("jar") { + // retain the default jar job, and add a classifier to avoid conflict + // with the createPolarisSparkJar. This jar is needed by the task "test", + // which can not be switched to depends on createPolarisSparkJar due to + // relocation of com.fasterxml. + archiveClassifier.set("internal") +} diff --git a/site/content/in-dev/unreleased/polaris-spark-client.md b/site/content/in-dev/unreleased/polaris-spark-client.md index 65ef3a5bb8..2c41c0cfd9 100644 --- a/site/content/in-dev/unreleased/polaris-spark-client.md +++ b/site/content/in-dev/unreleased/polaris-spark-client.md @@ -129,17 +129,17 @@ The Polaris Spark client has the following functionality limitations: 4) ALTER TABLE ... SET LOCATION is not supported for DELTA table. 5) For other non-Iceberg tables like csv, it is not supported. -## Iceberg Spark Runtime Client compatibility with Polaris Spark Client -The Polaris Spark Client today is not designed to be used with Iceberg Spark Runtime client together. In other words, +## Iceberg Spark Client compatibility with Polaris Spark Client +The Polaris Spark client today is not designed to be used with Iceberg Spark client together. In other words, there is no guarantee provided if both `org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:` and `org.apache.polaris:polaris-spark-3.5_2.12:` are provided for the Spark `package` configuration. -In order to help the usage of IcebergExtension use the same client, Polaris Spark Client also ships a version of -Iceberg Spark Runtime with it, the version information is described in the following table: +In order to help the usage of IcebergExtension use the same client, Polaris Spark client also ships a version of +Iceberg Spark client with it, the version information is described in the following table: -| Spark Client Version | Iceberg Spark Runtime Version | -|----------------------|-------------------------------| -| 1.0.0 | 1.9.0 | +| Spark Client Version | Iceberg Spark Client Version | +|----------------------|------------------------------| +| 1.0.0 | 1.9.0 | ## Trouble Shooting 1. When starting spark, sometimes it complains it failed to download a package, for example, From f020db3609beed992f4f754f22de1524df435ad5 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 12 Jun 2025 10:46:21 -0700 Subject: [PATCH 07/17] add change --- .../publishing/PublishingHelperPlugin.kt | 5 ----- plugins/spark/v3.5/spark/build.gradle.kts | 18 ++++++------------ 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/build-logic/src/main/kotlin/publishing/PublishingHelperPlugin.kt b/build-logic/src/main/kotlin/publishing/PublishingHelperPlugin.kt index d4d412a30f..04b04225e7 100644 --- a/build-logic/src/main/kotlin/publishing/PublishingHelperPlugin.kt +++ b/build-logic/src/main/kotlin/publishing/PublishingHelperPlugin.kt @@ -133,11 +133,6 @@ constructor(private val softwareComponentFactory: SoftwareComponentFactory) : Pl suppressPomMetadataWarningsFor("testFixturesApiElements") suppressPomMetadataWarningsFor("testFixturesRuntimeElements") - - if (project.tasks.findByName("createPolarisSparkJar") != null) { - // if the project contains spark client jar, also publish the jar to maven - artifact(project.tasks.named("createPolarisSparkJar").get()) - } } if ( diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 972e526066..e3e6aa00f8 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -19,7 +19,10 @@ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar -plugins { id("polaris-client") } +plugins { + id("polaris-client") + id("com.gradleup.shadow") +} // get version information val sparkMajorVersion = "3.5" @@ -112,7 +115,8 @@ dependencies { } } -tasks.register("createPolarisSparkJar") { +tasks.named("shadowJar") { + archiveClassifier = null isZip64 = true // include the LICENSE and NOTICE files for the shadow Jar @@ -137,13 +141,3 @@ tasks.register("createPolarisSparkJar") { relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml") relocate("org.apache.avro", "org.apache.polaris.shaded.org.apache.avro") } - -tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } - -tasks.named("jar") { - // retain the default jar job, and add a classifier to avoid conflict - // with the createPolarisSparkJar. This jar is needed by the task "test", - // which can not be switched to depends on createPolarisSparkJar due to - // relocation of com.fasterxml. - archiveClassifier.set("internal") -} From 747b6f0ad531519b29fa06a1d633858f86d06a3b Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 12 Jun 2025 10:49:51 -0700 Subject: [PATCH 08/17] update doc --- plugins/spark/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/spark/README.md b/plugins/spark/README.md index c7d6bc876b..d200a0681c 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -29,12 +29,12 @@ Right now, the plugin only provides support for Spark 3.5, Scala version 2.12 an and depends on iceberg-spark-runtime 1.9.0. # Build Plugin Jar -A task createPolarisSparkJar is added to build a jar for the Polaris Spark plugin, the jar is named as: +A shadowJar task is added to build a jar for the Polaris Spark plugin, the jar is named as: `polaris-spark-_--bundle.jar`. For example: `polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar`. -- `./gradlew :polaris-spark-3.5_2.12:createPolarisSparkJar` -- build jar for Spark 3.5 with Scala version 2.12. -- `./gradlew :polaris-spark-3.5_2.13:createPolarisSparkJar` -- build jar for Spark 3.5 with Scala version 2.13. +- `./gradlew :polaris-spark-3.5_2.12:shadowJar` -- build jar for Spark 3.5 with Scala version 2.12. +- `./gradlew :polaris-spark-3.5_2.13:shadowJar` -- build jar for Spark 3.5 with Scala version 2.13. The result jar is located at plugins/spark/v3.5/build//libs after the build. From df336148959595d78cabe717a279f88f98abaf9b Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 12 Jun 2025 11:23:58 -0700 Subject: [PATCH 09/17] update build and readme --- plugins/spark/README.md | 4 ++-- plugins/spark/v3.5/spark/build.gradle.kts | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/plugins/spark/README.md b/plugins/spark/README.md index d200a0681c..d4efb405c4 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -67,12 +67,12 @@ bin/spark-shell \ ``` Assume the path to the built Spark client jar is -`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar` +`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar` and the name of the catalog is `polaris`. The cli command will look like following: ```shell bin/spark-shell \ ---jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar \ +--jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar \ --packages org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension \ --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index e3e6aa00f8..cb0ee9e4b0 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -141,3 +141,5 @@ tasks.named("shadowJar") { relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml") relocate("org.apache.avro", "org.apache.polaris.shaded.org.apache.avro") } + +tasks.withType(Jar::class).named("sourcesJar") { dependsOn("shadowJar") } From c0115317f15ab0b28f783446fdd78a1f0e3b7b59 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 12 Jun 2025 11:47:45 -0700 Subject: [PATCH 10/17] add back jr --- plugins/spark/v3.5/spark/build.gradle.kts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index cb0ee9e4b0..443caa4714 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -143,3 +143,11 @@ tasks.named("shadowJar") { } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("shadowJar") } + +tasks.named("jar") { + // retain the default jar job, and add a classifier to avoid conflict + // with the createPolarisSparkJar. This jar is needed by the task "test", + // which can not be switched to depends on createPolarisSparkJar due to + // relocation of com.fasterxml. + archiveClassifier.set("internal") +} From 7aa6a26958c39a7b786a5cede4545d31d45eb19d Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 12 Jun 2025 12:12:43 -0700 Subject: [PATCH 11/17] udpate dependency --- plugins/spark/v3.5/spark/build.gradle.kts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 443caa4714..c770274e75 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -142,7 +142,10 @@ tasks.named("shadowJar") { relocate("org.apache.avro", "org.apache.polaris.shaded.org.apache.avro") } -tasks.withType(Jar::class).named("sourcesJar") { dependsOn("shadowJar") } +// ensure the shadowJar job is run for both `assemble` and `build` task +tasks.named("assemble") { dependsOn("shadowJar") } + +tasks.named("build") { dependsOn("shadowJar") } tasks.named("jar") { // retain the default jar job, and add a classifier to avoid conflict From 5979e2bcda603d3e56ea87a6a844059200acda99 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 13 Jun 2025 10:13:33 -0700 Subject: [PATCH 12/17] add change --- plugins/spark/v3.5/spark/build.gradle.kts | 6 ++++-- site/content/in-dev/unreleased/polaris-spark-client.md | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index c770274e75..f82bdcca9a 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -129,6 +129,8 @@ tasks.named("shadowJar") { from(sourceSets.main.get().output) configurations = listOf(project.configurations.runtimeClasspath.get()) + mergeServiceFiles() + // Optimization: Minimize the JAR (remove unused classes from dependencies) // The iceberg-spark-runtime plugin is always packaged along with our polaris-spark plugin, // therefore excluded from the optimization. @@ -149,8 +151,8 @@ tasks.named("build") { dependsOn("shadowJar") } tasks.named("jar") { // retain the default jar job, and add a classifier to avoid conflict - // with the createPolarisSparkJar. This jar is needed by the task "test", - // which can not be switched to depends on createPolarisSparkJar due to + // with the shadowJar task. This jar is needed by the task "test", + // which can not be switched to depends on shadow Jar task due to // relocation of com.fasterxml. archiveClassifier.set("internal") } diff --git a/site/content/in-dev/unreleased/polaris-spark-client.md b/site/content/in-dev/unreleased/polaris-spark-client.md index 2c41c0cfd9..a4321c15a9 100644 --- a/site/content/in-dev/unreleased/polaris-spark-client.md +++ b/site/content/in-dev/unreleased/polaris-spark-client.md @@ -134,15 +134,16 @@ The Polaris Spark client today is not designed to be used with Iceberg Spark cli there is no guarantee provided if both `org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:` and `org.apache.polaris:polaris-spark-3.5_2.12:` are provided for the Spark `package` configuration. -In order to help the usage of IcebergExtension use the same client, Polaris Spark client also ships a version of -Iceberg Spark client with it, the version information is described in the following table: +However, Polaris Spark client ships a version of Iceberg Spark client along with it, and IcebergSessionExtension +can be used without adding the Iceberg Spark client to the package. +The version information is described in the following table: | Spark Client Version | Iceberg Spark Client Version | |----------------------|------------------------------| | 1.0.0 | 1.9.0 | -## Trouble Shooting -1. When starting spark, sometimes it complains it failed to download a package, for example, +## Troubleshooting +1. When starting Spark, sometimes it complains it failed to download a package, for example, `download failed: org.apache.commons#commons-compress;1.26.2!commons-compress.jar`. If that happens, you can explicitly specify the package in the `package` configuration, for example, `--package org.apache.commons:commons-compress:1.27.1` From a1f892c4428f077f058ff7a7d7314e3321c4db86 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Mon, 16 Jun 2025 12:21:53 -0700 Subject: [PATCH 13/17] update --- plugins/spark/v3.5/spark/build.gradle.kts | 10 +--------- .../in-dev/unreleased/polaris-spark-client.md | 16 ++++------------ 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index f82bdcca9a..e4b51d4b11 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -116,7 +116,7 @@ dependencies { } tasks.named("shadowJar") { - archiveClassifier = null + archiveClassifier = "bundle" isZip64 = true // include the LICENSE and NOTICE files for the shadow Jar @@ -148,11 +148,3 @@ tasks.named("shadowJar") { tasks.named("assemble") { dependsOn("shadowJar") } tasks.named("build") { dependsOn("shadowJar") } - -tasks.named("jar") { - // retain the default jar job, and add a classifier to avoid conflict - // with the shadowJar task. This jar is needed by the task "test", - // which can not be switched to depends on shadow Jar task due to - // relocation of com.fasterxml. - archiveClassifier.set("internal") -} diff --git a/site/content/in-dev/unreleased/polaris-spark-client.md b/site/content/in-dev/unreleased/polaris-spark-client.md index a4321c15a9..a34bceeced 100644 --- a/site/content/in-dev/unreleased/polaris-spark-client.md +++ b/site/content/in-dev/unreleased/polaris-spark-client.md @@ -130,20 +130,12 @@ The Polaris Spark client has the following functionality limitations: 5) For other non-Iceberg tables like csv, it is not supported. ## Iceberg Spark Client compatibility with Polaris Spark Client -The Polaris Spark client today is not designed to be used with Iceberg Spark client together. In other words, -there is no guarantee provided if both `org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:` and -`org.apache.polaris:polaris-spark-3.5_2.12:` are provided for the Spark `package` configuration. - -However, Polaris Spark client ships a version of Iceberg Spark client along with it, and IcebergSessionExtension -can be used without adding the Iceberg Spark client to the package. -The version information is described in the following table: +The Polaris Spark client today depends on a specific Iceberg client version, and the version dependency is described +in the following table: | Spark Client Version | Iceberg Spark Client Version | |----------------------|------------------------------| | 1.0.0 | 1.9.0 | -## Troubleshooting -1. When starting Spark, sometimes it complains it failed to download a package, for example, - `download failed: org.apache.commons#commons-compress;1.26.2!commons-compress.jar`. If that happens, you can explicitly - specify the package in the `package` configuration, for example, `--package org.apache.commons:commons-compress:1.27.1` - +The Iceberg dependency is automatically downloaded when the Polaris package is downloaded, so there is no need to +add the Iceberg Spark client in the `packages` configuration. From cd2a94e0a763a2d7110801f0fdd9e21a25e727c3 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Mon, 16 Jun 2025 14:30:49 -0700 Subject: [PATCH 14/17] update tests --- plugins/spark/README.md | 6 +++--- .../spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb | 2 +- plugins/spark/v3.5/regtests/run.sh | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/spark/README.md b/plugins/spark/README.md index d4efb405c4..361cf163b1 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -42,7 +42,7 @@ The result jar is located at plugins/spark/v3.5/build//libs after Once the jar is built, we can manually test it with Spark and a local Polaris service. The following command starts a Polaris server for local testing, it runs on localhost:8181 with default -realm `POLARIS` and root credentials `root:secret`: +realm `POLARIS` and root credentials `root:secret`:./gr ```shell ./gradlew run ``` @@ -67,12 +67,12 @@ bin/spark-shell \ ``` Assume the path to the built Spark client jar is -`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar` +`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar` and the name of the catalog is `polaris`. The cli command will look like following: ```shell bin/spark-shell \ ---jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar \ +--jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar \ --packages org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension \ --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ diff --git a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb index 2d4d17b956..f4459b0714 100644 --- a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb +++ b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb @@ -266,7 +266,7 @@ "from pyspark.sql import SparkSession\n", "\n", "spark = (SparkSession.builder\n", - " .config(\"spark.jars\", \"../polaris_libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar\")\n", + " .config(\"spark.jars\", \"../polaris_libs/polaris-spark-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT-bundle.jar\")\n", " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.2.1\")\n", " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", " .config('spark.sql.iceberg.vectorization.enabled', 'false')\n", diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index 447f8c83c6..acf5d1a906 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -70,9 +70,9 @@ SPARK_VERSION="3.5.5" for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" + # find the project jar SPARK_DIR=${SPARK_ROOT_DIR}/spark - # find the spark client jar - JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.*-SNAPSHOT.jar" -print -quit) + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-spark-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.*-bundle.jar" -print -quit) echo "find jar ${JAR_PATH}" SPARK_EXISTS="TRUE" From 1fb7ccd47ff8a9d16db6f6dc025388f5000d2481 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Mon, 16 Jun 2025 14:41:30 -0700 Subject: [PATCH 15/17] remove merge service file --- plugins/spark/v3.5/spark/build.gradle.kts | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index e4b51d4b11..c328bb23e5 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -129,8 +129,6 @@ tasks.named("shadowJar") { from(sourceSets.main.get().output) configurations = listOf(project.configurations.runtimeClasspath.get()) - mergeServiceFiles() - // Optimization: Minimize the JAR (remove unused classes from dependencies) // The iceberg-spark-runtime plugin is always packaged along with our polaris-spark plugin, // therefore excluded from the optimization. From 841bcc400d1552e4494bc4550a7b26295f22b50f Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 18 Jun 2025 11:29:50 -0700 Subject: [PATCH 16/17] update readme --- plugins/spark/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plugins/spark/README.md b/plugins/spark/README.md index 361cf163b1..f2e8a69346 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -38,11 +38,13 @@ A shadowJar task is added to build a jar for the Polaris Spark plugin, the jar i The result jar is located at plugins/spark/v3.5/build//libs after the build. +The shadowJar task is also executed automatically when you run the `gradlew assemble` or `gradlew build`. + # Start Spark with Local Polaris Service using built Jar Once the jar is built, we can manually test it with Spark and a local Polaris service. The following command starts a Polaris server for local testing, it runs on localhost:8181 with default -realm `POLARIS` and root credentials `root:secret`:./gr +realm `POLARIS` and root credentials `root:secret`: ```shell ./gradlew run ``` From 5ad378f1455c20e791b06c89b7d441c3bd7bf697 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 18 Jun 2025 11:33:06 -0700 Subject: [PATCH 17/17] update readme --- plugins/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/spark/README.md b/plugins/spark/README.md index f2e8a69346..3f4acc31c4 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -38,7 +38,7 @@ A shadowJar task is added to build a jar for the Polaris Spark plugin, the jar i The result jar is located at plugins/spark/v3.5/build//libs after the build. -The shadowJar task is also executed automatically when you run the `gradlew assemble` or `gradlew build`. +The shadowJar task is also executed automatically when you run `gradlew assemble` or `gradlew build`. # Start Spark with Local Polaris Service using built Jar Once the jar is built, we can manually test it with Spark and a local Polaris service.