From 345da2e08802cffed3212194f2abd5c74db97a8b Mon Sep 17 00:00:00 2001 From: Yong Date: Sun, 18 May 2025 22:29:53 -0500 Subject: [PATCH] switch to use iceberg-aws-bundle jar --- getting-started/eclipselink/docker-compose.yml | 2 +- getting-started/jdbc/docker-compose.yml | 2 +- getting-started/spark/notebooks/SparkPolaris.ipynb | 2 +- plugins/spark/README.md | 10 +++++----- .../v3.5/getting-started/notebooks/SparkPolaris.ipynb | 4 ++-- regtests/setup.sh | 2 +- regtests/t_pyspark/src/iceberg_spark.py | 4 +--- .../in-dev/unreleased/getting-started/using-polaris.md | 4 ++-- site/content/in-dev/unreleased/polaris-spark-client.md | 4 ++-- 9 files changed, 16 insertions(+), 18 deletions(-) diff --git a/getting-started/eclipselink/docker-compose.yml b/getting-started/eclipselink/docker-compose.yml index 68a682c622..1126dae95d 100644 --- a/getting-started/eclipselink/docker-compose.yml +++ b/getting-started/eclipselink/docker-compose.yml @@ -76,7 +76,7 @@ services: retries: 15 command: [ /opt/spark/bin/spark-sql, - --packages, "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,software.amazon.awssdk:bundle:2.28.17,software.amazon.awssdk:url-connection-client:2.28.17,org.apache.iceberg:iceberg-gcp-bundle:1.9.0,org.apache.iceberg:iceberg-azure-bundle:1.9.0", + --packages, "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0,org.apache.iceberg:iceberg-gcp-bundle:1.9.0,org.apache.iceberg:iceberg-azure-bundle:1.9.0", --conf, "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", --conf, "spark.sql.catalog.quickstart_catalog=org.apache.iceberg.spark.SparkCatalog", --conf, "spark.sql.catalog.quickstart_catalog.type=rest", diff --git a/getting-started/jdbc/docker-compose.yml b/getting-started/jdbc/docker-compose.yml index fbfd427ee2..7429f3a706 100644 --- a/getting-started/jdbc/docker-compose.yml +++ b/getting-started/jdbc/docker-compose.yml @@ -76,7 +76,7 @@ services: retries: 15 command: [ /opt/spark/bin/spark-sql, - --packages, "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,software.amazon.awssdk:bundle:2.28.17,software.amazon.awssdk:url-connection-client:2.28.17,org.apache.iceberg:iceberg-gcp-bundle:1.9.0,org.apache.iceberg:iceberg-azure-bundle:1.9.0", + --packages, "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0,org.apache.iceberg:iceberg-gcp-bundle:1.9.0,org.apache.iceberg:iceberg-azure-bundle:1.9.0", --conf, "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", --conf, "spark.sql.catalog.polaris=org.apache.iceberg.spark.SparkCatalog", --conf, "spark.sql.catalog.polaris.type=rest", diff --git a/getting-started/spark/notebooks/SparkPolaris.ipynb b/getting-started/spark/notebooks/SparkPolaris.ipynb index 42d56a87ea..b3e416e80e 100644 --- a/getting-started/spark/notebooks/SparkPolaris.ipynb +++ b/getting-started/spark/notebooks/SparkPolaris.ipynb @@ -256,7 +256,7 @@ "\n", "spark = (SparkSession.builder\n", " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.iceberg.spark.SparkSessionCatalog\")\n", - " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.hadoop:hadoop-aws:3.4.0,software.amazon.awssdk:bundle:2.23.19,software.amazon.awssdk:url-connection-client:2.23.19\")\n", + " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0\")\n", " .config('spark.sql.iceberg.vectorization.enabled', 'false')\n", " \n", " # Configure the 'polaris' catalog as an Iceberg rest catalog\n", diff --git a/plugins/spark/README.md b/plugins/spark/README.md index 60978fe179..ae4a566cd9 100644 --- a/plugins/spark/README.md +++ b/plugins/spark/README.md @@ -31,7 +31,7 @@ and depends on iceberg-spark-runtime 1.9.0. # Build Plugin Jar A task createPolarisSparkJar is added to build a jar for the Polaris Spark plugin, the jar is named as: `polaris-iceberg--spark-runtime-_-.jar`. For example: -`polaris-iceberg-1.8.1-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar`. +`polaris-iceberg-1.9.0-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar`. - `./gradlew :polaris-spark-3.5_2.12:createPolarisSparkJar` -- build jar for Spark 3.5 with Scala version 2.12. - `./gradlew :polaris-spark-3.5_2.13:createPolarisSparkJar` -- build jar for Spark 3.5 with Scala version 2.13. @@ -53,7 +53,7 @@ jar, and to use the local Polaris server as a Catalog. ```shell bin/spark-shell \ --jars \ ---packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.3.1 \ +--packages org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension \ --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ --conf spark.sql.catalog..warehouse= \ @@ -67,13 +67,13 @@ bin/spark-shell \ ``` Assume the path to the built Spark client jar is -`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-iceberg-1.8.1-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar` +`/polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-iceberg-1.9.0-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar` and the name of the catalog is `polaris`. The cli command will look like following: ```shell bin/spark-shell \ ---jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-iceberg-1.8.1-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar \ ---packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.3.1 \ +--jars /polaris/plugins/spark/v3.5/spark/build/2.12/libs/polaris-iceberg-1.9.0-spark-runtime-3.5_2.12-0.10.0-beta-incubating-SNAPSHOT.jar \ +--packages org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension \ --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ --conf spark.sql.catalog.polaris.warehouse= \ diff --git a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb index ad32424b8b..cd0b02c8a0 100644 --- a/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb +++ b/plugins/spark/v3.5/getting-started/notebooks/SparkPolaris.ipynb @@ -266,8 +266,8 @@ "from pyspark.sql import SparkSession\n", "\n", "spark = (SparkSession.builder\n", - " .config(\"spark.jars\", \"../polaris_libs/polaris-iceberg-1.8.1-spark-runtime-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar\")\n", - " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.2.1\")\n", + " .config(\"spark.jars\", \"../polaris_libs/polaris-iceberg-1.9.0-spark-runtime-3.5_2.12-0.11.0-beta-incubating-SNAPSHOT.jar\")\n", + " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.2.1\")\n", " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", " .config('spark.sql.iceberg.vectorization.enabled', 'false')\n", "\n", diff --git a/regtests/setup.sh b/regtests/setup.sh index b2d792fa1a..5dc89ca49f 100755 --- a/regtests/setup.sh +++ b/regtests/setup.sh @@ -114,7 +114,7 @@ else cat << EOF >> ${SPARK_CONF} # POLARIS_TESTCONF_V5 -spark.jars.packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:${ICEBERG_VERSION},org.apache.hadoop:hadoop-aws:3.4.0,software.amazon.awssdk:bundle:2.23.19,software.amazon.awssdk:url-connection-client:2.23.19 +spark.jars.packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:${ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:${ICEBERG_VERSION} spark.hadoop.fs.s3.impl org.apache.hadoop.fs.s3a.S3AFileSystem spark.hadoop.fs.AbstractFileSystem.s3.impl org.apache.hadoop.fs.s3a.S3A spark.sql.variable.substitute true diff --git a/regtests/t_pyspark/src/iceberg_spark.py b/regtests/t_pyspark/src/iceberg_spark.py index 897180d5ff..8cb20591fc 100644 --- a/regtests/t_pyspark/src/iceberg_spark.py +++ b/regtests/t_pyspark/src/iceberg_spark.py @@ -75,9 +75,7 @@ def __enter__(self): """ packages = [ "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0", - "org.apache.hadoop:hadoop-aws:3.4.0", - "software.amazon.awssdk:bundle:2.23.19", - "software.amazon.awssdk:url-connection-client:2.23.19", + "org.apache.iceberg:iceberg-aws-bundle:1.9.0", ] excludes = ["org.checkerframework:checker-qual", "com.google.errorprone:error_prone_annotations"] diff --git a/site/content/in-dev/unreleased/getting-started/using-polaris.md b/site/content/in-dev/unreleased/getting-started/using-polaris.md index d452a3522a..7713b149cb 100644 --- a/site/content/in-dev/unreleased/getting-started/using-polaris.md +++ b/site/content/in-dev/unreleased/getting-started/using-polaris.md @@ -154,7 +154,7 @@ _Note: the credentials provided here are those for our principal, not the root c ```shell bin/spark-sql \ ---packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.hadoop:hadoop-aws:3.4.0 \ +--packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.quickstart_catalog.warehouse=quickstart_catalog \ --conf spark.sql.catalog.quickstart_catalog.header.X-Iceberg-Access-Delegation=vended-credentials \ @@ -170,7 +170,7 @@ bin/spark-sql \ Similar to the CLI commands above, this configures Spark to use the Polaris running at `localhost:8181`. If your Polaris server is running elsewhere, but sure to update the configuration appropriately. -Finally, note that we include the `hadoop-aws` package here. If your table is using a different filesystem, be sure to include the appropriate dependency. +Finally, note that we include the `iceberg-aws-bundle` package here. If your table is using a different filesystem, be sure to include the appropriate dependency. #### Using Spark SQL from a Docker container diff --git a/site/content/in-dev/unreleased/polaris-spark-client.md b/site/content/in-dev/unreleased/polaris-spark-client.md index 9466020a41..712082e273 100644 --- a/site/content/in-dev/unreleased/polaris-spark-client.md +++ b/site/content/in-dev/unreleased/polaris-spark-client.md @@ -60,7 +60,7 @@ a released Polaris Spark client. ```shell bin/spark-shell \ ---packages ,org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.3.1 \ +--packages ,org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1 \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension \ --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ --conf spark.sql.catalog..warehouse= \ @@ -88,7 +88,7 @@ You can also start the connection by programmatically initialize a SparkSession, from pyspark.sql import SparkSession spark = SparkSession.builder - .config("spark.jars.packages", ",org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.3.1") + .config("spark.jars.packages", ",org.apache.iceberg:iceberg-aws-bundle:1.9.0,io.delta:delta-spark_2.12:3.3.1") .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension") .config("spark.sql.catalog.", "org.apache.polaris.spark.SparkCatalog")