From 243b79a9b905599848a1d1034a74cb5fd80bd091 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Tue, 15 Apr 2025 23:36:06 -0700 Subject: [PATCH 01/25] ad retest setup --- plugins/spark/regtests/run.sh | 23 ++++ plugins/spark/regtests/setup.sh | 156 ++++++++++++++++++++++ plugins/spark/v3.5/spark/build.gradle.kts | 7 +- 3 files changed, 184 insertions(+), 2 deletions(-) create mode 100755 plugins/spark/regtests/run.sh create mode 100755 plugins/spark/regtests/setup.sh diff --git a/plugins/spark/regtests/run.sh b/plugins/spark/regtests/run.sh new file mode 100755 index 0000000000..2fdc375f0e --- /dev/null +++ b/plugins/spark/regtests/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Run without args to run all tests, or single arg for single test. +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + + diff --git a/plugins/spark/regtests/setup.sh b/plugins/spark/regtests/setup.sh new file mode 100755 index 0000000000..abb79c1e52 --- /dev/null +++ b/plugins/spark/regtests/setup.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Idempotent setup for regression tests. Run manually or let run.sh auto-run. +# +# Warning - first time setup may download large amounts of files +# Warning - may clobber conf/spark-defaults.conf + +set -x + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +SPARK_VERSION=3.5.5 +SCALA_VERSION=2.12 +POLARIS_CLIENT_JAR="" +while [[ $# -gt 0 ]]; do + case "$1" in + --sparkVersion) + SPARK_VERSION="$2" + shift # past argument + shift # past value + ;; + --scalaVersion) + SCALA_VERSION="$2" + shift # past argument + shift # past value + ;; + --jar) + POLARIS_CLIENT_JAR="$2" + shift # past argument + shift # past value + ;; + --) shift; + break + ;; + esac +done + +echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION}" + +if [ "$SCALA_VERSION" == "2.12" ]; then + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3 +else + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3-scala${SCALA_VERSION} +fi + +echo "Getting spark distribution ${SPARK_DISTRIBUTION}" + +TEST_ROOT_DIR="spark-client-tests" +mkdir ~/${TEST_ROOT_DIR} +SPARK_HOME=$(realpath ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}) +SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +DERBY_HOME="/tmp/derby" +echo "SPARK_HOME=${SPARK_HOME}" +echo "SPARK_CONF=${SPARK_CONF}" +export PYTHONPATH="${SPARK_HOME}/python/:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" + +# Ensure binaries are downloaded locally +echo 'Verifying Spark binaries...' +if ! [ -f ${SPARK_HOME}/bin/spark-sql ]; then + echo 'Setting up Spark...' + if [ -z "${SPARK_VERSION}" ] || [ -z "${SPARK_DISTRIBUTION}" ]; then + echo 'SPARK_VERSION or SPARK_DISTRIBUTION not set. Please set SPARK_VERSION and SPARK_DISTRIBUTION to the desired version.' + exit 1 + fi + if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Downloading spark distro...' + wget -O ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + if [[ "${OSTYPE}" == "darwin"* ]]; then + echo "Detected OS: mac. Running 'brew install wget' to try again." + brew install wget + wget -O ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + fi + fi + else + echo 'Found existing Spark tarball' + fi + # check if the download was successful + if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Failed to download Spark distribution. Please check the logs.' + exit 1 + fi + tar xzvf ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR} + if [ $? -ne 0 ]; then + echo 'Failed to extract Spark distribution. Please check the logs.' + exit 1 + else + echo 'Extracted Spark distribution.' + rm ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz + fi + SPARK_HOME=$(realpath ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}) + SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +else + echo 'Verified Spark distro already installed.' +fi + +echo "SPARK_HOME=${SPARK_HOME}" +echo "SPARK_CONF=${SPARK_CONF}" + +# Ensure Spark boilerplate conf is set +echo 'Verifying Spark conf...' +if grep 'POLARIS_TESTCONF_V5' ${SPARK_CONF} 2>/dev/null; then + echo 'Verified spark conf' +else + echo 'Setting spark conf...' + # Instead of clobbering existing spark conf, just comment it all out in case it was customized carefully. + sed -i 's/^/# /' ${SPARK_CONF} +cat << EOF >> ${SPARK_CONF} + +# POLARIS Spark client test conf +spark.jars $POLARIS_CLIENT_JAR +spark.jars.packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.2.1 +spark.hadoop.fs.s3.impl org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.AbstractFileSystem.s3.impl org.apache.hadoop.fs.s3a.S3A +spark.sql.variable.substitute true + +spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} + +spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension +spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog +spark.sql.catalog.polaris.type=rest +spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog +spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials +spark.sql.catalog.polaris.client.region=us-west-2 +spark.sql.sources.useV1SourceList='' +EOF + echo 'Success!' +fi + +# cleanup derby home if existed +if [ -d "${DERBY_HOME}" ]; then + echo "Directory ${DERBY_HOME} exists. Deleting it..." + rm -rf "${DERBY_HOME}" +fi + +echo "Launch spark-sql at ${SPARK_HOME}/bin/spark-sql" +# bootstrap dependencies so that future queries don't need to wait for the downloads. +# this is mostly useful for building the Docker image with all needed dependencies +${SPARK_HOME}/bin/spark-sql -e "SELECT 1" diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index ddf27ce1f9..87d6348f65 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -18,6 +18,7 @@ */ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar +import com.github.jengelman.gradle.plugins.shadow.transformers.ServiceFileTransformer plugins { id("polaris-client") @@ -122,13 +123,12 @@ tasks.register("checkNoDisallowedImports") { tasks.named("check") { dependsOn("checkNoDisallowedImports") } tasks.register("createPolarisSparkJar") { + archiveClassifier = null archiveBaseName = "polaris-iceberg-${icebergVersion}-spark-runtime-${sparkMajorVersion}_${scalaVersion}" isZip64 = true - mergeServiceFiles() - // pack both the source code and dependencies from(sourceSets.main.get().output) @@ -143,6 +143,9 @@ tasks.register("createPolarisSparkJar") { } relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml.jackson") + + mergeServiceFiles() + exclude("META-INF/*.RSA", "META-INF/*.DSA", "META-INF/*.SF") } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } From 8f9d3acf59e0a479e89b3e30af0d23ad16eb0ea9 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 17 Apr 2025 10:22:56 -0700 Subject: [PATCH 02/25] add change --- .github/workflows/spark_client_regtests.yml | 61 +++++++++++ build.gradle.kts | 2 + plugins/spark/regtests/Dockerfile | 48 +++++++++ plugins/spark/regtests/README.md | 89 ++++++++++++++++ plugins/spark/regtests/docker-compose.yml | 73 ++++++++++++++ plugins/spark/regtests/run.sh | 106 +++++++++++++++++++- plugins/spark/regtests/setup.sh | 31 +++--- plugins/spark/regtests/spark_sql.ref | 17 ++++ plugins/spark/regtests/spark_sql.sh | 54 ++++++++++ plugins/spark/v3.5/spark/build.gradle.kts | 28 +++++- 10 files changed, 487 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/spark_client_regtests.yml create mode 100644 plugins/spark/regtests/Dockerfile create mode 100755 plugins/spark/regtests/README.md create mode 100644 plugins/spark/regtests/docker-compose.yml create mode 100755 plugins/spark/regtests/spark_sql.ref create mode 100755 plugins/spark/regtests/spark_sql.sh diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml new file mode 100644 index 0000000000..35bd2c2860 --- /dev/null +++ b/.github/workflows/spark_client_regtests.yml @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Regression Tests +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + regtest: + + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 21 + uses: actions/setup-java@v4 + with: + java-version: '21' + distribution: 'temurin' + + - name: Fix permissions + run: mkdir -p regtests/output && chmod 777 regtests/output && chmod 777 regtests/t_*/ref/* + + - name: Project build + run: ./gradlew build + + - name: Image build + run: | + ./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.build=true + + - name: Regression Test + env: + AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} + AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + run: | + docker compose -f plugins/spark/regtests/docker-compose.yml up --build --exit-code-from regtest \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts index e39abe385d..feb6e368ca 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -125,6 +125,8 @@ tasks.named("rat").configure { excludes.add("**/kotlin-compiler*") excludes.add("**/build-logic/.kotlin/**") + + excludes.add("plugins/**/*.ref") } // Pass environment variables: diff --git a/plugins/spark/regtests/Dockerfile b/plugins/spark/regtests/Dockerfile new file mode 100644 index 0000000000..c0149aae5b --- /dev/null +++ b/plugins/spark/regtests/Dockerfile @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM docker.io/apache/spark:3.5.5-java17 +ARG POLARIS_HOST=polaris +ENV POLARIS_HOST=$POLARIS_HOST +ENV SPARK_HOME=/opt/spark +ENV CURRENT_SCALA_VERSIN='2.12' +ENV LANGUAGE='en_US:en' + +USER root +RUN apt update +RUN apt-get install -y diffutils wget curl +RUN mkdir -p /home/spark && \ + chown -R spark /home/spark && \ + mkdir -p /tmp/polaris-regtests && \ + chown -R spark /tmp/polaris-regtests +RUN mkdir /opt/spark/conf && chmod -R 777 /opt/spark/conf + +USER spark + +WORKDIR /home/spark/polaris + +COPY --chown=spark ./spark /home/spark/polaris/spark + +# /home/spark/regtests might not be writable in all situations, see https://github.com/apache/polaris/pull/205 +USER root +RUN chmod -R go+rwx /home/spark/polaris +RUN chmod -R 777 ./spark/regtests +USER spark + +ENTRYPOINT ["./spark/regtests/run.sh"] diff --git a/plugins/spark/regtests/README.md b/plugins/spark/regtests/README.md new file mode 100755 index 0000000000..2c60254ed0 --- /dev/null +++ b/plugins/spark/regtests/README.md @@ -0,0 +1,89 @@ + + +# End-to-end regression tests + +regtests provides basic end-to-end tests to customer abo + +Regression tests are either run in Docker, using docker-compose to orchestrate the tests, or +locally. + +## Prerequisites + +It is recommended to clean the `regtests/output` directory before running tests. This can be done by +running: + +```shell +rm -rf ./regtests/output && mkdir -p ./regtests/output && chmod -R 777 ./regtests/output +``` + +## Run Tests With Docker Compose + +Tests can be run with docker-compose using the provided `./regtests/docker-compose.yml` file, as +follows: + +```shell +./gradlew build +./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.build=true +docker compose -f ./plugins/spark/regtests/docker-compose.yml up --build --exit-code-from regtest +``` + +In this setup, a Polaris container will be started in a docker-compose group, using the image +previously built by the Gradle build. Then another container, including a Spark SQL shell, will run +the tests. The exit code will be the same as the exit code of the Spark container. + +This is the flow used in CI and should be done locally before pushing to GitHub to ensure that no +environmental factors contribute to the outcome of the tests. + +**Important**: if you are also using minikube, for example to test the Helm chart, you may need to +_unset_ the Docker environment that was pointing to the Minikube Docker daemon, otherwise the image +will be built by the Minikube Docker daemon and will not be available to the local Docker daemon. +This can be done by running, _before_ building the image and running the tests: + +```shell +eval $(minikube -p minikube docker-env --unset) +``` + +## Run Tests Locally + +Regression tests can be run locally as well, using the test harness. + +In this setup, a Polaris server must be running on localhost:8181 before running tests. The simplest +way to do this is to run the Polaris server in a separate terminal window: + +```shell +./gradlew run +``` + +Note: the regression tests expect Polaris to run with certain options, e.g. with support for `FILE` +storage, default realm `POLARIS` and root credentials `root:secret`; if you run the above command, +this will be the case. If you run Polaris in a different way, make sure that Polaris is configured +appropriately. + +Running the test harness will automatically run the idempotent setup script. From the root of the +project, just run: + +```shell +env POLARIS_HOST=localhost ./regtests/run.sh +``` \ No newline at end of file diff --git a/plugins/spark/regtests/docker-compose.yml b/plugins/spark/regtests/docker-compose.yml new file mode 100644 index 0000000000..332ed4922c --- /dev/null +++ b/plugins/spark/regtests/docker-compose.yml @@ -0,0 +1,73 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +services: + polaris: + image: apache/polaris:latest + ports: + - "8181" + - "8182" + environment: + AWS_REGION: us-west-2 + AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY + GOOGLE_APPLICATION_CREDENTIALS: $GOOGLE_APPLICATION_CREDENTIALS + AZURE_TENANT_ID: $AZURE_TENANT_ID + AZURE_CLIENT_ID: $AZURE_CLIENT_ID + AZURE_CLIENT_SECRET: $AZURE_CLIENT_SECRET + POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,root,secret + quarkus.log.file.enable: "false" + quarkus.otel.sdk.disabled: "true" + volumes: + - ./credentials:/tmp/credentials/ + healthcheck: + test: ["CMD", "curl", "http://localhost:8182/q/health"] + interval: 10s + timeout: 10s + retries: 5 + regtest: + build: + context: ../.. + dockerfile: spark/regtests/Dockerfile + args: + POLARIS_HOST: polaris + depends_on: + polaris: + condition: service_healthy + environment: + AWS_TEST_ENABLED: $AWS_TEST_ENABLED + AWS_STORAGE_BUCKET: $AWS_STORAGE_BUCKET + AWS_ROLE_ARN: $AWS_ROLE_ARN + AWS_TEST_BASE: $AWS_TEST_BASE + GCS_TEST_ENABLED: $GCS_TEST_ENABLED + GCS_TEST_BASE: $GCS_TEST_BASE + GOOGLE_APPLICATION_CREDENTIALS: $GOOGLE_APPLICATION_CREDENTIALS + AZURE_TEST_ENABLED: $AZURE_TEST_ENABLED + AZURE_TENANT_ID: $AZURE_TENANT_ID + AZURE_DFS_TEST_BASE: $AZURE_DFS_TEST_BASE + AZURE_BLOB_TEST_BASE: $AZURE_BLOB_TEST_BASE + AZURE_CLIENT_ID: $AZURE_CLIENT_ID + AZURE_CLIENT_SECRET: $AZURE_CLIENT_SECRET + AWS_CROSS_REGION_TEST_ENABLED: $AWS_CROSS_REGION_TEST_ENABLED + AWS_CROSS_REGION_BUCKET: $AWS_CROSS_REGION_BUCKET + AWS_ROLE_FOR_CROSS_REGION_BUCKET: $AWS_ROLE_FOR_CROSS_REGION_BUCKET + AWS_REGION_FOR_CROSS_REGION_TEST: $AWS_REGION_FOR_CROSS_REGION_TEST + volumes: + - ./output:/tmp/polaris-regtests/ + - ./credentials:/tmp/credentials/ diff --git a/plugins/spark/regtests/run.sh b/plugins/spark/regtests/run.sh index 2fdc375f0e..a08aaceba2 100755 --- a/plugins/spark/regtests/run.sh +++ b/plugins/spark/regtests/run.sh @@ -17,7 +17,111 @@ # specific language governing permissions and limitations # under the License. # -# Run without args to run all tests, or single arg for single test. +# Run without args to run all tests. SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SPARK_ROOT_DIR=$(dirname ${SCRIPT_DIR}) +export SPARK_LOCAL_HOSTNAME=localhost # avoid VPN messing up driver local IP address binding +FMT_RED='\033[0;31m' +FMT_GREEN='\033[0;32m' +FMT_NC='\033[0m' +function loginfo() { + echo "$(date): ${@}" +} +function loggreen() { + echo -e "${FMT_GREEN}$(date): ${@}${FMT_NC}" +} +function logred() { + echo -e "${FMT_RED}$(date): ${@}${FMT_NC}" +} + +export AWS_ACCESS_KEY_ID='' +export AWS_SECRET_ACCESS_KEY='' + +# Allow bearer token to be provided if desired +if [[ -z "$REGTEST_ROOT_BEARER_TOKEN" ]]; then + if ! output=$(curl -X POST -H "Polaris-Realm: POLARIS" "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/oauth/tokens" \ + -d "grant_type=client_credentials" \ + -d "client_id=root" \ + -d "client_secret=secret" \ + -d "scope=PRINCIPAL_ROLE:ALL"); then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + token=$(echo "$output" | awk -F\" '{print $4}') + + if [ "$token" == "unauthorized_client" ]; then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + export REGTEST_ROOT_BEARER_TOKEN=$token +fi + +echo "Root bearer token: ${REGTEST_ROOT_BEARER_TOKEN}" + +SPARK_VERSION_ITEMS=("3.5 3.5.5") +SCALA_VERSIONS=("2.12" "2.13") +if [[ -n "$CURRENT_SCALA_VERSION" ]]; then + SCALA_VERSIONS=("${CURRENT_SCALA_VERSION}") +fi + +NUM_FAILURES=0 + +for SPARK_VERSION_ITEM in "${SPARK_VERSION_ITEMS[@]}"; do + set -- $SPARK_VERSION_ITEM + SPARK_MAJOR_VERSION=$1 + SPARK_VERSION=$2 + for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do + echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" + # find the project jar + SPARK_DIR=${SPARK_ROOT_DIR}/v${SPARK_MAJOR_VERSION} + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-iceberg-*.*-spark-runtime-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar" -print -quit) + echo "find jar ${JAR_PATH}" + + source ${SCRIPT_DIR}/setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} + + # run the spark_sql test + loginfo "Starting test spark_sql.sh" + + # export SPARK_HOME=/Users/yzou/spark-client-tests/spark-3.5.5-bin-hadoop3 + TEST_FILE="spark_sql.sh" + TEST_SHORTNAME="spark_sql" + TEST_TMPDIR="/tmp/polaris-spark-regtests/${TEST_SHORTNAME}_${SPARK_MAJOR_VERSION}_${SCALA_VERSION}" + TEST_STDERR="${TEST_TMPDIR}/${TEST_SHORTNAME}.stderr" + TEST_STDOUT="${TEST_TMPDIR}/${TEST_SHORTNAME}.stdout" + + mkdir -p ${TEST_TMPDIR} + if (( ${VERBOSE} )); then + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' | tee ${TEST_STDOUT} + else + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' > ${TEST_STDOUT} + fi + loginfo "Test run concluded for ${TEST_SUITE}:${TEST_SHORTNAME}" + + TEST_REF="$(realpath ${SCRIPT_DIR})/${TEST_SHORTNAME}.ref" + if cmp --silent ${TEST_STDOUT} ${TEST_REF}; then + loggreen "Test SUCCEEDED: ${TEST_SUITE}:${TEST_SHORTNAME}" + else + logred "Test FAILED: ${TEST_SUITE}:${TEST_SHORTNAME}" + echo '#!/bin/bash' > ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + echo "meld ${TEST_STDOUT} ${TEST_REF}" >> ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + chmod 750 ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + logred "To compare and fix diffs (if 'meld' installed): ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh" + logred "Or manually diff: diff ${TEST_STDOUT} ${TEST_REF}" + logred "See stderr from test run for additional diagnostics: ${TEST_STDERR}" + diff ${TEST_STDOUT} ${TEST_REF} + NUM_FAILURES=$(( NUM_FAILURES + 1 )) + fi + export SPARK_HOME="" + done +done + +loginfo "Tests completed with ${NUM_FAILURES} failures" +if (( ${NUM_FAILURES} > 0 )); then + exit 1 +else + exit 0 +fi diff --git a/plugins/spark/regtests/setup.sh b/plugins/spark/regtests/setup.sh index abb79c1e52..c30de952d5 100755 --- a/plugins/spark/regtests/setup.sh +++ b/plugins/spark/regtests/setup.sh @@ -52,7 +52,7 @@ while [[ $# -gt 0 ]]; do esac done -echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION}" +echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} POLARIS_CLIENT_JAR=${POLARIS_CLIENT_JAR}" if [ "$SCALA_VERSION" == "2.12" ]; then SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3 @@ -62,14 +62,11 @@ fi echo "Getting spark distribution ${SPARK_DISTRIBUTION}" -TEST_ROOT_DIR="spark-client-tests" -mkdir ~/${TEST_ROOT_DIR} -SPARK_HOME=$(realpath ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}) +if [ -z "${SPARK_HOME}" ]; then + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) +fi SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" DERBY_HOME="/tmp/derby" -echo "SPARK_HOME=${SPARK_HOME}" -echo "SPARK_CONF=${SPARK_CONF}" -export PYTHONPATH="${SPARK_HOME}/python/:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" # Ensure binaries are downloaded locally echo 'Verifying Spark binaries...' @@ -79,33 +76,33 @@ if ! [ -f ${SPARK_HOME}/bin/spark-sql ]; then echo 'SPARK_VERSION or SPARK_DISTRIBUTION not set. Please set SPARK_VERSION and SPARK_DISTRIBUTION to the desired version.' exit 1 fi - if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then echo 'Downloading spark distro...' - wget -O ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz - if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then if [[ "${OSTYPE}" == "darwin"* ]]; then echo "Detected OS: mac. Running 'brew install wget' to try again." brew install wget - wget -O ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz fi fi else echo 'Found existing Spark tarball' fi # check if the download was successful - if ! [ -f ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz ]; then + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then echo 'Failed to download Spark distribution. Please check the logs.' exit 1 fi - tar xzvf ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR} + tar xzvf ~/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR} if [ $? -ne 0 ]; then echo 'Failed to extract Spark distribution. Please check the logs.' exit 1 else echo 'Extracted Spark distribution.' - rm ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}.tgz + rm ~/${SPARK_DISTRIBUTION}.tgz fi - SPARK_HOME=$(realpath ~/${TEST_ROOT_DIR}/${SPARK_DISTRIBUTION}) + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" else echo 'Verified Spark distro already installed.' @@ -126,7 +123,7 @@ cat << EOF >> ${SPARK_CONF} # POLARIS Spark client test conf spark.jars $POLARIS_CLIENT_JAR -spark.jars.packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.2.1 +spark.jars.packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_${SCALA_VERSION}:3.2.1 spark.hadoop.fs.s3.impl org.apache.hadoop.fs.s3a.S3AFileSystem spark.hadoop.fs.AbstractFileSystem.s3.impl org.apache.hadoop.fs.s3a.S3A spark.sql.variable.substitute true @@ -154,3 +151,5 @@ echo "Launch spark-sql at ${SPARK_HOME}/bin/spark-sql" # bootstrap dependencies so that future queries don't need to wait for the downloads. # this is mostly useful for building the Docker image with all needed dependencies ${SPARK_HOME}/bin/spark-sql -e "SELECT 1" + +export SPARK_HOME=$SPARK_HOME diff --git a/plugins/spark/regtests/spark_sql.ref b/plugins/spark/regtests/spark_sql.ref new file mode 100755 index 0000000000..2a2a6203de --- /dev/null +++ b/plugins/spark/regtests/spark_sql.ref @@ -0,0 +1,17 @@ +{"defaults":{"default-base-location":"file:///tmp/spark_sql_s3_catalog"},"overrides":{"prefix":"spark_sql_catalog"},"endpoints":["GET /v1/{prefix}/namespaces","GET /v1/{prefix}/namespaces/{namespace}","HEAD /v1/{prefix}/namespaces/{namespace}","POST /v1/{prefix}/namespaces","POST /v1/{prefix}/namespaces/{namespace}/properties","DELETE /v1/{prefix}/namespaces/{namespace}","GET /v1/{prefix}/namespaces/{namespace}/tables","GET /v1/{prefix}/namespaces/{namespace}/tables/{table}","HEAD /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/namespaces/{namespace}/tables","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}","DELETE /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/tables/rename","POST /v1/{prefix}/namespaces/{namespace}/register","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}/metrics","POST /v1/{prefix}/transactions/commit","GET /v1/{prefix}/namespaces/{namespace}/views","GET /v1/{prefix}/namespaces/{namespace}/views/{view}","HEAD /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/namespaces/{namespace}/views","POST /v1/{prefix}/namespaces/{namespace}/views/{view}","DELETE /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/views/rename","POST /v1/{prefix}/transactions/commit","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","POST polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","DELETE polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}"]} +Catalog created +spark-sql (default)> use polaris; +spark-sql ()> create namespace db1; +spark-sql ()> create namespace db2; +spark-sql ()> show namespaces; +db1 +db2 +spark-sql ()> + > create namespace db1.schema1; +spark-sql ()> show namespaces in db1; +db1.schema1 +spark-sql ()> + > drop namespace db1.schema1; +spark-sql ()> drop namespace db1; +spark-sql ()> drop namespace db2; +spark-sql ()> diff --git a/plugins/spark/regtests/spark_sql.sh b/plugins/spark/regtests/spark_sql.sh new file mode 100755 index 0000000000..3e761f912e --- /dev/null +++ b/plugins/spark/regtests/spark_sql.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN}" + +# echo "CURRENT SPARK HOME ${SPARK_HOME}" +CATALOG_NAME="spark_sql_catalog" +curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ + -d '{"name": "spark_sql_catalog", "id": 100, "type": "INTERNAL", "readOnly": false, "properties": {"default-base-location": "file:///tmp/spark_sql_s3_catalog"}, "storageConfigInfo": {"storageType": "FILE", "allowedLocations": ["file:///tmp"]}}' > /dev/stderr + +# Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata +curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${CATALOG_NAME}/catalog-roles/catalog_admin/grants \ + -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr + +curl -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/config?warehouse=${CATALOG_NAME}" +echo +echo "Catalog created" +cat << EOF | ${SPARK_HOME}/bin/spark-sql -S --conf spark.sql.catalog.polaris.token="${SPARK_BEARER_TOKEN}" --conf spark.sql.catalog.polaris.warehouse=${CATALOG_NAME} +use polaris; +create namespace db1; +create namespace db2; +show namespaces; + +create namespace db1.schema1; +show namespaces in db1; + +drop namespace db1.schema1; +drop namespace db1; +drop namespace db2; +EOF + +curl -i -X DELETE -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${CATALOG_NAME} > /dev/stderr diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 87d6348f65..133632edb9 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -18,11 +18,10 @@ */ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar -import com.github.jengelman.gradle.plugins.shadow.transformers.ServiceFileTransformer plugins { id("polaris-client") - alias(libs.plugins.jandex) + // alias(libs.plugins.jandex) } // get version information @@ -39,13 +38,33 @@ val scalaLibraryVersion = } dependencies { + // TODO: Extract a polaris-rest module as a thin layer for + // client to depends on. implementation(project(":polaris-api-iceberg-service")) { // exclude the iceberg dependencies, use the ones pulled // by iceberg-core exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.micrometer", "*") + } + implementation(project(":polaris-api-catalog-service")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.micrometer", "*") + } + implementation(project(":polaris-core")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") } - implementation(project(":polaris-api-catalog-service")) - implementation(project(":polaris-core")) { exclude("org.apache.iceberg", "*") } implementation("org.apache.iceberg:iceberg-core:${icebergVersion}") @@ -123,7 +142,6 @@ tasks.register("checkNoDisallowedImports") { tasks.named("check") { dependsOn("checkNoDisallowedImports") } tasks.register("createPolarisSparkJar") { - archiveClassifier = null archiveBaseName = "polaris-iceberg-${icebergVersion}-spark-runtime-${sparkMajorVersion}_${scalaVersion}" From 177605c3988a537c588288395ba742d7fecbe83d Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 17 Apr 2025 14:26:18 -0700 Subject: [PATCH 03/25] fix creadential --- plugins/spark/regtests/Dockerfile | 2 +- plugins/spark/regtests/README.md | 2 +- plugins/spark/regtests/credentials/.keep | 0 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 plugins/spark/regtests/credentials/.keep diff --git a/plugins/spark/regtests/Dockerfile b/plugins/spark/regtests/Dockerfile index c0149aae5b..a01d999897 100644 --- a/plugins/spark/regtests/Dockerfile +++ b/plugins/spark/regtests/Dockerfile @@ -21,7 +21,7 @@ FROM docker.io/apache/spark:3.5.5-java17 ARG POLARIS_HOST=polaris ENV POLARIS_HOST=$POLARIS_HOST ENV SPARK_HOME=/opt/spark -ENV CURRENT_SCALA_VERSIN='2.12' +ENV CURRENT_SCALA_VERSION='2.12' ENV LANGUAGE='en_US:en' USER root diff --git a/plugins/spark/regtests/README.md b/plugins/spark/regtests/README.md index 2c60254ed0..a00c4d4679 100755 --- a/plugins/spark/regtests/README.md +++ b/plugins/spark/regtests/README.md @@ -32,7 +32,7 @@ It is recommended to clean the `regtests/output` directory before running tests. running: ```shell -rm -rf ./regtests/output && mkdir -p ./regtests/output && chmod -R 777 ./regtests/output +rm -rf ./plugins/spark/regtests/output && mkdir -p ./plugins/spark/regtests/output && chmod -R 777 ./plugins/spark/regtests/output ``` ## Run Tests With Docker Compose diff --git a/plugins/spark/regtests/credentials/.keep b/plugins/spark/regtests/credentials/.keep new file mode 100644 index 0000000000..e69de29bb2 From d7e469e7b92a6a8eb7b4c2be33a7cfd8ebd4808d Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Thu, 17 Apr 2025 18:51:51 -0700 Subject: [PATCH 04/25] add docker --- plugins/spark/regtests/run.sh | 127 ------------------ plugins/spark/{ => v3.5}/regtests/Dockerfile | 4 +- plugins/spark/{ => v3.5}/regtests/README.md | 0 .../{ => v3.5}/regtests/credentials/.keep | 0 .../{ => v3.5}/regtests/docker-compose.yml | 2 +- plugins/spark/v3.5/regtests/run.sh | 123 +++++++++++++++++ plugins/spark/{ => v3.5}/regtests/setup.sh | 0 .../spark/{ => v3.5}/regtests/spark_sql.ref | 0 .../spark/{ => v3.5}/regtests/spark_sql.sh | 5 + 9 files changed, 131 insertions(+), 130 deletions(-) delete mode 100755 plugins/spark/regtests/run.sh rename plugins/spark/{ => v3.5}/regtests/Dockerfile (94%) rename plugins/spark/{ => v3.5}/regtests/README.md (100%) rename plugins/spark/{ => v3.5}/regtests/credentials/.keep (100%) rename plugins/spark/{ => v3.5}/regtests/docker-compose.yml (98%) create mode 100755 plugins/spark/v3.5/regtests/run.sh rename plugins/spark/{ => v3.5}/regtests/setup.sh (100%) rename plugins/spark/{ => v3.5}/regtests/spark_sql.ref (100%) rename plugins/spark/{ => v3.5}/regtests/spark_sql.sh (95%) diff --git a/plugins/spark/regtests/run.sh b/plugins/spark/regtests/run.sh deleted file mode 100755 index a08aaceba2..0000000000 --- a/plugins/spark/regtests/run.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Run without args to run all tests. -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -SPARK_ROOT_DIR=$(dirname ${SCRIPT_DIR}) -export SPARK_LOCAL_HOSTNAME=localhost # avoid VPN messing up driver local IP address binding - -FMT_RED='\033[0;31m' -FMT_GREEN='\033[0;32m' -FMT_NC='\033[0m' - -function loginfo() { - echo "$(date): ${@}" -} -function loggreen() { - echo -e "${FMT_GREEN}$(date): ${@}${FMT_NC}" -} -function logred() { - echo -e "${FMT_RED}$(date): ${@}${FMT_NC}" -} - -export AWS_ACCESS_KEY_ID='' -export AWS_SECRET_ACCESS_KEY='' - -# Allow bearer token to be provided if desired -if [[ -z "$REGTEST_ROOT_BEARER_TOKEN" ]]; then - if ! output=$(curl -X POST -H "Polaris-Realm: POLARIS" "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/oauth/tokens" \ - -d "grant_type=client_credentials" \ - -d "client_id=root" \ - -d "client_secret=secret" \ - -d "scope=PRINCIPAL_ROLE:ALL"); then - logred "Error: Failed to retrieve bearer token" - exit 1 - fi - - token=$(echo "$output" | awk -F\" '{print $4}') - - if [ "$token" == "unauthorized_client" ]; then - logred "Error: Failed to retrieve bearer token" - exit 1 - fi - - export REGTEST_ROOT_BEARER_TOKEN=$token -fi - -echo "Root bearer token: ${REGTEST_ROOT_BEARER_TOKEN}" - -SPARK_VERSION_ITEMS=("3.5 3.5.5") -SCALA_VERSIONS=("2.12" "2.13") -if [[ -n "$CURRENT_SCALA_VERSION" ]]; then - SCALA_VERSIONS=("${CURRENT_SCALA_VERSION}") -fi - -NUM_FAILURES=0 - -for SPARK_VERSION_ITEM in "${SPARK_VERSION_ITEMS[@]}"; do - set -- $SPARK_VERSION_ITEM - SPARK_MAJOR_VERSION=$1 - SPARK_VERSION=$2 - for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do - echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" - # find the project jar - SPARK_DIR=${SPARK_ROOT_DIR}/v${SPARK_MAJOR_VERSION} - JAR_PATH=$(find ${SPARK_DIR} -name "polaris-iceberg-*.*-spark-runtime-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar" -print -quit) - echo "find jar ${JAR_PATH}" - - source ${SCRIPT_DIR}/setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} - - # run the spark_sql test - loginfo "Starting test spark_sql.sh" - - # export SPARK_HOME=/Users/yzou/spark-client-tests/spark-3.5.5-bin-hadoop3 - TEST_FILE="spark_sql.sh" - TEST_SHORTNAME="spark_sql" - TEST_TMPDIR="/tmp/polaris-spark-regtests/${TEST_SHORTNAME}_${SPARK_MAJOR_VERSION}_${SCALA_VERSION}" - TEST_STDERR="${TEST_TMPDIR}/${TEST_SHORTNAME}.stderr" - TEST_STDOUT="${TEST_TMPDIR}/${TEST_SHORTNAME}.stdout" - - mkdir -p ${TEST_TMPDIR} - if (( ${VERBOSE} )); then - ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' | tee ${TEST_STDOUT} - else - ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' > ${TEST_STDOUT} - fi - loginfo "Test run concluded for ${TEST_SUITE}:${TEST_SHORTNAME}" - - TEST_REF="$(realpath ${SCRIPT_DIR})/${TEST_SHORTNAME}.ref" - if cmp --silent ${TEST_STDOUT} ${TEST_REF}; then - loggreen "Test SUCCEEDED: ${TEST_SUITE}:${TEST_SHORTNAME}" - else - logred "Test FAILED: ${TEST_SUITE}:${TEST_SHORTNAME}" - echo '#!/bin/bash' > ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh - echo "meld ${TEST_STDOUT} ${TEST_REF}" >> ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh - chmod 750 ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh - logred "To compare and fix diffs (if 'meld' installed): ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh" - logred "Or manually diff: diff ${TEST_STDOUT} ${TEST_REF}" - logred "See stderr from test run for additional diagnostics: ${TEST_STDERR}" - diff ${TEST_STDOUT} ${TEST_REF} - NUM_FAILURES=$(( NUM_FAILURES + 1 )) - fi - export SPARK_HOME="" - done -done - -loginfo "Tests completed with ${NUM_FAILURES} failures" -if (( ${NUM_FAILURES} > 0 )); then - exit 1 -else - exit 0 -fi diff --git a/plugins/spark/regtests/Dockerfile b/plugins/spark/v3.5/regtests/Dockerfile similarity index 94% rename from plugins/spark/regtests/Dockerfile rename to plugins/spark/v3.5/regtests/Dockerfile index a01d999897..a5a8cf84ac 100644 --- a/plugins/spark/regtests/Dockerfile +++ b/plugins/spark/v3.5/regtests/Dockerfile @@ -37,7 +37,7 @@ USER spark WORKDIR /home/spark/polaris -COPY --chown=spark ./spark /home/spark/polaris/spark +COPY --chown=spark ./v3.5 /home/spark/polaris/v3.5 # /home/spark/regtests might not be writable in all situations, see https://github.com/apache/polaris/pull/205 USER root @@ -45,4 +45,4 @@ RUN chmod -R go+rwx /home/spark/polaris RUN chmod -R 777 ./spark/regtests USER spark -ENTRYPOINT ["./spark/regtests/run.sh"] +ENTRYPOINT ["./v3.5/regtests/run.sh"] diff --git a/plugins/spark/regtests/README.md b/plugins/spark/v3.5/regtests/README.md similarity index 100% rename from plugins/spark/regtests/README.md rename to plugins/spark/v3.5/regtests/README.md diff --git a/plugins/spark/regtests/credentials/.keep b/plugins/spark/v3.5/regtests/credentials/.keep similarity index 100% rename from plugins/spark/regtests/credentials/.keep rename to plugins/spark/v3.5/regtests/credentials/.keep diff --git a/plugins/spark/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml similarity index 98% rename from plugins/spark/regtests/docker-compose.yml rename to plugins/spark/v3.5/regtests/docker-compose.yml index 332ed4922c..fd7d0562b7 100644 --- a/plugins/spark/regtests/docker-compose.yml +++ b/plugins/spark/v3.5/regtests/docker-compose.yml @@ -44,7 +44,7 @@ services: regtest: build: context: ../.. - dockerfile: spark/regtests/Dockerfile + dockerfile: v3.5/regtests/Dockerfile args: POLARIS_HOST: polaris depends_on: diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh new file mode 100755 index 0000000000..62f09c2f35 --- /dev/null +++ b/plugins/spark/v3.5/regtests/run.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Run without args to run all tests. +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SPARK_ROOT_DIR=$(dirname ${SCRIPT_DIR}) +export SPARK_LOCAL_HOSTNAME=localhost # avoid VPN messing up driver local IP address binding + +FMT_RED='\033[0;31m' +FMT_GREEN='\033[0;32m' +FMT_NC='\033[0m' + +function loginfo() { + echo "$(date): ${@}" +} +function loggreen() { + echo -e "${FMT_GREEN}$(date): ${@}${FMT_NC}" +} +function logred() { + echo -e "${FMT_RED}$(date): ${@}${FMT_NC}" +} + +export AWS_ACCESS_KEY_ID='' +export AWS_SECRET_ACCESS_KEY='' + +# Allow bearer token to be provided if desired +if [[ -z "$REGTEST_ROOT_BEARER_TOKEN" ]]; then + if ! output=$(curl -X POST -H "Polaris-Realm: POLARIS" "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/oauth/tokens" \ + -d "grant_type=client_credentials" \ + -d "client_id=root" \ + -d "client_secret=secret" \ + -d "scope=PRINCIPAL_ROLE:ALL"); then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + token=$(echo "$output" | awk -F\" '{print $4}') + + if [ "$token" == "unauthorized_client" ]; then + logred "Error: Failed to retrieve bearer token" + exit 1 + fi + + export REGTEST_ROOT_BEARER_TOKEN=$token +fi + +echo "Root bearer token: ${REGTEST_ROOT_BEARER_TOKEN}" + +NUM_FAILURES=0 + +SCALA_VERSIONS=("2.12" "2.13") +if [[ -n "$CURRENT_SCALA_VERSION" ]]; then + SCALA_VERSIONS=("${CURRENT_SCALA_VERSION}") +fi +SPARK_MAJOR_VERSION="3.5" +SPARK_VERSION="3.5.5" + +for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do + echo "RUN REGRESSION TEST FOR SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}, SPARK_VERSION=${SPARK_VERSION}, SCALA_VERSION=${SCALA_VERSION}" + # find the project jar + SPARK_DIR=${SPARK_ROOT_DIR}/spark + JAR_PATH=$(find ${SPARK_DIR} -name "polaris-iceberg-*.*-spark-runtime-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar" -print -quit) + echo "find jar ${JAR_PATH}" + + source ${SCRIPT_DIR}/setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} + + # run the spark_sql test + loginfo "Starting test spark_sql.sh" + + # export SPARK_HOME=/Users/yzou/spark-client-tests/spark-3.5.5-bin-hadoop3 + TEST_FILE="spark_sql.sh" + TEST_SHORTNAME="spark_sql" + TEST_TMPDIR="/tmp/polaris-spark-regtests/${TEST_SHORTNAME}_${SPARK_MAJOR_VERSION}_${SCALA_VERSION}" + TEST_STDERR="${TEST_TMPDIR}/${TEST_SHORTNAME}.stderr" + TEST_STDOUT="${TEST_TMPDIR}/${TEST_SHORTNAME}.stdout" + + mkdir -p ${TEST_TMPDIR} + if (( ${VERBOSE} )); then + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' | tee ${TEST_STDOUT} + else + ${SCRIPT_DIR}/${TEST_FILE} 2>${TEST_STDERR} | grep -v 'loading settings' > ${TEST_STDOUT} + fi + loginfo "Test run concluded for ${TEST_SUITE}:${TEST_SHORTNAME}" + + TEST_REF="$(realpath ${SCRIPT_DIR})/${TEST_SHORTNAME}.ref" + if cmp --silent ${TEST_STDOUT} ${TEST_REF}; then + loggreen "Test SUCCEEDED: ${TEST_SUITE}:${TEST_SHORTNAME}" + else + logred "Test FAILED: ${TEST_SUITE}:${TEST_SHORTNAME}" + echo '#!/bin/bash' > ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + echo "meld ${TEST_STDOUT} ${TEST_REF}" >> ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + chmod 750 ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh + logred "To compare and fix diffs (if 'meld' installed): ${TEST_TMPDIR}/${TEST_SHORTNAME}.fixdiffs.sh" + logred "Or manually diff: diff ${TEST_STDOUT} ${TEST_REF}" + logred "See stderr from test run for additional diagnostics: ${TEST_STDERR}" + diff ${TEST_STDOUT} ${TEST_REF} + NUM_FAILURES=$(( NUM_FAILURES + 1 )) + fi + export SPARK_HOME="" +done + +loginfo "Tests completed with ${NUM_FAILURES} failures" +if (( ${NUM_FAILURES} > 0 )); then + exit 1 +else + exit 0 +fi diff --git a/plugins/spark/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh similarity index 100% rename from plugins/spark/regtests/setup.sh rename to plugins/spark/v3.5/regtests/setup.sh diff --git a/plugins/spark/regtests/spark_sql.ref b/plugins/spark/v3.5/regtests/spark_sql.ref similarity index 100% rename from plugins/spark/regtests/spark_sql.ref rename to plugins/spark/v3.5/regtests/spark_sql.ref diff --git a/plugins/spark/regtests/spark_sql.sh b/plugins/spark/v3.5/regtests/spark_sql.sh similarity index 95% rename from plugins/spark/regtests/spark_sql.sh rename to plugins/spark/v3.5/regtests/spark_sql.sh index 3e761f912e..191d4d0fa9 100755 --- a/plugins/spark/regtests/spark_sql.sh +++ b/plugins/spark/v3.5/regtests/spark_sql.sh @@ -45,6 +45,11 @@ show namespaces; create namespace db1.schema1; show namespaces in db1; +create table db1.schema1.tbl1 (col1 int); +show tables in db1; +show tables in db1.schema1; + +drop table db1.schema1.tbl1; drop namespace db1.schema1; drop namespace db1; drop namespace db2; From 802d03ede43f2213954b703be5f050d13a21610a Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 10:08:07 -0700 Subject: [PATCH 05/25] add regtests for built jars --- .github/workflows/spark_client_regtests.yml | 2 +- plugins/spark/v3.5/regtests/Dockerfile | 2 +- plugins/spark/v3.5/regtests/README.md | 20 +++++--- plugins/spark/v3.5/regtests/credentials/.keep | 0 .../spark/v3.5/regtests/docker-compose.yml | 0 plugins/spark/v3.5/regtests/run.sh | 15 +++++- plugins/spark/v3.5/regtests/setup.sh | 1 + plugins/spark/v3.5/regtests/spark_sql.ref | 48 +++++++++++++++++-- plugins/spark/v3.5/regtests/spark_sql.sh | 28 +++++++++-- 9 files changed, 100 insertions(+), 16 deletions(-) mode change 100644 => 100755 plugins/spark/v3.5/regtests/Dockerfile mode change 100644 => 100755 plugins/spark/v3.5/regtests/credentials/.keep mode change 100644 => 100755 plugins/spark/v3.5/regtests/docker-compose.yml diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 35bd2c2860..588c9b186c 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -58,4 +58,4 @@ jobs: AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} run: | - docker compose -f plugins/spark/regtests/docker-compose.yml up --build --exit-code-from regtest \ No newline at end of file + docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest \ No newline at end of file diff --git a/plugins/spark/v3.5/regtests/Dockerfile b/plugins/spark/v3.5/regtests/Dockerfile old mode 100644 new mode 100755 index a5a8cf84ac..1620c12ae2 --- a/plugins/spark/v3.5/regtests/Dockerfile +++ b/plugins/spark/v3.5/regtests/Dockerfile @@ -42,7 +42,7 @@ COPY --chown=spark ./v3.5 /home/spark/polaris/v3.5 # /home/spark/regtests might not be writable in all situations, see https://github.com/apache/polaris/pull/205 USER root RUN chmod -R go+rwx /home/spark/polaris -RUN chmod -R 777 ./spark/regtests +RUN chmod -R 777 ./v3.5/regtests USER spark ENTRYPOINT ["./v3.5/regtests/run.sh"] diff --git a/plugins/spark/v3.5/regtests/README.md b/plugins/spark/v3.5/regtests/README.md index a00c4d4679..0f10f1e0b3 100755 --- a/plugins/spark/v3.5/regtests/README.md +++ b/plugins/spark/v3.5/regtests/README.md @@ -21,7 +21,7 @@ # End-to-end regression tests -regtests provides basic end-to-end tests to customer abo +regtests provides basic end-to-end tests for spark_sql using spark client jars. Regression tests are either run in Docker, using docker-compose to orchestrate the tests, or locally. @@ -32,12 +32,12 @@ It is recommended to clean the `regtests/output` directory before running tests. running: ```shell -rm -rf ./plugins/spark/regtests/output && mkdir -p ./plugins/spark/regtests/output && chmod -R 777 ./plugins/spark/regtests/output +rm -rf ./plugins/spark/v3.5/regtests/output && mkdir -p ./plugins/spark/v3.5/regtests/output && chmod -R 777 ./plugins/spark/v3.5/regtests/output ``` ## Run Tests With Docker Compose -Tests can be run with docker-compose using the provided `./regtests/docker-compose.yml` file, as +Tests can be run with docker-compose using the provided `./plugins/spark/v3.5/regtests/docker-compose.yml` file, as follows: ```shell @@ -46,12 +46,14 @@ follows: :polaris-quarkus-server:assemble \ :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ -Dquarkus.container-image.build=true -docker compose -f ./plugins/spark/regtests/docker-compose.yml up --build --exit-code-from regtest +docker compose -f ./plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest ``` In this setup, a Polaris container will be started in a docker-compose group, using the image previously built by the Gradle build. Then another container, including a Spark SQL shell, will run -the tests. The exit code will be the same as the exit code of the Spark container. +the tests. The exit code will be the same as the exit code of the Spark container. +**NOTE** Docker compose only support testing with scala 2.12, because no scala 2.13 image is available +for spark 3.5. Scala 2.13 will be supported for Spark 4.0. This is the flow used in CI and should be done locally before pushing to GitHub to ensure that no environmental factors contribute to the outcome of the tests. @@ -67,7 +69,13 @@ eval $(minikube -p minikube docker-env --unset) ## Run Tests Locally -Regression tests can be run locally as well, using the test harness. +Regression tests can be run locally as well, using the test harness. For local testing, both +Scala 2.12 and Scala 2.13 are supported. + +Before you run the test, make sure you build the project to generate the Spark client jars. +```shell +./gradlew build +``` In this setup, a Polaris server must be running on localhost:8181 before running tests. The simplest way to do this is to run the Polaris server in a separate terminal window: diff --git a/plugins/spark/v3.5/regtests/credentials/.keep b/plugins/spark/v3.5/regtests/credentials/.keep old mode 100644 new mode 100755 diff --git a/plugins/spark/v3.5/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml old mode 100644 new mode 100755 diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index 62f09c2f35..db0320ca5f 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -78,6 +78,11 @@ for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do JAR_PATH=$(find ${SPARK_DIR} -name "polaris-iceberg-*.*-spark-runtime-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar" -print -quit) echo "find jar ${JAR_PATH}" + SPARK_EXISTS="TRUE" + if [ -z "${SPARK_HOME}" ]; then + SPARK_EXISTS="FALSE" + fi + source ${SCRIPT_DIR}/setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} # run the spark_sql test @@ -112,9 +117,17 @@ for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do diff ${TEST_STDOUT} ${TEST_REF} NUM_FAILURES=$(( NUM_FAILURES + 1 )) fi - export SPARK_HOME="" + + # clean up + if [ "${SPARK_EXISTS}" = "FALSE" ]; then + rm -rf ${SPARK_HOME} + export SPARK_HOME="" + fi done +# clean the output dir +rm -rf ${SCRIPT_DIR}/output + loginfo "Tests completed with ${NUM_FAILURES} failures" if (( ${NUM_FAILURES} > 0 )); then exit 1 diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index c30de952d5..69fa97dce4 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -131,6 +131,7 @@ spark.sql.variable.substitute true spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension +spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog spark.sql.catalog.polaris.type=rest spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog diff --git a/plugins/spark/v3.5/regtests/spark_sql.ref b/plugins/spark/v3.5/regtests/spark_sql.ref index 2a2a6203de..5825d09318 100755 --- a/plugins/spark/v3.5/regtests/spark_sql.ref +++ b/plugins/spark/v3.5/regtests/spark_sql.ref @@ -1,4 +1,4 @@ -{"defaults":{"default-base-location":"file:///tmp/spark_sql_s3_catalog"},"overrides":{"prefix":"spark_sql_catalog"},"endpoints":["GET /v1/{prefix}/namespaces","GET /v1/{prefix}/namespaces/{namespace}","HEAD /v1/{prefix}/namespaces/{namespace}","POST /v1/{prefix}/namespaces","POST /v1/{prefix}/namespaces/{namespace}/properties","DELETE /v1/{prefix}/namespaces/{namespace}","GET /v1/{prefix}/namespaces/{namespace}/tables","GET /v1/{prefix}/namespaces/{namespace}/tables/{table}","HEAD /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/namespaces/{namespace}/tables","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}","DELETE /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/tables/rename","POST /v1/{prefix}/namespaces/{namespace}/register","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}/metrics","POST /v1/{prefix}/transactions/commit","GET /v1/{prefix}/namespaces/{namespace}/views","GET /v1/{prefix}/namespaces/{namespace}/views/{view}","HEAD /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/namespaces/{namespace}/views","POST /v1/{prefix}/namespaces/{namespace}/views/{view}","DELETE /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/views/rename","POST /v1/{prefix}/transactions/commit","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","POST polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","DELETE polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}"]} +{"defaults":{"default-base-location":"file:///tmp/spark_catalog"},"overrides":{"prefix":"spark_sql_catalog"},"endpoints":["GET /v1/{prefix}/namespaces","GET /v1/{prefix}/namespaces/{namespace}","HEAD /v1/{prefix}/namespaces/{namespace}","POST /v1/{prefix}/namespaces","POST /v1/{prefix}/namespaces/{namespace}/properties","DELETE /v1/{prefix}/namespaces/{namespace}","GET /v1/{prefix}/namespaces/{namespace}/tables","GET /v1/{prefix}/namespaces/{namespace}/tables/{table}","HEAD /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/namespaces/{namespace}/tables","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}","DELETE /v1/{prefix}/namespaces/{namespace}/tables/{table}","POST /v1/{prefix}/tables/rename","POST /v1/{prefix}/namespaces/{namespace}/register","POST /v1/{prefix}/namespaces/{namespace}/tables/{table}/metrics","POST /v1/{prefix}/transactions/commit","GET /v1/{prefix}/namespaces/{namespace}/views","GET /v1/{prefix}/namespaces/{namespace}/views/{view}","HEAD /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/namespaces/{namespace}/views","POST /v1/{prefix}/namespaces/{namespace}/views/{view}","DELETE /v1/{prefix}/namespaces/{namespace}/views/{view}","POST /v1/{prefix}/views/rename","POST /v1/{prefix}/transactions/commit","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","POST polaris/v1/{prefix}/namespaces/{namespace}/generic-tables","DELETE polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}","GET polaris/v1/{prefix}/namespaces/{namespace}/generic-tables/{generic-table}"]} Catalog created spark-sql (default)> use polaris; spark-sql ()> create namespace db1; @@ -11,7 +11,47 @@ spark-sql ()> spark-sql ()> show namespaces in db1; db1.schema1 spark-sql ()> - > drop namespace db1.schema1; -spark-sql ()> drop namespace db1; -spark-sql ()> drop namespace db2; + > create table db1.schema1.iceberg_tb (col1 int); +spark-sql ()> show tables in db1; +spark-sql ()> show tables in db1.schema1; +iceberg_tb spark-sql ()> + > use db1.schema1; +spark-sql (db1.schema1)> insert into iceberg_tb values (123), (234), (111); +spark-sql (db1.schema1)> select * from iceberg_tb order by col1; +111 +123 +234 +spark-sql (db1.schema1)> + > create table delta_tb1(col1 string) using delta location 'file:///tmp/spark_catalog/delta_tb1'; +spark-sql (db1.schema1)> insert into delta_tb1 values ('ab'), ('bb'), ('dd'); +spark-sql (db1.schema1)> select * from delta_tb1 order by col1; +ab +bb +dd +spark-sql (db1.schema1)> + > show tables; +iceberg_tb +delta_tb1 +spark-sql (db1.schema1)> + > use db1; +spark-sql (db1)> create table delta_tb2(col1 int) using delta location 'file:///tmp/spark_catalog/delta_tb2'; +spark-sql (db1)> insert into delta_tb2 values (1), (2), (3) order by col1; +spark-sql (db1)> select * from delta_tb2; +1 +2 +3 +spark-sql (db1)> + > show tables; +delta_tb2 +spark-sql (db1)> show tables in db1.schema1; +iceberg_tb +delta_tb1 +spark-sql (db1)> + > drop table db1.schema1.iceberg_tb; +spark-sql (db1)> drop table db1.schema1.delta_tb1; +spark-sql (db1)> drop namespace db1.schema1; +spark-sql (db1)> drop table delta_tb2; +spark-sql (db1)> drop namespace db1; +spark-sql (db1)> drop namespace db2; +spark-sql (db1)> diff --git a/plugins/spark/v3.5/regtests/spark_sql.sh b/plugins/spark/v3.5/regtests/spark_sql.sh index 191d4d0fa9..10c09180d9 100755 --- a/plugins/spark/v3.5/regtests/spark_sql.sh +++ b/plugins/spark/v3.5/regtests/spark_sql.sh @@ -25,7 +25,7 @@ SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN}" CATALOG_NAME="spark_sql_catalog" curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ - -d '{"name": "spark_sql_catalog", "id": 100, "type": "INTERNAL", "readOnly": false, "properties": {"default-base-location": "file:///tmp/spark_sql_s3_catalog"}, "storageConfigInfo": {"storageType": "FILE", "allowedLocations": ["file:///tmp"]}}' > /dev/stderr + -d '{"name": "spark_sql_catalog", "id": 100, "type": "INTERNAL", "readOnly": false, "properties": {"default-base-location": "file:///tmp/spark_catalog"}, "storageConfigInfo": {"storageType": "FILE", "allowedLocations": ["file:///tmp"]}}' > /dev/stderr # Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ @@ -45,15 +45,37 @@ show namespaces; create namespace db1.schema1; show namespaces in db1; -create table db1.schema1.tbl1 (col1 int); +create table db1.schema1.iceberg_tb (col1 int); show tables in db1; show tables in db1.schema1; -drop table db1.schema1.tbl1; +use db1.schema1; +insert into iceberg_tb values (123), (234), (111); +select * from iceberg_tb order by col1; + +create table delta_tb1(col1 string) using delta location 'file:///tmp/spark_catalog/delta_tb1'; +insert into delta_tb1 values ('ab'), ('bb'), ('dd'); +select * from delta_tb1 order by col1; + +show tables; + +use db1; +create table delta_tb2(col1 int) using delta location 'file:///tmp/spark_catalog/delta_tb2'; +insert into delta_tb2 values (1), (2), (3) order by col1; +select * from delta_tb2; + +show tables; +show tables in db1.schema1; + +drop table db1.schema1.iceberg_tb; +drop table db1.schema1.delta_tb1; drop namespace db1.schema1; +drop table delta_tb2; drop namespace db1; drop namespace db2; EOF +rm -rf /tmp/spark_catalog/ + curl -i -X DELETE -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${CATALOG_NAME} > /dev/stderr From 076a1560c319b7f33376d095cd991a31c2d61b06 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 10:25:25 -0700 Subject: [PATCH 06/25] add comments --- plugins/spark/v3.5/regtests/README.md | 3 +++ plugins/spark/v3.5/spark/build.gradle.kts | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/regtests/README.md b/plugins/spark/v3.5/regtests/README.md index 0f10f1e0b3..8aa6d391be 100755 --- a/plugins/spark/v3.5/regtests/README.md +++ b/plugins/spark/v3.5/regtests/README.md @@ -26,6 +26,9 @@ regtests provides basic end-to-end tests for spark_sql using spark client jars. Regression tests are either run in Docker, using docker-compose to orchestrate the tests, or locally. +**NOTE** regtests are supposed to be a light-weight testing to ensure jars can be used to start +spark and run basic SQL commands. Please use integration for detailed testing. + ## Prerequisites It is recommended to clean the `regtests/output` directory before running tests. This can be done by diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 133632edb9..549f9cfe21 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -38,12 +38,14 @@ val scalaLibraryVersion = } dependencies { - // TODO: Extract a polaris-rest module as a thin layer for + // TODO: extract a polaris-rest module as a thin layer for // client to depends on. implementation(project(":polaris-api-iceberg-service")) { // exclude the iceberg dependencies, use the ones pulled // by iceberg-core exclude("org.apache.iceberg", "*") + // exclude all cloud and quarkus specific dependencies to avoid + // running into problems with signature files. exclude("com.azure", "*") exclude("software.amazon.awssdk", "*") exclude("io.airlift", "*") From d43e8bcdd35bbd5e63a37b93fb6172b2c4bd2328 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 10:28:39 -0700 Subject: [PATCH 07/25] remove unncessary build --- plugins/spark/v3.5/spark/build.gradle.kts | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 549f9cfe21..ff09c5aed9 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -163,9 +163,6 @@ tasks.register("createPolarisSparkJar") { } relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml.jackson") - - mergeServiceFiles() - exclude("META-INF/*.RSA", "META-INF/*.DSA", "META-INF/*.SF") } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } From ad76d5b6cf1cbbfef8efa4f595293a95a8ee5143 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 10:30:23 -0700 Subject: [PATCH 08/25] add back --- plugins/spark/v3.5/spark/build.gradle.kts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index ff09c5aed9..549f9cfe21 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -163,6 +163,9 @@ tasks.register("createPolarisSparkJar") { } relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml.jackson") + + mergeServiceFiles() + exclude("META-INF/*.RSA", "META-INF/*.DSA", "META-INF/*.SF") } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } From 22e7594bc38ab175d9109e0fafa0cd97fe68e500 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 11:01:36 -0700 Subject: [PATCH 09/25] simplify jars --- plugins/spark/v3.5/spark/build.gradle.kts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 549f9cfe21..70fc4c869b 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -48,24 +48,34 @@ dependencies { // running into problems with signature files. exclude("com.azure", "*") exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") exclude("io.airlift", "*") exclude("io.smallrye", "*") - exclude("io.micrometer", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") } implementation(project(":polaris-api-catalog-service")) { exclude("org.apache.iceberg", "*") exclude("com.azure", "*") exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") exclude("io.airlift", "*") exclude("io.smallrye", "*") - exclude("io.micrometer", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") } implementation(project(":polaris-core")) { exclude("org.apache.iceberg", "*") exclude("com.azure", "*") exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") exclude("io.airlift", "*") exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") } implementation("org.apache.iceberg:iceberg-core:${icebergVersion}") @@ -163,9 +173,6 @@ tasks.register("createPolarisSparkJar") { } relocate("com.fasterxml", "org.apache.polaris.shaded.com.fasterxml.jackson") - - mergeServiceFiles() - exclude("META-INF/*.RSA", "META-INF/*.DSA", "META-INF/*.SF") } tasks.withType(Jar::class).named("sourcesJar") { dependsOn("createPolarisSparkJar") } From a6f137471108c628b7e9cd640c22aab9a6bcde87 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 11:12:15 -0700 Subject: [PATCH 10/25] remove dead code --- plugins/spark/v3.5/spark/build.gradle.kts | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 70fc4c869b..98729912c1 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -21,7 +21,6 @@ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar plugins { id("polaris-client") - // alias(libs.plugins.jandex) } // get version information From a7f52b3c574dcd04717284beece46e274b41de86 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 11:16:23 -0700 Subject: [PATCH 11/25] address feedback --- plugins/spark/v3.5/regtests/run.sh | 1 - plugins/spark/v3.5/regtests/setup.sh | 1 + plugins/spark/v3.5/regtests/spark_sql.sh | 2 +- plugins/spark/v3.5/spark/build.gradle.kts | 4 +--- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index db0320ca5f..5472a4e643 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -88,7 +88,6 @@ for SCALA_VERSION in "${SCALA_VERSIONS[@]}"; do # run the spark_sql test loginfo "Starting test spark_sql.sh" - # export SPARK_HOME=/Users/yzou/spark-client-tests/spark-3.5.5-bin-hadoop3 TEST_FILE="spark_sql.sh" TEST_SHORTNAME="spark_sql" TEST_TMPDIR="/tmp/polaris-spark-regtests/${TEST_SHORTNAME}_${SPARK_MAJOR_VERSION}_${SCALA_VERSION}" diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index 69fa97dce4..a446672602 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -153,4 +153,5 @@ echo "Launch spark-sql at ${SPARK_HOME}/bin/spark-sql" # this is mostly useful for building the Docker image with all needed dependencies ${SPARK_HOME}/bin/spark-sql -e "SELECT 1" +# ensure SPARK_HOME is setup for later tests export SPARK_HOME=$SPARK_HOME diff --git a/plugins/spark/v3.5/regtests/spark_sql.sh b/plugins/spark/v3.5/regtests/spark_sql.sh index 10c09180d9..fe036664cd 100755 --- a/plugins/spark/v3.5/regtests/spark_sql.sh +++ b/plugins/spark/v3.5/regtests/spark_sql.sh @@ -21,7 +21,6 @@ SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN}" -# echo "CURRENT SPARK HOME ${SPARK_HOME}" CATALOG_NAME="spark_sql_catalog" curl -i -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ @@ -75,6 +74,7 @@ drop namespace db1; drop namespace db2; EOF +# clean up the spark_catalog dir rm -rf /tmp/spark_catalog/ curl -i -X DELETE -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ diff --git a/plugins/spark/v3.5/spark/build.gradle.kts b/plugins/spark/v3.5/spark/build.gradle.kts index 98729912c1..5ce7e73c05 100644 --- a/plugins/spark/v3.5/spark/build.gradle.kts +++ b/plugins/spark/v3.5/spark/build.gradle.kts @@ -19,9 +19,7 @@ import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar -plugins { - id("polaris-client") -} +plugins { id("polaris-client") } // get version information val sparkMajorVersion = "3.5" From e12453c08c1933185c3a152ddae76eda79f0cdaf Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Fri, 18 Apr 2025 14:43:22 -0700 Subject: [PATCH 12/25] update CI name to Spark Client Regression Test --- .github/workflows/spark_client_regtests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 588c9b186c..fe5ae862d3 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -17,7 +17,7 @@ # under the License. # -name: Regression Tests +name: Spark Client Regression Tests on: push: branches: [ "main" ] From 59aff04e5d7be745c5a6a9ee195c2f483518fae4 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Tue, 22 Apr 2025 18:20:04 -0700 Subject: [PATCH 13/25] address feedback --- .github/workflows/spark_client_regtests.yml | 2 +- plugins/spark/v3.5/regtests/README.md | 2 +- plugins/spark/v3.5/regtests/setup.sh | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index fe5ae862d3..815942ffe4 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -58,4 +58,4 @@ jobs: AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} run: | - docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest \ No newline at end of file + docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest diff --git a/plugins/spark/v3.5/regtests/README.md b/plugins/spark/v3.5/regtests/README.md index 8aa6d391be..2214abd094 100755 --- a/plugins/spark/v3.5/regtests/README.md +++ b/plugins/spark/v3.5/regtests/README.md @@ -96,5 +96,5 @@ Running the test harness will automatically run the idempotent setup script. Fro project, just run: ```shell -env POLARIS_HOST=localhost ./regtests/run.sh +env POLARIS_HOST=localhost ./plugins/spark/v3.5/regtests/run.sh ``` \ No newline at end of file diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index a446672602..1ef0dbab7c 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -17,10 +17,20 @@ # specific language governing permissions and limitations # under the License. # -# Idempotent setup for regression tests. Run manually or let run.sh auto-run. +################################### +# Idempotent setup for spark regression tests. Run manually or let run.sh auto-run. # # Warning - first time setup may download large amounts of files # Warning - may clobber conf/spark-defaults.conf +# Warning - it will set the SPARK_HOME environment variable with the spark setup +# +# The script can be called independently like following +# ./setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH} +# Required Parameters: +# --sparkVersion : the spark version to setup +# --scalaVersion : the scala version of spark to setup +# --jar : path to the local Polaris Spark client jar +# set -x @@ -131,12 +141,15 @@ spark.sql.variable.substitute true spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension +# this configuration is needed for delta table spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog -spark.sql.catalog.polaris.type=rest spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog +# this configuration is used spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials spark.sql.catalog.polaris.client.region=us-west-2 +# configuration required to ensure DataSourceV2 load works correctly for +# different table formats spark.sql.sources.useV1SourceList='' EOF echo 'Success!' From 346381187be236e2fb533c429a7a18038decdcf5 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 09:33:47 -0700 Subject: [PATCH 14/25] update comment --- plugins/spark/v3.5/regtests/setup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index 1ef0dbab7c..1f38fcfa15 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -145,7 +145,8 @@ spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExte spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog -# this configuration is used +# this configuration is currently only used for iceberg tables, generic tables currently +# doesn't support credential vending spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials spark.sql.catalog.polaris.client.region=us-west-2 # configuration required to ensure DataSourceV2 load works correctly for From b517f30fde742fea5b148604286a8b776a0d2563 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 09:35:29 -0700 Subject: [PATCH 15/25] udpate grammer --- plugins/spark/v3.5/regtests/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index 1f38fcfa15..44ae6ab861 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -146,7 +146,7 @@ spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog # this configuration is currently only used for iceberg tables, generic tables currently -# doesn't support credential vending +# don't support credential vending spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials spark.sql.catalog.polaris.client.region=us-west-2 # configuration required to ensure DataSourceV2 load works correctly for From 075b4c00abdbfa4c94908dd9dcabc624617f1e9f Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 09:42:44 -0700 Subject: [PATCH 16/25] address comments --- plugins/spark/v3.5/regtests/README.md | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/plugins/spark/v3.5/regtests/README.md b/plugins/spark/v3.5/regtests/README.md index 2214abd094..75dd57a5a3 100755 --- a/plugins/spark/v3.5/regtests/README.md +++ b/plugins/spark/v3.5/regtests/README.md @@ -75,26 +75,12 @@ eval $(minikube -p minikube docker-env --unset) Regression tests can be run locally as well, using the test harness. For local testing, both Scala 2.12 and Scala 2.13 are supported. -Before you run the test, make sure you build the project to generate the Spark client jars. -```shell -./gradlew build -``` - -In this setup, a Polaris server must be running on localhost:8181 before running tests. The simplest -way to do this is to run the Polaris server in a separate terminal window: - -```shell -./gradlew run -``` +To run regression tests locally, run the following: +- `./gradlew build` -- build the Polaris project and Spark Client jars. +- `./gradlew run` -- start a Polaris server on localhost:8181. +- `env POLARIS_HOST=localhost ./plugins/spark/v3.5/regtests/run.sh` -- run regtests. Note: the regression tests expect Polaris to run with certain options, e.g. with support for `FILE` storage, default realm `POLARIS` and root credentials `root:secret`; if you run the above command, this will be the case. If you run Polaris in a different way, make sure that Polaris is configured appropriately. - -Running the test harness will automatically run the idempotent setup script. From the root of the -project, just run: - -```shell -env POLARIS_HOST=localhost ./plugins/spark/v3.5/regtests/run.sh -``` \ No newline at end of file From 5dc9a1e55d3a188160aeb8974f410e8061dbcd90 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 10:44:09 -0700 Subject: [PATCH 17/25] remove credentials --- plugins/spark/v3.5/regtests/docker-compose.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/plugins/spark/v3.5/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml index fd7d0562b7..0aa162c6f3 100755 --- a/plugins/spark/v3.5/regtests/docker-compose.yml +++ b/plugins/spark/v3.5/regtests/docker-compose.yml @@ -25,12 +25,6 @@ services: - "8182" environment: AWS_REGION: us-west-2 - AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY - GOOGLE_APPLICATION_CREDENTIALS: $GOOGLE_APPLICATION_CREDENTIALS - AZURE_TENANT_ID: $AZURE_TENANT_ID - AZURE_CLIENT_ID: $AZURE_CLIENT_ID - AZURE_CLIENT_SECRET: $AZURE_CLIENT_SECRET POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,root,secret quarkus.log.file.enable: "false" quarkus.otel.sdk.disabled: "true" From 9c760eb91017ada72a00b1abcaca0add37d9654e Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 10:48:48 -0700 Subject: [PATCH 18/25] remove cloud specific config --- plugins/spark/v3.5/regtests/docker-compose.yml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/plugins/spark/v3.5/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml index 0aa162c6f3..ea39f38098 100755 --- a/plugins/spark/v3.5/regtests/docker-compose.yml +++ b/plugins/spark/v3.5/regtests/docker-compose.yml @@ -44,24 +44,6 @@ services: depends_on: polaris: condition: service_healthy - environment: - AWS_TEST_ENABLED: $AWS_TEST_ENABLED - AWS_STORAGE_BUCKET: $AWS_STORAGE_BUCKET - AWS_ROLE_ARN: $AWS_ROLE_ARN - AWS_TEST_BASE: $AWS_TEST_BASE - GCS_TEST_ENABLED: $GCS_TEST_ENABLED - GCS_TEST_BASE: $GCS_TEST_BASE - GOOGLE_APPLICATION_CREDENTIALS: $GOOGLE_APPLICATION_CREDENTIALS - AZURE_TEST_ENABLED: $AZURE_TEST_ENABLED - AZURE_TENANT_ID: $AZURE_TENANT_ID - AZURE_DFS_TEST_BASE: $AZURE_DFS_TEST_BASE - AZURE_BLOB_TEST_BASE: $AZURE_BLOB_TEST_BASE - AZURE_CLIENT_ID: $AZURE_CLIENT_ID - AZURE_CLIENT_SECRET: $AZURE_CLIENT_SECRET - AWS_CROSS_REGION_TEST_ENABLED: $AWS_CROSS_REGION_TEST_ENABLED - AWS_CROSS_REGION_BUCKET: $AWS_CROSS_REGION_BUCKET - AWS_ROLE_FOR_CROSS_REGION_BUCKET: $AWS_ROLE_FOR_CROSS_REGION_BUCKET - AWS_REGION_FOR_CROSS_REGION_TEST: $AWS_REGION_FOR_CROSS_REGION_TEST volumes: - ./output:/tmp/polaris-regtests/ - ./credentials:/tmp/credentials/ From 0b752cff6bf18364df3eda1ac3c9b92adccce2c6 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 10:58:01 -0700 Subject: [PATCH 19/25] remove unused credential --- plugins/spark/v3.5/regtests/credentials/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100755 plugins/spark/v3.5/regtests/credentials/.keep diff --git a/plugins/spark/v3.5/regtests/credentials/.keep b/plugins/spark/v3.5/regtests/credentials/.keep deleted file mode 100755 index e69de29bb2..0000000000 From fd1d61e84f74b2b7929d1a4ff019fc30faa406ae Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 11:14:58 -0700 Subject: [PATCH 20/25] remove aws env var --- .github/workflows/spark_client_regtests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 815942ffe4..02f197b4d0 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -54,8 +54,5 @@ jobs: -Dquarkus.container-image.build=true - name: Regression Test - env: - AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} - AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} run: | docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest From 19360ea52c69f683fe9d5385a43e72524b36a4ac Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 11:19:47 -0700 Subject: [PATCH 21/25] add change --- .github/workflows/spark_client_regtests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 02f197b4d0..7229c033a0 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -43,8 +43,8 @@ jobs: - name: Fix permissions run: mkdir -p regtests/output && chmod 777 regtests/output && chmod 777 regtests/t_*/ref/* - - name: Project build - run: ./gradlew build + - name: Project build without testing + run: ./gradlew assemble - name: Image build run: | From e19d178bda4555e9f1461df558af8a75b5bc3e32 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 11:32:02 -0700 Subject: [PATCH 22/25] remove credential from dcker --- plugins/spark/v3.5/regtests/docker-compose.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/spark/v3.5/regtests/docker-compose.yml b/plugins/spark/v3.5/regtests/docker-compose.yml index ea39f38098..e1ea1a8981 100755 --- a/plugins/spark/v3.5/regtests/docker-compose.yml +++ b/plugins/spark/v3.5/regtests/docker-compose.yml @@ -28,8 +28,6 @@ services: POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,root,secret quarkus.log.file.enable: "false" quarkus.otel.sdk.disabled: "true" - volumes: - - ./credentials:/tmp/credentials/ healthcheck: test: ["CMD", "curl", "http://localhost:8182/q/health"] interval: 10s @@ -46,4 +44,3 @@ services: condition: service_healthy volumes: - ./output:/tmp/polaris-regtests/ - - ./credentials:/tmp/credentials/ From 652a2db253455aa27b3fabf2f40ec2de5e6c4781 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 15:14:45 -0700 Subject: [PATCH 23/25] remove aws setting --- plugins/spark/v3.5/regtests/run.sh | 3 --- plugins/spark/v3.5/regtests/setup.sh | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/plugins/spark/v3.5/regtests/run.sh b/plugins/spark/v3.5/regtests/run.sh index 5472a4e643..d850a4465f 100755 --- a/plugins/spark/v3.5/regtests/run.sh +++ b/plugins/spark/v3.5/regtests/run.sh @@ -36,9 +36,6 @@ function logred() { echo -e "${FMT_RED}$(date): ${@}${FMT_NC}" } -export AWS_ACCESS_KEY_ID='' -export AWS_SECRET_ACCESS_KEY='' - # Allow bearer token to be provided if desired if [[ -z "$REGTEST_ROOT_BEARER_TOKEN" ]]; then if ! output=$(curl -X POST -H "Polaris-Realm: POLARIS" "http://${POLARIS_HOST:-localhost}:8181/api/catalog/v1/oauth/tokens" \ diff --git a/plugins/spark/v3.5/regtests/setup.sh b/plugins/spark/v3.5/regtests/setup.sh index 44ae6ab861..072b08f6d5 100755 --- a/plugins/spark/v3.5/regtests/setup.sh +++ b/plugins/spark/v3.5/regtests/setup.sh @@ -133,9 +133,7 @@ cat << EOF >> ${SPARK_CONF} # POLARIS Spark client test conf spark.jars $POLARIS_CLIENT_JAR -spark.jars.packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_${SCALA_VERSION}:3.2.1 -spark.hadoop.fs.s3.impl org.apache.hadoop.fs.s3a.S3AFileSystem -spark.hadoop.fs.AbstractFileSystem.s3.impl org.apache.hadoop.fs.s3a.S3A +spark.jars.packages io.delta:delta-spark_${SCALA_VERSION}:3.2.1 spark.sql.variable.substitute true spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} From 8f3074cde37bd19d6f0ad643e8b377c315123c12 Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 17:04:24 -0700 Subject: [PATCH 24/25] add comment --- .github/workflows/spark_client_regtests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 7229c033a0..5befe35a28 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -53,6 +53,10 @@ jobs: :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ -Dquarkus.container-image.build=true + # NOTE: the regression test runs with spark 3.5.5 and scala 2.12 in Java 17. We also have integration + # tests runs with the existing gradle.yml, which only runs on Java 17. Since spark Java compatibility + # for 3.5 is 8, 11, and 17, we should run spark client with those compatible java versions. + # TODO: add separate spark client CI and run with Java 8, 11 and 17. - name: Regression Test run: | docker compose -f plugins/spark/v3.5/regtests/docker-compose.yml up --build --exit-code-from regtest From f538f95dc8f3d18cdfa5e202ab02da9abdde853b Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Wed, 23 Apr 2025 17:10:34 -0700 Subject: [PATCH 25/25] fix comment --- .github/workflows/spark_client_regtests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/spark_client_regtests.yml b/.github/workflows/spark_client_regtests.yml index 5befe35a28..44e0fdca1f 100644 --- a/.github/workflows/spark_client_regtests.yml +++ b/.github/workflows/spark_client_regtests.yml @@ -54,7 +54,7 @@ jobs: -Dquarkus.container-image.build=true # NOTE: the regression test runs with spark 3.5.5 and scala 2.12 in Java 17. We also have integration - # tests runs with the existing gradle.yml, which only runs on Java 17. Since spark Java compatibility + # tests runs with the existing gradle.yml, which only runs on Java 21. Since spark Java compatibility # for 3.5 is 8, 11, and 17, we should run spark client with those compatible java versions. # TODO: add separate spark client CI and run with Java 8, 11 and 17. - name: Regression Test