From 564b8014544477ef39cbe9529f08df6450e017b9 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Tue, 13 May 2025 18:36:06 +0530 Subject: [PATCH 1/9] docs: Add README for Minio S3 with Polaris governance example This commit introduces the initial README.md file for the 'getting-started/minio' example. The README outlines: - Purpose of the example: Demonstrating Apache Polaris managing an Iceberg data lake in Minio S3, with a focus on governance for Spark (R/W) and Trino (R/O). - Prerequisites for running the example. - An overview of the security model, including Minio S3 users and Polaris client roles. - Detailed setup and execution steps: - Optional environment variable configuration. - Making scripts executable. - Starting services with docker-compose. - Accessing the Minio console. - Using Spark SQL for creating namespaces, tables, and inserting data. - Using Trino CLI for querying data and verifying read-only access (including expected failures for write operations). - Optional steps for accessing the Polaris API with a scoped token. - Cleanup instructions. - A brief overview of the file structure within the `getting-started/minio` directory. --- getting-started/minio/README.md | 146 ++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 getting-started/minio/README.md diff --git a/getting-started/minio/README.md b/getting-started/minio/README.md new file mode 100644 index 0000000000..c6eb10ed8e --- /dev/null +++ b/getting-started/minio/README.md @@ -0,0 +1,146 @@ +# Getting Started with Apache Polaris: Minio S3, Governance with Spark & Trino (Read-Only) + +This example demonstrates setting up Apache Polaris to manage an Iceberg data lake in Minio S3, focusing on governance. +Polaris uses Postgres for its metadata. Spark SQL is configured for read/write access to create and populate Iceberg tables. Trino is configured for **strict read-only access** to query these tables. Access control is enforced by Polaris, with underlying S3 permissions managed by Minio. + +**Prerequisites:** +* Docker and Docker Compose. +* `jq` installed on your host machine. +* Apache Polaris images (`apache/polaris-admin-tool:postgres-latest`, `apache/polaris:postgres-latest`) built from source with JDBC support, tagged as `postgres-latest`. Run `./gradlew :polaris-quarkus-server:assemble :polaris-quarkus-server:quarkusAppPartsBuild --rerun :polaris-quarkus-admin:assemble :polaris-quarkus-admin:quarkusAppPartsBuild --rerun -Dquarkus.container-image.tag=postgres-latest -Dquarkus.container-image.build=true` from the Polaris repository root. + +**Security Overview:** +* **Minio (S3 Storage):** + * `polaris_s3_user` (R/W): Used by Polaris service for warehouse management. + * `spark_minio_s3_user` (R/W): Used by Spark engine for data R/W operations. + * `trino_minio_s3_user` (R/O): Used by Trino engine for data read operations. +* **Polaris (Catalog & Governance):** + * `root` user: Admin access to Polaris. + * `spark_app_client`: Polaris client ID for Spark, assigned `polaris_spark_role` (R/W permissions on `minio_catalog.ns_governed`). + * `trino_app_client`: Polaris client ID for Trino, assigned `polaris_trino_role` (R/O permissions on `minio_catalog.ns_governed`). + +**Setup and Execution:** + +1. **Environment Variables (Optional):** + Create a `.env` file in this directory (`getting-started/minio/.env`) to customize credentials and ports. Example: + ```env + # Minio Settings + MINIO_ROOT_USER=minioadmin + MINIO_ROOT_PASSWORD=minioadmin + MINIO_API_PORT=9000 + MINIO_CONSOLE_PORT=9001 + + # Minio S3 User Credentials (used by services, created by mc) + POLARIS_S3_USER=polaris_s3_user + POLARIS_S3_PASSWORD=polaris_s3_password_val + SPARK_MINIO_S3_USER=spark_minio_s3_user + SPARK_MINIO_S3_PASSWORD=spark_minio_s3_password_val + TRINO_MINIO_S3_USER=trino_minio_s3_user + TRINO_MINIO_S3_PASSWORD=trino_minio_s3_password_val + + # Polaris Client Credentials (for Spark & Trino to auth to Polaris, created by bootstrap) + SPARK_POLARIS_CLIENT_ID=spark_app_client + SPARK_POLARIS_CLIENT_SECRET=spark_client_secret_val + TRINO_POLARIS_CLIENT_ID=trino_app_client + TRINO_POLARIS_CLIENT_SECRET=trino_client_secret_val + + # Ports + POSTGRES_MINIO_PORT=5433 + POLARIS_MINIO_API_PORT=8183 + POLARIS_MINIO_MGMT_PORT=8184 + SPARK_UI_MINIO_START_PORT=4050 + # SPARK_UI_MINIO_END_PORT=4055 # Not strictly needed if using start port only for mapping range + TRINO_MINIO_PORT=8083 + ``` + +2. **Ensure Scripts are Executable:** + ```bash + chmod +x getting-started/minio/minio-config/setup-minio.sh + chmod +x getting-started/minio/polaris-config/create-catalog-minio.sh + chmod +x getting-started/minio/polaris-config/setup-polaris-governance.sh + ``` + +3. **Start Services:** + Navigate to `getting-started/minio` and run: + ```shell + docker compose up -d --build + ``` + This will start all services, including Minio setup, Polaris bootstrap (creating `root`, `spark_app_client`, `trino_app_client` principals), Polaris catalog creation, and Polaris governance setup (creating roles and assigning grants). Check logs with `docker compose logs -f`. + +4. **Access Minio Console:** + `http://localhost:${MINIO_CONSOLE_PORT:-9001}` (default: `minioadmin`/`minioadmin`). Verify `polaris-bucket`. + +5. **Using Spark SQL (Read/Write Access):** + Attach to Spark: `docker attach spark-sql-minio-gov` (Press ENTER for prompt). + The default catalog is `polaris_minio_gov`. + ```sql + -- Create a namespace governed by Polaris policies + CREATE NAMESPACE IF NOT EXISTS ns_governed + COMMENT 'Namespace for governed data access' + LOCATION 's3a://polaris-bucket/iceberg_warehouse/minio_catalog/ns_governed/'; -- Optional but good practice + + USE ns_governed; + + -- Create an Iceberg table + CREATE TABLE IF NOT EXISTS my_gov_table (id INT, name STRING, value DOUBLE) + USING iceberg + COMMENT 'Governed table for Spark R/W and Trino R/O demo' + TBLPROPERTIES ('format-version'='2'); + + -- Insert data + INSERT INTO my_gov_table VALUES (1, 'SparkRecordOne', 10.1), (2, 'SparkRecordTwo', 20.2); + + -- Select data + SELECT * FROM my_gov_table ORDER BY id; + -- Expected: Shows inserted records. + ``` + +6. **Using Trino CLI (Strict Read-Only Access):** + Access Trino CLI: `docker exec -it minio-trino-gov trino` + The Polaris catalog is mapped to `iceberg` in Trino. + ```sql + SHOW CATALOGS; + -- Expected: iceberg, system, ... + + SHOW SCHEMAS FROM iceberg; + -- Expected: information_schema, ns_governed + + SHOW TABLES FROM iceberg.ns_governed; + -- Expected: my_gov_table + + DESCRIBE iceberg.ns_governed.my_gov_table; + -- Expected: Schema of my_gov_table + + SELECT * FROM iceberg.ns_governed.my_gov_table ORDER BY id; + -- Expected: Shows records inserted by Spark. + + -- Test Read-Only: Attempt to create a table (SHOULD FAIL) + -- CREATE TABLE iceberg.ns_governed.trino_test_table (id INT) WITH (location = 's3a://polaris-bucket/iceberg_warehouse/minio_catalog/ns_governed/trino_test_table/'); + -- Expected: Error from Polaris indicating permission denied for CREATE_TABLE. + + -- Test Read-Only: Attempt to insert data (SHOULD FAIL) + -- INSERT INTO iceberg.ns_governed.my_gov_table VALUES (3, 'TrinoRecord', 30.3); + -- Expected: Error, as Trino's Polaris role and Minio S3 user are read-only. + ``` + +7. **Accessing Polaris API (Optional):** + Get token for `trino_app_client` (should have limited scope): + ```shell + export POLARIS_API_ENDPOINT="http://localhost:${POLARIS_MINIO_API_PORT:-8183}" + export TRINO_APP_TOKEN=$(curl -s "${POLARIS_API_ENDPOINT}/api/catalog/v1/oauth/tokens" \ + --user "${TRINO_POLARIS_CLIENT_ID:-trino_app_client}:${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d 'grant_type=client_credentials' \ + -d 'realmName=POLARIS_MINIO_REALM' | jq -r .access_token) + echo "Trino App Token: $TRINO_APP_TOKEN" + + # Try to list tables using Trino's token + curl -v "${POLARIS_API_ENDPOINT}/api/catalog/v1/minio_catalog/namespaces/ns_governed/tables" -H "Authorization: Bearer $TRINO_APP_TOKEN" + # This should succeed. + ``` + +8. **Cleanup:** + ```shell + docker compose down -v + ``` + +This set of scripts and configurations should enforce the desired access controls, with Trino having strictly read-only capabilities. \ No newline at end of file From ab66982fb0211774b905cbb2ca1fe35d614018e6 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 12:28:33 +0530 Subject: [PATCH 2/9] feat: Add Minio S3 access policies for Polaris example This commit introduces the JSON policy files required for configuring Minio access control in the Polaris with Minio S3 example. These policies define permissions for different users/services interacting with the Minio bucket. The following policies have been added to `getting-started/minio/minio-config/`: 1. `polaris-s3-rw-policy.json`: Grants Read-Write (R/W) permissions to the Minio bucket. This policy is intended for the `polaris_s3_user`, which the Polaris service itself uses for managing the Iceberg warehouse (e.g., creating namespace directories, managing catalog-level S3 interactions). 2. `spark-minio-rw-policy.json`: Grants Read-Write (R/W) permissions to the Minio bucket. This policy is for the `spark_minio_s3_user`, which the Spark engine uses for data plane operations like reading and writing Iceberg table data and metadata files to S3. 3. `trino-minio-ro-policy.json`: Grants Read-Only (R/O) permissions to the Minio bucket. This policy is for the `trino_minio_s3_user`, which the Trino engine uses for data plane operations, specifically reading Iceberg table data and metadata files from S3. These policies will be applied to their respective Minio users by the `setup-minio.sh` script, ensuring a layered security model where Polaris governs metadata access and Minio controls direct object storage access according to the principle of least privilege for each component. --- getting-started/minio/README.md | 14 +++++++++++- .../minio-config/polaris-s3-rw-policy.json | 22 +++++++++++++++++++ .../minio-config/spark-minio-rw-policy.json | 22 +++++++++++++++++++ .../minio-config/trino-minio-ro-policy.json | 17 ++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 getting-started/minio/minio-config/polaris-s3-rw-policy.json create mode 100644 getting-started/minio/minio-config/spark-minio-rw-policy.json create mode 100644 getting-started/minio/minio-config/trino-minio-ro-policy.json diff --git a/getting-started/minio/README.md b/getting-started/minio/README.md index c6eb10ed8e..cf187ab435 100644 --- a/getting-started/minio/README.md +++ b/getting-started/minio/README.md @@ -6,7 +6,19 @@ Polaris uses Postgres for its metadata. Spark SQL is configured for read/write a **Prerequisites:** * Docker and Docker Compose. * `jq` installed on your host machine. -* Apache Polaris images (`apache/polaris-admin-tool:postgres-latest`, `apache/polaris:postgres-latest`) built from source with JDBC support, tagged as `postgres-latest`. Run `./gradlew :polaris-quarkus-server:assemble :polaris-quarkus-server:quarkusAppPartsBuild --rerun :polaris-quarkus-admin:assemble :polaris-quarkus-admin:quarkusAppPartsBuild --rerun -Dquarkus.container-image.tag=postgres-latest -Dquarkus.container-image.build=true` from the Polaris repository root. +* Apache Polaris images (`apache/polaris-admin-tool:postgres-latest`, `apache/polaris:postgres-latest`) built from source with JDBC support, tagged as `postgres-latest`. + +Run + +```shell + ./gradlew \ + :polaris-quarkus-server:assemble \ + :polaris-quarkus-server:quarkusAppPartsBuild --rerun \ + :polaris-quarkus-admin:assemble \ + :polaris-quarkus-admin:quarkusAppPartsBuild --rerun \ + -Dquarkus.container-image.tag=postgres-latest \ + -Dquarkus.container-image.build=true +``` **Security Overview:** * **Minio (S3 Storage):** diff --git a/getting-started/minio/minio-config/polaris-s3-rw-policy.json b/getting-started/minio/minio-config/polaris-s3-rw-policy.json new file mode 100644 index 0000000000..c26b1cfe1b --- /dev/null +++ b/getting-started/minio/minio-config/polaris-s3-rw-policy.json @@ -0,0 +1,22 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} diff --git a/getting-started/minio/minio-config/spark-minio-rw-policy.json b/getting-started/minio/minio-config/spark-minio-rw-policy.json new file mode 100644 index 0000000000..c26b1cfe1b --- /dev/null +++ b/getting-started/minio/minio-config/spark-minio-rw-policy.json @@ -0,0 +1,22 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} diff --git a/getting-started/minio/minio-config/trino-minio-ro-policy.json b/getting-started/minio/minio-config/trino-minio-ro-policy.json new file mode 100644 index 0000000000..b1923672fd --- /dev/null +++ b/getting-started/minio/minio-config/trino-minio-ro-policy.json @@ -0,0 +1,17 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket", + "s3:GetBucketLocation" + ], + "Resource": [ + "arn:aws:s3:::polaris-bucket", + "arn:aws:s3:::polaris-bucket/*" + ] + } + ] +} From dea5219abd6fcb83241c4b8ea7dcf6ecc4d847c8 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 12:33:30 +0530 Subject: [PATCH 3/9] feat: Add setup script for Minio initialization This commit introduces the `setup-minio.sh` script located in `getting-started/minio/minio-config/`. This script is responsible for bootstrapping the Minio S3 service within the Docker Compose environment for the Polaris example. Its key functions include: - Waiting for the Minio service to become healthy and responsive. - Configuring the Minio client (`mc`) with an alias for the local Minio instance. - Creating the designated S3 bucket (`polaris-bucket`) if it doesn't already exist. - Creating Minio access policies by applying the previously defined JSON policy files: - `polaris-s3-rw-policy.json` - `spark-minio-rw-policy.json` - `trino-minio-ro-policy.json` - Creating three distinct Minio users with their respective credentials (passed as environment variables): - `polaris_s3_user` (for the Polaris service) - `spark_minio_s3_user` (for the Spark engine's data plane access) - `trino_minio_s3_user` (for the Trino engine's data plane access) - Attaching the appropriate access policies to each of these newly created Minio users. This script automates the necessary Minio setup steps, ensuring that the object storage is correctly configured with the required users and permissions before other services like Polaris, Spark, and Trino attempt to interact with it. --- .../minio/minio-config/setup-minio.sh | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 getting-started/minio/minio-config/setup-minio.sh diff --git a/getting-started/minio/minio-config/setup-minio.sh b/getting-started/minio/minio-config/setup-minio.sh new file mode 100644 index 0000000000..7a1a83c8d5 --- /dev/null +++ b/getting-started/minio/minio-config/setup-minio.sh @@ -0,0 +1,38 @@ +#!/bin/sh +set -e + +echo "Waiting for Minio service to start..." +attempt_counter=0 +max_attempts=20 +until curl -s -f http://minio:9000/minio/health/live > /dev/null; do + if [ ${attempt_counter} -eq ${max_attempts} ]; then + echo "Max attempts reached. Failed to connect to Minio." + exit 1 + fi + echo "Attempting to connect to Minio (${attempt_counter}/${max_attempts})..." + attempt_counter=$((attempt_counter+1)) + sleep 3 +done +echo "Minio service is live." + +mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} +mc mb myminio/polaris-bucket --ignore-existing + +# Create Minio policies from JSON files +mc admin policy create myminio polaris-s3-rw-policy /config/polaris-s3-rw-policy.json +mc admin policy create myminio spark-minio-rw-policy /config/spark-minio-rw-policy.json +mc admin policy create myminio trino-minio-ro-policy /config/trino-minio-ro-policy.json + +# Create Minio user for Polaris Service (R/W) +mc admin user add myminio ${POLARIS_S3_USER} ${POLARIS_S3_PASSWORD} +mc admin policy attach myminio polaris-s3-rw-policy --user ${POLARIS_S3_USER} + +# Create Minio user for Spark Engine data access (R/W) +mc admin user add myminio ${SPARK_MINIO_S3_USER} ${SPARK_MINIO_S3_PASSWORD} +mc admin policy attach myminio spark-minio-rw-policy --user ${SPARK_MINIO_S3_USER} + +# Create Minio user for Trino Engine data access (R/O) +mc admin user add myminio ${TRINO_MINIO_S3_USER} ${TRINO_MINIO_S3_PASSWORD} +mc admin policy attach myminio trino-minio-ro-policy --user ${TRINO_MINIO_S3_USER} + +echo "Minio setup complete: users (polaris_s3_user, spark_minio_s3_user, trino_minio_s3_user) and policies configured." From 3112d19b3b15507a8ac86d24c2498c181a954722 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 12:35:22 +0530 Subject: [PATCH 4/9] feat: Add script to create Minio-backed catalog in Polaris This commit introduces the `create-catalog-minio.sh` script, located in `getting-started/minio/polaris-config/`. The primary purpose of this script is to configure a new catalog within Apache Polaris that uses Minio S3 as its underlying storage for Iceberg table metadata and data. Key actions performed by the script: - Waits for the Polaris service to become healthy and responsive. - Acquires an administrative access token for the Polaris API using root credentials for the configured realm. - Defines the configuration for the new catalog, named `minio_catalog`. This configuration includes: - The S3 warehouse path (e.g., `s3a://polaris-bucket/iceberg_warehouse/minio_catalog`). - S3 connection details, such as the Minio endpoint. - S3 credentials (`POLARIS_S3_USER` and its password) that Polaris will use to interact with the Minio bucket for managing the warehouse structure. - Checks if the `minio_catalog` already exists in Polaris. - If the catalog does not exist, it sends a request to the Polaris Management API to create it with the specified configuration. - Verifies that the catalog creation was successful or that the catalog already existed. This script ensures that Polaris is aware of and configured to manage the Minio S3 storage location as an Iceberg catalog, which is essential before Spark or Trino can interact with tables within this catalog via Polaris. --- .../polaris-config/create-catalog-minio.sh | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 getting-started/minio/polaris-config/create-catalog-minio.sh diff --git a/getting-started/minio/polaris-config/create-catalog-minio.sh b/getting-started/minio/polaris-config/create-catalog-minio.sh new file mode 100644 index 0000000000..9d00d23c12 --- /dev/null +++ b/getting-started/minio/polaris-config/create-catalog-minio.sh @@ -0,0 +1,94 @@ +#!/bin/sh +set -e + +POLARIS_SERVICE_URL="http://polaris:8181" +POLARIS_MGMT_API_URL="${POLARIS_SERVICE_URL}/api/management/v1/catalogs" +POLARIS_TOKEN_URL="${POLARIS_SERVICE_URL}/api/catalog/v1/oauth/tokens" +POLARIS_ADMIN_USER="root" +POLARIS_ADMIN_PASS="s3cr3t" +POLARIS_REALM="POLARIS_MINIO_REALM" + +echo "Waiting for Polaris service to be healthy..." +attempt_counter=0 +max_attempts=20 +until curl -s -f "${POLARIS_SERVICE_URL}/q/health/live" > /dev/null; do + if [ ${attempt_counter} -eq ${max_attempts} ]; then + echo "Max attempts reached. Failed to connect to Polaris health check." + exit 1 + fi + echo "Attempting to connect to Polaris (${attempt_counter}/${max_attempts})..." + attempt_counter=$((attempt_counter+1)) + sleep 5 +done +echo "Polaris service is live." + +echo "Attempting to get Polaris admin token..." +ADMIN_TOKEN_RESPONSE=$(curl -s -w "%{http_code}" -X POST "${POLARIS_TOKEN_URL}" \ + --user "${POLARIS_ADMIN_USER}:${POLARIS_ADMIN_PASS}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials" -d "scope=PRINCIPAL_ROLE:ALL" -d "realmName=${POLARIS_REALM}") + +HTTP_CODE=$(echo "$ADMIN_TOKEN_RESPONSE" | tail -n1) +TOKEN_BODY=$(echo "$ADMIN_TOKEN_RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -ne 200 ]; then + echo "Failed to get Polaris admin token. HTTP Code: $HTTP_CODE. Response:" + echo "$TOKEN_BODY" + exit 1 +fi +ADMIN_TOKEN=$(echo "$TOKEN_BODY" | jq -r .access_token) +if [ -z "$ADMIN_TOKEN" ] || [ "$ADMIN_TOKEN" = "null" ]; then echo "Failed to parse admin token"; exit 1; fi +echo "Polaris admin token obtained." + +CATALOG_NAME="minio_catalog" +BUCKET_NAME="polaris-bucket" +CATALOG_WAREHOUSE_PATH="s3a://${BUCKET_NAME}/iceberg_warehouse/${CATALOG_NAME}" + +S3_ACCESS_KEY="${POLARIS_S3_USER}" # Polaris service's S3 user +S3_SECRET_KEY="${POLARIS_S3_PASSWORD}" +S3_ENDPOINT="http://minio:9000" + +CREATE_CATALOG_PAYLOAD=$(cat < Date: Wed, 14 May 2025 12:36:36 +0530 Subject: [PATCH 5/9] feat: Implement Polaris governance setup script This commit adds the `setup-polaris-governance.sh` script to `getting-started/minio/polaris-config/`. This script is responsible for configuring the access control policies within Apache Polaris for the Minio S3 example. Key functionalities of this script include: - Ensuring the Polaris service is operational before proceeding. - Obtaining an administrative token to interact with the Polaris Management API. - Defining and creating two distinct principal roles within Polaris: - `polaris_spark_role`: Intended for Spark, with read/write capabilities. - `polaris_trino_role`: Intended for Trino, with strict read-only capabilities. - Assigning the pre-bootstrapped client principals (`spark_app_client` and `trino_app_client`) to their respective roles (`polaris_spark_role` and `polaris_trino_role`). - Granting fine-grained privileges to these roles on Polaris resources (the `minio_catalog` and the `ns_governed` namespace): - The `polaris_spark_role` receives permissions to use the catalog, create namespaces, use namespaces, and perform full CRUD (Create, Read, Update, Delete) operations on tables and their data within `ns_governed`. - The `polaris_trino_role` receives permissions to use the catalog, use the `ns_governed` namespace, and read table metadata and data within that namespace. It is explicitly NOT granted any write, create, alter, or delete permissions. This script is crucial for demonstrating Polaris's governance capabilities by centrally defining and enforcing different access levels for Spark and Trino when interacting with the Iceberg tables managed by Polaris and stored in Minio. --- .../setup-polaris-governance.sh | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 getting-started/minio/polaris-config/setup-polaris-governance.sh diff --git a/getting-started/minio/polaris-config/setup-polaris-governance.sh b/getting-started/minio/polaris-config/setup-polaris-governance.sh new file mode 100644 index 0000000000..d75a7cf457 --- /dev/null +++ b/getting-started/minio/polaris-config/setup-polaris-governance.sh @@ -0,0 +1,104 @@ +#!/bin/sh +set -e + +POLARIS_SERVICE_URL="http://polaris:8181" +POLARIS_MGMT_API_URL_BASE="${POLARIS_SERVICE_URL}/api/management/v1" +POLARIS_ADMIN_USER="root" +POLARIS_ADMIN_PASS="s3cr3t" +POLARIS_REALM="POLARIS_MINIO_REALM" + +CATALOG_NAME="minio_catalog" +NAMESPACE_NAME="ns_governed" + +# Polaris client IDs (assumed to be created by polaris-bootstrap-minio) +SPARK_POLARIS_CLIENT_ID="${SPARK_POLARIS_CLIENT_ID:-spark_app_client}" +TRINO_POLARIS_CLIENT_ID="${TRINO_POLARIS_CLIENT_ID:-trino_app_client}" + +# Polaris Principal Role names +SPARK_ROLE_NAME="polaris_spark_role" +TRINO_ROLE_NAME="polaris_trino_role" + +echo "Waiting for Polaris service..." +# ... (Polaris health check as in create-catalog-minio.sh) ... +echo "Polaris service is live." + +echo "Acquiring Polaris admin token..." +# ... (Admin token acquisition as in create-catalog-minio.sh, storing token in ADMIN_TOKEN) ... +ADMIN_TOKEN_RESPONSE=$(curl -s -w "%{http_code}" -X POST "${POLARIS_SERVICE_URL}/api/catalog/v1/oauth/tokens" \ + --user "${POLARIS_ADMIN_USER}:${POLARIS_ADMIN_PASS}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials" -d "scope=PRINCIPAL_ROLE:ALL" -d "realmName=${POLARIS_REALM}") +HTTP_CODE=$(echo "$ADMIN_TOKEN_RESPONSE" | tail -n1) +TOKEN_BODY=$(echo "$ADMIN_TOKEN_RESPONSE" | sed '$d') +if [ "$HTTP_CODE" -ne 200 ]; then echo "Failed to get Polaris admin token. HTTP Code: $HTTP_CODE"; exit 1; fi +ADMIN_TOKEN=$(echo "$TOKEN_BODY" | jq -r .access_token) +if [ -z "$ADMIN_TOKEN" ] || [ "$ADMIN_TOKEN" = "null" ]; then echo "Failed to parse admin token"; exit 1; fi +echo "Admin token acquired." + + +polaris_api_call() { + local method="$1" + local endpoint="$2" + local payload="$3" + local expected_status_primary="$4" + local expected_status_secondary="${5:-409}" # Typically 409 Conflict for already exists + + full_url="${POLARIS_MGMT_API_URL_BASE}${endpoint}" + echo "Calling: $method $full_url" + if [ -n "$payload" ]; then + echo "Payload: $payload" + response_code=$(curl -s -o /tmp/api_response.txt -w "%{http_code}" \ + -X "$method" -H "Authorization: Bearer $ADMIN_TOKEN" -H "Content-Type: application/json" \ + "$full_url" -d "$payload") + else + response_code=$(curl -s -o /tmp/api_response.txt -w "%{http_code}" \ + -X "$method" -H "Authorization: Bearer $ADMIN_TOKEN" -H "Content-Type: application/json" \ + "$full_url") + fi + + echo "Response Code: $response_code. Body:" + cat /tmp/api_response.txt + if [ "$response_code" -ne "$expected_status_primary" ] && [ "$response_code" -ne "$expected_status_secondary" ]; then + echo "Error: API call failed. Expected $expected_status_primary or $expected_status_secondary, Got $response_code." + # exit 1 # Comment out for idempotency if needed + else + echo "API call successful or resource already exists (HTTP $response_code)." + fi + echo "" +} + +# 1. Create Principal Roles +polaris_api_call "POST" "/principal-roles" "{\"name\": \"${SPARK_ROLE_NAME}\"}" 201 +polaris_api_call "POST" "/principal-roles" "{\"name\": \"${TRINO_ROLE_NAME}\"}" 201 + +# 2. Assign Principals (Client IDs) to Roles +# Assumes SPARK_POLARIS_CLIENT_ID and TRINO_POLARIS_CLIENT_ID are valid principal names created by bootstrap +polaris_api_call "PUT" "/principal-roles/${SPARK_ROLE_NAME}/principals/${SPARK_POLARIS_CLIENT_ID}" "" 204 200 # 200 if already assigned +polaris_api_call "PUT" "/principal-roles/${TRINO_ROLE_NAME}/principals/${TRINO_POLARIS_CLIENT_ID}" "" 204 200 + +# 3. Grant Privileges to Roles + +# --- Spark Role Grants (R/W) --- +# Catalog grants for Spark +polaris_api_call "POST" "/principal-roles/${SPARK_ROLE_NAME}/catalog-grants" \ + "{\"catalogName\": \"${CATALOG_NAME}\", \"privileges\": [\"USE_CATALOG\", \"CREATE_NAMESPACE\"]}" 201 +# Namespace grants for Spark on ns_governed +polaris_api_call "POST" "/principal-roles/${SPARK_ROLE_NAME}/grants" \ + "{\"grantResource\":{\"resourceType\":\"NAMESPACE\",\"identifierParts\":[\"${CATALOG_NAME}\",\"${NAMESPACE_NAME}\"]},\"privileges\":[\"USE_NAMESPACE\",\"CREATE_TABLE\",\"DROP_TABLE\",\"ALTER_TABLE\"]}" 201 +# Table grants for Spark on tables under ns_governed +polaris_api_call "POST" "/principal-roles/${SPARK_ROLE_NAME}/grants" \ + "{\"grantResource\":{\"resourceType\":\"TABLE\",\"identifierParts\":[\"${CATALOG_NAME}\",\"${NAMESPACE_NAME}\",\"*\"]},\"privileges\":[\"READ_TABLE_METADATA\",\"READ_TABLE_DATA\",\"WRITE_TABLE_DATA\"]}" 201 + + +# --- Trino Role Grants (R/O) --- +# Catalog grants for Trino +polaris_api_call "POST" "/principal-roles/${TRINO_ROLE_NAME}/catalog-grants" \ + "{\"catalogName\": \"${CATALOG_NAME}\", \"privileges\": [\"USE_CATALOG\"]}" 201 +# Namespace grants for Trino on ns_governed +polaris_api_call "POST" "/principal-roles/${TRINO_ROLE_NAME}/grants" \ + "{\"grantResource\":{\"resourceType\":\"NAMESPACE\",\"identifierParts\":[\"${CATALOG_NAME}\",\"${NAMESPACE_NAME}\"]},\"privileges\":[\"USE_NAMESPACE\"]}" 201 +# Table grants for Trino on tables under ns_governed (strictly read-only) +polaris_api_call "POST" "/principal-roles/${TRINO_ROLE_NAME}/grants" \ + "{\"grantResource\":{\"resourceType\":\"TABLE\",\"identifierParts\":[\"${CATALOG_NAME}\",\"${NAMESPACE_NAME}\",\"*\"]},\"privileges\":[\"READ_TABLE_METADATA\",\"READ_TABLE_DATA\"]}" 201 + +echo "Polaris governance setup script completed." From 2b501ab125fcf57dbb27378147f342a492129548 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 14:17:23 +0530 Subject: [PATCH 6/9] feat: Configure Trino Iceberg connector for Polaris and Minio (Read-Only) This commit adds the `iceberg.properties` file to `getting-started/minio/trino-catalog/`. This file configures the Trino Iceberg connector to integrate with Apache Polaris for metadata management and Minio for data storage, specifically enforcing read-only access for Trino. Key configurations in this file include: - Setting the connector name to `iceberg`. - Defining the Iceberg catalog type as `rest`, with the URI pointing to the internal Polaris service endpoint (`http://polaris:8181/api/catalog`). - Mapping Trino's `iceberg` catalog to the `minio_catalog` defined within Polaris. - Configuring OAuth2 authentication for Trino to securely connect to Polaris, using `TRINO_POLARIS_CLIENT_ID` and `TRINO_POLARIS_CLIENT_SECRET`. These credentials correspond to a Polaris principal associated with a read-only role. - Specifying S3 connection details for Trino's data plane operations, including the Minio endpoint and credentials (`TRINO_MINIO_S3_USER` and `TRINO_MINIO_S3_PASSWORD`) that have read-only permissions at the Minio S3 level. - Enabling Hadoop filesystem support (`fs.hadoop.enabled=true`) as required for S3 interaction. This configuration ensures that Trino can discover and query Iceberg tables managed by Polaris, with data residing in Minio, while adhering to the strict read-only access policies defined both in Polaris and Minio. --- .../minio/trino-catalog/iceberg.properties | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 getting-started/minio/trino-catalog/iceberg.properties diff --git a/getting-started/minio/trino-catalog/iceberg.properties b/getting-started/minio/trino-catalog/iceberg.properties new file mode 100644 index 0000000000..5379e48df9 --- /dev/null +++ b/getting-started/minio/trino-catalog/iceberg.properties @@ -0,0 +1,26 @@ +connector.name=iceberg +iceberg.catalog.type=rest +iceberg.rest-catalog.uri=http://polaris:8181/api/catalog +iceberg.rest-catalog.warehouse=minio_catalog + +# Authentication for Polaris REST catalog (Trino authenticates to Polaris) +iceberg.rest-catalog.security=OAUTH2 +iceberg.rest-catalog.oauth2.token-endpoint=http://polaris:8181/api/catalog/v1/oauth/tokens +iceberg.rest-catalog.oauth2.client-id=${TRINO_POLARIS_CLIENT_ID} +iceberg.rest-catalog.oauth2.client-secret=${TRINO_POLARIS_CLIENT_SECRET} +iceberg.rest-catalog.oauth2.additional-parameters=realmName=POLARIS_MINIO_REALM + +# S3 configuration for Trino engine to access data in Minio (data plane - uses R/O Minio credentials) +iceberg.s3.endpoint=http://minio:9000 +iceberg.s3.path-style-access=true +iceberg.s3.aws-access-key=${TRINO_MINIO_S3_USER} +iceberg.s3.aws-secret-key=${TRINO_MINIO_S3_PASSWORD} +iceberg.s3.region=us-east-1 # Dummy region for Minio + +# Hive S3 settings, often used by Trino Iceberg connector +hive.s3.endpoint=http://minio:9000 +hive.s3.path-style-access=true +hive.s3.aws.access-key=${TRINO_MINIO_S3_USER} +hive.s3.aws.secret-key=${TRINO_MINIO_S3_PASSWORD} + +fs.hadoop.enabled=true From 7ad08d36e6cd8ff61127f89f8884bcc8898e18c7 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 14:17:54 +0530 Subject: [PATCH 7/9] feat: Add Docker Compose setup for Polaris with Minio, Spark & Trino governance example This commit introduces the `docker-compose.yml` file for the `getting-started/minio` example. This file orchestrates the deployment of a multi-container environment to demonstrate Apache Polaris managing an Iceberg data lake on Minio S3, with governed access for Spark (read/write) and Trino (read-only). The Docker Compose setup includes the following services: - `minio`: Provides S3-compatible object storage. - `mc`: Minio client used to initialize buckets, users, and policies in Minio. - `postgres-minio`: PostgreSQL database instance for Polaris metadata. - `polaris-bootstrap-minio`: Bootstraps the Polaris database and creates initial principals for root admin, Spark client, and Trino client. - `polaris`: The Apache Polaris catalog server. - `polaris-setup-catalog-minio`: A utility service to create the `minio_catalog` within Polaris, configured to use the Minio backend. - `polaris-setup-governance`: A utility service to apply fine-grained access control policies (roles and grants) within Polaris for Spark and Trino. - `spark-sql-minio`: Apache Spark SQL shell, configured to interact with Polaris for R/W operations on Iceberg tables. - `trino-minio`: Trino server, configured to interact with Polaris for R/O query operations on Iceberg tables. Key aspects of this Docker Compose configuration: - Defines service dependencies (`depends_on`) to ensure a correct startup order. - Manages network configuration for inter-service communication. - Utilizes volume mounts for persistent data (Postgres, Minio) and for injecting configuration files. - Employs environment variables for passing credentials, S3 user details, Polaris client IDs/secrets, and other settings, with sensible defaults provided. - Includes health checks for critical services like Minio and Polaris. - Uses specific, recent image versions for Minio, mc, Postgres, Spark, and Trino to ensure stability and reproducibility. This setup provides a complete, self-contained environment to test and demonstrate the end-to-end functionality of Polaris, including its governance features, with Minio as the storage backend and Spark/Trino as data processing engines. --- getting-started/minio/docker-compose.yml | 209 +++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 getting-started/minio/docker-compose.yml diff --git a/getting-started/minio/docker-compose.yml b/getting-started/minio/docker-compose.yml new file mode 100644 index 0000000000..6d5061a4e0 --- /dev/null +++ b/getting-started/minio/docker-compose.yml @@ -0,0 +1,209 @@ +version: '3.8' + +services: + minio: + image: minio/minio:RELEASE.2024-05-03T15-18-24Z # Using a specific recent version + ports: + - "${MINIO_API_PORT:-9000}:9000" + - "${MINIO_CONSOLE_PORT:-9001}:9001" + volumes: + - minio_data:/data + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + # Credentials for Minio users (to be created by mc) + POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} + POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} # Changed default + SPARK_MINIO_S3_USER: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + SPARK_MINIO_S3_PASSWORD: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} # Changed default + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} # Changed default + command: server /data --console-address ":9001" + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + mc: + image: minio/mc:RELEASE.2024-05-02T06-20-15Z # Using a specific recent version + depends_on: + minio: + condition: service_healthy + volumes: + - ./minio-config:/config + entrypoint: /bin/sh + command: /config/setup-minio.sh + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} + POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} + POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} + SPARK_MINIO_S3_USER: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + SPARK_MINIO_S3_PASSWORD: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} + + postgres-minio: + image: postgres:17.4 # Using a specific recent version for stability + ports: + - "${POSTGRES_MINIO_PORT:-5433}:5432" + shm_size: 128mb + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: POLARIS_MINIO + POSTGRES_INITDB_ARGS: "--encoding UTF8 --data-checksums" + volumes: + - ../assets/postgres/postgresql.conf:/etc/postgresql/postgresql.conf + - postgres_minio_data:/var/lib/postgresql/data + command: ["postgres", "-c", "config_file=/etc/postgresql/postgresql.conf"] + healthcheck: + test: "pg_isready -U postgres -d POLARIS_MINIO" + interval: 5s + timeout: 2s + retries: 15 + + polaris-bootstrap-minio: + image: apache/polaris-admin-tool:postgres-latest # Assumes image built from Polaris source + depends_on: + postgres-minio: + condition: service_healthy + environment: + polaris.persistence.type: relational-jdbc + quarkus.datasource.db-kind: pgsql + quarkus.datasource.jdbc.url: jdbc:postgresql://postgres-minio:5432/POLARIS_MINIO + quarkus.datasource.username: postgres + quarkus.datasource.password: postgres + # Polaris client credentials (to be created by bootstrap) + SPARK_POLARIS_CLIENT_ID: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} + SPARK_POLARIS_CLIENT_SECRET: ${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} + TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} + TRINO_POLARIS_CLIENT_SECRET: ${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} + command: > # Using > for multi-line command + bootstrap + --realm=POLARIS_MINIO_REALM + --credential=POLARIS_MINIO_REALM,root,s3cr3t + --credential=POLARIS_MINIO_REALM,${SPARK_POLARIS_CLIENT_ID:-spark_app_client},${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} + --credential=POLARIS_MINIO_REALM,${TRINO_POLARIS_CLIENT_ID:-trino_app_client},${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} + + polaris: + image: apache/polaris:postgres-latest # Assumes image built from Polaris source + depends_on: + polaris-bootstrap-minio: + condition: service_completed_successfully + postgres-minio: + condition: service_healthy + mc: + condition: service_completed_successfully + ports: + - "${POLARIS_MINIO_API_PORT:-8183}:8181" + - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" + environment: + polaris.persistence.type: relational-jdbc + quarkus.datasource.db-kind: pgsql + quarkus.datasource.jdbc.url: jdbc:postgresql://postgres-minio:5432/POLARIS_MINIO + quarkus.datasource.username: postgres + quarkus.datasource.password: postgres + polaris.realm-context.realms: POLARIS_MINIO_REALM + quarkus.otel.sdk.disabled: "true" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8182/q/health/live"] + interval: 5s + timeout: 3s + retries: 10 + start_period: 10s + + polaris-setup-catalog-minio: + image: alpine/curl:3.19 # Specific version + depends_on: + polaris: condition: service_healthy + volumes: + - ./polaris-config:/polaris-config + entrypoint: /bin/sh + command: '-c "apk add --no-cache jq && chmod +x /polaris-config/create-catalog-minio.sh && /polaris-config/create-catalog-minio.sh"' + environment: + POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} + POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} + + polaris-setup-governance: + image: alpine/curl:3.19 # Specific version + depends_on: + polaris-setup-catalog-minio: + condition: service_completed_successfully + volumes: + - ./polaris-config:/polaris-config + entrypoint: /bin/sh + command: '-c "apk add --no-cache jq && chmod +x /polaris-config/setup-polaris-governance.sh && /polaris-config/setup-polaris-governance.sh"' + environment: + SPARK_POLARIS_CLIENT_ID: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} + TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} + + spark-sql-minio: + image: apache/spark:3.5.1 # Using a specific Spark 3.5.x version + container_name: spark-sql-minio-gov + depends_on: + polaris-setup-governance: + condition: service_completed_successfully + minio: condition: service_healthy + stdin_open: true + tty: true + ports: + - "${SPARK_UI_MINIO_START_PORT:-4050}-${SPARK_UI_MINIO_END_PORT:-4055}:4040-4045" + environment: + # Minio S3 credentials for Spark data plane (R/W) + AWS_ACCESS_KEY_ID: ${SPARK_MINIO_S3_USER:-spark_minio_s3_user} + AWS_SECRET_ACCESS_KEY: ${SPARK_MINIO_S3_PASSWORD:-spark_minio_s3_password_val} + # Polaris client credentials for Spark control plane + SPARK_POLARIS_CLIENT_ID_ENV: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} + SPARK_POLARIS_CLIENT_SECRET_ENV: ${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} + command: [ + "/opt/spark/bin/spark-sql", + "--packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,software.amazon.awssdk:bundle:2.25.31,software.amazon.awssdk:url-connection-client:2.25.31,org.apache.hadoop:hadoop-aws:3.3.6", # Updated Iceberg, AWS SDK versions + "--conf", "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "--conf", "spark.sql.catalog.polaris_minio_gov=org.apache.iceberg.spark.SparkCatalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.catalog-impl=org.apache.iceberg.rest.RESTCatalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.uri=http://polaris:8181/api/catalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.type=oauth2", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.grant-type=client_credentials", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.client-id=${SPARK_POLARIS_CLIENT_ID_ENV}", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.client-secret=${SPARK_POLARIS_CLIENT_SECRET_ENV}", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.token-endpoint=http://polaris:8181/api/catalog/v1/oauth/tokens", + "--conf", "spark.sql.catalog.polaris_minio_gov.auth.oauth2.additional-parameters=realmName=POLARIS_MINIO_REALM", + "--conf", "spark.sql.catalog.polaris_minio_gov.warehouse=minio_catalog", + "--conf", "spark.sql.catalog.polaris_minio_gov.io-impl=org.apache.iceberg.aws.s3.S3FileIO", + "--conf", "spark.hadoop.fs.s3a.endpoint=http://minio:9000", + "--conf", "spark.hadoop.fs.s3a.path.style.access=true", + "--conf", "spark.hadoop.fs.s3a.connection.ssl.enabled=false", + "--conf", "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", + "--conf", "spark.sql.defaultCatalog=polaris_minio_gov", + "--conf", "spark.driver.extraJavaOptions=-Divy.cache.dir=/tmp -Divy.home=/tmp" + ] + + trino-minio: + image: trinodb/trino:449 # Using a specific Trino version + container_name: minio-trino-gov + depends_on: + polaris-setup-governance: + condition: service_completed_successfully + minio: condition: service_healthy + ports: + - "${TRINO_MINIO_PORT:-8083}:8080" + volumes: + - ./trino-catalog:/etc/trino/catalog + environment: + # Polaris client credentials for Trino control plane (R/O role in Polaris) + TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} + TRINO_POLARIS_CLIENT_SECRET: ${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} + # Minio S3 credentials for Trino data plane (R/O Minio user) + TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} + TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} + +volumes: + minio_data: + postgres_minio_data: + +networks: + default: + name: polaris_minio_gov_network From 82883f5ec6173e2d13e1455c133374f7ccff3e84 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Wed, 14 May 2025 14:39:17 +0530 Subject: [PATCH 8/9] Adding default .env. --- getting-started/minio/.env | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 getting-started/minio/.env diff --git a/getting-started/minio/.env b/getting-started/minio/.env new file mode 100644 index 0000000000..cd95e5afd4 --- /dev/null +++ b/getting-started/minio/.env @@ -0,0 +1,42 @@ +# .env +# Default environment variables for Polaris Minio S3 example + +# Minio Root Credentials (used by minio service and mc script) +MINIO_ROOT_USER=minioadmin +MINIO_ROOT_PASSWORD=minioadmin + +# Minio S3 User Credentials (created by mc script, used by services) +POLARIS_S3_USER=polaris_s3_user +POLARIS_S3_PASSWORD=polaris_s3_password_val + +SPARK_MINIO_S3_USER=spark_minio_s3_user +SPARK_MINIO_S3_PASSWORD=spark_minio_s3_password_val + +TRINO_MINIO_S3_USER=trino_minio_s3_user +TRINO_MINIO_S3_PASSWORD=trino_minio_s3_password_val + +# Polaris Client Credentials (created by polaris-bootstrap-minio, used by Spark/Trino to authenticate to Polaris) +# These are used by polaris-bootstrap-minio and polaris-setup-governance directly +SPARK_POLARIS_CLIENT_ID=spark_app_client +SPARK_POLARIS_CLIENT_SECRET=spark_client_secret_val + +TRINO_POLARIS_CLIENT_ID=trino_app_client +TRINO_POLARIS_CLIENT_SECRET=trino_client_secret_val + +# These specific _ENV suffixed versions are referenced by the spark-sql-minio service environment block +# Setting them explicitly here to match the defaults and avoid Docker Compose warnings. +SPARK_POLARIS_CLIENT_ID_ENV=spark_app_client +SPARK_POLARIS_CLIENT_SECRET_ENV=spark_client_secret_val + +# Port Mappings (defaults used in docker-compose.yml) +MINIO_API_PORT=9000 +MINIO_CONSOLE_PORT=9001 +POSTGRES_MINIO_PORT=5433 +POLARIS_MINIO_API_PORT=8183 +POLARIS_MINIO_MGMT_PORT=8184 +SPARK_UI_MINIO_START_PORT=4050 +SPARK_UI_MINIO_END_PORT=4055 # Used in port range mapping +TRINO_MINIO_PORT=8083 + +# You can change these values if needed, but these align with the defaults +# in the docker-compose.yml and associated scripts. From f048bcd7847d55cf36daf28636b8d1e004549094 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Thu, 15 May 2025 09:06:34 +0530 Subject: [PATCH 9/9] refactor: Switch Polaris to in-memory store for Minio example This commit refactors the `getting-started/minio` example to configure the main Apache Polaris server to use an in-memory metastore. This simplifies the setup by removing the dependency on PostgreSQL for Polaris's own metadata, making it lighter for a getting-started experience and to isolate previous database connection issues. Key changes include: 1. **Docker Compose (`docker-compose.yml`):** * Removed the `postgres-minio` and `polaris-bootstrap-minio` services. * Updated the `polaris` service: * Removed `depends_on: postgres-minio`. * Environment variables are now configured to set `POLARIS_PERSISTENCE_TYPE` to `in-memory`. * Added `POLARIS_BOOTSTRAP_CREDENTIALS` to allow the in-memory Polaris instance to initialize with known `root` credentials. * Removed PostgreSQL-specific `QUARKUS_DATASOURCE_*` variables from its environment block, relying on values from the `.env` file for other settings. * Updated health check timings and port references. * Adjusted `depends_on` for `polaris-setup-catalog-minio` and `polaris-setup-governance` to depend directly on the `polaris` service. * Updated image tags for `minio/mc` and `minio/minio` to `latest`. * Removed `version: '3.8'` as it's obsolete. 2. **Environment File (`.env`):** * Set `POLARIS_PERSISTENCE_TYPE=in-memory`. * Added `POLARIS_BOOTSTRAP_CREDENTIALS="POLARIS_MINIO_REALM,root,s3cr3t"`. * Commented out/removed PostgreSQL specific `QUARKUS_DATASOURCE_*` variables (as they are not needed for the in-memory `polaris` service). * Ensured other necessary variables (ports, client IDs/secrets for setup scripts) are present. 3. **Minio Setup Script (`minio-config/setup-minio.sh`):** * Removed the `curl`-based health check loop, relying on Docker Compose's `depends_on: minio: condition: service_healthy`. 4. **Polaris Governance Script (`polaris-config/setup-polaris-governance.sh`):** * Added conceptual API calls to create `spark_app_client` and `trino_app_client` principals and their credentials using the `root` token, as these are no longer created by a dedicated bootstrap service. (Note: These API calls are illustrative and depend on actual Polaris API structure). These changes aim to provide a working "getting started" example using an in-memory Polaris server, which simplifies deployment and focuses on Polaris's interaction with Minio and its governance features for Spark and Trino. The removal of the PostgreSQL dependency for the Polaris server itself should resolve previous H2 fallback issues. Sources and related content --- .../jdbc/docker-compose-bootstrap-db.yml | 9 +- getting-started/jdbc/docker-compose.yml | 66 ++++++--- getting-started/minio/.env | 31 +++- getting-started/minio/docker-compose.yml | 136 ++++++++---------- .../minio/minio-config/setup-minio.sh | 14 -- .../polaris-config/create-catalog-minio.sh | 0 .../setup-polaris-governance.sh | 17 +++ 7 files changed, 147 insertions(+), 126 deletions(-) mode change 100644 => 100755 getting-started/minio/minio-config/setup-minio.sh mode change 100644 => 100755 getting-started/minio/polaris-config/create-catalog-minio.sh mode change 100644 => 100755 getting-started/minio/polaris-config/setup-polaris-governance.sh diff --git a/getting-started/jdbc/docker-compose-bootstrap-db.yml b/getting-started/jdbc/docker-compose-bootstrap-db.yml index d23235b2d5..4d74e18cba 100644 --- a/getting-started/jdbc/docker-compose-bootstrap-db.yml +++ b/getting-started/jdbc/docker-compose-bootstrap-db.yml @@ -26,11 +26,10 @@ services: - QUARKUS_DATASOURCE_JDBC_URL=${QUARKUS_DATASOURCE_JDBC_URL} - QUARKUS_DATASOURCE_USERNAME=${QUARKUS_DATASOURCE_USERNAME} - QUARKUS_DATASOURCE_PASSWORD=${QUARKUS_DATASOURCE_PASSWORD} - command: - - "bootstrap" - - "--realm=POLARIS" - - "--credential=POLARIS,root,s3cr3t" - + command:> + bootstrap + --realm=POLARIS_MINIO_REALM + --credential=POLARIS_MINIO_REALM,root,s3cr3t polaris: depends_on: polaris-bootstrap: diff --git a/getting-started/jdbc/docker-compose.yml b/getting-started/jdbc/docker-compose.yml index fbfd427ee2..1d480764f9 100644 --- a/getting-started/jdbc/docker-compose.yml +++ b/getting-started/jdbc/docker-compose.yml @@ -21,32 +21,52 @@ services: polaris: image: apache/polaris:postgres-latest + depends_on: + postgres-minio: # Polaris server depends on PostgreSQL being healthy + condition: service_healthy + # polaris-bootstrap-minio is a setup task; polaris server doesn't need to wait for it on every start + # after the initial successful bootstrap. Other services that *use* Polaris data + # (like polaris-setup-catalog-minio) should depend on polaris: service_healthy. ports: - # API port - - "8181:8181" - # Management port (metrics and health checks) - - "8182:8182" - # Optional, allows attaching a debugger to the Polaris JVM - - "5005:5005" + # The host port is defined by POLARIS_MINIO_API_PORT from .env, container port is 8181 + - "${POLARIS_MINIO_API_PORT:-8183}:${QUARKUS_HTTP_PORT:-8181}" # Or just - "${POLARIS_MINIO_API_PORT:-8183}:8181" + # The host port is defined by POLARIS_MINIO_MGMT_PORT from .env, container port is 8182 + - "${POLARIS_MINIO_MGMT_PORT:-8184}:${QUARKUS_MANAGEMENT_PORT:-8182}" # Or just - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" environment: - - JAVA_DEBUG=true - - JAVA_DEBUG_PORT=*:5005 - - POLARIS_PERSISTENCE_TYPE=relational-jdbc - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_RETRIES=5 - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_INITIAL_DELAY_IN_MS=100 - - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_DELAY_IN_MS=5000 - - QUARKUS_DATASOURCE_DB_KIND=pgsql - - QUARKUS_DATASOURCE_JDBC_URL=${QUARKUS_DATASOURCE_JDBC_URL} - - QUARKUS_DATASOURCE_USERNAME=${QUARKUS_DATASOURCE_USERNAME} - - QUARKUS_DATASOURCE_PASSWORD=${QUARKUS_DATASOURCE_PASSWORD} - - POLARIS_REALM_CONTEXT_REALMS=POLARIS - - QUARKUS_OTEL_SDK_DISABLED=true + # These variables will be sourced from the .env file (or shell environment). + # Docker Compose makes them available to the container if they are defined. + - QUARKUS_DATASOURCE_DB_KIND + - QUARKUS_DATASOURCE_JDBC_URL + - QUARKUS_DATASOURCE_USERNAME + - QUARKUS_DATASOURCE_PASSWORD + + - POLARIS_PERSISTENCE_TYPE + - POLARIS_REALM_CONTEXT_REALMS + + # Optional JDBC retry settings + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_RETRIES + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_INITIAL_DELAY_IN_MS + - POLARIS_PERSISTENCE_RELATIONAL_JDBC_MAX_DELAY_IN_MS + + # Other Quarkus/App settings from .env + - QUARKUS_OTEL_SDK_DISABLED + - QUARKUS_HTTP_PORT # Tells Quarkus which port to bind to inside the container + - QUARKUS_MANAGEMENT_PORT # Tells Quarkus which management port to bind to inside the container + + # Optional: Debug logging settings (will be sourced from .env if uncommented there) + - QUARKUS_LOG_CONSOLE_LEVEL + - QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL + - QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL healthcheck: - test: ["CMD", "curl", "http://localhost:8182/q/health"] - interval: 2s - timeout: 10s - retries: 10 - start_period: 10s + # Uses the management port defined by POLARIS_MINIO_MGMT_PORT (which sets QUARKUS_MANAGEMENT_PORT for inside the container) + # The healthcheck runs INSIDE the container network, so it checks localhost:QUARKUS_MANAGEMENT_PORT (e.g. localhost:8182) + test: ["CMD-SHELL", "curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/live || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/ready || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health"] + interval: 10s + timeout: 5s + retries: 15 + start_period: 30s polaris-setup: image: alpine/curl diff --git a/getting-started/minio/.env b/getting-started/minio/.env index cd95e5afd4..a0f56f808d 100644 --- a/getting-started/minio/.env +++ b/getting-started/minio/.env @@ -15,8 +15,12 @@ SPARK_MINIO_S3_PASSWORD=spark_minio_s3_password_val TRINO_MINIO_S3_USER=trino_minio_s3_user TRINO_MINIO_S3_PASSWORD=trino_minio_s3_password_val -# Polaris Client Credentials (created by polaris-bootstrap-minio, used by Spark/Trino to authenticate to Polaris) -# These are used by polaris-bootstrap-minio and polaris-setup-governance directly +# Polaris Client Credentials (for Spark & Trino to auth to Polaris) +# These are used by: +# - polaris-bootstrap-minio (command to create them) +# - polaris-setup-governance (environment for script to know client IDs, and to create credentials if bootstrap doesn't) +# - spark-sql-minio (environment for Spark's Polaris catalog auth) +# - trino-minio (environment for Trino's Polaris catalog auth) SPARK_POLARIS_CLIENT_ID=spark_app_client SPARK_POLARIS_CLIENT_SECRET=spark_client_secret_val @@ -28,15 +32,32 @@ TRINO_POLARIS_CLIENT_SECRET=trino_client_secret_val SPARK_POLARIS_CLIENT_ID_ENV=spark_app_client SPARK_POLARIS_CLIENT_SECRET_ENV=spark_client_secret_val +# --- Polaris Service Specific Configuration --- +POLARIS_PERSISTENCE_TYPE=in-memory +POLARIS_REALM_CONTEXT_REALMS=POLARIS_MINIO_REALM +POLARIS_BOOTSTRAP_CREDENTIALS="POLARIS_MINIO_REALM,root,s3cr3t" # Custom root credentials for the realm +# --- Other Quarkus and Port Mappings for Services --- +QUARKUS_OTEL_SDK_DISABLED=true # For polaris service + # Port Mappings (defaults used in docker-compose.yml) MINIO_API_PORT=9000 MINIO_CONSOLE_PORT=9001 POSTGRES_MINIO_PORT=5433 POLARIS_MINIO_API_PORT=8183 -POLARIS_MINIO_MGMT_PORT=8184 +POLARIS_MINIO_MGMT_PORT=8184 # Important for health check + SPARK_UI_MINIO_START_PORT=4050 SPARK_UI_MINIO_END_PORT=4055 # Used in port range mapping + TRINO_MINIO_PORT=8083 -# You can change these values if needed, but these align with the defaults -# in the docker-compose.yml and associated scripts. +# Quarkus HTTP/Management ports for Polaris Service (can reference variables above) +QUARKUS_HTTP_PORT=${POLARIS_MINIO_API_PORT} +QUARKUS_MANAGEMENT_PORT=${POLARIS_MINIO_MGMT_PORT} + +# --- Optional: Debug Logging for Polaris Service (uncomment if needed) --- +# QUARKUS_LOG_CONSOLE_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL=DEBUG +# QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL=DEBUG diff --git a/getting-started/minio/docker-compose.yml b/getting-started/minio/docker-compose.yml index 6d5061a4e0..3e88396909 100644 --- a/getting-started/minio/docker-compose.yml +++ b/getting-started/minio/docker-compose.yml @@ -1,8 +1,6 @@ -version: '3.8' - services: minio: - image: minio/minio:RELEASE.2024-05-03T15-18-24Z # Using a specific recent version + image: minio/minio:latest ports: - "${MINIO_API_PORT:-9000}:9000" - "${MINIO_CONSOLE_PORT:-9001}:9001" @@ -27,7 +25,7 @@ services: start_period: 10s mc: - image: minio/mc:RELEASE.2024-05-02T06-20-15Z # Using a specific recent version + image: minio/mc:latest depends_on: minio: condition: service_healthy @@ -45,108 +43,88 @@ services: TRINO_MINIO_S3_USER: ${TRINO_MINIO_S3_USER:-trino_minio_s3_user} TRINO_MINIO_S3_PASSWORD: ${TRINO_MINIO_S3_PASSWORD:-trino_minio_s3_password_val} - postgres-minio: - image: postgres:17.4 # Using a specific recent version for stability - ports: - - "${POSTGRES_MINIO_PORT:-5433}:5432" - shm_size: 128mb - environment: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: POLARIS_MINIO - POSTGRES_INITDB_ARGS: "--encoding UTF8 --data-checksums" - volumes: - - ../assets/postgres/postgresql.conf:/etc/postgresql/postgresql.conf - - postgres_minio_data:/var/lib/postgresql/data - command: ["postgres", "-c", "config_file=/etc/postgresql/postgresql.conf"] - healthcheck: - test: "pg_isready -U postgres -d POLARIS_MINIO" - interval: 5s - timeout: 2s - retries: 15 - - polaris-bootstrap-minio: - image: apache/polaris-admin-tool:postgres-latest # Assumes image built from Polaris source - depends_on: - postgres-minio: - condition: service_healthy - environment: - polaris.persistence.type: relational-jdbc - quarkus.datasource.db-kind: pgsql - quarkus.datasource.jdbc.url: jdbc:postgresql://postgres-minio:5432/POLARIS_MINIO - quarkus.datasource.username: postgres - quarkus.datasource.password: postgres - # Polaris client credentials (to be created by bootstrap) - SPARK_POLARIS_CLIENT_ID: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} - SPARK_POLARIS_CLIENT_SECRET: ${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} - TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} - TRINO_POLARIS_CLIENT_SECRET: ${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} - command: > # Using > for multi-line command - bootstrap - --realm=POLARIS_MINIO_REALM - --credential=POLARIS_MINIO_REALM,root,s3cr3t - --credential=POLARIS_MINIO_REALM,${SPARK_POLARIS_CLIENT_ID:-spark_app_client},${SPARK_POLARIS_CLIENT_SECRET:-spark_client_secret_val} - --credential=POLARIS_MINIO_REALM,${TRINO_POLARIS_CLIENT_ID:-trino_app_client},${TRINO_POLARIS_CLIENT_SECRET:-trino_client_secret_val} - polaris: - image: apache/polaris:postgres-latest # Assumes image built from Polaris source + image: apache/polaris:postgres-latest depends_on: - polaris-bootstrap-minio: - condition: service_completed_successfully - postgres-minio: + minio: # Polaris server depends on PostgreSQL being healthy condition: service_healthy - mc: - condition: service_completed_successfully + # polaris-bootstrap-minio is a setup task; polaris server doesn't need to wait for it on every start + # after the initial successful bootstrap. Other services that *use* Polaris data + # (like polaris-setup-catalog-minio) should depend on polaris: service_healthy. ports: - - "${POLARIS_MINIO_API_PORT:-8183}:8181" - - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" + # The host port is defined by POLARIS_MINIO_API_PORT from .env, container port is 8181 + - "${POLARIS_MINIO_API_PORT:-8183}:${QUARKUS_HTTP_PORT:-8181}" # Or just - "${POLARIS_MINIO_API_PORT:-8183}:8181" + # The host port is defined by POLARIS_MINIO_MGMT_PORT from .env, container port is 8182 + - "${POLARIS_MINIO_MGMT_PORT:-8184}:${QUARKUS_MANAGEMENT_PORT:-8182}" # Or just - "${POLARIS_MINIO_MGMT_PORT:-8184}:8182" environment: - polaris.persistence.type: relational-jdbc - quarkus.datasource.db-kind: pgsql - quarkus.datasource.jdbc.url: jdbc:postgresql://postgres-minio:5432/POLARIS_MINIO - quarkus.datasource.username: postgres - quarkus.datasource.password: postgres - polaris.realm-context.realms: POLARIS_MINIO_REALM - quarkus.otel.sdk.disabled: "true" + # These variables will be sourced from the .env file (or shell environment). + # Docker Compose makes them available to the container if they are defined. + - POLARIS_PERSISTENCE_TYPE + - POLARIS_REALM_CONTEXT_REALMS + + # Other Quarkus/App settings from .env + - QUARKUS_OTEL_SDK_DISABLED + - QUARKUS_HTTP_PORT # Tells Quarkus which port to bind to inside the container + - QUARKUS_MANAGEMENT_PORT # Tells Quarkus which management port to bind to inside the container + + # Optional: Debug logging settings (will be sourced from .env if uncommented there) + - QUARKUS_LOG_CONSOLE_LEVEL + - QUARKUS_LOG_CATEGORY_IO_SMALLRYE_CONFIG_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_APACHE_POLARIS_LEVEL + - QUARKUS_LOG_CATEGORY_IO_QUARKUS_DATASOURCE_LEVEL + - QUARKUS_LOG_CATEGORY_ORG_AGROAL_LEVEL healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8182/q/health/live"] - interval: 5s - timeout: 3s - retries: 10 - start_period: 10s + # Uses the management port defined by POLARIS_MINIO_MGMT_PORT (which sets QUARKUS_MANAGEMENT_PORT for inside the container) + # The healthcheck runs INSIDE the container network, so it checks localhost:QUARKUS_MANAGEMENT_PORT (e.g. localhost:8182) + test: ["CMD-SHELL", "curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/live || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health/ready || curl -f http://localhost:${QUARKUS_MANAGEMENT_PORT:-8182}/q/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 12s # Generous start period for app init and DB connection polaris-setup-catalog-minio: - image: alpine/curl:3.19 # Specific version + image: alpine/curl:latest depends_on: - polaris: condition: service_healthy + polaris: + condition: service_healthy volumes: - ./polaris-config:/polaris-config entrypoint: /bin/sh command: '-c "apk add --no-cache jq && chmod +x /polaris-config/create-catalog-minio.sh && /polaris-config/create-catalog-minio.sh"' environment: - POLARIS_S3_USER: ${POLARIS_S3_USER:-polaris_s3_user} - POLARIS_S3_PASSWORD: ${POLARIS_S3_PASSWORD:-polaris_s3_password_val} + - POLARIS_S3_USER + - POLARIS_S3_PASSWORD + - QUARKUS_HTTP_PORT + - POLARIS_REALM_CONTEXT_REALMS polaris-setup-governance: - image: alpine/curl:3.19 # Specific version + image: alpine/curl:latest depends_on: - polaris-setup-catalog-minio: + polaris-setup-catalog-minio: # Should depend on polaris-bootstrap-minio being done first for root user condition: service_completed_successfully + polaris: # Also ensure polaris service itself is healthy for API calls + condition: service_healthy volumes: - ./polaris-config:/polaris-config entrypoint: /bin/sh command: '-c "apk add --no-cache jq && chmod +x /polaris-config/setup-polaris-governance.sh && /polaris-config/setup-polaris-governance.sh"' environment: - SPARK_POLARIS_CLIENT_ID: ${SPARK_POLARIS_CLIENT_ID:-spark_app_client} - TRINO_POLARIS_CLIENT_ID: ${TRINO_POLARIS_CLIENT_ID:-trino_app_client} + - SPARK_POLARIS_CLIENT_ID + - SPARK_POLARIS_CLIENT_SECRET + - TRINO_POLARIS_CLIENT_ID + - TRINO_POLARIS_CLIENT_SECRET + - POLARIS_REALM_CONTEXT_REALMS + - QUARKUS_HTTP_PORT # To construct http://polaris:${QUARKUS_HTTP_PORT} + spark-sql-minio: - image: apache/spark:3.5.1 # Using a specific Spark 3.5.x version + image: apache/spark:3.5.5-java17-python3 container_name: spark-sql-minio-gov depends_on: polaris-setup-governance: condition: service_completed_successfully - minio: condition: service_healthy + minio: + condition: service_healthy stdin_open: true tty: true ports: @@ -187,7 +165,8 @@ services: depends_on: polaris-setup-governance: condition: service_completed_successfully - minio: condition: service_healthy + minio: + condition: service_healthy ports: - "${TRINO_MINIO_PORT:-8083}:8080" volumes: @@ -202,7 +181,6 @@ services: volumes: minio_data: - postgres_minio_data: networks: default: diff --git a/getting-started/minio/minio-config/setup-minio.sh b/getting-started/minio/minio-config/setup-minio.sh old mode 100644 new mode 100755 index 7a1a83c8d5..130ba41d07 --- a/getting-started/minio/minio-config/setup-minio.sh +++ b/getting-started/minio/minio-config/setup-minio.sh @@ -1,20 +1,6 @@ #!/bin/sh set -e -echo "Waiting for Minio service to start..." -attempt_counter=0 -max_attempts=20 -until curl -s -f http://minio:9000/minio/health/live > /dev/null; do - if [ ${attempt_counter} -eq ${max_attempts} ]; then - echo "Max attempts reached. Failed to connect to Minio." - exit 1 - fi - echo "Attempting to connect to Minio (${attempt_counter}/${max_attempts})..." - attempt_counter=$((attempt_counter+1)) - sleep 3 -done -echo "Minio service is live." - mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} mc mb myminio/polaris-bucket --ignore-existing diff --git a/getting-started/minio/polaris-config/create-catalog-minio.sh b/getting-started/minio/polaris-config/create-catalog-minio.sh old mode 100644 new mode 100755 diff --git a/getting-started/minio/polaris-config/setup-polaris-governance.sh b/getting-started/minio/polaris-config/setup-polaris-governance.sh old mode 100644 new mode 100755 index d75a7cf457..0d013ad869 --- a/getting-started/minio/polaris-config/setup-polaris-governance.sh +++ b/getting-started/minio/polaris-config/setup-polaris-governance.sh @@ -66,6 +66,23 @@ polaris_api_call() { fi echo "" } +echo "Creating Polaris principal for Spark: ${SPARK_POLARIS_CLIENT_ID}" +# Assuming an API endpoint like /auth/principals or similar +# This might be a multi-step process: 1. Create principal, 2. Set password credential +# Example (highly conceptual, verify actual API): +polaris_api_call "POST" "/auth/principals" \ + "{\"name\": \"${SPARK_POLARIS_CLIENT_ID}\", \"realmName\": \"${POLARIS_MINIO_REALM}\"}" 201 409 "${POLARIS_AUTH_API_URL_BASE}" # 409 if exists + +polaris_api_call "PUT" "/auth/principals/${SPARK_POLARIS_CLIENT_ID}/credentials" \ + "[{\"type\": \"PASSWORD\", \"value\": \"${SPARK_POLARIS_CLIENT_SECRET}\"}]" 204 200 "${POLARIS_AUTH_API_URL_BASE}" # Using PUT to set/reset + +echo "Creating Polaris principal for Trino: ${TRINO_POLARIS_CLIENT_ID}" +polaris_api_call "POST" "/auth/principals" \ + "{\"name\": \"${TRINO_POLARIS_CLIENT_ID}\", \"realmName\": \"${POLARIS_MINIO_REALM}\"}" 201 409 "${POLARIS_AUTH_API_URL_BASE}" + +polaris_api_call "PUT" "/auth/principals/${TRINO_POLARIS_CLIENT_ID}/credentials" \ + "[{\"type\": \"PASSWORD\", \"value\": \"${TRINO_POLARIS_CLIENT_SECRET}\"}]" 204 200 "${POLARIS_AUTH_API_URL_BASE}" + # 1. Create Principal Roles polaris_api_call "POST" "/principal-roles" "{\"name\": \"${SPARK_ROLE_NAME}\"}" 201