From 67cd66621b8bc85b5ca26283a404697b62018890 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Mon, 13 Jan 2025 06:47:35 +0530 Subject: [PATCH 1/4] fix: Adding S3 Client to integartion test setup - Updates test fixture to use localhost instead of container IPs - Adds S3 client configuration for direct MinIO access - Comments out unused container IP resolution --- crates/integration_tests/src/lib.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/crates/integration_tests/src/lib.rs b/crates/integration_tests/src/lib.rs index c9311c45bd..0d32011ee6 100644 --- a/crates/integration_tests/src/lib.rs +++ b/crates/integration_tests/src/lib.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; +use aws_sdk_s3 as s3; use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; use iceberg_catalog_rest::{RestCatalog, RestCatalogConfig}; use iceberg_test_utils::docker::DockerCompose; @@ -27,6 +28,7 @@ const REST_CATALOG_PORT: u16 = 8181; pub struct TestFixture { pub _docker_compose: DockerCompose, pub rest_catalog: RestCatalog, + pub s3_client: s3::Client, } pub async fn set_test_fixture(func: &str) -> TestFixture { @@ -39,15 +41,14 @@ pub async fn set_test_fixture(func: &str) -> TestFixture { // Start docker compose docker_compose.run(); - let rest_catalog_ip = docker_compose.get_container_ip("rest"); - let minio_ip = docker_compose.get_container_ip("minio"); - + // let rest_catalog_ip = docker_compose.get_container_ip("rest"); + // let minio_ip = docker_compose.get_container_ip("minio"); let config = RestCatalogConfig::builder() - .uri(format!("http://{}:{}", rest_catalog_ip, REST_CATALOG_PORT)) + .uri(format!("http://{}:{}", "localhost", REST_CATALOG_PORT)) .props(HashMap::from([ ( S3_ENDPOINT.to_string(), - format!("http://{}:{}", minio_ip, 9000), + format!("http://{}:{}", "localhost", 9000), ), (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), @@ -56,8 +57,25 @@ pub async fn set_test_fixture(func: &str) -> TestFixture { .build(); let rest_catalog = RestCatalog::new(config); + let s3_config = s3::config::Builder::new() + .endpoint_url(format!("http://{}:{}", "localhost", 9000)) + .region(s3::config::Region::new("us-east-1")) + .behavior_version(s3::config::BehaviorVersion::latest()) + .credentials_provider(s3::config::Credentials::new( + "admin", + "password", + None, + None, + "test-minio-credentials", + )) + .force_path_style(true) + .build(); + + let s3_client = s3::Client::from_conf(s3_config); + TestFixture { _docker_compose: docker_compose, rest_catalog, + s3_client, } } From e79cef1b5587c1ce08289372a8bdacdfa4d20e56 Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Mon, 13 Jan 2025 06:55:35 +0530 Subject: [PATCH 2/4] Small amends to the integration test docker-compose Changes: - Switch MinIO image to 'latest' tag for better stability - Add port 9000 mapping for MinIO (9000:9000) - Add 'mc' as dependency for REST service to fix the race condition --- .../testdata/docker-compose.yaml | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/crates/integration_tests/testdata/docker-compose.yaml b/crates/integration_tests/testdata/docker-compose.yaml index dafd6b4978..d1ffd054ad 100644 --- a/crates/integration_tests/testdata/docker-compose.yaml +++ b/crates/integration_tests/testdata/docker-compose.yaml @@ -32,6 +32,7 @@ services: - CATALOG_S3_ENDPOINT=http://minio:9000 depends_on: - minio + - mc networks: rest_bridge: ports: @@ -40,7 +41,7 @@ services: - 8181 minio: - image: minio/minio:RELEASE.2024-03-07T00-43-48Z + image: minio/minio:latest environment: - MINIO_ROOT_USER=admin - MINIO_ROOT_PASSWORD=password @@ -51,6 +52,7 @@ services: rest_bridge: ports: - 9001:9001 + - 9000:9000 expose: - 9001 - 9000 @@ -69,17 +71,17 @@ services: networks: rest_bridge: - spark-iceberg: - build: spark/ - networks: - rest_bridge: - depends_on: - - rest - - minio - environment: - - AWS_ACCESS_KEY_ID=admin - - AWS_SECRET_ACCESS_KEY=password - - AWS_REGION=us-east-1 - links: - - rest:rest - - minio:minio + # spark-iceberg: + # build: spark/ + # networks: + # rest_bridge: + # depends_on: + # - rest + # - minio + # environment: + # - AWS_ACCESS_KEY_ID=admin + # - AWS_SECRET_ACCESS_KEY=password + # - AWS_REGION=us-east-1 + # links: + # - rest:rest + # - minio:minio From c02034e59e27fd2998feec1e135d9bcb5c29dcef Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Mon, 13 Jan 2025 06:58:20 +0530 Subject: [PATCH 3/4] Adding data. --- crates/integration_tests/Cargo.toml | 4 +++ .../testdata/docker-compose.yaml | 30 +++++++++---------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/crates/integration_tests/Cargo.toml b/crates/integration_tests/Cargo.toml index a047d75804..1b8dc564ea 100644 --- a/crates/integration_tests/Cargo.toml +++ b/crates/integration_tests/Cargo.toml @@ -33,3 +33,7 @@ iceberg-catalog-rest = { workspace = true } iceberg_test_utils = { path = "../test_utils", features = ["tests"] } parquet = { workspace = true } tokio = { workspace = true } +aws-sdk-s3 = "1.68.0" +url = "2.4.0" + + diff --git a/crates/integration_tests/testdata/docker-compose.yaml b/crates/integration_tests/testdata/docker-compose.yaml index d1ffd054ad..2db600315f 100644 --- a/crates/integration_tests/testdata/docker-compose.yaml +++ b/crates/integration_tests/testdata/docker-compose.yaml @@ -41,7 +41,7 @@ services: - 8181 minio: - image: minio/minio:latest + image: minio/minio:RELEASE.2024-12-18T13-15-44Z environment: - MINIO_ROOT_USER=admin - MINIO_ROOT_PASSWORD=password @@ -71,17 +71,17 @@ services: networks: rest_bridge: - # spark-iceberg: - # build: spark/ - # networks: - # rest_bridge: - # depends_on: - # - rest - # - minio - # environment: - # - AWS_ACCESS_KEY_ID=admin - # - AWS_SECRET_ACCESS_KEY=password - # - AWS_REGION=us-east-1 - # links: - # - rest:rest - # - minio:minio + spark-iceberg: + build: spark/ + networks: + rest_bridge: + depends_on: + - rest + - minio + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + links: + - rest:rest + - minio:minio From 839bc732825300363f438b4c27dfd132115e558f Mon Sep 17 00:00:00 2001 From: Karthic Rao Date: Mon, 13 Jan 2025 06:59:37 +0530 Subject: [PATCH 4/4] test: Add S3 partition directory verification in integration test Enhances append_partition_data_file_test.rs to verify correct partition layout in MinIO: - Add URL parsing to extract bucket and prefix from table location - Implement S3 ListObjectsV2 check for partition directory - Add debug logging for table location and data file paths - Verify objects exist in expected partition directory (id=100/) This helps diagnose the partition layout issue by confirming whether files are being written to the correct partitioned directory structure. --- .../tests/append_partition_data_file_test.rs | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/crates/integration_tests/tests/append_partition_data_file_test.rs b/crates/integration_tests/tests/append_partition_data_file_test.rs index 103021532f..5f92ff89e8 100644 --- a/crates/integration_tests/tests/append_partition_data_file_test.rs +++ b/crates/integration_tests/tests/append_partition_data_file_test.rs @@ -37,6 +37,7 @@ use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation}; use iceberg_integration_tests::set_test_fixture; use parquet::file::properties::WriterProperties; +use url::Url; #[tokio::test] async fn test_append_partition_data_file() { @@ -50,12 +51,13 @@ async fn test_append_partition_data_file() { ]), ); + println!("Creating namespace"); fixture .rest_catalog .create_namespace(ns.name(), ns.properties().clone()) .await .unwrap(); - + println!("done creating namespace"); let schema = Schema::builder() .with_schema_id(1) .with_identifier_field_ids(vec![2]) @@ -163,6 +165,55 @@ async fn test_append_partition_data_file() { assert_eq!(batches.len(), 1); assert_eq!(batches[0], batch); + // Now, let's verify that the partition directory (e.g. "id=100/") exists in MinIO. + // We can parse the S3 location from the table metadata, which is typically something like: + // "s3://my-test-bucket/iceberg/rust/t1" + let table_location = table.metadata().location().to_string(); + let table_url = Url::parse(&table_location).expect("failed to parse table location as URL"); + // Add debug prints for partition info + println!("Table location: {}", table.metadata().location()); + println!("Data file location: {:?}", data_file_valid); + // The bucket is the "host" part (my-test-bucket), the rest is the prefix + let bucket = table_url + .host_str() + .expect("no bucket found in table location"); + // The .path() starts with '/', so strip leading slash + let mut prefix = table_url.path().trim_start_matches('/').to_string(); + prefix.push_str("/data/id="); // Look in data/id= directory + prefix.push_str(&first_partition_id_value.to_string()); + prefix.push_str("/"); + + println!( + "Looking for objects in bucket '{}' with prefix '{}'", + bucket, prefix + ); + + let list_resp = fixture + .s3_client + .list_objects_v2() + .bucket(bucket) + .prefix(&prefix) + .send() + .await + .expect("failed to list objects from MinIO"); + + // Ensure we have some objects under that prefix + let objects = list_resp.contents(); + assert!( + !objects.is_empty(), + "No objects found under prefix {} - partition layout may not be correct.", + prefix + ); + + println!("Found objects under partition prefix '{}':", prefix); + for obj in objects { + println!( + " -> key={}, size={:?}", + obj.key().as_deref().unwrap_or_default(), + obj.size() + ); + } + test_schema_incompatible_partition_type( parquet_writer_builder.clone(), batch.clone(),