From c6291aa13159111fa9bcdd61366e255768882b01 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Mon, 5 May 2025 23:01:44 +0900 Subject: [PATCH 01/11] Support retrieving the latest Iceberg table on table scan (#11) * Allow resolving the current snapshot ID to use on a scan from a callback function * Use table_fn * Fix * Just pass a reference to the catalog * make public * Just take table ident * lint --- crates/integrations/datafusion/src/schema.rs | 7 ++- .../integrations/datafusion/src/table/mod.rs | 44 ++++++++++++++----- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/crates/integrations/datafusion/src/schema.rs b/crates/integrations/datafusion/src/schema.rs index 3be6da426e..4528017976 100644 --- a/crates/integrations/datafusion/src/schema.rs +++ b/crates/integrations/datafusion/src/schema.rs @@ -24,7 +24,7 @@ use datafusion::catalog::SchemaProvider; use datafusion::datasource::TableProvider; use datafusion::error::Result as DFResult; use futures::future::try_join_all; -use iceberg::{Catalog, NamespaceIdent, Result}; +use iceberg::{Catalog, NamespaceIdent, Result, TableIdent}; use crate::table::IcebergTableProvider; @@ -64,7 +64,10 @@ impl IcebergSchemaProvider { let providers = try_join_all( table_names .iter() - .map(|name| IcebergTableProvider::try_new(client.clone(), namespace.clone(), name)) + .map(|name| { + let table_ident = TableIdent::new(namespace.clone(), name.clone()); + IcebergTableProvider::try_new(client.clone(), table_ident) + }) .collect::>(), ) .await?; diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 00c9e13229..dcb3d5fc65 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -24,18 +24,18 @@ use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; use datafusion::catalog::Session; use datafusion::datasource::{TableProvider, TableType}; -use datafusion::error::Result as DFResult; +use datafusion::error::{DataFusionError, Result as DFResult}; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_plan::ExecutionPlan; use iceberg::arrow::schema_to_arrow_schema; use iceberg::table::Table; -use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; +use iceberg::{Catalog, Error, ErrorKind, Result, TableIdent}; use crate::physical_plan::scan::IcebergTableScan; /// Represents a [`TableProvider`] for the Iceberg [`Catalog`], /// managing access to a [`Table`]. -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct IcebergTableProvider { /// A table in the catalog. table: Table, @@ -43,6 +43,18 @@ pub struct IcebergTableProvider { snapshot_id: Option, /// A reference-counted arrow `Schema`. schema: ArrowSchemaRef, + /// A reference to the catalog that this table provider belongs to. + catalog: Option>, +} + +impl std::fmt::Debug for IcebergTableProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IcebergTableProvider") + .field("table", &self.table) + .field("snapshot_id", &self.snapshot_id) + .field("schema", &self.schema) + .finish_non_exhaustive() + } } impl IcebergTableProvider { @@ -51,24 +63,21 @@ impl IcebergTableProvider { table, snapshot_id: None, schema, + catalog: None, } } /// Asynchronously tries to construct a new [`IcebergTableProvider`] /// using the given client and table name to fetch an actual [`Table`] /// in the provided namespace. - pub(crate) async fn try_new( - client: Arc, - namespace: NamespaceIdent, - name: impl Into, - ) -> Result { - let ident = TableIdent::new(namespace, name.into()); - let table = client.load_table(&ident).await?; + pub async fn try_new(client: Arc, table_name: TableIdent) -> Result { + let table = client.load_table(&table_name).await?; let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); Ok(IcebergTableProvider { table, snapshot_id: None, + catalog: Some(client), schema, }) } @@ -80,6 +89,7 @@ impl IcebergTableProvider { Ok(IcebergTableProvider { table, snapshot_id: None, + catalog: None, schema, }) } @@ -104,6 +114,7 @@ impl IcebergTableProvider { Ok(IcebergTableProvider { table, snapshot_id: Some(snapshot_id), + catalog: None, schema, }) } @@ -130,8 +141,19 @@ impl TableProvider for IcebergTableProvider { filters: &[Expr], _limit: Option, ) -> DFResult> { + // Get the latest table metadata from the catalog if it exists + let table = if let Some(catalog) = &self.catalog { + catalog + .load_table(self.table.identifier()) + .await + .map_err(|e| { + DataFusionError::Execution(format!("Error getting Iceberg table metadata: {e}")) + })? + } else { + self.table.clone() + }; Ok(Arc::new(IcebergTableScan::new( - self.table.clone(), + table, self.snapshot_id, self.schema.clone(), projection, From 866d5a4a929201469bafd034e99206f6d17b001d Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 7 May 2025 12:03:50 +0900 Subject: [PATCH 02/11] Add integration test for new DataFusion table provider creation method --- crates/catalog/memory/src/catalog.rs | 13 +++++++ .../integrations/datafusion/src/table/mod.rs | 12 +----- .../tests/integration_datafusion_test.rs | 39 ++++++++++++++++++- ...-c49f-4e40-9236-a50fd0884b5d.metadata.json | 1 + 4 files changed, 52 insertions(+), 13 deletions(-) create mode 100644 crates/integrations/datafusion/tests/test_data/scan_snapshot_update/test.db/test_table/metadata/00000-754ae971-c49f-4e40-9236-a50fd0884b5d.metadata.json diff --git a/crates/catalog/memory/src/catalog.rs b/crates/catalog/memory/src/catalog.rs index cf4ad72169..65aaf9d4da 100644 --- a/crates/catalog/memory/src/catalog.rs +++ b/crates/catalog/memory/src/catalog.rs @@ -53,6 +53,19 @@ impl MemoryCatalog { warehouse_location, } } + + /// Register an existing table in the memory catalog. + pub async fn register_existing_table( + &self, + table_ident: &TableIdent, + metadata_location: String, + ) -> Result<()> { + let mut root_namespace_state = self.root_namespace_state.lock().await; + + root_namespace_state.insert_new_table(table_ident, metadata_location.clone())?; + + Ok(()) + } } #[async_trait] diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index dcb3d5fc65..c12772c4de 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -35,7 +35,7 @@ use crate::physical_plan::scan::IcebergTableScan; /// Represents a [`TableProvider`] for the Iceberg [`Catalog`], /// managing access to a [`Table`]. -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct IcebergTableProvider { /// A table in the catalog. table: Table, @@ -47,16 +47,6 @@ pub struct IcebergTableProvider { catalog: Option>, } -impl std::fmt::Debug for IcebergTableProvider { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("IcebergTableProvider") - .field("table", &self.table) - .field("snapshot_id", &self.snapshot_id) - .field("schema", &self.schema) - .finish_non_exhaustive() - } -} - impl IcebergTableProvider { pub(crate) fn new(table: Table, schema: ArrowSchemaRef) -> Self { IcebergTableProvider { diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index 715635e062..4aa105ca17 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -27,9 +27,9 @@ use datafusion::execution::context::SessionContext; use datafusion::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use iceberg::io::FileIOBuilder; use iceberg::spec::{NestedField, PrimitiveType, Schema, StructType, Type}; -use iceberg::{Catalog, NamespaceIdent, Result, TableCreation}; +use iceberg::{Catalog, NamespaceIdent, Result, TableCreation, TableIdent}; use iceberg_catalog_memory::MemoryCatalog; -use iceberg_datafusion::IcebergCatalogProvider; +use iceberg_datafusion::{IcebergCatalogProvider, IcebergTableProvider}; use tempfile::TempDir; fn temp_path() -> String { @@ -295,3 +295,38 @@ async fn test_table_predict_pushdown() -> Result<()> { assert!(s.value(1).trim().contains(expected)); Ok(()) } + +#[tokio::test] +async fn test_table_scan_snapshot() -> Result<()> { + let iceberg_catalog = get_iceberg_catalog(); + let namespace = NamespaceIdent::new("test".to_string()); + set_test_namespace(&iceberg_catalog, &namespace).await?; + + let current_dir = std::env::current_dir().unwrap(); + let metadata_path = current_dir.join("tests/test_data/scan_snapshot_update/test.db/test_table/metadata/00000-754ae971-c49f-4e40-9236-a50fd0884b5d.metadata.json"); + + let table_ident = TableIdent::new(namespace, "test_table".to_string()); + iceberg_catalog + .register_existing_table(&table_ident, metadata_path.display().to_string()) + .await?; + + let client = Arc::new(iceberg_catalog); + let table = Arc::new( + IcebergTableProvider::try_new(Arc::clone(&client) as Arc, table_ident.clone()) + .await?, + ); + + let ctx = SessionContext::new(); + ctx.register_table("df_test", table) + .expect("failed to register table"); + let records = ctx + .sql("select * from df_test") + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(0, records.len()); + + Ok(()) +} diff --git a/crates/integrations/datafusion/tests/test_data/scan_snapshot_update/test.db/test_table/metadata/00000-754ae971-c49f-4e40-9236-a50fd0884b5d.metadata.json b/crates/integrations/datafusion/tests/test_data/scan_snapshot_update/test.db/test_table/metadata/00000-754ae971-c49f-4e40-9236-a50fd0884b5d.metadata.json new file mode 100644 index 0000000000..5e66dfd655 --- /dev/null +++ b/crates/integrations/datafusion/tests/test_data/scan_snapshot_update/test.db/test_table/metadata/00000-754ae971-c49f-4e40-9236-a50fd0884b5d.metadata.json @@ -0,0 +1 @@ +{"location":"./tests/test_data/scan_snapshot_update/test.db/test_table","table-uuid":"e94dbe67-55ae-40e6-b0ff-8b48ebabf550","last-updated-ms":1746586823335,"last-column-id":2,"schemas":[{"type":"struct","fields":[{"id":1,"name":"id","type":"string","required":true},{"id":2,"name":"comment","type":"string","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"id"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"write.format.default":"parquet","write.parquet.compression-codec":"snappy"},"snapshots":[],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":2,"last-sequence-number":0} \ No newline at end of file From 8efab2ce4ff886e40b440886f070198f31e09407 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Fri, 9 May 2025 11:48:05 +0900 Subject: [PATCH 03/11] Adds a refresh_table_metadata function --- .../integrations/datafusion/src/table/mod.rs | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index c12772c4de..0313a7d5a5 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -24,12 +24,13 @@ use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; use datafusion::catalog::Session; use datafusion::datasource::{TableProvider, TableType}; -use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::error::Result as DFResult; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_plan::ExecutionPlan; use iceberg::arrow::schema_to_arrow_schema; use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, Result, TableIdent}; +use tokio::sync::RwLock; use crate::physical_plan::scan::IcebergTableScan; @@ -38,7 +39,9 @@ use crate::physical_plan::scan::IcebergTableScan; #[derive(Debug, Clone)] pub struct IcebergTableProvider { /// A table in the catalog. - table: Table, + table: Arc>, + /// The identifier to the table in the catalog. + table_identifier: TableIdent, /// Table snapshot id that will be queried via this provider. snapshot_id: Option, /// A reference-counted arrow `Schema`. @@ -49,8 +52,10 @@ pub struct IcebergTableProvider { impl IcebergTableProvider { pub(crate) fn new(table: Table, schema: ArrowSchemaRef) -> Self { + let table_identifier = table.identifier().clone(); IcebergTableProvider { - table, + table: Arc::new(RwLock::new(table)), + table_identifier, snapshot_id: None, schema, catalog: None, @@ -65,7 +70,8 @@ impl IcebergTableProvider { let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); Ok(IcebergTableProvider { - table, + table: Arc::new(RwLock::new(table)), + table_identifier: table_name, snapshot_id: None, catalog: Some(client), schema, @@ -75,9 +81,11 @@ impl IcebergTableProvider { /// Asynchronously tries to construct a new [`IcebergTableProvider`] /// using the given table. Can be used to create a table provider from an existing table regardless of the catalog implementation. pub async fn try_new_from_table(table: Table) -> Result { + let table_identifier = table.identifier().clone(); let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); Ok(IcebergTableProvider { - table, + table: Arc::new(RwLock::new(table)), + table_identifier, snapshot_id: None, catalog: None, schema, @@ -87,6 +95,7 @@ impl IcebergTableProvider { /// Asynchronously tries to construct a new [`IcebergTableProvider`] /// using a specific snapshot of the given table. Can be used to create a table provider from an existing table regardless of the catalog implementation. pub async fn try_new_from_table_snapshot(table: Table, snapshot_id: i64) -> Result { + let table_identifier = table.identifier().clone(); let snapshot = table .metadata() .snapshot_by_id(snapshot_id) @@ -102,12 +111,30 @@ impl IcebergTableProvider { let schema = snapshot.schema(table.metadata())?; let schema = Arc::new(schema_to_arrow_schema(&schema)?); Ok(IcebergTableProvider { - table, + table: Arc::new(RwLock::new(table)), + table_identifier, snapshot_id: Some(snapshot_id), catalog: None, schema, }) } + + /// Refreshes the table metadata to the latest snapshot. + /// + /// Requires that this TableProvider was created with a + /// reference to the catalog to load the updated table from. + pub async fn refresh_table_metadata(&self) -> Result { + let Some(catalog) = &self.catalog else { + return Err(Error::new(ErrorKind::Unexpected, format!("Table provider could not refresh table metadata because no catalog client was provided"))); + }; + + let updated_table = catalog.load_table(&self.table_identifier).await?; + + let mut table_guard = self.table.write().await; + *table_guard = updated_table.clone(); + + Ok(updated_table) + } } #[async_trait] @@ -132,15 +159,9 @@ impl TableProvider for IcebergTableProvider { _limit: Option, ) -> DFResult> { // Get the latest table metadata from the catalog if it exists - let table = if let Some(catalog) = &self.catalog { - catalog - .load_table(self.table.identifier()) - .await - .map_err(|e| { - DataFusionError::Execution(format!("Error getting Iceberg table metadata: {e}")) - })? - } else { - self.table.clone() + let table = match self.refresh_table_metadata().await.ok() { + Some(table) => table, + None => self.table.read().await.clone(), }; Ok(Arc::new(IcebergTableScan::new( table, From afeb0c1740a541cb9b9e4435a1487f277df1f04d Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Fri, 9 May 2025 15:59:54 +0900 Subject: [PATCH 04/11] Clippy --- crates/integrations/datafusion/src/table/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 0313a7d5a5..57b3319a33 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -125,7 +125,7 @@ impl IcebergTableProvider { /// reference to the catalog to load the updated table from. pub async fn refresh_table_metadata(&self) -> Result
{ let Some(catalog) = &self.catalog else { - return Err(Error::new(ErrorKind::Unexpected, format!("Table provider could not refresh table metadata because no catalog client was provided"))); + return Err(Error::new(ErrorKind::Unexpected, "Table provider could not refresh table metadata because no catalog client was provided".to_string())); }; let updated_table = catalog.load_table(&self.table_identifier).await?; From 48ee5849d1efb0b5f08387cf5a4a47b508af4496 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 14:55:20 +0900 Subject: [PATCH 05/11] Add a static catalog to use with create external table --- .../integrations/datafusion/src/table/mod.rs | 157 +++++++++++------ .../datafusion/src/table/static_catalog.rs | 158 ++++++++++++++++++ .../src/table/table_provider_factory.rs | 12 +- 3 files changed, 276 insertions(+), 51 deletions(-) create mode 100644 crates/integrations/datafusion/src/table/static_catalog.rs diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 57b3319a33..86eb031ad8 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +mod static_catalog; pub mod table_provider_factory; use std::any::Any; @@ -38,6 +39,8 @@ use crate::physical_plan::scan::IcebergTableScan; /// managing access to a [`Table`]. #[derive(Debug, Clone)] pub struct IcebergTableProvider { + /// A reference to the catalog that this table belongs to. + catalog: Arc, /// A table in the catalog. table: Arc>, /// The identifier to the table in the catalog. @@ -46,56 +49,34 @@ pub struct IcebergTableProvider { snapshot_id: Option, /// A reference-counted arrow `Schema`. schema: ArrowSchemaRef, - /// A reference to the catalog that this table provider belongs to. - catalog: Option>, } impl IcebergTableProvider { - pub(crate) fn new(table: Table, schema: ArrowSchemaRef) -> Self { - let table_identifier = table.identifier().clone(); - IcebergTableProvider { - table: Arc::new(RwLock::new(table)), - table_identifier, - snapshot_id: None, - schema, - catalog: None, - } - } /// Asynchronously tries to construct a new [`IcebergTableProvider`] /// using the given client and table name to fetch an actual [`Table`] /// in the provided namespace. - pub async fn try_new(client: Arc, table_name: TableIdent) -> Result { - let table = client.load_table(&table_name).await?; + pub async fn try_new(client: Arc, table_identifier: TableIdent) -> Result { + let table = client.load_table(&table_identifier).await?; let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); - Ok(IcebergTableProvider { - table: Arc::new(RwLock::new(table)), - table_identifier: table_name, - snapshot_id: None, - catalog: Some(client), - schema, - }) - } - - /// Asynchronously tries to construct a new [`IcebergTableProvider`] - /// using the given table. Can be used to create a table provider from an existing table regardless of the catalog implementation. - pub async fn try_new_from_table(table: Table) -> Result { - let table_identifier = table.identifier().clone(); - let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); Ok(IcebergTableProvider { table: Arc::new(RwLock::new(table)), table_identifier, snapshot_id: None, - catalog: None, + catalog: client, schema, }) } /// Asynchronously tries to construct a new [`IcebergTableProvider`] /// using a specific snapshot of the given table. Can be used to create a table provider from an existing table regardless of the catalog implementation. - pub async fn try_new_from_table_snapshot(table: Table, snapshot_id: i64) -> Result { - let table_identifier = table.identifier().clone(); + pub async fn try_new_from_table_snapshot( + client: Arc, + table_identifier: TableIdent, + snapshot_id: i64, + ) -> Result { + let table = client.load_table(&table_identifier).await?; let snapshot = table .metadata() .snapshot_by_id(snapshot_id) @@ -114,21 +95,14 @@ impl IcebergTableProvider { table: Arc::new(RwLock::new(table)), table_identifier, snapshot_id: Some(snapshot_id), - catalog: None, + catalog: client, schema, }) } /// Refreshes the table metadata to the latest snapshot. - /// - /// Requires that this TableProvider was created with a - /// reference to the catalog to load the updated table from. pub async fn refresh_table_metadata(&self) -> Result
{ - let Some(catalog) = &self.catalog else { - return Err(Error::new(ErrorKind::Unexpected, "Table provider could not refresh table metadata because no catalog client was provided".to_string())); - }; - - let updated_table = catalog.load_table(&self.table_identifier).await?; + let updated_table = self.catalog.load_table(&self.table_identifier).await?; let mut table_guard = self.table.write().await; *table_guard = updated_table.clone(); @@ -184,14 +158,97 @@ impl TableProvider for IcebergTableProvider { #[cfg(test)] mod tests { + use std::collections::HashMap; + use datafusion::common::Column; use datafusion::prelude::SessionContext; use iceberg::io::FileIO; use iceberg::table::{StaticTable, Table}; - use iceberg::TableIdent; + use iceberg::{Namespace, NamespaceIdent, TableCommit, TableCreation, TableIdent}; use super::*; + #[derive(Debug)] + struct TestCatalog { + table: Table, + } + + impl TestCatalog { + fn new(table: Table) -> Self { + Self { table } + } + } + + #[async_trait] + impl Catalog for TestCatalog { + async fn load_table(&self, _table_identifier: &TableIdent) -> Result
{ + Ok(self.table.clone()) + } + + async fn list_namespaces( + &self, + _parent: Option<&NamespaceIdent>, + ) -> Result> { + unimplemented!() + } + + async fn create_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result { + unimplemented!() + } + + async fn get_namespace(&self, _namespace: &NamespaceIdent) -> Result { + unimplemented!() + } + + async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> Result { + unimplemented!() + } + + async fn update_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result<()> { + unimplemented!() + } + + async fn drop_namespace(&self, _namespace: &NamespaceIdent) -> Result<()> { + unimplemented!() + } + + async fn list_tables(&self, _namespace: &NamespaceIdent) -> Result> { + unimplemented!() + } + + async fn create_table( + &self, + _namespace: &NamespaceIdent, + _creation: TableCreation, + ) -> Result
{ + unimplemented!() + } + + async fn drop_table(&self, _table: &TableIdent) -> Result<()> { + unimplemented!() + } + + async fn table_exists(&self, _table: &TableIdent) -> Result { + unimplemented!() + } + + async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> Result<()> { + unimplemented!() + } + + async fn update_table(&self, _commit: TableCommit) -> Result
{ + unimplemented!() + } + } + async fn get_test_table_from_metadata_file() -> Table { let metadata_file_name = "TableMetadataV2Valid.json"; let metadata_file_path = format!( @@ -214,7 +271,8 @@ mod tests { #[tokio::test] async fn test_try_new_from_table() { let table = get_test_table_from_metadata_file().await; - let table_provider = IcebergTableProvider::try_new_from_table(table.clone()) + let catalog = Arc::new(TestCatalog::new(table.clone())); + let table_provider = IcebergTableProvider::try_new(catalog, table.identifier().clone()) .await .unwrap(); let ctx = SessionContext::new(); @@ -239,10 +297,14 @@ mod tests { async fn test_try_new_from_table_snapshot() { let table = get_test_table_from_metadata_file().await; let snapshot_id = table.metadata().snapshots().next().unwrap().snapshot_id(); - let table_provider = - IcebergTableProvider::try_new_from_table_snapshot(table.clone(), snapshot_id) - .await - .unwrap(); + let catalog = Arc::new(TestCatalog::new(table.clone())); + let table_provider = IcebergTableProvider::try_new_from_table_snapshot( + catalog, + table.identifier().clone(), + snapshot_id, + ) + .await + .unwrap(); let ctx = SessionContext::new(); ctx.register_table("mytable", Arc::new(table_provider)) .unwrap(); @@ -264,7 +326,8 @@ mod tests { #[tokio::test] async fn test_physical_input_schema_consistent_with_logical_input_schema() { let table = get_test_table_from_metadata_file().await; - let table_provider = IcebergTableProvider::try_new_from_table(table.clone()) + let catalog = Arc::new(TestCatalog::new(table.clone())); + let table_provider = IcebergTableProvider::try_new(catalog, table.identifier().clone()) .await .unwrap(); let ctx = SessionContext::new(); diff --git a/crates/integrations/datafusion/src/table/static_catalog.rs b/crates/integrations/datafusion/src/table/static_catalog.rs new file mode 100644 index 0000000000..ba3df9cc1d --- /dev/null +++ b/crates/integrations/datafusion/src/table/static_catalog.rs @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A static catalog provider that implements the [`Catalog`] trait for +//! use in constructing an [`IcebergTableProvider`] from a static table. + +use std::collections::HashMap; + +use async_trait::async_trait; +use iceberg::table::Table; +use iceberg::{ + Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, + TableIdent, +}; + +#[derive(Debug)] +pub(crate) struct StaticCatalog { + table: Table, +} + +impl StaticCatalog { + pub(crate) fn new(table: Table) -> Self { + Self { table } + } +} + +#[async_trait] +impl Catalog for StaticCatalog { + async fn load_table(&self, table_identifier: &TableIdent) -> Result
{ + if self.table.identifier() != table_identifier { + return Err(Error::new( + ErrorKind::TableNotFound, + format!( + "Table with identifier {} not found in static catalog", + table_identifier + ), + )); + } + + Ok(self.table.clone()) + } + + async fn list_namespaces( + &self, + _parent: Option<&NamespaceIdent>, + ) -> Result> { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Listing namespaces is not supported in static catalog", + )) + } + + async fn create_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Creating namespaces is not supported in static catalog", + )) + } + + async fn get_namespace(&self, _namespace: &NamespaceIdent) -> Result { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Getting namespaces is not supported in static catalog", + )) + } + + async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> Result { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Checking namespace existence is not supported in static catalog", + )) + } + + async fn update_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result<()> { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Updating namespaces is not supported in static catalog", + )) + } + + async fn drop_namespace(&self, _namespace: &NamespaceIdent) -> Result<()> { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Dropping namespaces is not supported in static catalog", + )) + } + + async fn list_tables(&self, namespace: &NamespaceIdent) -> Result> { + if self.table.identifier().namespace() == namespace { + return Ok(vec![self.table.identifier().clone()]); + } + Err(Error::new( + ErrorKind::NamespaceNotFound, + format!("Namespace {} not found in static catalog", namespace), + )) + } + + async fn create_table( + &self, + _namespace: &NamespaceIdent, + _creation: TableCreation, + ) -> Result
{ + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Creating tables is not supported in static catalog", + )) + } + + async fn drop_table(&self, _table: &TableIdent) -> Result<()> { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Dropping tables is not supported in static catalog", + )) + } + + async fn table_exists(&self, table: &TableIdent) -> Result { + if self.table.identifier() == table { + return Ok(true); + } + Ok(false) + } + + async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> Result<()> { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Renaming tables is not supported in static catalog", + )) + } + + async fn update_table(&self, _commit: TableCommit) -> Result
{ + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Updating tables is not supported in static catalog", + )) + } +} diff --git a/crates/integrations/datafusion/src/table/table_provider_factory.rs b/crates/integrations/datafusion/src/table/table_provider_factory.rs index 15a3fef688..008adbfc7f 100644 --- a/crates/integrations/datafusion/src/table/table_provider_factory.rs +++ b/crates/integrations/datafusion/src/table/table_provider_factory.rs @@ -24,12 +24,12 @@ use datafusion::catalog::{Session, TableProvider, TableProviderFactory}; use datafusion::error::Result as DFResult; use datafusion::logical_expr::CreateExternalTable; use datafusion::sql::TableReference; -use iceberg::arrow::schema_to_arrow_schema; use iceberg::io::FileIO; use iceberg::table::StaticTable; use iceberg::{Error, ErrorKind, Result, TableIdent}; use super::IcebergTableProvider; +use crate::table::static_catalog::StaticCatalog; use crate::to_datafusion_error; /// A factory that implements DataFusion's `TableProviderFactory` to create `IcebergTableProvider` instances. @@ -126,10 +126,14 @@ impl TableProviderFactory for IcebergTableProviderFactory { .map_err(to_datafusion_error)? .into_table(); - let schema = schema_to_arrow_schema(table.metadata().current_schema()) - .map_err(to_datafusion_error)?; + let table_ident = table.identifier().clone(); + let static_catalog = Arc::new(StaticCatalog::new(table)); - Ok(Arc::new(IcebergTableProvider::new(table, Arc::new(schema)))) + Ok(Arc::new( + IcebergTableProvider::try_new(static_catalog, table_ident) + .await + .map_err(to_datafusion_error)?, + )) } } From 1853ae5968602cfff53a3ebbe4c04db13438c8f9 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:08:46 +0900 Subject: [PATCH 06/11] fix --- crates/integrations/datafusion/src/schema.rs | 2 +- .../integrations/datafusion/src/table/mod.rs | 19 +++++++++++++++---- .../datafusion/src/table/static_catalog.rs | 11 +++++++++++ .../tests/integration_datafusion_test.rs | 2 +- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/crates/integrations/datafusion/src/schema.rs b/crates/integrations/datafusion/src/schema.rs index 660830ad92..e78844c2ad 100644 --- a/crates/integrations/datafusion/src/schema.rs +++ b/crates/integrations/datafusion/src/schema.rs @@ -116,7 +116,7 @@ impl SchemaProvider for IcebergSchemaProvider { let metadata_table_type = MetadataTableType::try_from(metadata_table_name).map_err(DataFusionError::Plan)?; if let Some(table) = self.tables.get(table_name) { - let metadata_table = table.metadata_table(metadata_table_type); + let metadata_table = table.metadata_table(metadata_table_type).await; return Ok(Some(Arc::new(metadata_table))); } else { return Ok(None); diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 4f070c62f8..5e9eb62bb7 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -32,7 +32,7 @@ use datafusion::physical_plan::ExecutionPlan; use iceberg::arrow::schema_to_arrow_schema; use iceberg::inspect::MetadataTableType; use iceberg::table::Table; -use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; +use iceberg::{Catalog, Error, ErrorKind, Result, TableIdent}; use metadata_table::IcebergMetadataTableProvider; use tokio::sync::RwLock; @@ -113,9 +113,12 @@ impl IcebergTableProvider { Ok(updated_table) } - pub(crate) fn metadata_table(&self, r#type: MetadataTableType) -> IcebergMetadataTableProvider { + pub(crate) async fn metadata_table( + &self, + r#type: MetadataTableType, + ) -> IcebergMetadataTableProvider { IcebergMetadataTableProvider { - table: self.table.clone(), + table: self.table.read().await.clone(), r#type, } } @@ -174,7 +177,7 @@ mod tests { use datafusion::prelude::SessionContext; use iceberg::io::FileIO; use iceberg::table::{StaticTable, Table}; - use iceberg::{Namespace, NamespaceIdent, TableCommit, TableCreation, TableIdent, TableIdent}; + use iceberg::{Namespace, NamespaceIdent, TableCommit, TableCreation, TableIdent}; use super::*; @@ -257,6 +260,14 @@ mod tests { async fn update_table(&self, _commit: TableCommit) -> Result
{ unimplemented!() } + + async fn register_table( + &self, + _table: &TableIdent, + _metadata_location: String, + ) -> Result
{ + unimplemented!() + } } async fn get_test_table_from_metadata_file() -> Table { diff --git a/crates/integrations/datafusion/src/table/static_catalog.rs b/crates/integrations/datafusion/src/table/static_catalog.rs index ba3df9cc1d..a4e7d9b717 100644 --- a/crates/integrations/datafusion/src/table/static_catalog.rs +++ b/crates/integrations/datafusion/src/table/static_catalog.rs @@ -155,4 +155,15 @@ impl Catalog for StaticCatalog { "Updating tables is not supported in static catalog", )) } + + async fn register_table( + &self, + _table: &TableIdent, + _metadata_location: String, + ) -> Result
{ + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Registering tables is not supported in static catalog", + )) + } } diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index b9210db04f..be0977e1ab 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -30,7 +30,6 @@ use iceberg::io::FileIOBuilder; use iceberg::spec::{NestedField, PrimitiveType, Schema, StructType, Type}; use iceberg::test_utils::check_record_batches; use iceberg::{Catalog, MemoryCatalog, NamespaceIdent, Result, TableCreation, TableIdent}; -use iceberg_catalog_memory::MemoryCatalog; use iceberg_datafusion::{IcebergCatalogProvider, IcebergTableProvider}; use tempfile::TempDir; @@ -343,6 +342,7 @@ async fn test_table_scan_snapshot() -> Result<()> { Ok(()) } +#[tokio::test] async fn test_metadata_table() -> Result<()> { let iceberg_catalog = get_iceberg_catalog(); let namespace = NamespaceIdent::new("ns".to_string()); From aadbe278fffbb4687231270e5c346356989b081f Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:13:40 +0900 Subject: [PATCH 07/11] fix --- .../tests/shared_tests/datafusion.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/integration_tests/tests/shared_tests/datafusion.rs b/crates/integration_tests/tests/shared_tests/datafusion.rs index badb6496fc..efe9737d5e 100644 --- a/crates/integration_tests/tests/shared_tests/datafusion.rs +++ b/crates/integration_tests/tests/shared_tests/datafusion.rs @@ -24,7 +24,7 @@ use datafusion::assert_batches_eq; use datafusion::catalog::TableProvider; use datafusion::error::DataFusionError; use datafusion::prelude::SessionContext; -use iceberg::{Catalog, TableIdent}; +use iceberg::TableIdent; use iceberg_catalog_rest::RestCatalog; use iceberg_datafusion::IcebergTableProvider; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; @@ -36,17 +36,15 @@ async fn test_basic_queries() -> Result<(), DataFusionError> { let fixture = get_shared_containers(); let rest_catalog = RestCatalog::new(fixture.catalog_config.clone()); - let table = rest_catalog - .load_table(&TableIdent::from_strs(["default", "types_test"]).unwrap()) - .await - .unwrap(); - let ctx = SessionContext::new(); let table_provider = Arc::new( - IcebergTableProvider::try_new_from_table(table) - .await - .unwrap(), + IcebergTableProvider::try_new( + Arc::new(rest_catalog), + TableIdent::from_strs(["default", "types_test"]).unwrap(), + ) + .await + .unwrap(), ); let schema = table_provider.schema(); From 34690cc5bf8b78235ee3c815741c3709eb3ad764 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:20:26 +0900 Subject: [PATCH 08/11] fix python tests --- bindings/python/src/datafusion_table_provider.rs | 8 +++++--- .../integrations/datafusion/src/table/static_catalog.rs | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index b5e1bf952e..881bcfe923 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -20,9 +20,9 @@ use std::ffi::CString; use std::sync::Arc; use datafusion_ffi::table_provider::FFI_TableProvider; -use iceberg::TableIdent; use iceberg::io::FileIO; use iceberg::table::StaticTable; +use iceberg::TableIdent; use iceberg_datafusion::table::IcebergTableProvider; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; @@ -61,7 +61,7 @@ impl PyIcebergDataFusionTable { .map_err(|e| PyRuntimeError::new_err(format!("Failed to build FileIO: {e}")))?; let static_table = - StaticTable::from_metadata_file(&metadata_location, table_ident, file_io) + StaticTable::from_metadata_file(&metadata_location, table_ident.clone(), file_io) .await .map_err(|e| { PyRuntimeError::new_err(format!("Failed to load static table: {e}")) @@ -69,7 +69,9 @@ impl PyIcebergDataFusionTable { let table = static_table.into_table(); - IcebergTableProvider::try_new_from_table(table) + let static_catalog = Arc::new(StaticCatalog::new(table)); + + IcebergTableProvider::try_new(static_catalog, table_ident) .await .map_err(|e| { PyRuntimeError::new_err(format!("Failed to create table provider: {e}")) diff --git a/crates/integrations/datafusion/src/table/static_catalog.rs b/crates/integrations/datafusion/src/table/static_catalog.rs index a4e7d9b717..03fb5abd96 100644 --- a/crates/integrations/datafusion/src/table/static_catalog.rs +++ b/crates/integrations/datafusion/src/table/static_catalog.rs @@ -27,13 +27,14 @@ use iceberg::{ TableIdent, }; +/// Represents a static catalog that contains a single table. #[derive(Debug)] -pub(crate) struct StaticCatalog { +pub struct StaticCatalog { table: Table, } impl StaticCatalog { - pub(crate) fn new(table: Table) -> Self { + pub fn new(table: Table) -> Self { Self { table } } } From e0e040af25efd1b0cfef36838100a31d26a9eba4 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:22:17 +0900 Subject: [PATCH 09/11] fix python bindings fmt --- bindings/python/Cargo.lock | 1 + bindings/python/src/datafusion_table_provider.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 6ad2d624cf..6c3f79a6cc 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -2221,6 +2221,7 @@ dependencies = [ "ordered-float 4.6.0", "parquet", "rand 0.8.5", + "reqsign", "reqwest", "roaring", "rust_decimal", diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index 881bcfe923..3f0481fc7d 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -20,9 +20,9 @@ use std::ffi::CString; use std::sync::Arc; use datafusion_ffi::table_provider::FFI_TableProvider; +use iceberg::TableIdent; use iceberg::io::FileIO; use iceberg::table::StaticTable; -use iceberg::TableIdent; use iceberg_datafusion::table::IcebergTableProvider; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; From caf0e14c8a25f00871fc216a5e9fdfdec4194210 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:23:51 +0900 Subject: [PATCH 10/11] fix static catalog visibility --- bindings/python/src/datafusion_table_provider.rs | 3 ++- crates/integrations/datafusion/src/table/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index 3f0481fc7d..f10bc2dc17 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -20,9 +20,10 @@ use std::ffi::CString; use std::sync::Arc; use datafusion_ffi::table_provider::FFI_TableProvider; -use iceberg::TableIdent; use iceberg::io::FileIO; use iceberg::table::StaticTable; +use iceberg::TableIdent; +use iceberg_datafusion::table::static_catalog::StaticCatalog; use iceberg_datafusion::table::IcebergTableProvider; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 5e9eb62bb7..f738cc3b19 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -16,7 +16,7 @@ // under the License. pub mod metadata_table; -mod static_catalog; +pub mod static_catalog; pub mod table_provider_factory; use std::any::Any; From 38704636800f76d0599d44639ef261c54232205a Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Tue, 22 Jul 2025 15:25:45 +0900 Subject: [PATCH 11/11] fmt --- bindings/python/src/datafusion_table_provider.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index f10bc2dc17..4f06a5799f 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -20,11 +20,11 @@ use std::ffi::CString; use std::sync::Arc; use datafusion_ffi::table_provider::FFI_TableProvider; +use iceberg::TableIdent; use iceberg::io::FileIO; use iceberg::table::StaticTable; -use iceberg::TableIdent; -use iceberg_datafusion::table::static_catalog::StaticCatalog; use iceberg_datafusion::table::IcebergTableProvider; +use iceberg_datafusion::table::static_catalog::StaticCatalog; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use pyo3::types::PyCapsule;