From 8723ed7f505331a4fd2e177743ff4f61a0fd740d Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Wed, 6 Mar 2024 06:12:39 +0100 Subject: [PATCH 01/64] fmt members --- Cargo.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index dccc6bdf19..22bc8f90fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,12 @@ [workspace] resolver = "2" -members = ["crates/catalog/*", "crates/examples", "crates/iceberg", "crates/test_utils"] +members = [ + "crates/catalog/*", + "crates/examples", + "crates/iceberg", + "crates/test_utils", +] [workspace.package] version = "0.2.0" From 342b6862cdb1ee1acd5eab354ccb882d6d0a8c81 Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Wed, 6 Mar 2024 06:43:08 +0100 Subject: [PATCH 02/64] setup basic test-infra for hms-catalog --- crates/catalog/hms/Cargo.toml | 6 ++ .../hms/testdata/hms_catalog/Dockerfile | 36 +++++++++ .../testdata/hms_catalog/docker-compose.yaml | 48 ++++++++++++ .../hms/testdata/hms_catalog/entrypoint.sh | 15 ++++ .../hms/testdata/hms_catalog/hive-site.xml | 51 ++++++++++++ crates/catalog/hms/tests/hms_catalog_test.rs | 78 +++++++++++++++++++ 6 files changed, 234 insertions(+) create mode 100644 crates/catalog/hms/testdata/hms_catalog/Dockerfile create mode 100644 crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml create mode 100755 crates/catalog/hms/testdata/hms_catalog/entrypoint.sh create mode 100644 crates/catalog/hms/testdata/hms_catalog/hive-site.xml create mode 100644 crates/catalog/hms/tests/hms_catalog_test.rs diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index f44125c5ad..22b399a78a 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -35,3 +35,9 @@ hive_metastore = { workspace = true } iceberg = { workspace = true } typed-builder = { workspace = true } volo-thrift = { workspace = true } +log = "0.4.20" + +[dev-dependencies] +iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } +port_scanner = { workspace = true } +tokio = { workspace = true } diff --git a/crates/catalog/hms/testdata/hms_catalog/Dockerfile b/crates/catalog/hms/testdata/hms_catalog/Dockerfile new file mode 100644 index 0000000000..7e0f8a75ed --- /dev/null +++ b/crates/catalog/hms/testdata/hms_catalog/Dockerfile @@ -0,0 +1,36 @@ +FROM openjdk:8u342-jre + +ENV HADOOP_VERSION=3.3.5 +ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} +ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin + +ENV HIVE_VERSION=3.1.3 +ENV HIVE_HOME=/opt/apache-hive-${HIVE_VERSION}-bin +ENV PATH=$HIVE_HOME/bin:$PATH + +# Set classpath for S3 Access +ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.12.316.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.3.5.jar + +WORKDIR /opt + +RUN apt-get update && apt-get install -y procps fastjar + +RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ + tar -xzf hadoop-${HADOOP_VERSION}.tar.gz && \ + rm hadoop-${HADOOP_VERSION}.tar.gz + +RUN wget https://downloads.apache.org/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz && \ + tar -xzf apache-hive-${HIVE_VERSION}-bin.tar.gz && \ + rm apache-hive-${HIVE_VERSION}-bin.tar.gz + +RUN cd ${HIVE_HOME}/lib && \ + wget https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.28/mysql-connector-java-8.0.28.jar + +COPY ./hive-site.xml ${HIVE_HOME}/conf/hive-site.xml +COPY ./entrypoint.sh /entrypoint.sh + +RUN chmod +x /entrypoint.sh + +EXPOSE 9083 + +ENTRYPOINT ["sh", "-c", "/entrypoint.sh"] \ No newline at end of file diff --git a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml new file mode 100644 index 0000000000..28f84c1a7b --- /dev/null +++ b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: '3.8' + +services: + minio: + image: minio/minio + ports: + - "9000:9000" + - "9001:9001" + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + command: [ "server", "/data", "--console-address", ":9001" ] + + hive-mysql: + image: mysql:5.7 + ports: + - "3306:3306" + environment: + - MYSQL_ROOT_PASSWORD=admin + - MYSQL_DATABASE=metastore + - MYSQL_USER=hive + - MYSQL_PASSWORD=hive + + hive-metastore: + image: iceberg-hms + build: ./ + depends_on: + - hive-mysql + ports: + - "9083:9083" diff --git a/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh b/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh new file mode 100755 index 0000000000..6235dd2aae --- /dev/null +++ b/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +HIVE_VERSION=3.1.3 +HIVE_HOME=/opt/apache-hive-${HIVE_VERSION}-bin + +# Check if schema exists +${HIVE_HOME}/bin/schematool -dbType mysql -info + +if [ $? -eq 1 ]; then + echo "Getting schema info failed. Probably not initialized. Initializing...in 5s" + sleep 5 + ${HIVE_HOME}/bin/schematool -initSchema -dbType mysql +fi + +${HIVE_HOME}/bin/hive --service metastore diff --git a/crates/catalog/hms/testdata/hms_catalog/hive-site.xml b/crates/catalog/hms/testdata/hms_catalog/hive-site.xml new file mode 100644 index 0000000000..cb72ee8300 --- /dev/null +++ b/crates/catalog/hms/testdata/hms_catalog/hive-site.xml @@ -0,0 +1,51 @@ + + + metastore.thrift.uris + thrift://localhost:9083 + Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. + + + metastore.task.threads.always + org.apache.hadoop.hive.metastore.events.EventCleanerTask + + + metastore.expression.proxy + org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy + + + javax.jdo.option.ConnectionDriverName + com.mysql.cj.jdbc.Driver + + + javax.jdo.option.ConnectionURL + jdbc:mysql://hive-mysql:3306/metastore + + + javax.jdo.option.ConnectionUserName + hive + + + javax.jdo.option.ConnectionPassword + hive + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.access.key + admin + + + fs.s3a.secret.key + password + + + fs.s3a.endpoint + http://minio:9000 + + + fs.s3a.path.style.access + true + + diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs new file mode 100644 index 0000000000..af56c4675d --- /dev/null +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for hms catalog. + +use iceberg::{Catalog, NamespaceIdent}; +use iceberg_catalog_hms::{HmsCatalog, HmsCatalogConfig, HmsThriftTransport}; +use iceberg_test_utils::docker::DockerCompose; +use iceberg_test_utils::{normalize_test_name, set_up}; +use port_scanner::scan_port_addr; +use tokio::time::sleep; + +const HMS_CATALOG_PORT: u16 = 9083; + +struct TestFixture { + _docker_compose: DockerCompose, + hms_catalog: HmsCatalog, +} + +async fn set_test_fixture(func: &str) -> TestFixture { + set_up(); + + let docker_compose = DockerCompose::new( + normalize_test_name(format!("{}_{func}", module_path!())), + format!("{}/testdata/hms_catalog", env!("CARGO_MANIFEST_DIR")), + ); + + docker_compose.run(); + + let hms_catalog_ip = docker_compose.get_container_ip("hive-metastore"); + + let read_port = format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT); + loop { + if !scan_port_addr(&read_port) { + log::info!("Waiting for 1s hms catalog to ready..."); + sleep(std::time::Duration::from_millis(1000)).await; + } else { + break; + } + } + + let config = HmsCatalogConfig::builder() + .address(format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT)) + .thrift_transport(HmsThriftTransport::Buffered) + .build(); + + let hms_catalog = HmsCatalog::new(config).unwrap(); + + TestFixture { + _docker_compose: docker_compose, + hms_catalog, + } +} + +#[tokio::test] +async fn test_list_namespace() { + let fixture = set_test_fixture("test_list_namespace").await; + + let expected = vec![NamespaceIdent::from_strs(["default"]).unwrap()]; + + let result = fixture.hms_catalog.list_namespaces(None).await.unwrap(); + + assert_eq!(expected, result) +} From d8ccce1dda638901986990aa755b44b18f760a7b Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Thu, 7 Mar 2024 15:01:47 +0100 Subject: [PATCH 03/64] add license --- .../hms/testdata/hms_catalog/Dockerfile | 17 +++++++++++++++++ .../hms/testdata/hms_catalog/entrypoint.sh | 17 +++++++++++++++++ .../hms/testdata/hms_catalog/hive-site.xml | 19 +++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/crates/catalog/hms/testdata/hms_catalog/Dockerfile b/crates/catalog/hms/testdata/hms_catalog/Dockerfile index 7e0f8a75ed..7c1f862669 100644 --- a/crates/catalog/hms/testdata/hms_catalog/Dockerfile +++ b/crates/catalog/hms/testdata/hms_catalog/Dockerfile @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + FROM openjdk:8u342-jre ENV HADOOP_VERSION=3.3.5 diff --git a/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh b/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh index 6235dd2aae..f738637813 100755 --- a/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh +++ b/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh @@ -1,5 +1,22 @@ #!/bin/sh +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + HIVE_VERSION=3.1.3 HIVE_HOME=/opt/apache-hive-${HIVE_VERSION}-bin diff --git a/crates/catalog/hms/testdata/hms_catalog/hive-site.xml b/crates/catalog/hms/testdata/hms_catalog/hive-site.xml index cb72ee8300..c2df65cddf 100644 --- a/crates/catalog/hms/testdata/hms_catalog/hive-site.xml +++ b/crates/catalog/hms/testdata/hms_catalog/hive-site.xml @@ -1,3 +1,22 @@ + + metastore.thrift.uris From 910d848dcc7c3f46e89fc8d24853f65bfb73bddf Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Thu, 7 Mar 2024 19:57:21 +0100 Subject: [PATCH 04/64] add hms create_namespace --- crates/catalog/hms/Cargo.toml | 1 + crates/catalog/hms/src/catalog.rs | 29 ++- crates/catalog/hms/src/utils.rs | 207 ++++++++++++++++++- crates/catalog/hms/tests/hms_catalog_test.rs | 49 ++++- 4 files changed, 277 insertions(+), 9 deletions(-) diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index 22b399a78a..68b3aed548 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -36,6 +36,7 @@ iceberg = { workspace = true } typed-builder = { workspace = true } volo-thrift = { workspace = true } log = "0.4.20" +pilota = "0.10.0" [dev-dependencies] iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index a00aaf3370..bed3c70452 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -125,12 +125,35 @@ impl Catalog for HmsCatalog { .collect()) } + /// Creates a new namespace with the given identifier and properties. + /// + /// Attempts to create a namespace defined by the `namespace` + /// parameter and configured with the specified `properties`. + /// + /// This function can return an error in the following situations: + /// + /// - If `hive.metastore.database.owner-type` is specified without + /// `hive.metastore.database.owner`, + /// - Errors from `validate_namespace` if the namespace identifier does not + /// meet validation criteria. + /// - Errors from `convert_to_database` if the properties cannot be + /// successfully converted into a database configuration. + /// - Errors from the underlying database creation process, converted using + /// `from_thrift_error`. async fn create_namespace( &self, - _namespace: &NamespaceIdent, - _properties: HashMap, + namespace: &NamespaceIdent, + properties: HashMap, ) -> Result { - todo!() + let database = convert_to_database(namespace, &properties)?; + + self.client + .0 + .create_database(database) + .await + .map_err(from_thrift_error)?; + + Ok(Namespace::new(namespace.clone())) } async fn get_namespace(&self, _namespace: &NamespaceIdent) -> Result { diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 1b0ff33df4..c8b1804364 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -16,10 +16,16 @@ // under the License. use anyhow::anyhow; -use iceberg::{Error, ErrorKind}; +use hive_metastore::{Database, PrincipalType}; +use iceberg::{Error, ErrorKind, NamespaceIdent, Result}; +use pilota::{AHashMap, FastStr}; +use std::collections::HashMap; use std::fmt::Debug; use std::io; +pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; +pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; + /// Format a thrift error into iceberg error. pub fn from_thrift_error(error: volo_thrift::error::ResponseError) -> Error where @@ -40,3 +46,202 @@ pub fn from_io_error(error: io::Error) -> Error { ) .with_source(error) } + +/// Converts name and properties into `hive_metastore::hms::Database` +/// after validating the `namespace` and `owner-settings`. +pub fn convert_to_database( + namespace: &NamespaceIdent, + properties: &HashMap, +) -> Result { + let name = validate_namespace(namespace)?; + validate_owner_settings(properties)?; + + let mut db = Database::default(); + let mut parameters = AHashMap::new(); + + db.name = Some(name.into()); + + for (k, v) in properties { + match k.as_str() { + "comment" => db.description = Some(v.clone().into()), + "location" => db.location_uri = Some(format_location_uri(v.clone()).into()), + HMS_DB_OWNER => db.owner_name = Some(v.clone().into()), + HMS_DB_OWNER_TYPE => { + let owner_type = match v.to_lowercase().as_str() { + "user" => PrincipalType::User, + "group" => PrincipalType::Group, + "role" => PrincipalType::Role, + _ => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Invalid value for setting 'owner_type': {}", v), + )) + } + }; + db.owner_type = Some(owner_type); + } + _ => { + parameters.insert( + FastStr::from_string(k.clone()), + FastStr::from_string(v.clone()), + ); + } + } + } + + db.parameters = Some(parameters); + + // Set default user, if none provided + // https://github.com/apache/iceberg/blob/main/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveHadoopUtil.java#L44 + if db.owner_name.is_none() { + db.owner_name = Some("user.name".into()); + db.owner_type = Some(PrincipalType::User); + } + + Ok(db) +} + +/// Checks if provided `NamespaceIdent` is valid. +fn validate_namespace(namespace: &NamespaceIdent) -> Result { + let name = namespace.as_ref(); + + if name.len() != 1 { + return Err(Error::new( + ErrorKind::DataInvalid, + "Invalid database, hierarchical namespaces are not supported", + )); + } + + let name = name[0].clone(); + + if name.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Cannot create namespace with empty name", + )); + } + + Ok(name) +} + +/// Formats location_uri by e.g. removing trailing slashes. +fn format_location_uri(location: String) -> String { + let mut location = location; + + if !location.starts_with('/') { + location = format!("/{}", location); + } + + if location.ends_with('/') && location.len() > 1 { + location.pop(); + } + + location +} + +/// Checks if `owner-settings` are valid. +/// If `owner_type` is set, then `owner` must also be set. +fn validate_owner_settings(properties: &HashMap) -> Result<()> { + let owner_is_set = properties.get(HMS_DB_OWNER).is_some(); + let owner_type_is_set = properties.get(HMS_DB_OWNER_TYPE).is_some(); + + if owner_type_is_set && !owner_is_set { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Setting '{}' without setting '{}' is not allowed", + HMS_DB_OWNER_TYPE, HMS_DB_OWNER + ), + )); + } + + Ok(()) +} +#[cfg(test)] +mod tests { + use iceberg::{Namespace, NamespaceIdent}; + + use super::*; + + #[test] + fn test_validate_owner_settings() { + let valid = HashMap::from([ + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ]); + let invalid = HashMap::from([(HMS_DB_OWNER_TYPE.to_string(), "user".to_string())]); + + assert!(validate_owner_settings(&valid).is_ok()); + assert!(validate_owner_settings(&invalid).is_err()); + } + + #[test] + fn test_convert_to_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + ("comment".to_string(), "my_description".to_string()), + ("location".to_string(), "my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.description, Some(FastStr::from("my_description"))); + assert_eq!(db.owner_name, Some(FastStr::from("apache"))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + if let Some(params) = db.parameters { + assert_eq!(params.get("key1".into()), Some(&FastStr::from("value1"))); + } + + Ok(()) + } + + #[test] + fn test_convert_to_database_with_default_user() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::new(); + + let db = convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.owner_name, Some(FastStr::from("user.name"))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + Ok(()) + } + + #[test] + fn test_validate_namespace() { + let valid_ns = Namespace::new(NamespaceIdent::new("ns".to_string())); + let empty_ns = Namespace::new(NamespaceIdent::new("".to_string())); + let hierarchical_ns = Namespace::new( + NamespaceIdent::from_vec(vec!["level1".to_string(), "level2".to_string()]).unwrap(), + ); + + let valid = validate_namespace(valid_ns.name()); + let empty = validate_namespace(empty_ns.name()); + let hierarchical = validate_namespace(hierarchical_ns.name()); + + assert!(valid.is_ok()); + assert!(empty.is_err()); + assert!(hierarchical.is_err()); + } + + #[test] + fn test_format_location_uri() { + let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; + let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; + + inputs + .into_iter() + .zip(outputs.into_iter()) + .for_each(|(inp, out)| { + let location = format_location_uri(inp.to_string()); + assert_eq!(location, out); + }) + } +} diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index af56c4675d..4c4cfa9182 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -17,7 +17,9 @@ //! Integration tests for hms catalog. -use iceberg::{Catalog, NamespaceIdent}; +use std::collections::HashMap; + +use iceberg::{Catalog, Namespace, NamespaceIdent}; use iceberg_catalog_hms::{HmsCatalog, HmsCatalogConfig, HmsThriftTransport}; use iceberg_test_utils::docker::DockerCompose; use iceberg_test_utils::{normalize_test_name, set_up}; @@ -25,6 +27,7 @@ use port_scanner::scan_port_addr; use tokio::time::sleep; const HMS_CATALOG_PORT: u16 = 9083; +type Result = std::result::Result; struct TestFixture { _docker_compose: DockerCompose, @@ -67,12 +70,48 @@ async fn set_test_fixture(func: &str) -> TestFixture { } #[tokio::test] -async fn test_list_namespace() { +async fn test_list_namespace() -> Result<()> { let fixture = set_test_fixture("test_list_namespace").await; - let expected = vec![NamespaceIdent::from_strs(["default"]).unwrap()]; + let expected_no_parent = vec![NamespaceIdent::new("default".into())]; + let result_no_parent = fixture.hms_catalog.list_namespaces(None).await?; + + let result_with_parent = fixture + .hms_catalog + .list_namespaces(Some(&NamespaceIdent::new("parent".into()))) + .await?; - let result = fixture.hms_catalog.list_namespaces(None).await.unwrap(); + assert_eq!(expected_no_parent, result_no_parent); + assert!(result_with_parent.len() == 0); - assert_eq!(expected, result) + Ok(()) +} + +#[tokio::test] +async fn test_create_namespace() -> Result<()> { + let fixture = set_test_fixture("test_create_namespace").await; + + let ns = Namespace::new(NamespaceIdent::new("my_namespace".to_string())); + let properties = HashMap::from([ + ("comment".to_string(), "my_description".to_string()), + ("location".to_string(), "my_location".to_string()), + ( + "hive.metastore.database.owner".to_string(), + "apache".to_string(), + ), + ( + "hive.metastore.database.owner-type".to_string(), + "user".to_string(), + ), + ("key1".to_string(), "value1".to_string()), + ]); + + let result = fixture + .hms_catalog + .create_namespace(ns.name(), properties) + .await?; + + assert_eq!(result, ns); + + Ok(()) } From 315ddde71b65cd0866be581a3ccb4335a71a34b1 Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 06:01:51 +0100 Subject: [PATCH 05/64] add hms get_namespace --- crates/catalog/hms/src/catalog.rs | 26 ++++++++- crates/catalog/hms/src/utils.rs | 59 +++++++++++++++++++- crates/catalog/hms/tests/hms_catalog_test.rs | 32 ++++++++++- 3 files changed, 112 insertions(+), 5 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index bed3c70452..56fdc6a236 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -156,8 +156,30 @@ impl Catalog for HmsCatalog { Ok(Namespace::new(namespace.clone())) } - async fn get_namespace(&self, _namespace: &NamespaceIdent) -> Result { - todo!() + /// Retrieves a namespace by its identifier. + /// + /// Validates the given namespace identifier and then queries the + /// underlying database client to fetch the corresponding namespace data. + /// Constructs a `Namespace` object with the retrieved data and returns it. + /// + /// This function can return an error in any of the following situations: + /// - If the provided namespace identifier fails validation checks + /// - If there is an error querying the database, returned by + /// `from_thrift_error`. + async fn get_namespace(&self, namespace: &NamespaceIdent) -> Result { + let name = validate_namespace(namespace)?; + + let db = self + .client + .0 + .get_database(name.clone().into()) + .await + .map_err(from_thrift_error)?; + + let properties = properties_from_database(&db); + let ns = Namespace::with_properties(NamespaceIdent::new(name.into()), properties); + + Ok(ns) } async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> Result { diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index c8b1804364..8d55e6eae0 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -47,6 +47,41 @@ pub fn from_io_error(error: io::Error) -> Error { .with_source(error) } +/// Create and extract properties from `hive_metastore::hms::Database`. +pub fn properties_from_database(database: &Database) -> HashMap { + let mut properties = HashMap::new(); + + if let Some(description) = &database.description { + properties.insert("comment".to_string(), description.to_string()); + }; + + if let Some(location) = &database.location_uri { + properties.insert("location".to_string(), location.to_string()); + }; + + if let Some(owner) = &database.owner_name { + properties.insert(HMS_DB_OWNER.to_string(), owner.to_string()); + }; + + if let Some(owner_type) = &database.owner_type { + let value = match owner_type { + PrincipalType::User => "User", + PrincipalType::Group => "Group", + PrincipalType::Role => "Role", + }; + + properties.insert(HMS_DB_OWNER_TYPE.to_string(), value.to_string()); + }; + + if let Some(params) = &database.parameters { + params.iter().for_each(|(k, v)| { + properties.insert(k.clone().into(), v.clone().into()); + }); + }; + + properties +} + /// Converts name and properties into `hive_metastore::hms::Database` /// after validating the `namespace` and `owner-settings`. pub fn convert_to_database( @@ -102,7 +137,7 @@ pub fn convert_to_database( } /// Checks if provided `NamespaceIdent` is valid. -fn validate_namespace(namespace: &NamespaceIdent) -> Result { +pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { let name = namespace.as_ref(); if name.len() != 1 { @@ -117,7 +152,7 @@ fn validate_namespace(namespace: &NamespaceIdent) -> Result { if name.is_empty() { return Err(Error::new( ErrorKind::DataInvalid, - "Cannot create namespace with empty name", + "Invalid database, provided namespace is empty.", )); } @@ -163,6 +198,26 @@ mod tests { use super::*; + #[test] + fn test_properties_from_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + ("comment".to_string(), "my_description".to_string()), + ("location".to_string(), "/my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "User".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = convert_to_database(&ns, &properties)?; + + let expected = properties_from_database(&db); + + assert_eq!(expected, properties); + + Ok(()) + } + #[test] fn test_validate_owner_settings() { let valid = HashMap::from([ diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 4c4cfa9182..6c0f205f20 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -91,7 +91,7 @@ async fn test_list_namespace() -> Result<()> { async fn test_create_namespace() -> Result<()> { let fixture = set_test_fixture("test_create_namespace").await; - let ns = Namespace::new(NamespaceIdent::new("my_namespace".to_string())); + let ns = Namespace::new(NamespaceIdent::new("my_namespace".into())); let properties = HashMap::from([ ("comment".to_string(), "my_description".to_string()), ("location".to_string(), "my_location".to_string()), @@ -115,3 +115,33 @@ async fn test_create_namespace() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_get_namespace() -> Result<()> { + let fixture = set_test_fixture("test_get_namespace").await; + + let ns = Namespace::new(NamespaceIdent::new("default".into())); + let properties = HashMap::from([ + ( + "location".to_string(), + "file:/user/hive/warehouse".to_string(), + ), + ( + "hive.metastore.database.owner-type".to_string(), + "Role".to_string(), + ), + ("comment".to_string(), "Default Hive database".to_string()), + ( + "hive.metastore.database.owner".to_string(), + "public".to_string(), + ), + ]); + + let expected = Namespace::with_properties(NamespaceIdent::new("default".into()), properties); + + let result = fixture.hms_catalog.get_namespace(ns.name()).await?; + + assert_eq!(expected, result); + + Ok(()) +} From 05886bd6f865a5305a81b2c039d17c0c6eb68a7e Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 06:21:10 +0100 Subject: [PATCH 06/64] fix: typo --- crates/iceberg/src/catalog/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index bd579d598b..708e6bf3c4 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -50,7 +50,7 @@ pub trait Catalog: Debug + Sync + Send { async fn get_namespace(&self, namespace: &NamespaceIdent) -> Result; /// Check if namespace exists in catalog. - async fn namespace_exists(&self, namesace: &NamespaceIdent) -> Result; + async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result; /// Update a namespace inside the catalog. /// From ada8193380d832d1446945e7f0b780a83359d9a6 Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 11:29:34 +0100 Subject: [PATCH 07/64] add hms namespace_exists and drop_namespace --- crates/catalog/hms/src/catalog.rs | 54 ++++++++++++++++++-- crates/catalog/hms/tests/hms_catalog_test.rs | 44 ++++++++++++++++ 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 56fdc6a236..f6031d2c2f 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -19,6 +19,7 @@ use super::utils::*; use async_trait::async_trait; use hive_metastore::ThriftHiveMetastoreClient; use hive_metastore::ThriftHiveMetastoreClientBuilder; +use hive_metastore::ThriftHiveMetastoreGetDatabaseException; use iceberg::table::Table; use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, @@ -28,6 +29,7 @@ use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; use typed_builder::TypedBuilder; +use volo_thrift::ResponseError; /// Which variant of the thrift transport to communicate with HMS /// See: @@ -97,7 +99,6 @@ impl HmsCatalog { } } -/// Refer to for implementation details. #[async_trait] impl Catalog for HmsCatalog { /// HMS doesn't support nested namespaces. @@ -182,8 +183,36 @@ impl Catalog for HmsCatalog { Ok(ns) } - async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> Result { - todo!() + /// Checks if a namespace exists within the Hive Metastore. + /// + /// Validates the namespace identifier by querying the Hive Metastore + /// to determine if the specified namespace (database) exists. + /// + /// # Returns + /// A `Result` indicating the outcome of the check: + /// - `Ok(true)` if the namespace exists. + /// - `Ok(false)` if the namespace does not exist, identified by a specific + /// `UserException` variant. + /// - `Err(...)` if an error occurs during validation or the Hive Metastore + /// query, with the error encapsulating the issue. + async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result { + let name = validate_namespace(namespace)?; + + let resp = self.client.0.get_database(name.clone().into()).await; + + match resp { + Ok(_) => Ok(true), + Err(err) => { + if let ResponseError::UserException(ThriftHiveMetastoreGetDatabaseException::O1( + _, + )) = &err + { + Ok(false) + } else { + Err(from_thrift_error(err)) + } + } + } } async fn update_namespace( @@ -194,8 +223,23 @@ impl Catalog for HmsCatalog { todo!() } - async fn drop_namespace(&self, _namespace: &NamespaceIdent) -> Result<()> { - todo!() + /// Asynchronously drops a namespace from the Hive Metastore. + /// + /// # Returns + /// A `Result<()>` indicating the outcome: + /// - `Ok(())` signifies successful namespace deletion. + /// - `Err(...)` signifies failure to drop the namespace due to validation + /// errors, connectivity issues, or Hive Metastore constraints. + async fn drop_namespace(&self, namespace: &NamespaceIdent) -> Result<()> { + let name = validate_namespace(namespace)?; + + self.client + .0 + .drop_database(name.into(), false, false) + .await + .map_err(from_thrift_error)?; + + Ok(()) } async fn list_tables(&self, _namespace: &NamespaceIdent) -> Result> { diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 6c0f205f20..1d0bf97cd6 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -145,3 +145,47 @@ async fn test_get_namespace() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_namespace_exists() -> Result<()> { + let fixture = set_test_fixture("test_namespace_exists").await; + + let ns_exists = Namespace::new(NamespaceIdent::new("default".into())); + let ns_not_exists = Namespace::new(NamespaceIdent::new("not_here".into())); + + let result_exists = fixture + .hms_catalog + .namespace_exists(ns_exists.name()) + .await?; + let result_not_exists = fixture + .hms_catalog + .namespace_exists(ns_not_exists.name()) + .await?; + + assert!(result_exists); + assert!(!result_not_exists); + + Ok(()) +} + +#[tokio::test] +async fn test_drop_namespace() -> Result<()> { + let fixture = set_test_fixture("test_drop_namespace").await; + + let ns = Namespace::new(NamespaceIdent::new("delete_me".into())); + + fixture + .hms_catalog + .create_namespace(ns.name(), HashMap::new()) + .await?; + + let result = fixture.hms_catalog.namespace_exists(ns.name()).await?; + assert!(result); + + fixture.hms_catalog.drop_namespace(ns.name()).await?; + + let result = fixture.hms_catalog.namespace_exists(ns.name()).await?; + assert!(!result); + + Ok(()) +} From 00c720be9d3ae4afdd4d400088072b6d3d79b912 Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 14:41:31 +0100 Subject: [PATCH 08/64] add hms update_namespace --- crates/catalog/hms/src/catalog.rs | 34 ++++++++++++++++++-- crates/catalog/hms/tests/hms_catalog_test.rs | 22 +++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index f6031d2c2f..2c3d3def9d 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -215,12 +215,40 @@ impl Catalog for HmsCatalog { } } + /// Asynchronously updates properties of an existing namespace. + /// + /// Converts the given namespace identifier and properties into a database + /// representation and then attempts to update the corresponding namespace + /// in the Hive Metastore. + /// + /// # Returns + /// Returns `Ok(())` if the namespace update is successful. If the + /// namespace cannot be updated due to missing information or an error + /// during the update process, an `Err(...)` is returned. async fn update_namespace( &self, - _namespace: &NamespaceIdent, - _properties: HashMap, + namespace: &NamespaceIdent, + properties: HashMap, ) -> Result<()> { - todo!() + let db = convert_to_database(namespace, &properties)?; + + let name = match &db.name { + Some(name) => name, + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + "Database name must be specified", + )) + } + }; + + self.client + .0 + .alter_database(name.clone(), db) + .await + .map_err(from_thrift_error)?; + + Ok(()) } /// Asynchronously drops a namespace from the Hive Metastore. diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 1d0bf97cd6..4d5f7c3596 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -168,6 +168,28 @@ async fn test_namespace_exists() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_update_namespace() -> Result<()> { + let fixture = set_test_fixture("test_update_namespace").await; + + let ns = Namespace::new(NamespaceIdent::new("default".into())); + let properties = HashMap::from([("comment".to_string(), "my_update".to_string())]); + + fixture + .hms_catalog + .update_namespace(ns.name(), properties) + .await?; + + let db = fixture.hms_catalog.get_namespace(ns.name()).await?; + + assert_eq!( + db.properties().get("comment"), + Some(&"my_update".to_string()) + ); + + Ok(()) +} + #[tokio::test] async fn test_drop_namespace() -> Result<()> { let fixture = set_test_fixture("test_drop_namespace").await; From 1b98f880808e7dbcae217d0ef1cd64cf7338aa7c Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 14:54:31 +0100 Subject: [PATCH 09/64] move fns into HmsCatalog --- crates/catalog/hms/src/catalog.rs | 278 +++++++++++++++++++++++++++++- crates/catalog/hms/src/utils.rs | 262 +--------------------------- 2 files changed, 273 insertions(+), 267 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 2c3d3def9d..1f7bf23932 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -17,6 +17,8 @@ use super::utils::*; use async_trait::async_trait; +use hive_metastore::Database; +use hive_metastore::PrincipalType; use hive_metastore::ThriftHiveMetastoreClient; use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; @@ -25,12 +27,19 @@ use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, TableIdent, }; +use pilota::AHashMap; +use pilota::FastStr; use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; use typed_builder::TypedBuilder; use volo_thrift::ResponseError; +/// hive.metastore.database.owner setting +pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; +/// hive.metastore.database.owner-type setting +pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; + /// Which variant of the thrift transport to communicate with HMS /// See: #[derive(Debug, Default)] @@ -97,6 +106,154 @@ impl HmsCatalog { client: HmsClient(client), }) } + + /// Create and extract properties from `hive_metastore::hms::Database`. + pub fn properties_from_database(database: &Database) -> HashMap { + let mut properties = HashMap::new(); + + if let Some(description) = &database.description { + properties.insert("comment".to_string(), description.to_string()); + }; + + if let Some(location) = &database.location_uri { + properties.insert("location".to_string(), location.to_string()); + }; + + if let Some(owner) = &database.owner_name { + properties.insert(HMS_DB_OWNER.to_string(), owner.to_string()); + }; + + if let Some(owner_type) = &database.owner_type { + let value = match owner_type { + PrincipalType::User => "User", + PrincipalType::Group => "Group", + PrincipalType::Role => "Role", + }; + + properties.insert(HMS_DB_OWNER_TYPE.to_string(), value.to_string()); + }; + + if let Some(params) = &database.parameters { + params.iter().for_each(|(k, v)| { + properties.insert(k.clone().into(), v.clone().into()); + }); + }; + + properties + } + + /// Converts name and properties into `hive_metastore::hms::Database` + /// after validating the `namespace` and `owner-settings`. + pub fn convert_to_database( + namespace: &NamespaceIdent, + properties: &HashMap, + ) -> Result { + let name = HmsCatalog::validate_namespace(namespace)?; + HmsCatalog::validate_owner_settings(properties)?; + + let mut db = Database::default(); + let mut parameters = AHashMap::new(); + + db.name = Some(name.into()); + + for (k, v) in properties { + match k.as_str() { + "comment" => db.description = Some(v.clone().into()), + "location" => { + db.location_uri = Some(HmsCatalog::format_location_uri(v.clone()).into()) + } + HMS_DB_OWNER => db.owner_name = Some(v.clone().into()), + HMS_DB_OWNER_TYPE => { + let owner_type = match v.to_lowercase().as_str() { + "user" => PrincipalType::User, + "group" => PrincipalType::Group, + "role" => PrincipalType::Role, + _ => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Invalid value for setting 'owner_type': {}", v), + )) + } + }; + db.owner_type = Some(owner_type); + } + _ => { + parameters.insert( + FastStr::from_string(k.clone()), + FastStr::from_string(v.clone()), + ); + } + } + } + + db.parameters = Some(parameters); + + // Set default user, if none provided + // https://github.com/apache/iceberg/blob/main/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveHadoopUtil.java#L44 + if db.owner_name.is_none() { + db.owner_name = Some("user.name".into()); + db.owner_type = Some(PrincipalType::User); + } + + Ok(db) + } + + /// Checks if provided `NamespaceIdent` is valid. + pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { + let name = namespace.as_ref(); + + if name.len() != 1 { + return Err(Error::new( + ErrorKind::DataInvalid, + "Invalid database, hierarchical namespaces are not supported", + )); + } + + let name = name[0].clone(); + + if name.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Invalid database, provided namespace is empty.", + )); + } + + Ok(name) + } + + /// Formats location_uri by e.g. removing trailing slashes. + fn format_location_uri(location: String) -> String { + let mut location = location; + + if !location.starts_with('/') { + location = format!("/{}", location); + } + + if location.ends_with('/') && location.len() > 1 { + location.pop(); + } + + location + } + + /// Checks if `owner-settings` are valid. + /// If `owner_type` is set, then `owner` must also be set. + fn validate_owner_settings(properties: &HashMap) -> Result<()> { + let owner_is_set = properties.get(HMS_DB_OWNER).is_some(); + let owner_type_is_set = properties.get(HMS_DB_OWNER_TYPE).is_some(); + + if owner_type_is_set && !owner_is_set { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Setting '{}' without setting '{}' is not allowed", + HMS_DB_OWNER_TYPE, HMS_DB_OWNER + ), + )); + } + + Ok(()) + } } #[async_trait] @@ -146,7 +303,7 @@ impl Catalog for HmsCatalog { namespace: &NamespaceIdent, properties: HashMap, ) -> Result { - let database = convert_to_database(namespace, &properties)?; + let database = HmsCatalog::convert_to_database(namespace, &properties)?; self.client .0 @@ -168,7 +325,7 @@ impl Catalog for HmsCatalog { /// - If there is an error querying the database, returned by /// `from_thrift_error`. async fn get_namespace(&self, namespace: &NamespaceIdent) -> Result { - let name = validate_namespace(namespace)?; + let name = HmsCatalog::validate_namespace(namespace)?; let db = self .client @@ -177,7 +334,7 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - let properties = properties_from_database(&db); + let properties = HmsCatalog::properties_from_database(&db); let ns = Namespace::with_properties(NamespaceIdent::new(name.into()), properties); Ok(ns) @@ -196,7 +353,7 @@ impl Catalog for HmsCatalog { /// - `Err(...)` if an error occurs during validation or the Hive Metastore /// query, with the error encapsulating the issue. async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result { - let name = validate_namespace(namespace)?; + let name = HmsCatalog::validate_namespace(namespace)?; let resp = self.client.0.get_database(name.clone().into()).await; @@ -230,7 +387,7 @@ impl Catalog for HmsCatalog { namespace: &NamespaceIdent, properties: HashMap, ) -> Result<()> { - let db = convert_to_database(namespace, &properties)?; + let db = HmsCatalog::convert_to_database(namespace, &properties)?; let name = match &db.name { Some(name) => name, @@ -259,7 +416,7 @@ impl Catalog for HmsCatalog { /// - `Err(...)` signifies failure to drop the namespace due to validation /// errors, connectivity issues, or Hive Metastore constraints. async fn drop_namespace(&self, namespace: &NamespaceIdent) -> Result<()> { - let name = validate_namespace(namespace)?; + let name = HmsCatalog::validate_namespace(namespace)?; self.client .0 @@ -302,3 +459,112 @@ impl Catalog for HmsCatalog { todo!() } } + +#[cfg(test)] +mod tests { + use iceberg::{Namespace, NamespaceIdent}; + + use super::*; + + #[test] + fn test_properties_from_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + ("comment".to_string(), "my_description".to_string()), + ("location".to_string(), "/my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "User".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = HmsCatalog::convert_to_database(&ns, &properties)?; + + let expected = HmsCatalog::properties_from_database(&db); + + assert_eq!(expected, properties); + + Ok(()) + } + + #[test] + fn test_validate_owner_settings() { + let valid = HashMap::from([ + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ]); + let invalid = HashMap::from([(HMS_DB_OWNER_TYPE.to_string(), "user".to_string())]); + + assert!(HmsCatalog::validate_owner_settings(&valid).is_ok()); + assert!(HmsCatalog::validate_owner_settings(&invalid).is_err()); + } + + #[test] + fn test_convert_to_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + ("comment".to_string(), "my_description".to_string()), + ("location".to_string(), "my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = HmsCatalog::convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.description, Some(FastStr::from("my_description"))); + assert_eq!(db.owner_name, Some(FastStr::from("apache"))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + if let Some(params) = db.parameters { + assert_eq!(params.get("key1".into()), Some(&FastStr::from("value1"))); + } + + Ok(()) + } + + #[test] + fn test_convert_to_database_with_default_user() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::new(); + + let db = HmsCatalog::convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.owner_name, Some(FastStr::from("user.name"))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + Ok(()) + } + + #[test] + fn test_validate_namespace() { + let valid_ns = Namespace::new(NamespaceIdent::new("ns".to_string())); + let empty_ns = Namespace::new(NamespaceIdent::new("".to_string())); + let hierarchical_ns = Namespace::new( + NamespaceIdent::from_vec(vec!["level1".to_string(), "level2".to_string()]).unwrap(), + ); + + let valid = HmsCatalog::validate_namespace(valid_ns.name()); + let empty = HmsCatalog::validate_namespace(empty_ns.name()); + let hierarchical = HmsCatalog::validate_namespace(hierarchical_ns.name()); + + assert!(valid.is_ok()); + assert!(empty.is_err()); + assert!(hierarchical.is_err()); + } + + #[test] + fn test_format_location_uri() { + let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; + let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; + + inputs + .into_iter() + .zip(outputs.into_iter()) + .for_each(|(inp, out)| { + let location = HmsCatalog::format_location_uri(inp.to_string()); + assert_eq!(location, out); + }) + } +} diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 8d55e6eae0..1b0ff33df4 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -16,16 +16,10 @@ // under the License. use anyhow::anyhow; -use hive_metastore::{Database, PrincipalType}; -use iceberg::{Error, ErrorKind, NamespaceIdent, Result}; -use pilota::{AHashMap, FastStr}; -use std::collections::HashMap; +use iceberg::{Error, ErrorKind}; use std::fmt::Debug; use std::io; -pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; -pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; - /// Format a thrift error into iceberg error. pub fn from_thrift_error(error: volo_thrift::error::ResponseError) -> Error where @@ -46,257 +40,3 @@ pub fn from_io_error(error: io::Error) -> Error { ) .with_source(error) } - -/// Create and extract properties from `hive_metastore::hms::Database`. -pub fn properties_from_database(database: &Database) -> HashMap { - let mut properties = HashMap::new(); - - if let Some(description) = &database.description { - properties.insert("comment".to_string(), description.to_string()); - }; - - if let Some(location) = &database.location_uri { - properties.insert("location".to_string(), location.to_string()); - }; - - if let Some(owner) = &database.owner_name { - properties.insert(HMS_DB_OWNER.to_string(), owner.to_string()); - }; - - if let Some(owner_type) = &database.owner_type { - let value = match owner_type { - PrincipalType::User => "User", - PrincipalType::Group => "Group", - PrincipalType::Role => "Role", - }; - - properties.insert(HMS_DB_OWNER_TYPE.to_string(), value.to_string()); - }; - - if let Some(params) = &database.parameters { - params.iter().for_each(|(k, v)| { - properties.insert(k.clone().into(), v.clone().into()); - }); - }; - - properties -} - -/// Converts name and properties into `hive_metastore::hms::Database` -/// after validating the `namespace` and `owner-settings`. -pub fn convert_to_database( - namespace: &NamespaceIdent, - properties: &HashMap, -) -> Result { - let name = validate_namespace(namespace)?; - validate_owner_settings(properties)?; - - let mut db = Database::default(); - let mut parameters = AHashMap::new(); - - db.name = Some(name.into()); - - for (k, v) in properties { - match k.as_str() { - "comment" => db.description = Some(v.clone().into()), - "location" => db.location_uri = Some(format_location_uri(v.clone()).into()), - HMS_DB_OWNER => db.owner_name = Some(v.clone().into()), - HMS_DB_OWNER_TYPE => { - let owner_type = match v.to_lowercase().as_str() { - "user" => PrincipalType::User, - "group" => PrincipalType::Group, - "role" => PrincipalType::Role, - _ => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!("Invalid value for setting 'owner_type': {}", v), - )) - } - }; - db.owner_type = Some(owner_type); - } - _ => { - parameters.insert( - FastStr::from_string(k.clone()), - FastStr::from_string(v.clone()), - ); - } - } - } - - db.parameters = Some(parameters); - - // Set default user, if none provided - // https://github.com/apache/iceberg/blob/main/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveHadoopUtil.java#L44 - if db.owner_name.is_none() { - db.owner_name = Some("user.name".into()); - db.owner_type = Some(PrincipalType::User); - } - - Ok(db) -} - -/// Checks if provided `NamespaceIdent` is valid. -pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { - let name = namespace.as_ref(); - - if name.len() != 1 { - return Err(Error::new( - ErrorKind::DataInvalid, - "Invalid database, hierarchical namespaces are not supported", - )); - } - - let name = name[0].clone(); - - if name.is_empty() { - return Err(Error::new( - ErrorKind::DataInvalid, - "Invalid database, provided namespace is empty.", - )); - } - - Ok(name) -} - -/// Formats location_uri by e.g. removing trailing slashes. -fn format_location_uri(location: String) -> String { - let mut location = location; - - if !location.starts_with('/') { - location = format!("/{}", location); - } - - if location.ends_with('/') && location.len() > 1 { - location.pop(); - } - - location -} - -/// Checks if `owner-settings` are valid. -/// If `owner_type` is set, then `owner` must also be set. -fn validate_owner_settings(properties: &HashMap) -> Result<()> { - let owner_is_set = properties.get(HMS_DB_OWNER).is_some(); - let owner_type_is_set = properties.get(HMS_DB_OWNER_TYPE).is_some(); - - if owner_type_is_set && !owner_is_set { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Setting '{}' without setting '{}' is not allowed", - HMS_DB_OWNER_TYPE, HMS_DB_OWNER - ), - )); - } - - Ok(()) -} -#[cfg(test)] -mod tests { - use iceberg::{Namespace, NamespaceIdent}; - - use super::*; - - #[test] - fn test_properties_from_database() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::from([ - ("comment".to_string(), "my_description".to_string()), - ("location".to_string(), "/my_location".to_string()), - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "User".to_string()), - ("key1".to_string(), "value1".to_string()), - ]); - - let db = convert_to_database(&ns, &properties)?; - - let expected = properties_from_database(&db); - - assert_eq!(expected, properties); - - Ok(()) - } - - #[test] - fn test_validate_owner_settings() { - let valid = HashMap::from([ - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), - ]); - let invalid = HashMap::from([(HMS_DB_OWNER_TYPE.to_string(), "user".to_string())]); - - assert!(validate_owner_settings(&valid).is_ok()); - assert!(validate_owner_settings(&invalid).is_err()); - } - - #[test] - fn test_convert_to_database() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::from([ - ("comment".to_string(), "my_description".to_string()), - ("location".to_string(), "my_location".to_string()), - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), - ("key1".to_string(), "value1".to_string()), - ]); - - let db = convert_to_database(&ns, &properties)?; - - assert_eq!(db.name, Some(FastStr::from("my_namespace"))); - assert_eq!(db.description, Some(FastStr::from("my_description"))); - assert_eq!(db.owner_name, Some(FastStr::from("apache"))); - assert_eq!(db.owner_type, Some(PrincipalType::User)); - - if let Some(params) = db.parameters { - assert_eq!(params.get("key1".into()), Some(&FastStr::from("value1"))); - } - - Ok(()) - } - - #[test] - fn test_convert_to_database_with_default_user() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::new(); - - let db = convert_to_database(&ns, &properties)?; - - assert_eq!(db.name, Some(FastStr::from("my_namespace"))); - assert_eq!(db.owner_name, Some(FastStr::from("user.name"))); - assert_eq!(db.owner_type, Some(PrincipalType::User)); - - Ok(()) - } - - #[test] - fn test_validate_namespace() { - let valid_ns = Namespace::new(NamespaceIdent::new("ns".to_string())); - let empty_ns = Namespace::new(NamespaceIdent::new("".to_string())); - let hierarchical_ns = Namespace::new( - NamespaceIdent::from_vec(vec!["level1".to_string(), "level2".to_string()]).unwrap(), - ); - - let valid = validate_namespace(valid_ns.name()); - let empty = validate_namespace(empty_ns.name()); - let hierarchical = validate_namespace(hierarchical_ns.name()); - - assert!(valid.is_ok()); - assert!(empty.is_err()); - assert!(hierarchical.is_err()); - } - - #[test] - fn test_format_location_uri() { - let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; - let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; - - inputs - .into_iter() - .zip(outputs.into_iter()) - .for_each(|(inp, out)| { - let location = format_location_uri(inp.to_string()); - assert_eq!(location, out); - }) - } -} From f82fa5d0cdc2a308006f6fea3226898369c71cfb Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 14:58:00 +0100 Subject: [PATCH 10/64] use `expose` in docker-compose --- .../hms/testdata/hms_catalog/docker-compose.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml index 28f84c1a7b..ec65baf322 100644 --- a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml +++ b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml @@ -20,9 +20,9 @@ version: '3.8' services: minio: image: minio/minio - ports: - - "9000:9000" - - "9001:9001" + expose: + - 9000 + - 9001 environment: - MINIO_ROOT_USER=admin - MINIO_ROOT_PASSWORD=password @@ -31,8 +31,8 @@ services: hive-mysql: image: mysql:5.7 - ports: - - "3306:3306" + expose: + - 3306 environment: - MYSQL_ROOT_PASSWORD=admin - MYSQL_DATABASE=metastore @@ -44,5 +44,5 @@ services: build: ./ depends_on: - hive-mysql - ports: - - "9083:9083" + expose: + - 9083 From 10b53ba84c8152be67aaafb7a77640b96cbe34ba Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 17:00:31 +0100 Subject: [PATCH 11/64] add hms list_tables --- crates/catalog/hms/src/catalog.rs | 18 ++++++++++++++++-- crates/catalog/hms/tests/hms_catalog_test.rs | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 1f7bf23932..b7630ce0ae 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -427,8 +427,22 @@ impl Catalog for HmsCatalog { Ok(()) } - async fn list_tables(&self, _namespace: &NamespaceIdent) -> Result> { - todo!() + async fn list_tables(&self, namespace: &NamespaceIdent) -> Result> { + let name = HmsCatalog::validate_namespace(namespace)?; + + let tables = self + .client + .0 + .get_all_tables(name.clone().into()) + .await + .map_err(from_thrift_error)?; + + let tables = tables + .iter() + .map(|table| TableIdent::new(namespace.clone(), table.to_string())) + .collect(); + + Ok(tables) } async fn create_table( diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 4d5f7c3596..9d192c24fa 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -211,3 +211,16 @@ async fn test_drop_namespace() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_list_tables() -> Result<()> { + let fixture = set_test_fixture("test_list_tables").await; + + let ns = Namespace::new(NamespaceIdent::new("default".into())); + + let result = fixture.hms_catalog.list_tables(ns.name()).await?; + + assert_eq!(result, vec![]); + + Ok(()) +} From b35916c4810232cce796a320098dab49a43af1b0 Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 17:03:14 +0100 Subject: [PATCH 12/64] fix: clippy --- crates/catalog/hms/src/catalog.rs | 15 ++++++--------- crates/catalog/hms/tests/hms_catalog_test.rs | 2 +- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index b7630ce0ae..5ed072d819 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -335,7 +335,7 @@ impl Catalog for HmsCatalog { .map_err(from_thrift_error)?; let properties = HmsCatalog::properties_from_database(&db); - let ns = Namespace::with_properties(NamespaceIdent::new(name.into()), properties); + let ns = Namespace::with_properties(NamespaceIdent::new(name), properties); Ok(ns) } @@ -531,7 +531,7 @@ mod tests { assert_eq!(db.owner_type, Some(PrincipalType::User)); if let Some(params) = db.parameters { - assert_eq!(params.get("key1".into()), Some(&FastStr::from("value1"))); + assert_eq!(params.get("key1"), Some(&FastStr::from("value1"))); } Ok(()) @@ -573,12 +573,9 @@ mod tests { let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; - inputs - .into_iter() - .zip(outputs.into_iter()) - .for_each(|(inp, out)| { - let location = HmsCatalog::format_location_uri(inp.to_string()); - assert_eq!(location, out); - }) + inputs.into_iter().zip(outputs).for_each(|(inp, out)| { + let location = HmsCatalog::format_location_uri(inp.to_string()); + assert_eq!(location, out); + }) } } diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 9d192c24fa..5628c094a8 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -82,7 +82,7 @@ async fn test_list_namespace() -> Result<()> { .await?; assert_eq!(expected_no_parent, result_no_parent); - assert!(result_with_parent.len() == 0); + assert!(result_with_parent.is_empty()); Ok(()) } From cfcbc17e7846506c27793def87e0fd3465ec100d Mon Sep 17 00:00:00 2001 From: mlanhenke Date: Fri, 8 Mar 2024 17:24:31 +0100 Subject: [PATCH 13/64] fix: cargo sort --- .cargo/audit.toml | 8 ++++---- crates/catalog/hms/Cargo.toml | 4 ++-- deny.toml | 26 ++++++++++++++++---------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/.cargo/audit.toml b/.cargo/audit.toml index 1d73f83b81..5db5a9d811 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -17,8 +17,8 @@ [advisories] ignore = [ - # rsa - # Marvin Attack: potential key recovery through timing sidechannels - # Issues: https://github.com/apache/iceberg-rust/issues/221 - "RUSTSEC-2023-0071", + # rsa + # Marvin Attack: potential key recovery through timing sidechannels + # Issues: https://github.com/apache/iceberg-rust/issues/221 + "RUSTSEC-2023-0071", ] diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index 68b3aed548..b15e2610d6 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -33,10 +33,10 @@ anyhow = { workspace = true } async-trait = { workspace = true } hive_metastore = { workspace = true } iceberg = { workspace = true } -typed-builder = { workspace = true } -volo-thrift = { workspace = true } log = "0.4.20" pilota = "0.10.0" +typed-builder = { workspace = true } +volo-thrift = { workspace = true } [dev-dependencies] iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } diff --git a/deny.toml b/deny.toml index e32367948f..1d30b5f3ff 100644 --- a/deny.toml +++ b/deny.toml @@ -19,17 +19,23 @@ unlicensed = "deny" copyleft = "deny" allow = [ - "Apache-2.0", - "Apache-2.0 WITH LLVM-exception", - "MIT", - "BSD-3-Clause", - "ISC", - "CC0-1.0", + "Apache-2.0", + "Apache-2.0 WITH LLVM-exception", + "MIT", + "BSD-3-Clause", + "ISC", + "CC0-1.0", ] exceptions = [ - { allow = ["OpenSSL"], name = "ring" }, - { allow = ["Unicode-DFS-2016"], name = "unicode-ident" }, - { allow = ["Zlib"], name = "adler32" } + { allow = [ + "OpenSSL", + ], name = "ring" }, + { allow = [ + "Unicode-DFS-2016", + ], name = "unicode-ident" }, + { allow = [ + "Zlib", + ], name = "adler32" }, ] [[licenses.clarify]] @@ -42,4 +48,4 @@ name = "ring" # compiled into non-test libraries, is included below." # OpenSSL - Obviously expression = "ISC AND MIT AND OpenSSL" -license-files = [{ path = "LICENSE", hash = 0xbd0eed23 }] \ No newline at end of file +license-files = [{ path = "LICENSE", hash = 0xbd0eed23 }] From 2e320fe9f2d4e1ed052780cddb710daf4cc98343 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 12:07:23 +0100 Subject: [PATCH 14/64] fix: cargo workspace --- crates/catalog/hms/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index b15e2610d6..ca3177ac3b 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -33,7 +33,7 @@ anyhow = { workspace = true } async-trait = { workspace = true } hive_metastore = { workspace = true } iceberg = { workspace = true } -log = "0.4.20" +log = { workspace = true } pilota = "0.10.0" typed-builder = { workspace = true } volo-thrift = { workspace = true } From ae03c852d38aac8a866a2f0ffc14163fe1be6386 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 12:19:33 +0100 Subject: [PATCH 15/64] move fns into utils + add constants --- crates/catalog/hms/src/catalog.rs | 277 +----------------------------- crates/catalog/hms/src/utils.rs | 268 ++++++++++++++++++++++++++++- 2 files changed, 274 insertions(+), 271 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 5ed072d819..de9ca57f83 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -17,8 +17,6 @@ use super::utils::*; use async_trait::async_trait; -use hive_metastore::Database; -use hive_metastore::PrincipalType; use hive_metastore::ThriftHiveMetastoreClient; use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; @@ -27,19 +25,12 @@ use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, TableIdent, }; -use pilota::AHashMap; -use pilota::FastStr; use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; use typed_builder::TypedBuilder; use volo_thrift::ResponseError; -/// hive.metastore.database.owner setting -pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; -/// hive.metastore.database.owner-type setting -pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; - /// Which variant of the thrift transport to communicate with HMS /// See: #[derive(Debug, Default)] @@ -106,154 +97,6 @@ impl HmsCatalog { client: HmsClient(client), }) } - - /// Create and extract properties from `hive_metastore::hms::Database`. - pub fn properties_from_database(database: &Database) -> HashMap { - let mut properties = HashMap::new(); - - if let Some(description) = &database.description { - properties.insert("comment".to_string(), description.to_string()); - }; - - if let Some(location) = &database.location_uri { - properties.insert("location".to_string(), location.to_string()); - }; - - if let Some(owner) = &database.owner_name { - properties.insert(HMS_DB_OWNER.to_string(), owner.to_string()); - }; - - if let Some(owner_type) = &database.owner_type { - let value = match owner_type { - PrincipalType::User => "User", - PrincipalType::Group => "Group", - PrincipalType::Role => "Role", - }; - - properties.insert(HMS_DB_OWNER_TYPE.to_string(), value.to_string()); - }; - - if let Some(params) = &database.parameters { - params.iter().for_each(|(k, v)| { - properties.insert(k.clone().into(), v.clone().into()); - }); - }; - - properties - } - - /// Converts name and properties into `hive_metastore::hms::Database` - /// after validating the `namespace` and `owner-settings`. - pub fn convert_to_database( - namespace: &NamespaceIdent, - properties: &HashMap, - ) -> Result { - let name = HmsCatalog::validate_namespace(namespace)?; - HmsCatalog::validate_owner_settings(properties)?; - - let mut db = Database::default(); - let mut parameters = AHashMap::new(); - - db.name = Some(name.into()); - - for (k, v) in properties { - match k.as_str() { - "comment" => db.description = Some(v.clone().into()), - "location" => { - db.location_uri = Some(HmsCatalog::format_location_uri(v.clone()).into()) - } - HMS_DB_OWNER => db.owner_name = Some(v.clone().into()), - HMS_DB_OWNER_TYPE => { - let owner_type = match v.to_lowercase().as_str() { - "user" => PrincipalType::User, - "group" => PrincipalType::Group, - "role" => PrincipalType::Role, - _ => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!("Invalid value for setting 'owner_type': {}", v), - )) - } - }; - db.owner_type = Some(owner_type); - } - _ => { - parameters.insert( - FastStr::from_string(k.clone()), - FastStr::from_string(v.clone()), - ); - } - } - } - - db.parameters = Some(parameters); - - // Set default user, if none provided - // https://github.com/apache/iceberg/blob/main/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveHadoopUtil.java#L44 - if db.owner_name.is_none() { - db.owner_name = Some("user.name".into()); - db.owner_type = Some(PrincipalType::User); - } - - Ok(db) - } - - /// Checks if provided `NamespaceIdent` is valid. - pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { - let name = namespace.as_ref(); - - if name.len() != 1 { - return Err(Error::new( - ErrorKind::DataInvalid, - "Invalid database, hierarchical namespaces are not supported", - )); - } - - let name = name[0].clone(); - - if name.is_empty() { - return Err(Error::new( - ErrorKind::DataInvalid, - "Invalid database, provided namespace is empty.", - )); - } - - Ok(name) - } - - /// Formats location_uri by e.g. removing trailing slashes. - fn format_location_uri(location: String) -> String { - let mut location = location; - - if !location.starts_with('/') { - location = format!("/{}", location); - } - - if location.ends_with('/') && location.len() > 1 { - location.pop(); - } - - location - } - - /// Checks if `owner-settings` are valid. - /// If `owner_type` is set, then `owner` must also be set. - fn validate_owner_settings(properties: &HashMap) -> Result<()> { - let owner_is_set = properties.get(HMS_DB_OWNER).is_some(); - let owner_type_is_set = properties.get(HMS_DB_OWNER_TYPE).is_some(); - - if owner_type_is_set && !owner_is_set { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Setting '{}' without setting '{}' is not allowed", - HMS_DB_OWNER_TYPE, HMS_DB_OWNER - ), - )); - } - - Ok(()) - } } #[async_trait] @@ -303,7 +146,7 @@ impl Catalog for HmsCatalog { namespace: &NamespaceIdent, properties: HashMap, ) -> Result { - let database = HmsCatalog::convert_to_database(namespace, &properties)?; + let database = convert_to_database(namespace, &properties)?; self.client .0 @@ -325,7 +168,7 @@ impl Catalog for HmsCatalog { /// - If there is an error querying the database, returned by /// `from_thrift_error`. async fn get_namespace(&self, namespace: &NamespaceIdent) -> Result { - let name = HmsCatalog::validate_namespace(namespace)?; + let name = validate_namespace(namespace)?; let db = self .client @@ -334,7 +177,7 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - let properties = HmsCatalog::properties_from_database(&db); + let properties = properties_from_database(&db); let ns = Namespace::with_properties(NamespaceIdent::new(name), properties); Ok(ns) @@ -353,7 +196,7 @@ impl Catalog for HmsCatalog { /// - `Err(...)` if an error occurs during validation or the Hive Metastore /// query, with the error encapsulating the issue. async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result { - let name = HmsCatalog::validate_namespace(namespace)?; + let name = validate_namespace(namespace)?; let resp = self.client.0.get_database(name.clone().into()).await; @@ -387,7 +230,7 @@ impl Catalog for HmsCatalog { namespace: &NamespaceIdent, properties: HashMap, ) -> Result<()> { - let db = HmsCatalog::convert_to_database(namespace, &properties)?; + let db = convert_to_database(namespace, &properties)?; let name = match &db.name { Some(name) => name, @@ -416,7 +259,7 @@ impl Catalog for HmsCatalog { /// - `Err(...)` signifies failure to drop the namespace due to validation /// errors, connectivity issues, or Hive Metastore constraints. async fn drop_namespace(&self, namespace: &NamespaceIdent) -> Result<()> { - let name = HmsCatalog::validate_namespace(namespace)?; + let name = validate_namespace(namespace)?; self.client .0 @@ -428,7 +271,7 @@ impl Catalog for HmsCatalog { } async fn list_tables(&self, namespace: &NamespaceIdent) -> Result> { - let name = HmsCatalog::validate_namespace(namespace)?; + let name = validate_namespace(namespace)?; let tables = self .client @@ -473,109 +316,3 @@ impl Catalog for HmsCatalog { todo!() } } - -#[cfg(test)] -mod tests { - use iceberg::{Namespace, NamespaceIdent}; - - use super::*; - - #[test] - fn test_properties_from_database() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::from([ - ("comment".to_string(), "my_description".to_string()), - ("location".to_string(), "/my_location".to_string()), - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "User".to_string()), - ("key1".to_string(), "value1".to_string()), - ]); - - let db = HmsCatalog::convert_to_database(&ns, &properties)?; - - let expected = HmsCatalog::properties_from_database(&db); - - assert_eq!(expected, properties); - - Ok(()) - } - - #[test] - fn test_validate_owner_settings() { - let valid = HashMap::from([ - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), - ]); - let invalid = HashMap::from([(HMS_DB_OWNER_TYPE.to_string(), "user".to_string())]); - - assert!(HmsCatalog::validate_owner_settings(&valid).is_ok()); - assert!(HmsCatalog::validate_owner_settings(&invalid).is_err()); - } - - #[test] - fn test_convert_to_database() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::from([ - ("comment".to_string(), "my_description".to_string()), - ("location".to_string(), "my_location".to_string()), - (HMS_DB_OWNER.to_string(), "apache".to_string()), - (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), - ("key1".to_string(), "value1".to_string()), - ]); - - let db = HmsCatalog::convert_to_database(&ns, &properties)?; - - assert_eq!(db.name, Some(FastStr::from("my_namespace"))); - assert_eq!(db.description, Some(FastStr::from("my_description"))); - assert_eq!(db.owner_name, Some(FastStr::from("apache"))); - assert_eq!(db.owner_type, Some(PrincipalType::User)); - - if let Some(params) = db.parameters { - assert_eq!(params.get("key1"), Some(&FastStr::from("value1"))); - } - - Ok(()) - } - - #[test] - fn test_convert_to_database_with_default_user() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); - let properties = HashMap::new(); - - let db = HmsCatalog::convert_to_database(&ns, &properties)?; - - assert_eq!(db.name, Some(FastStr::from("my_namespace"))); - assert_eq!(db.owner_name, Some(FastStr::from("user.name"))); - assert_eq!(db.owner_type, Some(PrincipalType::User)); - - Ok(()) - } - - #[test] - fn test_validate_namespace() { - let valid_ns = Namespace::new(NamespaceIdent::new("ns".to_string())); - let empty_ns = Namespace::new(NamespaceIdent::new("".to_string())); - let hierarchical_ns = Namespace::new( - NamespaceIdent::from_vec(vec!["level1".to_string(), "level2".to_string()]).unwrap(), - ); - - let valid = HmsCatalog::validate_namespace(valid_ns.name()); - let empty = HmsCatalog::validate_namespace(empty_ns.name()); - let hierarchical = HmsCatalog::validate_namespace(hierarchical_ns.name()); - - assert!(valid.is_ok()); - assert!(empty.is_err()); - assert!(hierarchical.is_err()); - } - - #[test] - fn test_format_location_uri() { - let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; - let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; - - inputs.into_iter().zip(outputs).for_each(|(inp, out)| { - let location = HmsCatalog::format_location_uri(inp.to_string()); - assert_eq!(location, out); - }) - } -} diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 1b0ff33df4..a8d8f64fa4 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -16,10 +16,24 @@ // under the License. use anyhow::anyhow; -use iceberg::{Error, ErrorKind}; +use hive_metastore::{Database, PrincipalType}; +use iceberg::{Error, ErrorKind, NamespaceIdent, Result}; +use pilota::{AHashMap, FastStr}; +use std::collections::HashMap; use std::fmt::Debug; use std::io; +/// hive.metastore.database.owner setting +pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; +/// hive.metastore.database.owner default setting +pub const HMS_DEFAULT_DB_OWNER: &str = "user.name"; +/// hive.metastore.database.owner-type setting +pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; +/// hive metatore `description` property +pub const COMMENT: &str = "comment"; +/// hive metatore `location` property +pub const LOCATION: &str = "location"; + /// Format a thrift error into iceberg error. pub fn from_thrift_error(error: volo_thrift::error::ResponseError) -> Error where @@ -40,3 +54,255 @@ pub fn from_io_error(error: io::Error) -> Error { ) .with_source(error) } + +/// Create and extract properties from `hive_metastore::hms::Database`. +pub fn properties_from_database(database: &Database) -> HashMap { + let mut properties = HashMap::new(); + + if let Some(description) = &database.description { + properties.insert(COMMENT.to_string(), description.to_string()); + }; + + if let Some(location) = &database.location_uri { + properties.insert(LOCATION.to_string(), location.to_string()); + }; + + if let Some(owner) = &database.owner_name { + properties.insert(HMS_DB_OWNER.to_string(), owner.to_string()); + }; + + if let Some(owner_type) = &database.owner_type { + let value = match owner_type { + PrincipalType::User => "User", + PrincipalType::Group => "Group", + PrincipalType::Role => "Role", + }; + + properties.insert(HMS_DB_OWNER_TYPE.to_string(), value.to_string()); + }; + + if let Some(params) = &database.parameters { + params.iter().for_each(|(k, v)| { + properties.insert(k.clone().into(), v.clone().into()); + }); + }; + + properties +} + +/// Converts name and properties into `hive_metastore::hms::Database` +/// after validating the `namespace` and `owner-settings`. +pub fn convert_to_database( + namespace: &NamespaceIdent, + properties: &HashMap, +) -> Result { + let name = validate_namespace(namespace)?; + validate_owner_settings(properties)?; + + let mut db = Database::default(); + let mut parameters = AHashMap::new(); + + db.name = Some(name.into()); + + for (k, v) in properties { + match k.as_str() { + COMMENT => db.description = Some(v.clone().into()), + LOCATION => db.location_uri = Some(format_location_uri(v.clone()).into()), + HMS_DB_OWNER => db.owner_name = Some(v.clone().into()), + HMS_DB_OWNER_TYPE => { + let owner_type = match v.to_lowercase().as_str() { + "user" => PrincipalType::User, + "group" => PrincipalType::Group, + "role" => PrincipalType::Role, + _ => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Invalid value for setting 'owner_type': {}", v), + )) + } + }; + db.owner_type = Some(owner_type); + } + _ => { + parameters.insert( + FastStr::from_string(k.clone()), + FastStr::from_string(v.clone()), + ); + } + } + } + + db.parameters = Some(parameters); + + // Set default owner, if none provided + // https://github.com/apache/iceberg/blob/main/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveHadoopUtil.java#L44 + if db.owner_name.is_none() { + db.owner_name = Some(HMS_DEFAULT_DB_OWNER.into()); + db.owner_type = Some(PrincipalType::User); + } + + Ok(db) +} + +/// Checks if provided `NamespaceIdent` is valid. +pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { + let name = namespace.as_ref(); + + if name.len() != 1 { + return Err(Error::new( + ErrorKind::DataInvalid, + "Invalid database, hierarchical namespaces are not supported", + )); + } + + let name = name[0].clone(); + + if name.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Invalid database, provided namespace is empty.", + )); + } + + Ok(name) +} + +/// Formats location_uri by e.g. removing trailing slashes. +fn format_location_uri(location: String) -> String { + let mut location = location; + + if !location.starts_with('/') { + location = format!("/{}", location); + } + + if location.ends_with('/') && location.len() > 1 { + location.pop(); + } + + location +} + +/// Checks if `owner-settings` are valid. +/// If `owner_type` is set, then `owner` must also be set. +fn validate_owner_settings(properties: &HashMap) -> Result<()> { + let owner_is_set = properties.get(HMS_DB_OWNER).is_some(); + let owner_type_is_set = properties.get(HMS_DB_OWNER_TYPE).is_some(); + + if owner_type_is_set && !owner_is_set { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Setting '{}' without setting '{}' is not allowed", + HMS_DB_OWNER_TYPE, HMS_DB_OWNER + ), + )); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use iceberg::{Namespace, NamespaceIdent}; + + use super::*; + + #[test] + fn test_properties_from_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + (COMMENT.to_string(), "my_description".to_string()), + (LOCATION.to_string(), "/my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "User".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = convert_to_database(&ns, &properties)?; + + let expected = properties_from_database(&db); + + assert_eq!(expected, properties); + + Ok(()) + } + + #[test] + fn test_validate_owner_settings() { + let valid = HashMap::from([ + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ]); + let invalid = HashMap::from([(HMS_DB_OWNER_TYPE.to_string(), "user".to_string())]); + + assert!(validate_owner_settings(&valid).is_ok()); + assert!(validate_owner_settings(&invalid).is_err()); + } + + #[test] + fn test_convert_to_database() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::from([ + (COMMENT.to_string(), "my_description".to_string()), + (LOCATION.to_string(), "my_location".to_string()), + (HMS_DB_OWNER.to_string(), "apache".to_string()), + (HMS_DB_OWNER_TYPE.to_string(), "user".to_string()), + ("key1".to_string(), "value1".to_string()), + ]); + + let db = convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.description, Some(FastStr::from("my_description"))); + assert_eq!(db.owner_name, Some(FastStr::from("apache"))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + if let Some(params) = db.parameters { + assert_eq!(params.get("key1"), Some(&FastStr::from("value1"))); + } + + Ok(()) + } + + #[test] + fn test_convert_to_database_with_default_user() -> Result<()> { + let ns = NamespaceIdent::new("my_namespace".into()); + let properties = HashMap::new(); + + let db = convert_to_database(&ns, &properties)?; + + assert_eq!(db.name, Some(FastStr::from("my_namespace"))); + assert_eq!(db.owner_name, Some(FastStr::from(HMS_DEFAULT_DB_OWNER))); + assert_eq!(db.owner_type, Some(PrincipalType::User)); + + Ok(()) + } + + #[test] + fn test_validate_namespace() { + let valid_ns = Namespace::new(NamespaceIdent::new("ns".to_string())); + let empty_ns = Namespace::new(NamespaceIdent::new("".to_string())); + let hierarchical_ns = Namespace::new( + NamespaceIdent::from_vec(vec!["level1".to_string(), "level2".to_string()]).unwrap(), + ); + + let valid = validate_namespace(valid_ns.name()); + let empty = validate_namespace(empty_ns.name()); + let hierarchical = validate_namespace(hierarchical_ns.name()); + + assert!(valid.is_ok()); + assert!(empty.is_err()); + assert!(hierarchical.is_err()); + } + + #[test] + fn test_format_location_uri() { + let inputs = vec!["iceberg", "is/", "/nice/", "really/nice/", "/"]; + let outputs = vec!["/iceberg", "/is", "/nice", "/really/nice", "/"]; + + inputs.into_iter().zip(outputs).for_each(|(inp, out)| { + let location = format_location_uri(inp.to_string()); + assert_eq!(location, out); + }) + } +} From 17cb90467bccf5ef0a42ba9e44e16642b3fbb703 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 12:24:17 +0100 Subject: [PATCH 16/64] include database name in error msg --- crates/catalog/hms/src/utils.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index a8d8f64fa4..b7c934ea57 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -151,7 +151,10 @@ pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { if name.len() != 1 { return Err(Error::new( ErrorKind::DataInvalid, - "Invalid database, hierarchical namespaces are not supported", + format!( + "Invalid database name: {:?}, hierarchical namespaces are not supported", + namespace + ), )); } From 3aef54a94637c014b15ae895dc16a0b9d609c693 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 12:26:19 +0100 Subject: [PATCH 17/64] add pilota to cargo workspace --- Cargo.toml | 1 + crates/catalog/hms/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 22bc8f90fa..c978abc868 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ murmur3 = "0.5.2" once_cell = "1" opendal = "0.45" ordered-float = "4.0.0" +pilota = "0.10.0" pretty_assertions = "1.4.0" port_scanner = "0.1.5" reqwest = { version = "^0.11", features = ["json"] } diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index ca3177ac3b..475da7be61 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -34,7 +34,7 @@ async-trait = { workspace = true } hive_metastore = { workspace = true } iceberg = { workspace = true } log = { workspace = true } -pilota = "0.10.0" +pilota = { workspace = true } typed-builder = { workspace = true } volo-thrift = { workspace = true } From 1660f3bffc7abde6aebe36e6ee4a73b865b2d826 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 12:29:44 +0100 Subject: [PATCH 18/64] add minio version --- crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml index ec65baf322..85413a8ab8 100644 --- a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml +++ b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml @@ -19,7 +19,7 @@ version: '3.8' services: minio: - image: minio/minio + image: minio/minio:RELEASE.2024-03-07T00-43-48Z expose: - 9000 - 9001 From a33132e2c031f34acc8692b5a5b23388adf6eef9 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 16:30:47 +0100 Subject: [PATCH 19/64] change visibility to pub(crate); return namespace from conversion fn --- crates/catalog/hms/src/catalog.rs | 3 +-- crates/catalog/hms/src/utils.rs | 36 +++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index de9ca57f83..57e3824ce6 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -177,8 +177,7 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - let properties = properties_from_database(&db); - let ns = Namespace::with_properties(NamespaceIdent::new(name), properties); + let ns = convert_to_namespace(&db)?; Ok(ns) } diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index b7c934ea57..02f32c6585 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -17,7 +17,7 @@ use anyhow::anyhow; use hive_metastore::{Database, PrincipalType}; -use iceberg::{Error, ErrorKind, NamespaceIdent, Result}; +use iceberg::{Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; use std::collections::HashMap; use std::fmt::Debug; @@ -55,10 +55,20 @@ pub fn from_io_error(error: io::Error) -> Error { .with_source(error) } -/// Create and extract properties from `hive_metastore::hms::Database`. -pub fn properties_from_database(database: &Database) -> HashMap { +/// Returns a `Namespace` by extracting database name and properties +/// from `hive_metastore::hms::Database` +pub(crate) fn convert_to_namespace(database: &Database) -> Result { let mut properties = HashMap::new(); + let name = if let Some(name) = &database.name { + name.to_string() + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "Database name must be specified", + )); + }; + if let Some(description) = &database.description { properties.insert(COMMENT.to_string(), description.to_string()); }; @@ -87,12 +97,15 @@ pub fn properties_from_database(database: &Database) -> HashMap }); }; - properties + Ok(Namespace::with_properties( + NamespaceIdent::new(name), + properties, + )) } /// Converts name and properties into `hive_metastore::hms::Database` /// after validating the `namespace` and `owner-settings`. -pub fn convert_to_database( +pub(crate) fn convert_to_database( namespace: &NamespaceIdent, properties: &HashMap, ) -> Result { @@ -145,7 +158,7 @@ pub fn convert_to_database( } /// Checks if provided `NamespaceIdent` is valid. -pub fn validate_namespace(namespace: &NamespaceIdent) -> Result { +pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { let name = namespace.as_ref(); if name.len() != 1 { @@ -211,8 +224,7 @@ mod tests { use super::*; #[test] - fn test_properties_from_database() -> Result<()> { - let ns = NamespaceIdent::new("my_namespace".into()); + fn test_convert_to_namespace() -> Result<()> { let properties = HashMap::from([ (COMMENT.to_string(), "my_description".to_string()), (LOCATION.to_string(), "/my_location".to_string()), @@ -221,11 +233,13 @@ mod tests { ("key1".to_string(), "value1".to_string()), ]); - let db = convert_to_database(&ns, &properties)?; + let ident = NamespaceIdent::new("my_namespace".into()); + let db = convert_to_database(&ident, &properties)?; - let expected = properties_from_database(&db); + let expected_ns = Namespace::with_properties(ident, properties); + let result_ns = convert_to_namespace(&db)?; - assert_eq!(expected, properties); + assert_eq!(expected_ns, result_ns); Ok(()) } From 153bb6f43467ed6692a31f4c98b777d5515671a1 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 16:35:24 +0100 Subject: [PATCH 20/64] add minio version in rest-catalog docker-compose --- .../rest/testdata/rest_catalog/docker-compose.yaml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml b/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml index 5c101463fe..27c2368b99 100644 --- a/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml +++ b/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml @@ -37,7 +37,7 @@ services: - 8181 minio: - image: minio/minio + image: minio/minio:RELEASE.2024-03-07T00-43-48Z environment: - MINIO_ROOT_USER=admin - MINIO_ROOT_PASSWORD=password @@ -56,10 +56,4 @@ services: - AWS_SECRET_ACCESS_KEY=password - AWS_REGION=us-east-1 entrypoint: > - /bin/sh -c " - until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; - /usr/bin/mc rm -r --force minio/icebergdata; - /usr/bin/mc mb minio/icebergdata; - /usr/bin/mc policy set public minio/icebergdata; - tail -f /dev/null - " \ No newline at end of file + /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; /usr/bin/mc rm -r --force minio/icebergdata; /usr/bin/mc mb minio/icebergdata; /usr/bin/mc policy set public minio/icebergdata; tail -f /dev/null " From 2b80a4bcdd470f650401a9d2aca8bed47441e937 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 16:54:29 +0100 Subject: [PATCH 21/64] fix: hms test docker infrastructure --- .../hms/testdata/hms_catalog/Dockerfile | 67 +++++++----------- .../hms/testdata/hms_catalog/core-site.xml | 53 ++++++++++++++ .../testdata/hms_catalog/docker-compose.yaml | 24 ++++--- .../hms/testdata/hms_catalog/entrypoint.sh | 32 --------- .../hms/testdata/hms_catalog/hive-site.xml | 70 ------------------- crates/catalog/hms/tests/hms_catalog_test.rs | 5 +- 6 files changed, 91 insertions(+), 160 deletions(-) create mode 100644 crates/catalog/hms/testdata/hms_catalog/core-site.xml delete mode 100755 crates/catalog/hms/testdata/hms_catalog/entrypoint.sh delete mode 100644 crates/catalog/hms/testdata/hms_catalog/hive-site.xml diff --git a/crates/catalog/hms/testdata/hms_catalog/Dockerfile b/crates/catalog/hms/testdata/hms_catalog/Dockerfile index 7c1f862669..ff8c9fae63 100644 --- a/crates/catalog/hms/testdata/hms_catalog/Dockerfile +++ b/crates/catalog/hms/testdata/hms_catalog/Dockerfile @@ -1,53 +1,34 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -FROM openjdk:8u342-jre +FROM openjdk:8-jre-slim AS build -ENV HADOOP_VERSION=3.3.5 -ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} -ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +RUN apt-get update -qq && apt-get -qq -y install curl -ENV HIVE_VERSION=3.1.3 -ENV HIVE_HOME=/opt/apache-hive-${HIVE_VERSION}-bin -ENV PATH=$HIVE_HOME/bin:$PATH +ENV AWSSDK_VERSION=2.20.18 +ENV HADOOP_VERSION=3.1.0 -# Set classpath for S3 Access -ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.12.316.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.3.5.jar +RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.271/aws-java-sdk-bundle-1.11.271.jar -Lo /tmp/aws-java-sdk-bundle-1.11.271.jar +RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /tmp/hadoop-aws-${HADOOP_VERSION}.jar -WORKDIR /opt -RUN apt-get update && apt-get install -y procps fastjar +FROM apache/hive:3.1.3 -RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ - tar -xzf hadoop-${HADOOP_VERSION}.tar.gz && \ - rm hadoop-${HADOOP_VERSION}.tar.gz +ENV AWSSDK_VERSION=2.20.18 +ENV HADOOP_VERSION=3.1.0 -RUN wget https://downloads.apache.org/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz && \ - tar -xzf apache-hive-${HIVE_VERSION}-bin.tar.gz && \ - rm apache-hive-${HIVE_VERSION}-bin.tar.gz - -RUN cd ${HIVE_HOME}/lib && \ - wget https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.28/mysql-connector-java-8.0.28.jar - -COPY ./hive-site.xml ${HIVE_HOME}/conf/hive-site.xml -COPY ./entrypoint.sh /entrypoint.sh - -RUN chmod +x /entrypoint.sh - -EXPOSE 9083 - -ENTRYPOINT ["sh", "-c", "/entrypoint.sh"] \ No newline at end of file +COPY --from=build /tmp/hadoop-aws-${HADOOP_VERSION}.jar /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar +COPY --from=build /tmp/aws-java-sdk-bundle-1.11.271.jar /opt/hive/lib/aws-java-sdk-bundle-1.11.271.jar +COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml \ No newline at end of file diff --git a/crates/catalog/hms/testdata/hms_catalog/core-site.xml b/crates/catalog/hms/testdata/hms_catalog/core-site.xml new file mode 100644 index 0000000000..53789f0f05 --- /dev/null +++ b/crates/catalog/hms/testdata/hms_catalog/core-site.xml @@ -0,0 +1,53 @@ + + + + + + + fs.defaultFS + s3a://warehouse/hive + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.fast.upload + true + + + fs.s3a.endpoint + http://minio:9000 + + + fs.s3a.access.key + admin + + + fs.s3a.secret.key + password + + + fs.s3a.connection.ssl.enabled + false + + + fs.s3a.path.style.access + true + + \ No newline at end of file diff --git a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml index 85413a8ab8..c9605868b2 100644 --- a/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml +++ b/crates/catalog/hms/testdata/hms_catalog/docker-compose.yaml @@ -29,20 +29,22 @@ services: - MINIO_DOMAIN=minio command: [ "server", "/data", "--console-address", ":9001" ] - hive-mysql: - image: mysql:5.7 - expose: - - 3306 + mc: + depends_on: + - minio + image: minio/mc:RELEASE.2024-03-07T00-31-49Z environment: - - MYSQL_ROOT_PASSWORD=admin - - MYSQL_DATABASE=metastore - - MYSQL_USER=hive - - MYSQL_PASSWORD=hive + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/warehouse; /usr/bin/mc policy set public minio/warehouse; tail -f /dev/null " hive-metastore: - image: iceberg-hms + image: iceberg-hive-metastore build: ./ - depends_on: - - hive-mysql expose: - 9083 + environment: + SERVICE_NAME: "metastore" + SERVICE_OPTS: "-Dmetastore.warehouse.dir=s3a://warehouse/hive/" diff --git a/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh b/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh deleted file mode 100755 index f738637813..0000000000 --- a/crates/catalog/hms/testdata/hms_catalog/entrypoint.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -HIVE_VERSION=3.1.3 -HIVE_HOME=/opt/apache-hive-${HIVE_VERSION}-bin - -# Check if schema exists -${HIVE_HOME}/bin/schematool -dbType mysql -info - -if [ $? -eq 1 ]; then - echo "Getting schema info failed. Probably not initialized. Initializing...in 5s" - sleep 5 - ${HIVE_HOME}/bin/schematool -initSchema -dbType mysql -fi - -${HIVE_HOME}/bin/hive --service metastore diff --git a/crates/catalog/hms/testdata/hms_catalog/hive-site.xml b/crates/catalog/hms/testdata/hms_catalog/hive-site.xml deleted file mode 100644 index c2df65cddf..0000000000 --- a/crates/catalog/hms/testdata/hms_catalog/hive-site.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - - metastore.thrift.uris - thrift://localhost:9083 - Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. - - - metastore.task.threads.always - org.apache.hadoop.hive.metastore.events.EventCleanerTask - - - metastore.expression.proxy - org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy - - - javax.jdo.option.ConnectionDriverName - com.mysql.cj.jdbc.Driver - - - javax.jdo.option.ConnectionURL - jdbc:mysql://hive-mysql:3306/metastore - - - javax.jdo.option.ConnectionUserName - hive - - - javax.jdo.option.ConnectionPassword - hive - - - fs.s3a.impl - org.apache.hadoop.fs.s3a.S3AFileSystem - - - fs.s3a.access.key - admin - - - fs.s3a.secret.key - password - - - fs.s3a.endpoint - http://minio:9000 - - - fs.s3a.path.style.access - true - - diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 5628c094a8..bab83a9550 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -122,10 +122,7 @@ async fn test_get_namespace() -> Result<()> { let ns = Namespace::new(NamespaceIdent::new("default".into())); let properties = HashMap::from([ - ( - "location".to_string(), - "file:/user/hive/warehouse".to_string(), - ), + ("location".to_string(), "s3a://warehouse/hive".to_string()), ( "hive.metastore.database.owner-type".to_string(), "Role".to_string(), From bde3c985e4eedd2ee677cee9d8ed3a850cb2a4ee Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 16:54:39 +0100 Subject: [PATCH 22/64] add version to minio/mc --- crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml b/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml index 27c2368b99..0152a22ca0 100644 --- a/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml +++ b/crates/catalog/rest/testdata/rest_catalog/docker-compose.yaml @@ -50,7 +50,7 @@ services: mc: depends_on: - minio - image: minio/mc + image: minio/mc:RELEASE.2024-03-07T00-31-49Z environment: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password From 8375c750efef19d5986bc2e33a9d28c5ebbb693c Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 16:59:57 +0100 Subject: [PATCH 23/64] fix: license header --- crates/catalog/hms/testdata/hms_catalog/core-site.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/catalog/hms/testdata/hms_catalog/core-site.xml b/crates/catalog/hms/testdata/hms_catalog/core-site.xml index 53789f0f05..12220704ea 100644 --- a/crates/catalog/hms/testdata/hms_catalog/core-site.xml +++ b/crates/catalog/hms/testdata/hms_catalog/core-site.xml @@ -1,5 +1,3 @@ - - + + fs.defaultFS From 7dc7adbc4095dcee9ceab9081678e4aa98c8a15c Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 9 Mar 2024 18:56:21 +0100 Subject: [PATCH 24/64] fix: core-site --- crates/catalog/hms/testdata/hms_catalog/core-site.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/catalog/hms/testdata/hms_catalog/core-site.xml b/crates/catalog/hms/testdata/hms_catalog/core-site.xml index 12220704ea..f0583a0bc0 100644 --- a/crates/catalog/hms/testdata/hms_catalog/core-site.xml +++ b/crates/catalog/hms/testdata/hms_catalog/core-site.xml @@ -15,8 +15,6 @@ limitations under the License. --> - - fs.defaultFS From fde11387dcde2e1bc8d36047f43cf97c998006bb Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sun, 10 Mar 2024 19:22:21 +0100 Subject: [PATCH 25/64] split utils and errors --- crates/catalog/hms/src/catalog.rs | 3 +++ crates/catalog/hms/src/error.rs | 42 +++++++++++++++++++++++++++++++ crates/catalog/hms/src/lib.rs | 1 + crates/catalog/hms/src/utils.rs | 24 ------------------ 4 files changed, 46 insertions(+), 24 deletions(-) create mode 100644 crates/catalog/hms/src/error.rs diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 57e3824ce6..61ad77a8a3 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +use crate::error::from_io_error; +use crate::error::from_thrift_error; + use super::utils::*; use async_trait::async_trait; use hive_metastore::ThriftHiveMetastoreClient; diff --git a/crates/catalog/hms/src/error.rs b/crates/catalog/hms/src/error.rs new file mode 100644 index 0000000000..1b0ff33df4 --- /dev/null +++ b/crates/catalog/hms/src/error.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use anyhow::anyhow; +use iceberg::{Error, ErrorKind}; +use std::fmt::Debug; +use std::io; + +/// Format a thrift error into iceberg error. +pub fn from_thrift_error(error: volo_thrift::error::ResponseError) -> Error +where + T: Debug, +{ + Error::new( + ErrorKind::Unexpected, + "operation failed for hitting thrift error".to_string(), + ) + .with_source(anyhow!("thrift error: {:?}", error)) +} + +/// Format an io error into iceberg error. +pub fn from_io_error(error: io::Error) -> Error { + Error::new( + ErrorKind::Unexpected, + "operation failed for hitting io error".to_string(), + ) + .with_source(error) +} diff --git a/crates/catalog/hms/src/lib.rs b/crates/catalog/hms/src/lib.rs index b75e74977a..76859bc848 100644 --- a/crates/catalog/hms/src/lib.rs +++ b/crates/catalog/hms/src/lib.rs @@ -22,4 +22,5 @@ mod catalog; pub use catalog::*; +mod error; mod utils; diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 02f32c6585..14adb3ab1d 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -15,13 +15,10 @@ // specific language governing permissions and limitations // under the License. -use anyhow::anyhow; use hive_metastore::{Database, PrincipalType}; use iceberg::{Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; use std::collections::HashMap; -use std::fmt::Debug; -use std::io; /// hive.metastore.database.owner setting pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; @@ -34,27 +31,6 @@ pub const COMMENT: &str = "comment"; /// hive metatore `location` property pub const LOCATION: &str = "location"; -/// Format a thrift error into iceberg error. -pub fn from_thrift_error(error: volo_thrift::error::ResponseError) -> Error -where - T: Debug, -{ - Error::new( - ErrorKind::Unexpected, - "operation failed for hitting thrift error".to_string(), - ) - .with_source(anyhow!("thrift error: {:?}", error)) -} - -/// Format an io error into iceberg error. -pub fn from_io_error(error: io::Error) -> Error { - Error::new( - ErrorKind::Unexpected, - "operation failed for hitting io error".to_string(), - ) - .with_source(error) -} - /// Returns a `Namespace` by extracting database name and properties /// from `hive_metastore::hms::Database` pub(crate) fn convert_to_namespace(database: &Database) -> Result { From 666f8563c5bb69bd32cfeab87178af420f4e0b67 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Mon, 11 Mar 2024 13:51:26 +0100 Subject: [PATCH 26/64] add fn get_default_table_location --- crates/catalog/hms/src/utils.rs | 67 ++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 14adb3ab1d..61dd46858b 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -18,7 +18,7 @@ use hive_metastore::{Database, PrincipalType}; use iceberg::{Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; -use std::collections::HashMap; +use std::{collections::HashMap, fmt::Display}; /// hive.metastore.database.owner setting pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; @@ -30,6 +30,8 @@ pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; pub const COMMENT: &str = "comment"; /// hive metatore `location` property pub const LOCATION: &str = "location"; +/// hive metatore `warehouse` location property +pub const WAREHOUSE_LOCATION: &str = "warehouse"; /// Returns a `Namespace` by extracting database name and properties /// from `hive_metastore::hms::Database` @@ -159,6 +161,24 @@ pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { Ok(name) } +/// Gets default table location from `Namespace` properties +pub(crate) fn get_default_table_location( + namespace: &Namespace, + table_name: impl AsRef + Display, +) -> Result { + let properties = namespace.properties(); + properties + .get(LOCATION) + .or_else(|| properties.get(WAREHOUSE_LOCATION)) + .map(|location| format!("{}/{}", location, table_name)) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "No default path is set, please specify a location when creating a table", + ) + }) +} + /// Formats location_uri by e.g. removing trailing slashes. fn format_location_uri(location: String) -> String { let mut location = location; @@ -199,6 +219,51 @@ mod tests { use super::*; + #[test] + fn test_get_default_table_location() -> Result<()> { + let properties = HashMap::from([(LOCATION.to_string(), "db_location".to_string())]); + + let namespace = + Namespace::with_properties(NamespaceIdent::new("default".into()), properties); + let table_name = "my_table"; + + let expected = "db_location/my_table"; + let result = get_default_table_location(&namespace, table_name)?; + + assert_eq!(expected, result); + + Ok(()) + } + + #[test] + fn test_get_default_table_location_warehouse() -> Result<()> { + let properties = HashMap::from([( + WAREHOUSE_LOCATION.to_string(), + "warehouse_location".to_string(), + )]); + + let namespace = + Namespace::with_properties(NamespaceIdent::new("default".into()), properties); + let table_name = "my_table"; + + let expected = "warehouse_location/my_table"; + let result = get_default_table_location(&namespace, table_name)?; + + assert_eq!(expected, result); + + Ok(()) + } + + #[test] + fn test_get_default_table_location_missing() { + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + let table_name = "my_table"; + + let result = get_default_table_location(&namespace, table_name); + + assert!(result.is_err()); + } + #[test] fn test_convert_to_namespace() -> Result<()> { let properties = HashMap::from([ From 71c447ad5b50308931f9a1684cfd79aa19116298 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Mon, 11 Mar 2024 14:07:13 +0100 Subject: [PATCH 27/64] add fn get_metadata_location --- crates/catalog/hms/src/utils.rs | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 61dd46858b..8aaa040843 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -19,6 +19,7 @@ use hive_metastore::{Database, PrincipalType}; use iceberg::{Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; use std::{collections::HashMap, fmt::Display}; +use uuid::Uuid; /// hive.metastore.database.owner setting pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; @@ -179,6 +180,27 @@ pub(crate) fn get_default_table_location( }) } +pub(crate) fn get_metadata_location( + location: impl AsRef + Display, + version: i32, +) -> Result { + if version < 0 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Table metadata version: '{}' must be a non-negative integer", + version + ), + )); + }; + + let version = format!("{:0>5}", version); + let id = Uuid::new_v4(); + let metadata_location = format!("{}/metadata/{}-{}.metadata.json", location, version, id); + + Ok(metadata_location) +} + /// Formats location_uri by e.g. removing trailing slashes. fn format_location_uri(location: String) -> String { let mut location = location; @@ -218,6 +240,21 @@ mod tests { use iceberg::{Namespace, NamespaceIdent}; use super::*; + #[test] + fn test_get_metadata_location() -> Result<()> { + let location = "my_base_location"; + let valid_version = 0; + let invalid_version = -1; + + let valid_result = get_metadata_location(location, valid_version)?; + let invalid_result = get_metadata_location(location, invalid_version); + + assert!(valid_result.starts_with("my_base_location/metadata/00000-")); + assert!(valid_result.ends_with(".metadata.json")); + assert!(invalid_result.is_err()); + + Ok(()) + } #[test] fn test_get_default_table_location() -> Result<()> { From 97d71ccbe6babe639be70471cb9bb6bdc2dc2300 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Mon, 11 Mar 2024 14:16:12 +0100 Subject: [PATCH 28/64] add docs --- crates/catalog/hms/src/utils.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 8aaa040843..fdda05977c 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -162,7 +162,7 @@ pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { Ok(name) } -/// Gets default table location from `Namespace` properties +/// Get default table location from `Namespace` properties pub(crate) fn get_default_table_location( namespace: &Namespace, table_name: impl AsRef + Display, @@ -180,6 +180,7 @@ pub(crate) fn get_default_table_location( }) } +/// Get metadata location pub(crate) fn get_metadata_location( location: impl AsRef + Display, version: i32, From c64ffa6413eb470a5c8e1d230d2c63084385b8d1 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 12 Mar 2024 13:32:06 +0100 Subject: [PATCH 29/64] add HiveSchemaBuilder --- crates/catalog/hms/Cargo.toml | 4 +- crates/catalog/hms/src/lib.rs | 1 + crates/catalog/hms/src/schema.rs | 309 +++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 crates/catalog/hms/src/schema.rs diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index 475da7be61..626881980c 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -35,10 +35,12 @@ hive_metastore = { workspace = true } iceberg = { workspace = true } log = { workspace = true } pilota = { workspace = true } +serde_json = { workspace = true } typed-builder = { workspace = true } +uuid = { workspace = true } volo-thrift = { workspace = true } +tokio = { workspace = true } [dev-dependencies] iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } port_scanner = { workspace = true } -tokio = { workspace = true } diff --git a/crates/catalog/hms/src/lib.rs b/crates/catalog/hms/src/lib.rs index 76859bc848..db0034d46b 100644 --- a/crates/catalog/hms/src/lib.rs +++ b/crates/catalog/hms/src/lib.rs @@ -23,4 +23,5 @@ mod catalog; pub use catalog::*; mod error; +mod schema; mod utils; diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs new file mode 100644 index 0000000000..4c8b4e04ac --- /dev/null +++ b/crates/catalog/hms/src/schema.rs @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use hive_metastore::FieldSchema; +use iceberg::spec::{NestedFieldRef, PrimitiveType, SchemaVisitor}; + +pub(crate) type HiveSchema = Vec; + +#[derive(Debug)] +pub(crate) struct HiveSchemaBuilder { + cols: HiveSchema, + context: Vec, +} + +impl HiveSchemaBuilder { + /// Create new `HiveSchemaBuilder` + pub fn new() -> Self { + HiveSchemaBuilder { + cols: Vec::new(), + context: Vec::new(), + } + } + + /// Get converted `HiveSchema` + pub fn schema(&self) -> HiveSchema { + self.cols.clone() + } + + /// Check if visitor is in `StructType` + fn is_inside_struct(&self) -> bool { + !self.context.is_empty() + } +} + +impl SchemaVisitor for HiveSchemaBuilder { + type T = String; + + fn schema( + &mut self, + _schema: &iceberg::spec::Schema, + value: Self::T, + ) -> iceberg::Result { + Ok(value) + } + + fn before_struct_field( + &mut self, + field: &iceberg::spec::NestedFieldRef, + ) -> iceberg::Result<()> { + self.context.push(field.clone()); + Ok(()) + } + + fn r#struct( + &mut self, + r#_struct: &iceberg::spec::StructType, + results: Vec, + ) -> iceberg::Result { + Ok(format!("struct<{}>", results.join(", "))) + } + + fn after_struct_field( + &mut self, + _field: &iceberg::spec::NestedFieldRef, + ) -> iceberg::Result<()> { + self.context.pop(); + Ok(()) + } + + fn field( + &mut self, + field: &iceberg::spec::NestedFieldRef, + value: Self::T, + ) -> iceberg::Result { + if self.is_inside_struct() { + return Ok(format!("{}:{}", field.name, value)); + } + + self.cols.push(FieldSchema { + name: Some(field.name.clone().into()), + r#type: Some(value.clone().into()), + comment: field.doc.clone().map(|doc| doc.into()), + }); + + Ok(value) + } + + fn list( + &mut self, + _list: &iceberg::spec::ListType, + value: Self::T, + ) -> iceberg::Result { + Ok(format!("array<{}>", value)) + } + + fn map( + &mut self, + _map: &iceberg::spec::MapType, + key_value: Self::T, + value: Self::T, + ) -> iceberg::Result { + Ok(format!("map<{},{}>", key_value, value)) + } + + fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result { + let hive_type = match p { + PrimitiveType::Boolean => "boolean".to_string(), + PrimitiveType::Int => "int".to_string(), + PrimitiveType::Long => "bigint".to_string(), + PrimitiveType::Float => "float".to_string(), + PrimitiveType::Double => "double".to_string(), + PrimitiveType::Date => "date".to_string(), + PrimitiveType::Timestamp | PrimitiveType::Timestamptz => "timestamp".to_string(), + PrimitiveType::Time | PrimitiveType::String | PrimitiveType::Uuid => { + "string".to_string() + } + PrimitiveType::Binary | PrimitiveType::Fixed(_) => "binary".to_string(), + PrimitiveType::Decimal { precision, scale } => { + format!("decimal({},{})", precision, scale) + } + }; + + Ok(hive_type) + } +} + +#[cfg(test)] +mod tests { + use iceberg::{ + spec::{visit_schema, ListType, MapType, NestedField, Schema, StructType, Type}, + Result, + }; + + use super::*; + + #[test] + fn test_schema_with_nested_maps() -> Result<()> { + let mut visitor = HiveSchemaBuilder::new(); + + let schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![NestedField::required( + 1, + "quux", + Type::Map(MapType { + key_field: NestedField::map_key_element( + 2, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 3, + Type::Map(MapType { + key_field: NestedField::map_key_element( + 4, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 5, + Type::Primitive(PrimitiveType::Int), + true, + ) + .into(), + }), + true, + ) + .into(), + }), + ) + .into()]) + .build()?; + + let expected = vec![FieldSchema { + name: Some("quux".into()), + r#type: Some("map>".into()), + comment: None, + }]; + + visit_schema(&schema, &mut visitor)?; + + assert_eq!(visitor.schema(), expected); + + Ok(()) + } + + #[test] + fn test_schema_with_struct_inside_list() -> Result<()> { + let mut visitor = HiveSchemaBuilder::new(); + + let schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![NestedField::required( + 1, + "location", + Type::List(ListType { + element_field: NestedField::list_element( + 2, + Type::Struct(StructType::new(vec![ + NestedField::optional( + 3, + "latitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + NestedField::optional( + 4, + "longitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + ])), + true, + ) + .into(), + }), + ) + .into()]) + .build()?; + + let expected = vec![FieldSchema { + name: Some("location".into()), + r#type: Some("array>".into()), + comment: None, + }]; + + visit_schema(&schema, &mut visitor)?; + + assert_eq!(visitor.schema(), expected); + + Ok(()) + } + + #[test] + fn test_schema_with_structs() -> Result<()> { + let mut visitor = HiveSchemaBuilder::new(); + + let schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![NestedField::required( + 1, + "person", + Type::Struct(StructType::new(vec![ + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(3, "age", Type::Primitive(PrimitiveType::Int)).into(), + ])), + ) + .into()]) + .build()?; + + let expected = vec![FieldSchema { + name: Some("person".into()), + r#type: Some("struct".into()), + comment: None, + }]; + + visit_schema(&schema, &mut visitor)?; + + assert_eq!(visitor.schema(), expected); + + Ok(()) + } + + #[test] + fn test_schema_with_simple_fields() -> Result<()> { + let mut visitor = HiveSchemaBuilder::new(); + + let schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "foo", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build()?; + + visit_schema(&schema, &mut visitor)?; + + let expected = vec![ + FieldSchema { + name: Some("foo".into()), + r#type: Some("int".into()), + comment: None, + }, + FieldSchema { + name: Some("bar".into()), + r#type: Some("string".into()), + comment: None, + }, + ]; + + assert_eq!(visitor.schema(), expected); + + Ok(()) + } +} From 1da7f9538307e04a54a8cc03b45aa46e2566d341 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 12 Mar 2024 15:34:54 +0100 Subject: [PATCH 30/64] add schema to HiveSchemaBuilder --- crates/catalog/hms/src/schema.rs | 45 ++++++++++++++------------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index 4c8b4e04ac..142c0172d2 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -16,28 +16,32 @@ // under the License. use hive_metastore::FieldSchema; -use iceberg::spec::{NestedFieldRef, PrimitiveType, SchemaVisitor}; +use iceberg::spec::{visit_schema, NestedFieldRef, PrimitiveType, Schema, SchemaVisitor}; +use iceberg::Result; pub(crate) type HiveSchema = Vec; #[derive(Debug)] pub(crate) struct HiveSchemaBuilder { + schema: Schema, cols: HiveSchema, context: Vec, } impl HiveSchemaBuilder { /// Create new `HiveSchemaBuilder` - pub fn new() -> Self { + pub fn new(schema: Schema) -> Self { HiveSchemaBuilder { + schema, cols: Vec::new(), context: Vec::new(), } } /// Get converted `HiveSchema` - pub fn schema(&self) -> HiveSchema { - self.cols.clone() + pub fn schema(&mut self) -> Result { + visit_schema(&self.schema.clone(), self)?; + Ok(self.cols.clone()) } /// Check if visitor is in `StructType` @@ -141,7 +145,7 @@ impl SchemaVisitor for HiveSchemaBuilder { #[cfg(test)] mod tests { use iceberg::{ - spec::{visit_schema, ListType, MapType, NestedField, Schema, StructType, Type}, + spec::{ListType, MapType, NestedField, Schema, StructType, Type}, Result, }; @@ -149,8 +153,6 @@ mod tests { #[test] fn test_schema_with_nested_maps() -> Result<()> { - let mut visitor = HiveSchemaBuilder::new(); - let schema = Schema::builder() .with_schema_id(1) .with_fields(vec![NestedField::required( @@ -185,23 +187,21 @@ mod tests { .into()]) .build()?; + let result = HiveSchemaBuilder::new(schema).schema()?; + let expected = vec![FieldSchema { name: Some("quux".into()), r#type: Some("map>".into()), comment: None, }]; - visit_schema(&schema, &mut visitor)?; - - assert_eq!(visitor.schema(), expected); + assert_eq!(result, expected); Ok(()) } #[test] fn test_schema_with_struct_inside_list() -> Result<()> { - let mut visitor = HiveSchemaBuilder::new(); - let schema = Schema::builder() .with_schema_id(1) .with_fields(vec![NestedField::required( @@ -232,23 +232,21 @@ mod tests { .into()]) .build()?; + let result = HiveSchemaBuilder::new(schema).schema()?; + let expected = vec![FieldSchema { name: Some("location".into()), r#type: Some("array>".into()), comment: None, }]; - visit_schema(&schema, &mut visitor)?; - - assert_eq!(visitor.schema(), expected); + assert_eq!(result, expected); Ok(()) } #[test] fn test_schema_with_structs() -> Result<()> { - let mut visitor = HiveSchemaBuilder::new(); - let schema = Schema::builder() .with_schema_id(1) .with_fields(vec![NestedField::required( @@ -262,23 +260,21 @@ mod tests { .into()]) .build()?; + let result = HiveSchemaBuilder::new(schema).schema()?; + let expected = vec![FieldSchema { name: Some("person".into()), r#type: Some("struct".into()), comment: None, }]; - visit_schema(&schema, &mut visitor)?; - - assert_eq!(visitor.schema(), expected); + assert_eq!(result, expected); Ok(()) } #[test] fn test_schema_with_simple_fields() -> Result<()> { - let mut visitor = HiveSchemaBuilder::new(); - let schema = Schema::builder() .with_schema_id(1) .with_fields(vec![ @@ -286,8 +282,7 @@ mod tests { NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), ]) .build()?; - - visit_schema(&schema, &mut visitor)?; + let result = HiveSchemaBuilder::new(schema).schema()?; let expected = vec![ FieldSchema { @@ -302,7 +297,7 @@ mod tests { }, ]; - assert_eq!(visitor.schema(), expected); + assert_eq!(result, expected); Ok(()) } From f99ed2b59be29c9cecbdbd8221cf7352b3fe8a41 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 12 Mar 2024 15:54:21 +0100 Subject: [PATCH 31/64] add convert_to_hive_table --- crates/catalog/hms/Cargo.toml | 1 + crates/catalog/hms/src/utils.rs | 137 +++++++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 3 deletions(-) diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index 626881980c..c70d11e35f 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -31,6 +31,7 @@ keywords = ["iceberg", "hive", "catalog"] [dependencies] anyhow = { workspace = true } async-trait = { workspace = true } +chrono = { workspace = true } hive_metastore = { workspace = true } iceberg = { workspace = true } log = { workspace = true } diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index fdda05977c..9a17410d04 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. -use hive_metastore::{Database, PrincipalType}; -use iceberg::{Error, ErrorKind, Namespace, NamespaceIdent, Result}; +use chrono::Utc; +use hive_metastore::{Database, PrincipalType, SerDeInfo, StorageDescriptor}; +use iceberg::{spec::Schema, Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; use std::{collections::HashMap, fmt::Display}; use uuid::Uuid; +use crate::schema::HiveSchemaBuilder; + /// hive.metastore.database.owner setting pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; /// hive.metastore.database.owner default setting @@ -31,8 +34,22 @@ pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; pub const COMMENT: &str = "comment"; /// hive metatore `location` property pub const LOCATION: &str = "location"; +/// hive metatore `metadat_location` property +pub const METADATA_LOCATION: &str = "metadata_location"; +/// hive metatore `external` property +pub const EXTERNAL: &str = "EXTERNAL"; +/// hive metatore `external_table` property +pub const EXTERNAL_TABLE: &str = "EXTERNAL_TABLE"; +/// hive metatore `table_type` property +pub const TABLE_TYPE: &str = "table_type"; /// hive metatore `warehouse` location property pub const WAREHOUSE_LOCATION: &str = "warehouse"; +/// hive metatore `SerDeInfo` serialization_lib parameter +pub const SERIALIZATION_LIB: &str = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; +/// hive metatore input format +pub const INPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileInputFormat"; +/// hive metatore output format +pub const OUTPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileOutputFormat"; /// Returns a `Namespace` by extracting database name and properties /// from `hive_metastore::hms::Database` @@ -136,6 +153,57 @@ pub(crate) fn convert_to_database( Ok(db) } +pub(crate) fn convert_to_hive_table( + db_name: String, + schema: Schema, + table_name: String, + location: String, + metadata_location: String, + properties: &HashMap, +) -> Result { + let serde_info = SerDeInfo { + serialization_lib: Some(SERIALIZATION_LIB.into()), + ..Default::default() + }; + + let hive_schema = HiveSchemaBuilder::new(schema).schema()?; + + let storage_descriptor = StorageDescriptor { + location: Some(location.into()), + cols: Some(hive_schema), + input_format: Some(INPUT_FORMAT.into()), + output_format: Some(OUTPUT_FORMAT.into()), + serde_info: Some(serde_info), + ..Default::default() + }; + + let parameters = AHashMap::from([ + (FastStr::from(EXTERNAL), FastStr::from("TRUE")), + (FastStr::from(TABLE_TYPE), FastStr::from("ICEBERG")), + ( + FastStr::from(METADATA_LOCATION), + FastStr::from(metadata_location), + ), + ]); + + let current_time_ms = get_current_time()?; + let owner = properties + .get("owner") + .map_or(HMS_DEFAULT_DB_OWNER.to_string(), |v| v.into()); + + Ok(hive_metastore::Table { + table_name: Some(table_name.into()), + db_name: Some(db_name.into()), + table_type: Some(EXTERNAL_TABLE.into()), + owner: Some(owner.into()), + create_time: Some(current_time_ms), + last_access_time: Some(current_time_ms), + sd: Some(storage_descriptor), + parameters: Some(parameters), + ..Default::default() + }) +} + /// Checks if provided `NamespaceIdent` is valid. pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { let name = namespace.as_ref(); @@ -236,11 +304,74 @@ fn validate_owner_settings(properties: &HashMap) -> Result<()> { Ok(()) } +fn get_current_time() -> Result { + let now = Utc::now(); + now.timestamp().try_into().map_err(|_| { + Error::new( + ErrorKind::Unexpected, + "Current time is out of range for i32", + ) + }) +} + #[cfg(test)] mod tests { - use iceberg::{Namespace, NamespaceIdent}; + use iceberg::{ + spec::{NestedField, PrimitiveType, Type}, + Namespace, NamespaceIdent, + }; use super::*; + + #[test] + fn test_convert_to_hive_table() -> Result<()> { + let db_name = "my_db".to_string(); + let table_name = "my_table".to_string(); + let location = "s3a://warehouse/hms".to_string(); + let metadata_location = get_metadata_location(location.clone(), 0)?; + let properties = HashMap::new(); + let schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "foo", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build()?; + + let result = convert_to_hive_table( + db_name.clone(), + schema.clone(), + table_name.clone(), + location.clone(), + metadata_location, + &properties, + )?; + + let serde_info = SerDeInfo { + serialization_lib: Some(SERIALIZATION_LIB.into()), + ..Default::default() + }; + + let hive_schema = HiveSchemaBuilder::new(schema).schema()?; + + let sd = StorageDescriptor { + location: Some(location.into()), + cols: Some(hive_schema), + input_format: Some(INPUT_FORMAT.into()), + output_format: Some(OUTPUT_FORMAT.into()), + serde_info: Some(serde_info), + ..Default::default() + }; + + assert_eq!(result.db_name, Some(db_name.into())); + assert_eq!(result.table_name, Some(table_name.into())); + assert_eq!(result.table_type, Some(EXTERNAL_TABLE.into())); + assert_eq!(result.owner, Some(HMS_DEFAULT_DB_OWNER.into())); + assert_eq!(result.sd, Some(sd)); + + Ok(()) + } + #[test] fn test_get_metadata_location() -> Result<()> { let location = "my_base_location"; From 1db7021f90848a1dd1ebe90dfd41cbf7f39f084f Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 12 Mar 2024 15:55:13 +0100 Subject: [PATCH 32/64] cargo sort --- crates/catalog/hms/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/hms/Cargo.toml b/crates/catalog/hms/Cargo.toml index c70d11e35f..5a032215aa 100644 --- a/crates/catalog/hms/Cargo.toml +++ b/crates/catalog/hms/Cargo.toml @@ -37,10 +37,10 @@ iceberg = { workspace = true } log = { workspace = true } pilota = { workspace = true } serde_json = { workspace = true } +tokio = { workspace = true } typed-builder = { workspace = true } uuid = { workspace = true } volo-thrift = { workspace = true } -tokio = { workspace = true } [dev-dependencies] iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } From 987704ea3edb42060639cd112303c476352ca912 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 12 Mar 2024 16:15:52 +0100 Subject: [PATCH 33/64] implement table_ops without TableMetadataBuilder --- crates/catalog/hms/src/catalog.rs | 192 ++++++++++++++++++++++++++++-- 1 file changed, 180 insertions(+), 12 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 61ad77a8a3..a3f8c04194 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -23,6 +23,8 @@ use async_trait::async_trait; use hive_metastore::ThriftHiveMetastoreClient; use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; +use hive_metastore::ThriftHiveMetastoreGetTableException; +use iceberg::io::FileIO; use iceberg::table::Table; use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, @@ -31,6 +33,7 @@ use iceberg::{ use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; +use tokio::io::AsyncReadExt; use typed_builder::TypedBuilder; use volo_thrift::ResponseError; @@ -272,6 +275,15 @@ impl Catalog for HmsCatalog { Ok(()) } + /// Asynchronously lists all tables within a specified namespace. + /// + /// # Returns + /// + /// A `Result>`, which is: + /// - `Ok(vec![...])` containing a vector of `TableIdent` instances, each + /// representing a table within the specified namespace. + /// - `Err(...)` if an error occurs during namespace validation or while + /// querying the database. async fn list_tables(&self, namespace: &NamespaceIdent) -> Result> { let name = validate_namespace(namespace)?; @@ -292,29 +304,185 @@ impl Catalog for HmsCatalog { async fn create_table( &self, - _namespace: &NamespaceIdent, - _creation: TableCreation, + namespace: &NamespaceIdent, + creation: TableCreation, ) -> Result { - todo!() + let db_name = validate_namespace(namespace)?; + let table_name = &creation.name; + + let location = match &creation.location { + Some(location) => location.clone(), + None => { + let ns = self.get_namespace(namespace).await?; + get_default_table_location(&ns, table_name)? + } + }; + + let metadata_location = get_metadata_location(location.clone(), 0)?; + let _file_io = FileIO::from_path(&metadata_location)?.build()?; + + // TODO: Create `TableMetadata` and write to storage + // blocked by: https://github.com/apache/iceberg-rust/issues/250 + + let hive_table = convert_to_hive_table( + db_name, + creation.schema.clone(), + table_name.clone(), + location, + metadata_location, + &creation.properties, + )?; + + self.client + .0 + .create_table(hive_table) + .await + .map_err(from_thrift_error)?; + + // TODO: Create & return Result
+ Err(Error::new( + ErrorKind::FeatureUnsupported, + "Table creation is not supported yet", + )) } - async fn load_table(&self, _table: &TableIdent) -> Result
{ - todo!() + async fn load_table(&self, table: &TableIdent) -> Result
{ + let dbname = validate_namespace(table.namespace())?; + let tbl_name = table.name.clone(); + + let hive_table = self + .client + .0 + .get_table(dbname.into(), tbl_name.into()) + .await + .map_err(from_thrift_error)?; + + // TODO: extract into utils + let metadata_location = match hive_table.parameters { + Some(properties) => match properties.get(METADATA_LOCATION) { + Some(location) => location.to_string(), + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("No '{}' set on table", METADATA_LOCATION), + )) + } + }, + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + "No 'parameters' set on table. Location of metadata is undefined", + )) + } + }; + + let file_io = FileIO::from_path(&metadata_location)?.build()?; + + // TODO: extract into utils + let mut reader = file_io.new_input(&metadata_location)?.reader().await?; + let mut metadata_str = String::new(); + reader.read_to_string(&mut metadata_str).await?; + + let _metadata = serde_json::from_str(&metadata_str)?; + + // TODO: Create `TableMetadata` + // blocked by: https://github.com/apache/iceberg-rust/issues/250 + // Create and return Result
+ Err(Error::new( + ErrorKind::FeatureUnsupported, + "Loading a table is not supported yet", + )) } - async fn drop_table(&self, _table: &TableIdent) -> Result<()> { - todo!() + /// Asynchronously drops a table from the database. + /// + /// # Errors + /// Returns an error if: + /// - The namespace provided in `table` cannot be validated + /// or does not exist. + /// - The underlying database client encounters an error while + /// attempting to drop the table. This includes scenarios where + /// the table does not exist. + /// - Any network or communication error occurs with the database backend. + async fn drop_table(&self, table: &TableIdent) -> Result<()> { + let dbname = validate_namespace(table.namespace())?; + + self.client + .0 + .drop_table(dbname.into(), table.name.clone().into(), false) + .await + .map_err(from_thrift_error)?; + + Ok(()) } - async fn stat_table(&self, _table: &TableIdent) -> Result { - todo!() + /// Asynchronously checks the existence of a specified table + /// in the database. + /// + /// # Returns + /// - `Ok(true)` if the table exists in the database. + /// - `Ok(false)` if the table does not exist in the database. + /// - `Err(...)` if an error occurs during the process + async fn stat_table(&self, table: &TableIdent) -> Result { + let dbname = validate_namespace(table.namespace())?; + let tbl_name = table.name.clone(); + + let resp = self + .client + .0 + .get_table(dbname.into(), tbl_name.into()) + .await; + + match resp { + Ok(_) => Ok(true), + Err(err) => { + if let ResponseError::UserException(ThriftHiveMetastoreGetTableException::O2(_)) = + &err + { + Ok(false) + } else { + Err(from_thrift_error(err)) + } + } + } } - async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> Result<()> { - todo!() + /// Asynchronously renames a table within the database + /// or moves it between namespaces (databases). + /// + /// # Returns + /// - `Ok(())` on successful rename or move of the table. + /// - `Err(...)` if an error occurs during the process. + async fn rename_table(&self, src: &TableIdent, dest: &TableIdent) -> Result<()> { + let src_dbname = validate_namespace(src.namespace())?; + let dest_dbname = validate_namespace(dest.namespace())?; + + let src_tbl_name = src.name.clone(); + let dest_tbl_name = dest.name.clone(); + + let mut tbl = self + .client + .0 + .get_table(src_dbname.clone().into(), src_tbl_name.clone().into()) + .await + .map_err(from_thrift_error)?; + + tbl.db_name = Some(dest_dbname.into()); + tbl.table_name = Some(dest_tbl_name.into()); + + self.client + .0 + .alter_table(src_dbname.into(), src_tbl_name.into(), tbl) + .await + .map_err(from_thrift_error)?; + + Ok(()) } async fn update_table(&self, _commit: TableCommit) -> Result
{ - todo!() + Err(Error::new( + ErrorKind::FeatureUnsupported, + "Updating a table is not supported yet", + )) } } From aa98865835eb649521affd99eac3bac43b5c8f73 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 13 Mar 2024 11:16:49 +0100 Subject: [PATCH 34/64] refactor: HiveSchema fn from_iceberg --- crates/catalog/hms/src/schema.rs | 18 ++++++++---------- crates/catalog/hms/src/utils.rs | 8 ++++---- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index 142c0172d2..3f83e703a6 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -23,24 +23,22 @@ pub(crate) type HiveSchema = Vec; #[derive(Debug)] pub(crate) struct HiveSchemaBuilder { - schema: Schema, cols: HiveSchema, context: Vec, } impl HiveSchemaBuilder { /// Create new `HiveSchemaBuilder` - pub fn new(schema: Schema) -> Self { + pub fn new() -> Self { HiveSchemaBuilder { - schema, cols: Vec::new(), context: Vec::new(), } } - /// Get converted `HiveSchema` - pub fn schema(&mut self) -> Result { - visit_schema(&self.schema.clone(), self)?; + /// Convert `Schema` into `HiveSchema` by applying the `SchemaVisitor` + pub fn from_iceberg(&mut self, schema: &Schema) -> Result { + visit_schema(schema, self)?; Ok(self.cols.clone()) } @@ -187,7 +185,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new(schema).schema()?; + let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("quux".into()), @@ -232,7 +230,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new(schema).schema()?; + let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("location".into()), @@ -260,7 +258,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new(schema).schema()?; + let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("person".into()), @@ -282,7 +280,7 @@ mod tests { NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), ]) .build()?; - let result = HiveSchemaBuilder::new(schema).schema()?; + let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; let expected = vec![ FieldSchema { diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 9a17410d04..a5219a7230 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -155,7 +155,7 @@ pub(crate) fn convert_to_database( pub(crate) fn convert_to_hive_table( db_name: String, - schema: Schema, + schema: &Schema, table_name: String, location: String, metadata_location: String, @@ -166,7 +166,7 @@ pub(crate) fn convert_to_hive_table( ..Default::default() }; - let hive_schema = HiveSchemaBuilder::new(schema).schema()?; + let hive_schema = HiveSchemaBuilder::new().from_iceberg(&schema)?; let storage_descriptor = StorageDescriptor { location: Some(location.into()), @@ -340,7 +340,7 @@ mod tests { let result = convert_to_hive_table( db_name.clone(), - schema.clone(), + &schema, table_name.clone(), location.clone(), metadata_location, @@ -352,7 +352,7 @@ mod tests { ..Default::default() }; - let hive_schema = HiveSchemaBuilder::new(schema).schema()?; + let hive_schema = HiveSchemaBuilder::new().from_iceberg(&schema)?; let sd = StorageDescriptor { location: Some(location.into()), From 1d1d57773d54ee9490c0afa539815d592835a785 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 13 Mar 2024 14:28:14 +0100 Subject: [PATCH 35/64] prepare table creation without metadata --- crates/catalog/hms/src/catalog.rs | 43 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index a3f8c04194..5193d44e35 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -25,6 +25,7 @@ use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; use hive_metastore::ThriftHiveMetastoreGetTableException; use iceberg::io::FileIO; +use iceberg::spec::TableMetadata; use iceberg::table::Table; use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, @@ -326,7 +327,7 @@ impl Catalog for HmsCatalog { let hive_table = convert_to_hive_table( db_name, - creation.schema.clone(), + &creation.schema, table_name.clone(), location, metadata_location, @@ -339,6 +340,18 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; + // let table = Table::builder() + // .metadata(metadata) + // .metadata_location(metadata_location) + // .identifier(TableIdent::new( + // NamespaceIdent::new(db_name), + // table_name.clone(), + // )) + // .file_io(file_io) + // .build(); + + // Ok(table) + // TODO: Create & return Result
Err(Error::new( ErrorKind::FeatureUnsupported, @@ -347,13 +360,12 @@ impl Catalog for HmsCatalog { } async fn load_table(&self, table: &TableIdent) -> Result
{ - let dbname = validate_namespace(table.namespace())?; - let tbl_name = table.name.clone(); + let db_name = validate_namespace(table.namespace())?; let hive_table = self .client .0 - .get_table(dbname.into(), tbl_name.into()) + .get_table(db_name.clone().into(), table.name.clone().into()) .await .map_err(from_thrift_error)?; @@ -382,16 +394,19 @@ impl Catalog for HmsCatalog { let mut reader = file_io.new_input(&metadata_location)?.reader().await?; let mut metadata_str = String::new(); reader.read_to_string(&mut metadata_str).await?; - - let _metadata = serde_json::from_str(&metadata_str)?; - - // TODO: Create `TableMetadata` - // blocked by: https://github.com/apache/iceberg-rust/issues/250 - // Create and return Result
- Err(Error::new( - ErrorKind::FeatureUnsupported, - "Loading a table is not supported yet", - )) + let metadata = serde_json::from_str::(&metadata_str)?; + + let table = Table::builder() + .metadata(metadata) + .metadata_location(metadata_location) + .identifier(TableIdent::new( + NamespaceIdent::new(db_name), + table.name.clone(), + )) + .file_io(file_io) + .build(); + + Ok(table) } /// Asynchronously drops a table from the database. From 0d0ce47b1aab9bc109e56e1b553b3a9a14f86062 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Thu, 14 Mar 2024 06:01:20 +0100 Subject: [PATCH 36/64] simplify HiveSchemaBuilder --- crates/catalog/hms/src/schema.rs | 33 +++++++++++++------------------- crates/catalog/hms/src/utils.rs | 4 ++-- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index 3f83e703a6..fc565bb892 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -19,30 +19,23 @@ use hive_metastore::FieldSchema; use iceberg::spec::{visit_schema, NestedFieldRef, PrimitiveType, Schema, SchemaVisitor}; use iceberg::Result; -pub(crate) type HiveSchema = Vec; +type HiveSchema = Vec; -#[derive(Debug)] +#[derive(Debug, Default)] pub(crate) struct HiveSchemaBuilder { - cols: HiveSchema, + schema: HiveSchema, context: Vec, } impl HiveSchemaBuilder { - /// Create new `HiveSchemaBuilder` - pub fn new() -> Self { - HiveSchemaBuilder { - cols: Vec::new(), - context: Vec::new(), - } - } - /// Convert `Schema` into `HiveSchema` by applying the `SchemaVisitor` - pub fn from_iceberg(&mut self, schema: &Schema) -> Result { - visit_schema(schema, self)?; - Ok(self.cols.clone()) + pub fn from_iceberg(schema: &Schema) -> Result { + let mut visitor = Self::default(); + visit_schema(schema, &mut visitor)?; + Ok(visitor.schema.clone()) } - /// Check if visitor is in `StructType` + /// Check if is in `StructType` while traversing schema fn is_inside_struct(&self) -> bool { !self.context.is_empty() } @@ -92,7 +85,7 @@ impl SchemaVisitor for HiveSchemaBuilder { return Ok(format!("{}:{}", field.name, value)); } - self.cols.push(FieldSchema { + self.schema.push(FieldSchema { name: Some(field.name.clone().into()), r#type: Some(value.clone().into()), comment: field.doc.clone().map(|doc| doc.into()), @@ -185,7 +178,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("quux".into()), @@ -230,7 +223,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("location".into()), @@ -258,7 +251,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?; let expected = vec![FieldSchema { name: Some("person".into()), @@ -280,7 +273,7 @@ mod tests { NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), ]) .build()?; - let result = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?; let expected = vec![ FieldSchema { diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index a5219a7230..91a36c9dea 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -166,7 +166,7 @@ pub(crate) fn convert_to_hive_table( ..Default::default() }; - let hive_schema = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let hive_schema = HiveSchemaBuilder::from_iceberg(schema)?; let storage_descriptor = StorageDescriptor { location: Some(location.into()), @@ -352,7 +352,7 @@ mod tests { ..Default::default() }; - let hive_schema = HiveSchemaBuilder::new().from_iceberg(&schema)?; + let hive_schema = HiveSchemaBuilder::from_iceberg(&schema)?; let sd = StorageDescriptor { location: Some(location.into()), From 376528e571c719ed74108b718c23b1066a9ac79e Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Thu, 14 Mar 2024 06:18:24 +0100 Subject: [PATCH 37/64] refactor: use ok_or_else() --- crates/catalog/hms/src/utils.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 91a36c9dea..d692d2dcca 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -56,14 +56,11 @@ pub const OUTPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileOutputFormat"; pub(crate) fn convert_to_namespace(database: &Database) -> Result { let mut properties = HashMap::new(); - let name = if let Some(name) = &database.name { - name.to_string() - } else { - return Err(Error::new( - ErrorKind::DataInvalid, - "Database name must be specified", - )); - }; + let name = database + .name + .as_ref() + .ok_or_else(|| Error::new(ErrorKind::DataInvalid, "Database name must be specified"))? + .to_string(); if let Some(description) = &database.description { properties.insert(COMMENT.to_string(), description.to_string()); From 87daa000a0531818d9b849a0f35bc4d4f411f09f Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Thu, 14 Mar 2024 07:14:14 +0100 Subject: [PATCH 38/64] simplify HiveSchemaBuilder --- crates/catalog/hms/src/schema.rs | 23 ++++++++++++++--------- crates/catalog/hms/src/utils.rs | 4 ++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index fc565bb892..fb82c18a10 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -28,11 +28,16 @@ pub(crate) struct HiveSchemaBuilder { } impl HiveSchemaBuilder { - /// Convert `Schema` into `HiveSchema` by applying the `SchemaVisitor` - pub fn from_iceberg(schema: &Schema) -> Result { - let mut visitor = Self::default(); - visit_schema(schema, &mut visitor)?; - Ok(visitor.schema.clone()) + /// Creates a new `HiveSchemaBuilder` from iceberg `Schema` + pub fn from_iceberg(schema: &Schema) -> Result { + let mut builder = Self::default(); + visit_schema(schema, &mut builder)?; + Ok(builder) + } + + /// Returns the newly converted `HiveSchema` + pub fn build(self) -> HiveSchema { + self.schema } /// Check if is in `StructType` while traversing schema @@ -178,7 +183,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![FieldSchema { name: Some("quux".into()), @@ -223,7 +228,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![FieldSchema { name: Some("location".into()), @@ -251,7 +256,7 @@ mod tests { .into()]) .build()?; - let result = HiveSchemaBuilder::from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![FieldSchema { name: Some("person".into()), @@ -273,7 +278,7 @@ mod tests { NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), ]) .build()?; - let result = HiveSchemaBuilder::from_iceberg(&schema)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![ FieldSchema { diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index d692d2dcca..910f6be8ac 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -163,7 +163,7 @@ pub(crate) fn convert_to_hive_table( ..Default::default() }; - let hive_schema = HiveSchemaBuilder::from_iceberg(schema)?; + let hive_schema = HiveSchemaBuilder::from_iceberg(schema)?.build(); let storage_descriptor = StorageDescriptor { location: Some(location.into()), @@ -349,7 +349,7 @@ mod tests { ..Default::default() }; - let hive_schema = HiveSchemaBuilder::from_iceberg(&schema)?; + let hive_schema = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let sd = StorageDescriptor { location: Some(location.into()), From af193e631ce3029231cd1ee5407dbe46714896dc Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Thu, 14 Mar 2024 11:15:40 +0100 Subject: [PATCH 39/64] fix visibility of consts --- crates/catalog/hms/src/utils.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 910f6be8ac..6729fa641e 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -25,31 +25,31 @@ use uuid::Uuid; use crate::schema::HiveSchemaBuilder; /// hive.metastore.database.owner setting -pub const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; +const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; /// hive.metastore.database.owner default setting -pub const HMS_DEFAULT_DB_OWNER: &str = "user.name"; +const HMS_DEFAULT_DB_OWNER: &str = "user.name"; /// hive.metastore.database.owner-type setting -pub const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; +const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; /// hive metatore `description` property -pub const COMMENT: &str = "comment"; +const COMMENT: &str = "comment"; /// hive metatore `location` property -pub const LOCATION: &str = "location"; +const LOCATION: &str = "location"; /// hive metatore `metadat_location` property -pub const METADATA_LOCATION: &str = "metadata_location"; +pub(crate) const METADATA_LOCATION: &str = "metadata_location"; /// hive metatore `external` property -pub const EXTERNAL: &str = "EXTERNAL"; +const EXTERNAL: &str = "EXTERNAL"; /// hive metatore `external_table` property -pub const EXTERNAL_TABLE: &str = "EXTERNAL_TABLE"; +const EXTERNAL_TABLE: &str = "EXTERNAL_TABLE"; /// hive metatore `table_type` property -pub const TABLE_TYPE: &str = "table_type"; +const TABLE_TYPE: &str = "table_type"; /// hive metatore `warehouse` location property -pub const WAREHOUSE_LOCATION: &str = "warehouse"; +const WAREHOUSE_LOCATION: &str = "warehouse"; /// hive metatore `SerDeInfo` serialization_lib parameter -pub const SERIALIZATION_LIB: &str = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; +const SERIALIZATION_LIB: &str = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; /// hive metatore input format -pub const INPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileInputFormat"; +const INPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileInputFormat"; /// hive metatore output format -pub const OUTPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileOutputFormat"; +const OUTPUT_FORMAT: &str = "org.apache.hadoop.mapred.FileOutputFormat"; /// Returns a `Namespace` by extracting database name and properties /// from `hive_metastore::hms::Database` From 7658e77f997b1ca3ceeee93af8ecec2c989c690a Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Fri, 15 Mar 2024 19:21:32 +0100 Subject: [PATCH 40/64] change serde metadata v2 --- crates/iceberg/src/spec/table_metadata.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 9893e9eeae..dce5a068f2 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -727,14 +727,10 @@ pub(super) mod _serde { .collect(), default_spec_id: v.default_spec_id, last_partition_id: v.last_partition_id, - properties: if v.properties.is_empty() { - None - } else { - Some(v.properties) - }, + properties: Some(v.properties), current_snapshot_id: v.current_snapshot_id.or(Some(-1)), snapshots: if v.snapshots.is_empty() { - None + Some(vec![]) } else { Some( v.snapshots From 63f0b222675a29a121d815a89e808b693637672d Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Fri, 15 Mar 2024 19:43:39 +0100 Subject: [PATCH 41/64] change default partition_specs and sort_orders --- crates/iceberg/src/spec/table_metadata.rs | 47 ++++++++++++++++------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index dce5a068f2..62040b0625 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -29,6 +29,7 @@ use super::{ snapshot::{Snapshot, SnapshotReference, SnapshotRetention}, PartitionSpecRef, SchemaId, SchemaRef, SnapshotRef, SortOrderRef, }; +use super::{PartitionSpec, SortOrder}; use _serde::TableMetadataEnum; @@ -297,19 +298,37 @@ impl TableMetadataBuilder { properties, } = table_creation; - if partition_spec.is_some() { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - "Can't create table with partition spec now", - )); - } + let partition_specs = match partition_spec { + Some(_) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Can't create table with partition spec now", + )) + } + None => HashMap::from([( + 0, + Arc::new(PartitionSpec { + spec_id: 0, + fields: vec![], + }), + )]), + }; - if sort_order.is_some() { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - "Can't create table with sort order now", - )); - } + let sort_orders = match sort_order { + Some(_) => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Can't create table with sort order now", + )) + } + None => HashMap::from([( + 0, + Arc::new(SortOrder { + order_id: 0, + fields: vec![], + }), + )]), + }; let table_metadata = TableMetadata { format_version: FormatVersion::V2, @@ -325,14 +344,14 @@ impl TableMetadataBuilder { last_column_id: schema.highest_field_id(), current_schema_id: schema.schema_id(), schemas: HashMap::from([(schema.schema_id(), Arc::new(schema))]), - partition_specs: Default::default(), + partition_specs, default_spec_id: 0, last_partition_id: 0, properties, current_snapshot_id: None, snapshots: Default::default(), snapshot_log: vec![], - sort_orders: Default::default(), + sort_orders, metadata_log: vec![], default_sort_order_id: 0, refs: Default::default(), From 1a11ead74618a67ed8dc4d5d4f1e7c6c808c38f0 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Fri, 15 Mar 2024 19:49:01 +0100 Subject: [PATCH 42/64] change test --- crates/iceberg/src/spec/table_metadata.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 62040b0625..bc89cd13b8 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1690,9 +1690,27 @@ mod tests { .len(), 0 ); - assert_eq!(table_metadata.partition_specs.len(), 0); assert_eq!(table_metadata.properties.len(), 0); - assert_eq!(table_metadata.sort_orders.len(), 0); + assert_eq!( + table_metadata.partition_specs, + HashMap::from([( + 0, + Arc::new(PartitionSpec { + spec_id: 0, + fields: vec![] + }) + )]) + ); + assert_eq!( + table_metadata.sort_orders, + HashMap::from([( + 0, + Arc::new(SortOrder { + order_id: 0, + fields: vec![] + }) + )]) + ); } #[test] From a5954b9201f4237b3de3811c3dad4a26157bb7f5 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 16 Mar 2024 19:59:59 +0100 Subject: [PATCH 43/64] add create table with metadata --- crates/catalog/hms/src/catalog.rs | 52 +++++++-------- crates/catalog/hms/tests/hms_catalog_test.rs | 66 +++++++++++++++++++- 2 files changed, 91 insertions(+), 27 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index cfbcdb1a44..ad3d213259 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -25,7 +25,9 @@ use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; use hive_metastore::ThriftHiveMetastoreGetTableException; use iceberg::io::FileIO; +use iceberg::io::FileIOBuilder; use iceberg::spec::TableMetadata; +use iceberg::spec::TableMetadataBuilder; use iceberg::table::Table; use iceberg::{ Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation, @@ -35,6 +37,7 @@ use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; use tokio::io::AsyncReadExt; +use tokio::io::AsyncWriteExt; use typed_builder::TypedBuilder; use volo_thrift::ResponseError; @@ -54,6 +57,8 @@ pub enum HmsThriftTransport { pub struct HmsCatalogConfig { address: String, thrift_transport: HmsThriftTransport, + #[builder(default, setter(prefix = "with_"))] + props: HashMap, } struct HmsClient(ThriftHiveMetastoreClient); @@ -309,29 +314,33 @@ impl Catalog for HmsCatalog { creation: TableCreation, ) -> Result
{ let db_name = validate_namespace(namespace)?; - let table_name = &creation.name; + let table_name = creation.name.clone(); let location = match &creation.location { Some(location) => location.clone(), None => { let ns = self.get_namespace(namespace).await?; - get_default_table_location(&ns, table_name)? + get_default_table_location(&ns, &table_name)? } }; - let metadata_location = get_metadata_location(location.clone(), 0)?; - let _file_io = FileIO::from_path(&metadata_location)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; + let metadata_location = get_metadata_location(&location, 0)?; - // TODO: Create `TableMetadata` and write to storage - // blocked by: https://github.com/apache/iceberg-rust/issues/250 + let file_io = FileIOBuilder::new("s3a") + .with_props(&self.config.props) + .build()?; + let mut file = file_io.new_output(&metadata_location)?.writer().await?; + file.write_all(&serde_json::to_vec(&metadata)?).await?; + file.shutdown().await?; let hive_table = convert_to_hive_table( - db_name, - &creation.schema, + db_name.clone(), + metadata.current_schema(), table_name.clone(), location, - metadata_location, - &creation.properties, + metadata_location.clone(), + metadata.properties(), )?; self.client @@ -340,23 +349,14 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - // let table = Table::builder() - // .metadata(metadata) - // .metadata_location(metadata_location) - // .identifier(TableIdent::new( - // NamespaceIdent::new(db_name), - // table_name.clone(), - // )) - // .file_io(file_io) - // .build(); - - // Ok(table) + let table = Table::builder() + .file_io(file_io) + .metadata_location(metadata_location) + .metadata(metadata) + .identifier(TableIdent::new(NamespaceIdent::new(db_name), table_name)) + .build(); - // TODO: Create & return Result
- Err(Error::new( - ErrorKind::FeatureUnsupported, - "Table creation is not supported yet", - )) + Ok(table) } async fn load_table(&self, table: &TableIdent) -> Result
{ diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index bab83a9550..f9e01dfa6d 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -19,7 +19,11 @@ use std::collections::HashMap; -use iceberg::{Catalog, Namespace, NamespaceIdent}; +use iceberg::io::{ + FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, +}; +use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; +use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation}; use iceberg_catalog_hms::{HmsCatalog, HmsCatalogConfig, HmsThriftTransport}; use iceberg_test_utils::docker::DockerCompose; use iceberg_test_utils::{normalize_test_name, set_up}; @@ -27,11 +31,13 @@ use port_scanner::scan_port_addr; use tokio::time::sleep; const HMS_CATALOG_PORT: u16 = 9083; +const MINIO_PORT: u16 = 9000; type Result = std::result::Result; struct TestFixture { _docker_compose: DockerCompose, hms_catalog: HmsCatalog, + file_io: FileIO, } async fn set_test_fixture(func: &str) -> TestFixture { @@ -45,6 +51,7 @@ async fn set_test_fixture(func: &str) -> TestFixture { docker_compose.run(); let hms_catalog_ip = docker_compose.get_container_ip("hive-metastore"); + let minio_ip = docker_compose.get_container_ip("minio"); let read_port = format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT); loop { @@ -56,9 +63,25 @@ async fn set_test_fixture(func: &str) -> TestFixture { } } + let props = HashMap::from([ + ( + S3_ENDPOINT.to_string(), + format!("http://{}:{}", minio_ip, MINIO_PORT), + ), + (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), + (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), + (S3_REGION.to_string(), "us-east-1".to_string()), + ]); + + let file_io = FileIOBuilder::new("s3a") + .with_props(&props) + .build() + .unwrap(); + let config = HmsCatalogConfig::builder() .address(format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT)) .thrift_transport(HmsThriftTransport::Buffered) + .with_props(props) .build(); let hms_catalog = HmsCatalog::new(config).unwrap(); @@ -66,9 +89,50 @@ async fn set_test_fixture(func: &str) -> TestFixture { TestFixture { _docker_compose: docker_compose, hms_catalog, + file_io, } } +#[tokio::test] +async fn test_create_table() -> Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "foo", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build()?; + + let creation = TableCreation::builder() + .location("s3a://warehouse/hive".to_string()) + .name("my_table".to_string()) + .properties(HashMap::new()) + .schema(schema) + .build(); + + let result = fixture + .hms_catalog + .create_table(namespace.name(), creation) + .await?; + + assert_eq!(result.identifier().name(), "my_table"); + assert!(result + .metadata_location() + .is_some_and(|location| location.starts_with("s3a://warehouse/hive/metadata/00000-"))); + assert!( + fixture + .file_io + .is_exist("s3a://warehouse/hive/metadata/") + .await? + ); + + Ok(()) +} + #[tokio::test] async fn test_list_namespace() -> Result<()> { let fixture = set_test_fixture("test_list_namespace").await; From 3034dca11e49c7c00a1336e912a6ed2d27c2e503 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 16 Mar 2024 20:25:00 +0100 Subject: [PATCH 44/64] use FileIO::from_path --- crates/catalog/hms/src/catalog.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index ad3d213259..9d39803544 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -25,7 +25,6 @@ use hive_metastore::ThriftHiveMetastoreClientBuilder; use hive_metastore::ThriftHiveMetastoreGetDatabaseException; use hive_metastore::ThriftHiveMetastoreGetTableException; use iceberg::io::FileIO; -use iceberg::io::FileIOBuilder; use iceberg::spec::TableMetadata; use iceberg::spec::TableMetadataBuilder; use iceberg::table::Table; @@ -327,7 +326,7 @@ impl Catalog for HmsCatalog { let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; let metadata_location = get_metadata_location(&location, 0)?; - let file_io = FileIOBuilder::new("s3a") + let file_io = FileIO::from_path(&metadata_location)? .with_props(&self.config.props) .build()?; let mut file = file_io.new_output(&metadata_location)?.writer().await?; From 4c7d91a2edfea5fdc14344df16757c0cc226a944 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 16 Mar 2024 20:43:10 +0100 Subject: [PATCH 45/64] add test_load_table --- crates/catalog/hms/tests/hms_catalog_test.rs | 48 ++++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index f9e01dfa6d..deec13a785 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -23,7 +23,7 @@ use iceberg::io::{ FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; -use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation}; +use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation, TableIdent}; use iceberg_catalog_hms::{HmsCatalog, HmsCatalogConfig, HmsThriftTransport}; use iceberg_test_utils::docker::DockerCompose; use iceberg_test_utils::{normalize_test_name, set_up}; @@ -93,12 +93,7 @@ async fn set_test_fixture(func: &str) -> TestFixture { } } -#[tokio::test] -async fn test_create_table() -> Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; - - let namespace = Namespace::new(NamespaceIdent::new("default".into())); - +fn set_table_creation(location: impl ToString, name: impl ToString) -> Result { let schema = Schema::builder() .with_schema_id(0) .with_fields(vec![ @@ -108,12 +103,47 @@ async fn test_create_table() -> Result<()> { .build()?; let creation = TableCreation::builder() - .location("s3a://warehouse/hive".to_string()) - .name("my_table".to_string()) + .location(location.to_string()) + .name(name.to_string()) .properties(HashMap::new()) .schema(schema) .build(); + Ok(creation) +} + +#[tokio::test] +async fn test_load_table() -> Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + + let expected = fixture + .hms_catalog + .create_table(namespace.name(), creation) + .await?; + + let result = fixture + .hms_catalog + .load_table(&TableIdent::new( + namespace.name().clone(), + "my_table".to_string(), + )) + .await?; + + assert_eq!(result.identifier(), expected.identifier()); + assert_eq!(result.metadata_location(), expected.metadata_location()); + assert_eq!(result.metadata(), expected.metadata()); + + Ok(()) +} + +#[tokio::test] +async fn test_create_table() -> Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + let result = fixture .hms_catalog .create_table(namespace.name(), creation) From e1407d5695bed0c0b39ab209b80dd1e1d84ae403 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 16 Mar 2024 20:56:43 +0100 Subject: [PATCH 46/64] small fixes + docs --- crates/catalog/hms/src/catalog.rs | 41 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 9d39803544..8deb36e029 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -184,7 +184,7 @@ impl Catalog for HmsCatalog { let db = self .client .0 - .get_database(name.clone().into()) + .get_database(name.into()) .await .map_err(from_thrift_error)?; @@ -208,7 +208,7 @@ impl Catalog for HmsCatalog { async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result { let name = validate_namespace(namespace)?; - let resp = self.client.0.get_database(name.clone().into()).await; + let resp = self.client.0.get_database(name.into()).await; match resp { Ok(_) => Ok(true), @@ -295,7 +295,7 @@ impl Catalog for HmsCatalog { let tables = self .client .0 - .get_all_tables(name.clone().into()) + .get_all_tables(name.into()) .await .map_err(from_thrift_error)?; @@ -307,6 +307,18 @@ impl Catalog for HmsCatalog { Ok(tables) } + /// Creates a new table within a specified namespace using the provided + /// table creation settings. + /// + /// # Returns + /// A `Result` wrapping a `Table` object representing the newly created + /// table. + /// + /// # Errors + /// This function may return an error in several cases, including invalid + /// namespace identifiers, failure to determine a default storage location, + /// issues generating or writing table metadata, and errors communicating + /// with the Hive Metastore. async fn create_table( &self, namespace: &NamespaceIdent, @@ -358,6 +370,18 @@ impl Catalog for HmsCatalog { Ok(table) } + /// Loads a table from the Hive Metastore and constructs a `Table` object + /// based on its metadata. + /// + /// # Returns + /// A `Result` wrapping a `Table` object that represents the loaded table. + /// + /// # Errors + /// This function may return an error in several scenarios, including: + /// - Failure to validate the namespace. + /// - Failure to retrieve the table from the Hive Metastore. + /// - Absence of metadata location information in the table's properties. + /// - Issues reading or deserializing the table's metadata file. async fn load_table(&self, table: &TableIdent) -> Result
{ let db_name = validate_namespace(table.namespace())?; @@ -368,7 +392,6 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - // TODO: extract into utils let metadata_location = match hive_table.parameters { Some(properties) => match properties.get(METADATA_LOCATION) { Some(location) => location.to_string(), @@ -387,22 +410,22 @@ impl Catalog for HmsCatalog { } }; - let file_io = FileIO::from_path(&metadata_location)?.build()?; - - // TODO: extract into utils + let file_io = FileIO::from_path(&metadata_location)? + .with_props(&self.config.props) + .build()?; let mut reader = file_io.new_input(&metadata_location)?.reader().await?; let mut metadata_str = String::new(); reader.read_to_string(&mut metadata_str).await?; let metadata = serde_json::from_str::(&metadata_str)?; let table = Table::builder() - .metadata(metadata) + .file_io(file_io) .metadata_location(metadata_location) + .metadata(metadata) .identifier(TableIdent::new( NamespaceIdent::new(db_name), table.name.clone(), )) - .file_io(file_io) .build(); Ok(table) From 6a4d6c862774c5cae70d38d899a470248b2fe6c8 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sat, 16 Mar 2024 20:58:55 +0100 Subject: [PATCH 47/64] rename --- crates/catalog/hms/src/catalog.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 8deb36e029..cadd554419 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -442,11 +442,11 @@ impl Catalog for HmsCatalog { /// the table does not exist. /// - Any network or communication error occurs with the database backend. async fn drop_table(&self, table: &TableIdent) -> Result<()> { - let dbname = validate_namespace(table.namespace())?; + let db_name = validate_namespace(table.namespace())?; self.client .0 - .drop_table(dbname.into(), table.name.clone().into(), false) + .drop_table(db_name.into(), table.name.clone().into(), false) .await .map_err(from_thrift_error)?; @@ -461,13 +461,13 @@ impl Catalog for HmsCatalog { /// - `Ok(false)` if the table does not exist in the database. /// - `Err(...)` if an error occurs during the process async fn table_exists(&self, table: &TableIdent) -> Result { - let dbname = validate_namespace(table.namespace())?; - let tbl_name = table.name.clone(); + let db_name = validate_namespace(table.namespace())?; + let table_name = table.name.clone(); let resp = self .client .0 - .get_table(dbname.into(), tbl_name.into()) + .get_table(db_name.into(), table_name.into()) .await; match resp { From 747cc5b4fd1d6f8ef933dc3b2c1997ea38d7aa1d Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sun, 17 Mar 2024 19:25:29 +0100 Subject: [PATCH 48/64] extract get_metadata_location from hive_table --- crates/catalog/hms/src/catalog.rs | 20 ++--------- crates/catalog/hms/src/utils.rs | 59 +++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index cadd554419..269068b0a9 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -336,7 +336,7 @@ impl Catalog for HmsCatalog { }; let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; - let metadata_location = get_metadata_location(&location, 0)?; + let metadata_location = create_metadata_location(&location, 0)?; let file_io = FileIO::from_path(&metadata_location)? .with_props(&self.config.props) @@ -392,23 +392,7 @@ impl Catalog for HmsCatalog { .await .map_err(from_thrift_error)?; - let metadata_location = match hive_table.parameters { - Some(properties) => match properties.get(METADATA_LOCATION) { - Some(location) => location.to_string(), - None => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!("No '{}' set on table", METADATA_LOCATION), - )) - } - }, - None => { - return Err(Error::new( - ErrorKind::DataInvalid, - "No 'parameters' set on table. Location of metadata is undefined", - )) - } - }; + let metadata_location = get_metadata_location(&hive_table.parameters)?; let file_io = FileIO::from_path(&metadata_location)? .with_props(&self.config.props) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 6729fa641e..c53dbceb78 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -35,7 +35,7 @@ const COMMENT: &str = "comment"; /// hive metatore `location` property const LOCATION: &str = "location"; /// hive metatore `metadat_location` property -pub(crate) const METADATA_LOCATION: &str = "metadata_location"; +const METADATA_LOCATION: &str = "metadata_location"; /// hive metatore `external` property const EXTERNAL: &str = "EXTERNAL"; /// hive metatore `external_table` property @@ -245,8 +245,8 @@ pub(crate) fn get_default_table_location( }) } -/// Get metadata location -pub(crate) fn get_metadata_location( +/// Create metadata location from `location` and `version` +pub(crate) fn create_metadata_location( location: impl AsRef + Display, version: i32, ) -> Result { @@ -267,6 +267,29 @@ pub(crate) fn get_metadata_location( Ok(metadata_location) } +/// Get metadata location from `HiveTable` parameters +pub(crate) fn get_metadata_location( + parameters: &Option>, +) -> Result { + match parameters { + Some(properties) => match properties.get(METADATA_LOCATION) { + Some(location) => Ok(location.to_string()), + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("No '{}' set on table", METADATA_LOCATION), + )) + } + }, + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + "No 'parameters' set on table. Location of metadata is undefined", + )) + } + } +} + /// Formats location_uri by e.g. removing trailing slashes. fn format_location_uri(location: String) -> String { let mut location = location; @@ -320,12 +343,34 @@ mod tests { use super::*; + #[test] + fn test_get_metadata_location() -> Result<()> { + let params_valid = Some(AHashMap::from([( + FastStr::new(METADATA_LOCATION), + FastStr::new("my_location"), + )])); + let params_missing_key = Some(AHashMap::from([( + FastStr::new("not_here"), + FastStr::new("my_location"), + )])); + + let result_valid = get_metadata_location(¶ms_valid)?; + let result_missing_key = get_metadata_location(¶ms_missing_key); + let result_no_params = get_metadata_location(&None); + + assert_eq!(result_valid, "my_location"); + assert!(result_missing_key.is_err()); + assert!(result_no_params.is_err()); + + Ok(()) + } + #[test] fn test_convert_to_hive_table() -> Result<()> { let db_name = "my_db".to_string(); let table_name = "my_table".to_string(); let location = "s3a://warehouse/hms".to_string(); - let metadata_location = get_metadata_location(location.clone(), 0)?; + let metadata_location = create_metadata_location(location.clone(), 0)?; let properties = HashMap::new(); let schema = Schema::builder() .with_schema_id(1) @@ -370,13 +415,13 @@ mod tests { } #[test] - fn test_get_metadata_location() -> Result<()> { + fn test_create_metadata_location() -> Result<()> { let location = "my_base_location"; let valid_version = 0; let invalid_version = -1; - let valid_result = get_metadata_location(location, valid_version)?; - let invalid_result = get_metadata_location(location, invalid_version); + let valid_result = create_metadata_location(location, valid_version)?; + let invalid_result = create_metadata_location(location, invalid_version); assert!(valid_result.starts_with("my_base_location/metadata/00000-")); assert!(valid_result.ends_with(".metadata.json")); From 07dd4d9e86cbc0aaf28469c7a8a0ed473b2c0300 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sun, 17 Mar 2024 19:39:38 +0100 Subject: [PATCH 49/64] add integration tests --- crates/catalog/hms/tests/hms_catalog_test.rs | 63 ++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index deec13a785..6ecb7f07e2 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -112,6 +112,69 @@ fn set_table_creation(location: impl ToString, name: impl ToString) -> Result Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + + let table = fixture + .hms_catalog + .create_table(namespace.name(), creation) + .await?; + + let dest = TableIdent::new(namespace.name().clone(), "my_table_rename".to_string()); + + fixture + .hms_catalog + .rename_table(table.identifier(), &dest) + .await?; + + let result = fixture.hms_catalog.table_exists(&dest).await?; + + assert!(result); + + Ok(()) +} + +#[tokio::test] +async fn test_table_exists() -> Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + + let table = fixture + .hms_catalog + .create_table(namespace.name(), creation) + .await?; + + let result = fixture.hms_catalog.table_exists(table.identifier()).await?; + + assert!(result); + + Ok(()) +} + +#[tokio::test] +async fn test_drop_table() -> Result<()> { + let fixture = set_test_fixture("test_list_namespace").await; + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + let namespace = Namespace::new(NamespaceIdent::new("default".into())); + + let table = fixture + .hms_catalog + .create_table(namespace.name(), creation) + .await?; + + fixture.hms_catalog.drop_table(table.identifier()).await?; + + let result = fixture.hms_catalog.table_exists(table.identifier()).await?; + + assert!(!result); + + Ok(()) +} + #[tokio::test] async fn test_load_table() -> Result<()> { let fixture = set_test_fixture("test_list_namespace").await; From 4fdfa33b1bf16d592c75d59c4e5b1782fa87e68d Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sun, 17 Mar 2024 19:40:27 +0100 Subject: [PATCH 50/64] fix: clippy --- crates/catalog/hms/src/utils.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index c53dbceb78..9f0cdfdcf7 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -274,19 +274,15 @@ pub(crate) fn get_metadata_location( match parameters { Some(properties) => match properties.get(METADATA_LOCATION) { Some(location) => Ok(location.to_string()), - None => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!("No '{}' set on table", METADATA_LOCATION), - )) - } - }, - None => { - return Err(Error::new( + None => Err(Error::new( ErrorKind::DataInvalid, - "No 'parameters' set on table. Location of metadata is undefined", - )) - } + format!("No '{}' set on table", METADATA_LOCATION), + )), + }, + None => Err(Error::new( + ErrorKind::DataInvalid, + "No 'parameters' set on table. Location of metadata is undefined", + )), } } From 2ce1a6a401fd23faedd1a3f125923872440aa042 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Sun, 17 Mar 2024 19:48:47 +0100 Subject: [PATCH 51/64] remove whitespace --- crates/catalog/hms/src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 9f0cdfdcf7..8c1d1832af 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -240,7 +240,7 @@ pub(crate) fn get_default_table_location( .ok_or_else(|| { Error::new( ErrorKind::DataInvalid, - "No default path is set, please specify a location when creating a table", + "No default path is set, please specify a location when creating a table", ) }) } From eaef1ef6fafec42859da814c46f2979c6b5e47cc Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Tue, 19 Mar 2024 10:07:02 +0100 Subject: [PATCH 52/64] fix: fixture names --- crates/catalog/hms/tests/hms_catalog_test.rs | 36 ++++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 6ecb7f07e2..3179f67d69 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -114,7 +114,7 @@ fn set_table_creation(location: impl ToString, name: impl ToString) -> Result Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; + let fixture = set_test_fixture("test_rename_table").await; let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; let namespace = Namespace::new(NamespaceIdent::new("default".into())); @@ -139,7 +139,7 @@ async fn test_rename_table() -> Result<()> { #[tokio::test] async fn test_table_exists() -> Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; + let fixture = set_test_fixture("test_table_exists").await; let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; let namespace = Namespace::new(NamespaceIdent::new("default".into())); @@ -157,7 +157,7 @@ async fn test_table_exists() -> Result<()> { #[tokio::test] async fn test_drop_table() -> Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; + let fixture = set_test_fixture("test_drop_table").await; let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; let namespace = Namespace::new(NamespaceIdent::new("default".into())); @@ -177,7 +177,7 @@ async fn test_drop_table() -> Result<()> { #[tokio::test] async fn test_load_table() -> Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; + let fixture = set_test_fixture("test_load_table").await; let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; let namespace = Namespace::new(NamespaceIdent::new("default".into())); @@ -203,7 +203,7 @@ async fn test_load_table() -> Result<()> { #[tokio::test] async fn test_create_table() -> Result<()> { - let fixture = set_test_fixture("test_list_namespace").await; + let fixture = set_test_fixture("test_create_table").await; let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; let namespace = Namespace::new(NamespaceIdent::new("default".into())); @@ -226,6 +226,19 @@ async fn test_create_table() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_list_tables() -> Result<()> { + let fixture = set_test_fixture("test_list_tables").await; + + let ns = Namespace::new(NamespaceIdent::new("default".into())); + + let result = fixture.hms_catalog.list_tables(ns.name()).await?; + + assert_eq!(result, vec![]); + + Ok(()) +} + #[tokio::test] async fn test_list_namespace() -> Result<()> { let fixture = set_test_fixture("test_list_namespace").await; @@ -365,16 +378,3 @@ async fn test_drop_namespace() -> Result<()> { Ok(()) } - -#[tokio::test] -async fn test_list_tables() -> Result<()> { - let fixture = set_test_fixture("test_list_tables").await; - - let ns = Namespace::new(NamespaceIdent::new("default".into())); - - let result = fixture.hms_catalog.list_tables(ns.name()).await?; - - assert_eq!(result, vec![]); - - Ok(()) -} From f814906cecb471c84f0a610ffe92af052dc91b28 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 10:53:09 +0100 Subject: [PATCH 53/64] remove builder-prefix `with` --- crates/catalog/hms/src/catalog.rs | 2 +- crates/catalog/hms/tests/hms_catalog_test.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 269068b0a9..1499fd9bc4 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -56,7 +56,7 @@ pub enum HmsThriftTransport { pub struct HmsCatalogConfig { address: String, thrift_transport: HmsThriftTransport, - #[builder(default, setter(prefix = "with_"))] + #[builder(default)] props: HashMap, } diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 3179f67d69..9f627dd7f2 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -81,7 +81,7 @@ async fn set_test_fixture(func: &str) -> TestFixture { let config = HmsCatalogConfig::builder() .address(format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT)) .thrift_transport(HmsThriftTransport::Buffered) - .with_props(props) + .props(props) .build(); let hms_catalog = HmsCatalog::new(config).unwrap(); From dc054a9044031c4da560ec8af11c057f1d0c4d66 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 10:54:00 +0100 Subject: [PATCH 54/64] capitalize error msg --- crates/catalog/hms/src/error.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/catalog/hms/src/error.rs b/crates/catalog/hms/src/error.rs index 1b0ff33df4..a0f393c622 100644 --- a/crates/catalog/hms/src/error.rs +++ b/crates/catalog/hms/src/error.rs @@ -27,7 +27,7 @@ where { Error::new( ErrorKind::Unexpected, - "operation failed for hitting thrift error".to_string(), + "Operation failed for hitting thrift error".to_string(), ) .with_source(anyhow!("thrift error: {:?}", error)) } @@ -36,7 +36,7 @@ where pub fn from_io_error(error: io::Error) -> Error { Error::new( ErrorKind::Unexpected, - "operation failed for hitting io error".to_string(), + "Operation failed for hitting io error".to_string(), ) .with_source(error) } From 9557a1ccdf96282e0958eb10c2605a7e14f74809 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 11:02:53 +0100 Subject: [PATCH 55/64] remove trait bound `Display` --- crates/catalog/hms/src/utils.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 8c1d1832af..0782d97497 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -19,7 +19,7 @@ use chrono::Utc; use hive_metastore::{Database, PrincipalType, SerDeInfo, StorageDescriptor}; use iceberg::{spec::Schema, Error, ErrorKind, Namespace, NamespaceIdent, Result}; use pilota::{AHashMap, FastStr}; -use std::{collections::HashMap, fmt::Display}; +use std::collections::HashMap; use uuid::Uuid; use crate::schema::HiveSchemaBuilder; @@ -230,13 +230,13 @@ pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { /// Get default table location from `Namespace` properties pub(crate) fn get_default_table_location( namespace: &Namespace, - table_name: impl AsRef + Display, + table_name: impl AsRef, ) -> Result { let properties = namespace.properties(); properties .get(LOCATION) .or_else(|| properties.get(WAREHOUSE_LOCATION)) - .map(|location| format!("{}/{}", location, table_name)) + .map(|location| format!("{}/{}", location, table_name.as_ref())) .ok_or_else(|| { Error::new( ErrorKind::DataInvalid, @@ -246,10 +246,7 @@ pub(crate) fn get_default_table_location( } /// Create metadata location from `location` and `version` -pub(crate) fn create_metadata_location( - location: impl AsRef + Display, - version: i32, -) -> Result { +pub(crate) fn create_metadata_location(location: impl AsRef, version: i32) -> Result { if version < 0 { return Err(Error::new( ErrorKind::DataInvalid, @@ -262,7 +259,12 @@ pub(crate) fn create_metadata_location( let version = format!("{:0>5}", version); let id = Uuid::new_v4(); - let metadata_location = format!("{}/metadata/{}-{}.metadata.json", location, version, id); + let metadata_location = format!( + "{}/metadata/{}-{}.metadata.json", + location.as_ref(), + version, + id + ); Ok(metadata_location) } From 3cbb2229d8852e54f542643992b6aa206e227bb6 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 11:05:39 +0100 Subject: [PATCH 56/64] add const `OWNER` --- crates/catalog/hms/src/utils.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 0782d97497..4bb3bfff5d 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -30,6 +30,8 @@ const HMS_DB_OWNER: &str = "hive.metastore.database.owner"; const HMS_DEFAULT_DB_OWNER: &str = "user.name"; /// hive.metastore.database.owner-type setting const HMS_DB_OWNER_TYPE: &str = "hive.metastore.database.owner-type"; +/// hive metatore `owner` property +const OWNER: &str = "owner"; /// hive metatore `description` property const COMMENT: &str = "comment"; /// hive metatore `location` property @@ -185,7 +187,7 @@ pub(crate) fn convert_to_hive_table( let current_time_ms = get_current_time()?; let owner = properties - .get("owner") + .get(OWNER) .map_or(HMS_DEFAULT_DB_OWNER.to_string(), |v| v.into()); Ok(hive_metastore::Table { From af91d19dacbb6ef567b474ab7b109a5c8625bec3 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 11:45:28 +0100 Subject: [PATCH 57/64] fix: default warehouse location --- crates/catalog/hms/src/catalog.rs | 4 +- crates/catalog/hms/src/utils.rs | 50 +++++++++++--------- crates/catalog/hms/tests/hms_catalog_test.rs | 1 + 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 1499fd9bc4..9512194d0f 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -56,6 +56,8 @@ pub enum HmsThriftTransport { pub struct HmsCatalogConfig { address: String, thrift_transport: HmsThriftTransport, + #[builder(default, setter(strip_option))] + warehouse: Option, #[builder(default)] props: HashMap, } @@ -331,7 +333,7 @@ impl Catalog for HmsCatalog { Some(location) => location.clone(), None => { let ns = self.get_namespace(namespace).await?; - get_default_table_location(&ns, &table_name)? + get_default_table_location(&ns, &table_name, self.config.warehouse.clone())? } }; diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 4bb3bfff5d..0653de79f1 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -44,8 +44,6 @@ const EXTERNAL: &str = "EXTERNAL"; const EXTERNAL_TABLE: &str = "EXTERNAL_TABLE"; /// hive metatore `table_type` property const TABLE_TYPE: &str = "table_type"; -/// hive metatore `warehouse` location property -const WAREHOUSE_LOCATION: &str = "warehouse"; /// hive metatore `SerDeInfo` serialization_lib parameter const SERIALIZATION_LIB: &str = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; /// hive metatore input format @@ -233,18 +231,24 @@ pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { pub(crate) fn get_default_table_location( namespace: &Namespace, table_name: impl AsRef, + warehouse: Option, ) -> Result { let properties = namespace.properties(); - properties - .get(LOCATION) - .or_else(|| properties.get(WAREHOUSE_LOCATION)) - .map(|location| format!("{}/{}", location, table_name.as_ref())) - .ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - "No default path is set, please specify a location when creating a table", - ) - }) + + let location = match properties.get(LOCATION) { + Some(location) => location.to_string(), + None => match warehouse { + Some(location) => location, + None => { + return Err(Error::new( + ErrorKind::DataInvalid, + "No default path is set, please specify a location when creating a table", + )) + } + }, + }; + + Ok(format!("{}/{}", location, table_name.as_ref())) } /// Create metadata location from `location` and `version` @@ -439,7 +443,11 @@ mod tests { let table_name = "my_table"; let expected = "db_location/my_table"; - let result = get_default_table_location(&namespace, table_name)?; + let result = get_default_table_location( + &namespace, + table_name, + Some("warehouse_location".to_string()), + )?; assert_eq!(expected, result); @@ -448,17 +456,15 @@ mod tests { #[test] fn test_get_default_table_location_warehouse() -> Result<()> { - let properties = HashMap::from([( - WAREHOUSE_LOCATION.to_string(), - "warehouse_location".to_string(), - )]); - - let namespace = - Namespace::with_properties(NamespaceIdent::new("default".into()), properties); + let namespace = Namespace::new(NamespaceIdent::new("default".into())); let table_name = "my_table"; let expected = "warehouse_location/my_table"; - let result = get_default_table_location(&namespace, table_name)?; + let result = get_default_table_location( + &namespace, + table_name, + Some("warehouse_location".to_string()), + )?; assert_eq!(expected, result); @@ -470,7 +476,7 @@ mod tests { let namespace = Namespace::new(NamespaceIdent::new("default".into())); let table_name = "my_table"; - let result = get_default_table_location(&namespace, table_name); + let result = get_default_table_location(&namespace, table_name, None); assert!(result.is_err()); } diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 9f627dd7f2..54bc4fb3b7 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -81,6 +81,7 @@ async fn set_test_fixture(func: &str) -> TestFixture { let config = HmsCatalogConfig::builder() .address(format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT)) .thrift_transport(HmsThriftTransport::Buffered) + .warehouse("s3a://warehouse/hive".to_string()) .props(props) .build(); From e8a7bdfb3e1662d01091cfac50a0e6cbdff87c9e Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 11:51:17 +0100 Subject: [PATCH 58/64] add test-case `list_tables` --- crates/catalog/hms/tests/hms_catalog_test.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 54bc4fb3b7..3ebc80d622 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -230,13 +230,23 @@ async fn test_create_table() -> Result<()> { #[tokio::test] async fn test_list_tables() -> Result<()> { let fixture = set_test_fixture("test_list_tables").await; - let ns = Namespace::new(NamespaceIdent::new("default".into())); - let result = fixture.hms_catalog.list_tables(ns.name()).await?; assert_eq!(result, vec![]); + let creation = set_table_creation("s3a://warehouse/hive", "my_table")?; + fixture + .hms_catalog + .create_table(ns.name(), creation) + .await?; + let result = fixture.hms_catalog.list_tables(ns.name()).await?; + + assert_eq!( + result, + vec![TableIdent::new(ns.name().clone(), "my_table".to_string())] + ); + Ok(()) } From a2fe37a3413a28d3a453487b3fa627a8965be21c Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 12:03:41 +0100 Subject: [PATCH 59/64] add all primitives to test_schema --- crates/catalog/hms/src/schema.rs | 89 ++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index fb82c18a10..ab16ce7f70 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -274,23 +274,104 @@ mod tests { let schema = Schema::builder() .with_schema_id(1) .with_fields(vec![ - NestedField::required(1, "foo", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "bar", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(1, "c1", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::required(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(3, "c3", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(4, "c4", Type::Primitive(PrimitiveType::Float)).into(), + NestedField::required(5, "c5", Type::Primitive(PrimitiveType::Double)).into(), + NestedField::required( + 6, + "c6", + Type::Primitive(PrimitiveType::Decimal { + precision: 2, + scale: 2, + }), + ) + .into(), + NestedField::required(7, "c7", Type::Primitive(PrimitiveType::Date)).into(), + NestedField::required(8, "c8", Type::Primitive(PrimitiveType::Time)).into(), + NestedField::required(9, "c9", Type::Primitive(PrimitiveType::Timestamp)).into(), + NestedField::required(10, "c10", Type::Primitive(PrimitiveType::Timestamptz)) + .into(), + NestedField::required(11, "c11", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(12, "c12", Type::Primitive(PrimitiveType::Uuid)).into(), + NestedField::required(13, "c13", Type::Primitive(PrimitiveType::Fixed(4))).into(), + NestedField::required(14, "c14", Type::Primitive(PrimitiveType::Binary)).into(), ]) .build()?; let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![ FieldSchema { - name: Some("foo".into()), + name: Some("c1".into()), + r#type: Some("boolean".into()), + comment: None, + }, + FieldSchema { + name: Some("c2".into()), r#type: Some("int".into()), comment: None, }, FieldSchema { - name: Some("bar".into()), + name: Some("c3".into()), + r#type: Some("bigint".into()), + comment: None, + }, + FieldSchema { + name: Some("c4".into()), + r#type: Some("float".into()), + comment: None, + }, + FieldSchema { + name: Some("c5".into()), + r#type: Some("double".into()), + comment: None, + }, + FieldSchema { + name: Some("c6".into()), + r#type: Some("decimal(2,2)".into()), + comment: None, + }, + FieldSchema { + name: Some("c7".into()), + r#type: Some("date".into()), + comment: None, + }, + FieldSchema { + name: Some("c8".into()), r#type: Some("string".into()), comment: None, }, + FieldSchema { + name: Some("c9".into()), + r#type: Some("timestamp".into()), + comment: None, + }, + FieldSchema { + name: Some("c10".into()), + r#type: Some("timestamp".into()), + comment: None, + }, + FieldSchema { + name: Some("c11".into()), + r#type: Some("string".into()), + comment: None, + }, + FieldSchema { + name: Some("c12".into()), + r#type: Some("string".into()), + comment: None, + }, + FieldSchema { + name: Some("c13".into()), + r#type: Some("binary".into()), + comment: None, + }, + FieldSchema { + name: Some("c14".into()), + r#type: Some("binary".into()), + comment: None, + }, ]; assert_eq!(result, expected); From 2bb5e38b36acebfc639ebc072e3eaa8634cf4770 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 12:09:04 +0100 Subject: [PATCH 60/64] exclude `Timestamptz` from hive conversion --- crates/catalog/hms/src/schema.rs | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index ab16ce7f70..d0e8614f4f 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -17,7 +17,7 @@ use hive_metastore::FieldSchema; use iceberg::spec::{visit_schema, NestedFieldRef, PrimitiveType, Schema, SchemaVisitor}; -use iceberg::Result; +use iceberg::{Error, ErrorKind, Result}; type HiveSchema = Vec; @@ -124,7 +124,7 @@ impl SchemaVisitor for HiveSchemaBuilder { PrimitiveType::Float => "float".to_string(), PrimitiveType::Double => "double".to_string(), PrimitiveType::Date => "date".to_string(), - PrimitiveType::Timestamp | PrimitiveType::Timestamptz => "timestamp".to_string(), + PrimitiveType::Timestamp => "timestamp".to_string(), PrimitiveType::Time | PrimitiveType::String | PrimitiveType::Uuid => { "string".to_string() } @@ -132,6 +132,12 @@ impl SchemaVisitor for HiveSchemaBuilder { PrimitiveType::Decimal { precision, scale } => { format!("decimal({},{})", precision, scale) } + _ => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Conversion from 'Timestamptz' is not supported", + )) + } }; Ok(hive_type) @@ -291,12 +297,10 @@ mod tests { NestedField::required(7, "c7", Type::Primitive(PrimitiveType::Date)).into(), NestedField::required(8, "c8", Type::Primitive(PrimitiveType::Time)).into(), NestedField::required(9, "c9", Type::Primitive(PrimitiveType::Timestamp)).into(), - NestedField::required(10, "c10", Type::Primitive(PrimitiveType::Timestamptz)) - .into(), - NestedField::required(11, "c11", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(12, "c12", Type::Primitive(PrimitiveType::Uuid)).into(), - NestedField::required(13, "c13", Type::Primitive(PrimitiveType::Fixed(4))).into(), - NestedField::required(14, "c14", Type::Primitive(PrimitiveType::Binary)).into(), + NestedField::required(10, "c10", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(11, "c11", Type::Primitive(PrimitiveType::Uuid)).into(), + NestedField::required(12, "c12", Type::Primitive(PrimitiveType::Fixed(4))).into(), + NestedField::required(13, "c13", Type::Primitive(PrimitiveType::Binary)).into(), ]) .build()?; let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); @@ -349,7 +353,7 @@ mod tests { }, FieldSchema { name: Some("c10".into()), - r#type: Some("timestamp".into()), + r#type: Some("string".into()), comment: None, }, FieldSchema { @@ -359,16 +363,11 @@ mod tests { }, FieldSchema { name: Some("c12".into()), - r#type: Some("string".into()), - comment: None, - }, - FieldSchema { - name: Some("c13".into()), r#type: Some("binary".into()), comment: None, }, FieldSchema { - name: Some("c14".into()), + name: Some("c13".into()), r#type: Some("binary".into()), comment: None, }, From 8c519eb6068bbc65146b2a34d38650c281c3ce21 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 12:13:51 +0100 Subject: [PATCH 61/64] remove Self::T from schema --- crates/catalog/hms/src/schema.rs | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index d0e8614f4f..1a822f06c8 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -52,8 +52,8 @@ impl SchemaVisitor for HiveSchemaBuilder { fn schema( &mut self, _schema: &iceberg::spec::Schema, - value: Self::T, - ) -> iceberg::Result { + value: String, + ) -> iceberg::Result { Ok(value) } @@ -68,8 +68,8 @@ impl SchemaVisitor for HiveSchemaBuilder { fn r#struct( &mut self, r#_struct: &iceberg::spec::StructType, - results: Vec, - ) -> iceberg::Result { + results: Vec, + ) -> iceberg::Result { Ok(format!("struct<{}>", results.join(", "))) } @@ -84,8 +84,8 @@ impl SchemaVisitor for HiveSchemaBuilder { fn field( &mut self, field: &iceberg::spec::NestedFieldRef, - value: Self::T, - ) -> iceberg::Result { + value: String, + ) -> iceberg::Result { if self.is_inside_struct() { return Ok(format!("{}:{}", field.name, value)); } @@ -99,24 +99,20 @@ impl SchemaVisitor for HiveSchemaBuilder { Ok(value) } - fn list( - &mut self, - _list: &iceberg::spec::ListType, - value: Self::T, - ) -> iceberg::Result { + fn list(&mut self, _list: &iceberg::spec::ListType, value: String) -> iceberg::Result { Ok(format!("array<{}>", value)) } fn map( &mut self, _map: &iceberg::spec::MapType, - key_value: Self::T, - value: Self::T, - ) -> iceberg::Result { + key_value: String, + value: String, + ) -> iceberg::Result { Ok(format!("map<{},{}>", key_value, value)) } - fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result { + fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result { let hive_type = match p { PrimitiveType::Boolean => "boolean".to_string(), PrimitiveType::Int => "int".to_string(), From c7652686ffd8d23ff14e3881a44c473c8f6664c7 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 12:54:16 +0100 Subject: [PATCH 62/64] remove context --- crates/catalog/hms/src/schema.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index 1a822f06c8..2e9f9a68f2 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -16,7 +16,7 @@ // under the License. use hive_metastore::FieldSchema; -use iceberg::spec::{visit_schema, NestedFieldRef, PrimitiveType, Schema, SchemaVisitor}; +use iceberg::spec::{visit_schema, PrimitiveType, Schema, SchemaVisitor}; use iceberg::{Error, ErrorKind, Result}; type HiveSchema = Vec; @@ -24,7 +24,7 @@ type HiveSchema = Vec; #[derive(Debug, Default)] pub(crate) struct HiveSchemaBuilder { schema: HiveSchema, - context: Vec, + depth: usize, } impl HiveSchemaBuilder { @@ -42,7 +42,7 @@ impl HiveSchemaBuilder { /// Check if is in `StructType` while traversing schema fn is_inside_struct(&self) -> bool { - !self.context.is_empty() + self.depth > 0 } } @@ -59,9 +59,9 @@ impl SchemaVisitor for HiveSchemaBuilder { fn before_struct_field( &mut self, - field: &iceberg::spec::NestedFieldRef, + _field: &iceberg::spec::NestedFieldRef, ) -> iceberg::Result<()> { - self.context.push(field.clone()); + self.depth += 1; Ok(()) } @@ -77,7 +77,7 @@ impl SchemaVisitor for HiveSchemaBuilder { &mut self, _field: &iceberg::spec::NestedFieldRef, ) -> iceberg::Result<()> { - self.context.pop(); + self.depth -= 1; Ok(()) } From 98a2f278b9de3899bdad853f2509206227755219 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Wed, 20 Mar 2024 15:18:25 +0100 Subject: [PATCH 63/64] keep file_io in HmsCatalog --- crates/catalog/hms/src/catalog.rs | 33 +++++++++------- crates/catalog/hms/src/utils.rs | 40 ++++---------------- crates/catalog/hms/tests/hms_catalog_test.rs | 14 ++----- 3 files changed, 30 insertions(+), 57 deletions(-) diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 9512194d0f..4ca66bd21a 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -56,8 +56,7 @@ pub enum HmsThriftTransport { pub struct HmsCatalogConfig { address: String, thrift_transport: HmsThriftTransport, - #[builder(default, setter(strip_option))] - warehouse: Option, + warehouse: String, #[builder(default)] props: HashMap, } @@ -68,6 +67,7 @@ struct HmsClient(ThriftHiveMetastoreClient); pub struct HmsCatalog { config: HmsCatalogConfig, client: HmsClient, + file_io: FileIO, } impl Debug for HmsCatalog { @@ -105,11 +105,20 @@ impl HmsCatalog { .build(), }; + let file_io = FileIO::from_path(&config.warehouse)? + .with_props(&config.props) + .build()?; + Ok(Self { config, client: HmsClient(client), + file_io, }) } + /// Get the catalogs `FileIO` + pub fn file_io(&self) -> FileIO { + self.file_io.clone() + } } #[async_trait] @@ -333,17 +342,18 @@ impl Catalog for HmsCatalog { Some(location) => location.clone(), None => { let ns = self.get_namespace(namespace).await?; - get_default_table_location(&ns, &table_name, self.config.warehouse.clone())? + get_default_table_location(&ns, &table_name, &self.config.warehouse) } }; let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; let metadata_location = create_metadata_location(&location, 0)?; - let file_io = FileIO::from_path(&metadata_location)? - .with_props(&self.config.props) - .build()?; - let mut file = file_io.new_output(&metadata_location)?.writer().await?; + let mut file = self + .file_io + .new_output(&metadata_location)? + .writer() + .await?; file.write_all(&serde_json::to_vec(&metadata)?).await?; file.shutdown().await?; @@ -363,7 +373,7 @@ impl Catalog for HmsCatalog { .map_err(from_thrift_error)?; let table = Table::builder() - .file_io(file_io) + .file_io(self.file_io()) .metadata_location(metadata_location) .metadata(metadata) .identifier(TableIdent::new(NamespaceIdent::new(db_name), table_name)) @@ -396,16 +406,13 @@ impl Catalog for HmsCatalog { let metadata_location = get_metadata_location(&hive_table.parameters)?; - let file_io = FileIO::from_path(&metadata_location)? - .with_props(&self.config.props) - .build()?; - let mut reader = file_io.new_input(&metadata_location)?.reader().await?; + let mut reader = self.file_io.new_input(&metadata_location)?.reader().await?; let mut metadata_str = String::new(); reader.read_to_string(&mut metadata_str).await?; let metadata = serde_json::from_str::(&metadata_str)?; let table = Table::builder() - .file_io(file_io) + .file_io(self.file_io()) .metadata_location(metadata_location) .metadata(metadata) .identifier(TableIdent::new( diff --git a/crates/catalog/hms/src/utils.rs b/crates/catalog/hms/src/utils.rs index 0653de79f1..04ee5d4b31 100644 --- a/crates/catalog/hms/src/utils.rs +++ b/crates/catalog/hms/src/utils.rs @@ -231,24 +231,16 @@ pub(crate) fn validate_namespace(namespace: &NamespaceIdent) -> Result { pub(crate) fn get_default_table_location( namespace: &Namespace, table_name: impl AsRef, - warehouse: Option, -) -> Result { + warehouse: impl AsRef, +) -> String { let properties = namespace.properties(); let location = match properties.get(LOCATION) { - Some(location) => location.to_string(), - None => match warehouse { - Some(location) => location, - None => { - return Err(Error::new( - ErrorKind::DataInvalid, - "No default path is set, please specify a location when creating a table", - )) - } - }, + Some(location) => location, + None => warehouse.as_ref(), }; - Ok(format!("{}/{}", location, table_name.as_ref())) + format!("{}/{}", location, table_name.as_ref()) } /// Create metadata location from `location` and `version` @@ -443,11 +435,7 @@ mod tests { let table_name = "my_table"; let expected = "db_location/my_table"; - let result = get_default_table_location( - &namespace, - table_name, - Some("warehouse_location".to_string()), - )?; + let result = get_default_table_location(&namespace, table_name, "warehouse_location"); assert_eq!(expected, result); @@ -460,27 +448,13 @@ mod tests { let table_name = "my_table"; let expected = "warehouse_location/my_table"; - let result = get_default_table_location( - &namespace, - table_name, - Some("warehouse_location".to_string()), - )?; + let result = get_default_table_location(&namespace, table_name, "warehouse_location"); assert_eq!(expected, result); Ok(()) } - #[test] - fn test_get_default_table_location_missing() { - let namespace = Namespace::new(NamespaceIdent::new("default".into())); - let table_name = "my_table"; - - let result = get_default_table_location(&namespace, table_name, None); - - assert!(result.is_err()); - } - #[test] fn test_convert_to_namespace() -> Result<()> { let properties = HashMap::from([ diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index 3ebc80d622..a48d0568c6 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -19,9 +19,7 @@ use std::collections::HashMap; -use iceberg::io::{ - FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, -}; +use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation, TableIdent}; use iceberg_catalog_hms::{HmsCatalog, HmsCatalogConfig, HmsThriftTransport}; @@ -37,7 +35,6 @@ type Result = std::result::Result; struct TestFixture { _docker_compose: DockerCompose, hms_catalog: HmsCatalog, - file_io: FileIO, } async fn set_test_fixture(func: &str) -> TestFixture { @@ -73,11 +70,6 @@ async fn set_test_fixture(func: &str) -> TestFixture { (S3_REGION.to_string(), "us-east-1".to_string()), ]); - let file_io = FileIOBuilder::new("s3a") - .with_props(&props) - .build() - .unwrap(); - let config = HmsCatalogConfig::builder() .address(format!("{}:{}", hms_catalog_ip, HMS_CATALOG_PORT)) .thrift_transport(HmsThriftTransport::Buffered) @@ -90,7 +82,6 @@ async fn set_test_fixture(func: &str) -> TestFixture { TestFixture { _docker_compose: docker_compose, hms_catalog, - file_io, } } @@ -219,7 +210,8 @@ async fn test_create_table() -> Result<()> { .is_some_and(|location| location.starts_with("s3a://warehouse/hive/metadata/00000-"))); assert!( fixture - .file_io + .hms_catalog + .file_io() .is_exist("s3a://warehouse/hive/metadata/") .await? ); From 5bcbba8267d944b92cca1c3d85632e92b8998122 Mon Sep 17 00:00:00 2001 From: marvinlanhenke Date: Thu, 21 Mar 2024 11:03:47 +0100 Subject: [PATCH 64/64] use json schema repr --- crates/catalog/hms/src/schema.rs | 289 ++++++++++++++++++++----------- 1 file changed, 185 insertions(+), 104 deletions(-) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index 2e9f9a68f2..77caaf7154 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -142,48 +142,42 @@ impl SchemaVisitor for HiveSchemaBuilder { #[cfg(test)] mod tests { - use iceberg::{ - spec::{ListType, MapType, NestedField, Schema, StructType, Type}, - Result, - }; + use iceberg::{spec::Schema, Result}; use super::*; #[test] fn test_schema_with_nested_maps() -> Result<()> { - let schema = Schema::builder() - .with_schema_id(1) - .with_fields(vec![NestedField::required( - 1, - "quux", - Type::Map(MapType { - key_field: NestedField::map_key_element( - 2, - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::map_value_element( - 3, - Type::Map(MapType { - key_field: NestedField::map_key_element( - 4, - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::map_value_element( - 5, - Type::Primitive(PrimitiveType::Int), - true, - ) - .into(), - }), - true, - ) - .into(), - }), - ) - .into()]) - .build()?; + let record = r#" + { + "schema-id": 1, + "type": "struct", + "fields": [ + { + "id": 1, + "name": "quux", + "required": true, + "type": { + "type": "map", + "key-id": 2, + "key": "string", + "value-id": 3, + "value-required": true, + "value": { + "type": "map", + "key-id": 4, + "key": "string", + "value-id": 5, + "value-required": true, + "value": "int" + } + } + } + ] + } + "#; + + let schema = serde_json::from_str::(record)?; let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); @@ -200,35 +194,43 @@ mod tests { #[test] fn test_schema_with_struct_inside_list() -> Result<()> { - let schema = Schema::builder() - .with_schema_id(1) - .with_fields(vec![NestedField::required( - 1, - "location", - Type::List(ListType { - element_field: NestedField::list_element( - 2, - Type::Struct(StructType::new(vec![ - NestedField::optional( - 3, - "latitude", - Type::Primitive(PrimitiveType::Float), - ) - .into(), - NestedField::optional( - 4, - "longitude", - Type::Primitive(PrimitiveType::Float), - ) - .into(), - ])), - true, - ) - .into(), - }), - ) - .into()]) - .build()?; + let record = r#" + { + "schema-id": 1, + "type": "struct", + "fields": [ + { + "id": 1, + "name": "location", + "required": true, + "type": { + "type": "list", + "element-id": 2, + "element-required": true, + "element": { + "type": "struct", + "fields": [ + { + "id": 3, + "name": "latitude", + "required": false, + "type": "float" + }, + { + "id": 4, + "name": "longitude", + "required": false, + "type": "float" + } + ] + } + } + } + ] + } + "#; + + let schema = serde_json::from_str::(record)?; let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); @@ -245,18 +247,36 @@ mod tests { #[test] fn test_schema_with_structs() -> Result<()> { - let schema = Schema::builder() - .with_schema_id(1) - .with_fields(vec![NestedField::required( - 1, - "person", - Type::Struct(StructType::new(vec![ - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(3, "age", Type::Primitive(PrimitiveType::Int)).into(), - ])), - ) - .into()]) - .build()?; + let record = r#"{ + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "person", + "required": true, + "type": { + "type": "struct", + "fields": [ + { + "id": 2, + "name": "name", + "required": true, + "type": "string" + }, + { + "id": 3, + "name": "age", + "required": false, + "type": "int" + } + ] + } + } + ] + }"#; + + let schema = serde_json::from_str::(record)?; let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); @@ -273,32 +293,93 @@ mod tests { #[test] fn test_schema_with_simple_fields() -> Result<()> { - let schema = Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "c1", Type::Primitive(PrimitiveType::Boolean)).into(), - NestedField::required(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(3, "c3", Type::Primitive(PrimitiveType::Long)).into(), - NestedField::required(4, "c4", Type::Primitive(PrimitiveType::Float)).into(), - NestedField::required(5, "c5", Type::Primitive(PrimitiveType::Double)).into(), - NestedField::required( - 6, - "c6", - Type::Primitive(PrimitiveType::Decimal { - precision: 2, - scale: 2, - }), - ) - .into(), - NestedField::required(7, "c7", Type::Primitive(PrimitiveType::Date)).into(), - NestedField::required(8, "c8", Type::Primitive(PrimitiveType::Time)).into(), - NestedField::required(9, "c9", Type::Primitive(PrimitiveType::Timestamp)).into(), - NestedField::required(10, "c10", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(11, "c11", Type::Primitive(PrimitiveType::Uuid)).into(), - NestedField::required(12, "c12", Type::Primitive(PrimitiveType::Fixed(4))).into(), - NestedField::required(13, "c13", Type::Primitive(PrimitiveType::Binary)).into(), - ]) - .build()?; + let record = r#"{ + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "c1", + "required": true, + "type": "boolean" + }, + { + "id": 2, + "name": "c2", + "required": true, + "type": "int" + }, + { + "id": 3, + "name": "c3", + "required": true, + "type": "long" + }, + { + "id": 4, + "name": "c4", + "required": true, + "type": "float" + }, + { + "id": 5, + "name": "c5", + "required": true, + "type": "double" + }, + { + "id": 6, + "name": "c6", + "required": true, + "type": "decimal(2,2)" + }, + { + "id": 7, + "name": "c7", + "required": true, + "type": "date" + }, + { + "id": 8, + "name": "c8", + "required": true, + "type": "time" + }, + { + "id": 9, + "name": "c9", + "required": true, + "type": "timestamp" + }, + { + "id": 10, + "name": "c10", + "required": true, + "type": "string" + }, + { + "id": 11, + "name": "c11", + "required": true, + "type": "uuid" + }, + { + "id": 12, + "name": "c12", + "required": true, + "type": "fixed[4]" + }, + { + "id": 13, + "name": "c13", + "required": true, + "type": "binary" + } + ] + }"#; + + let schema = serde_json::from_str::(record)?; + let result = HiveSchemaBuilder::from_iceberg(&schema)?.build(); let expected = vec![