Skip to content

Commit ec127ec

Browse files
committed
Support 'entries' metadata table
1 parent 6e07faa commit ec127ec

File tree

10 files changed

+1114
-139
lines changed

10 files changed

+1114
-139
lines changed

crates/iceberg/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ uuid = { workspace = true }
8585
zstd = { workspace = true }
8686

8787
[dev-dependencies]
88+
arrow-cast = { workspace = true, features = ["prettyprint"] }
8889
ctor = { workspace = true }
8990
expect-test = { workspace = true }
9091
iceberg-catalog-memory = { workspace = true }

crates/iceberg/src/arrow/schema.rs

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,193 @@ get_parquet_stat_as_datum!(min);
821821

822822
get_parquet_stat_as_datum!(max);
823823

824+
/// Utilities to deal with [arrow_array::builder] types in the Iceberg context.
825+
pub(crate) mod builder {
826+
use arrow_array::builder::*;
827+
use arrow_array::cast::AsArray;
828+
use arrow_array::types::*;
829+
use arrow_array::{ArrayRef, Datum as ArrowDatum};
830+
use arrow_schema::{DataType, TimeUnit};
831+
use ordered_float::OrderedFloat;
832+
833+
use crate::spec::{Literal, PrimitiveLiteral};
834+
use crate::{Error, ErrorKind};
835+
836+
/// A helper wrapping [ArrayBuilder] for building arrays without declaring the inner type at
837+
/// compile-time when types are determined dynamically (e.g. based on some column type).
838+
/// A [DataType] is given at construction time which is used to later downcast the inner array
839+
/// and provided values.
840+
pub(crate) struct AnyArrayBuilder {
841+
data_type: DataType,
842+
inner: Box<dyn ArrayBuilder>,
843+
}
844+
845+
impl AnyArrayBuilder {
846+
pub(crate) fn new(data_type: &DataType) -> Self {
847+
Self {
848+
data_type: data_type.clone(),
849+
inner: make_builder(data_type, 0),
850+
}
851+
}
852+
853+
pub(crate) fn finish(&mut self) -> ArrayRef {
854+
self.inner.finish()
855+
}
856+
857+
/// Append an [[arrow_array::Datum]] value.
858+
pub(crate) fn append_datum(&mut self, value: &dyn ArrowDatum) -> crate::Result<()> {
859+
let (array, is_scalar) = value.get();
860+
assert!(is_scalar, "Can only append scalar datum");
861+
862+
match array.data_type() {
863+
DataType::Boolean => self
864+
.builder::<BooleanBuilder>()?
865+
.append_value(array.as_boolean().value(0)),
866+
DataType::Int32 => self
867+
.builder::<Int32Builder>()?
868+
.append_value(array.as_primitive::<Int32Type>().value(0)),
869+
DataType::Int64 => self
870+
.builder::<Int64Builder>()?
871+
.append_value(array.as_primitive::<Int64Type>().value(0)),
872+
DataType::Float32 => self
873+
.builder::<Float32Builder>()?
874+
.append_value(array.as_primitive::<Float32Type>().value(0)),
875+
DataType::Float64 => self
876+
.builder::<Float64Builder>()?
877+
.append_value(array.as_primitive::<Float64Type>().value(0)),
878+
DataType::Decimal128(_, _) => self
879+
.builder::<Decimal128Builder>()?
880+
.append_value(array.as_primitive::<Decimal128Type>().value(0)),
881+
DataType::Date32 => self
882+
.builder::<Date32Builder>()?
883+
.append_value(array.as_primitive::<Date32Type>().value(0)),
884+
DataType::Time64(TimeUnit::Microsecond) => self
885+
.builder::<Time64MicrosecondBuilder>()?
886+
.append_value(array.as_primitive::<Time64MicrosecondType>().value(0)),
887+
DataType::Timestamp(TimeUnit::Microsecond, _) => self
888+
.builder::<TimestampMicrosecondBuilder>()?
889+
.append_value(array.as_primitive::<TimestampMicrosecondType>().value(0)),
890+
DataType::Timestamp(TimeUnit::Nanosecond, _) => self
891+
.builder::<TimestampNanosecondBuilder>()?
892+
.append_value(array.as_primitive::<TimestampNanosecondType>().value(0)),
893+
DataType::Utf8 => self
894+
.builder::<StringBuilder>()?
895+
.append_value(array.as_string::<i32>().value(0)),
896+
DataType::FixedSizeBinary(_) => self
897+
.builder::<BinaryBuilder>()?
898+
.append_value(array.as_fixed_size_binary().value(0)),
899+
DataType::LargeBinary => self
900+
.builder::<LargeBinaryBuilder>()?
901+
.append_value(array.as_binary::<i64>().value(0)),
902+
_ => {
903+
return Err(Error::new(
904+
ErrorKind::FeatureUnsupported,
905+
format!("Cannot append data type: {:?}", array.data_type(),),
906+
));
907+
}
908+
}
909+
Ok(())
910+
}
911+
912+
/// Append a literal with the provided [DataType]. We're not solely relying on the literal to
913+
/// infer the type because [Literal] values do not specify the expected type of builder. E.g.,
914+
/// a [PrimitiveLiteral::Long] may go into an array builder for longs but also for timestamps.
915+
pub(crate) fn append_literal(&mut self, value: &Literal) -> crate::Result<()> {
916+
let Some(primitive) = value.as_primitive_literal() else {
917+
return Err(Error::new(
918+
ErrorKind::FeatureUnsupported,
919+
"Expected primitive type",
920+
));
921+
};
922+
923+
match (&self.data_type, primitive.clone()) {
924+
(DataType::Boolean, PrimitiveLiteral::Boolean(value)) => {
925+
self.builder::<BooleanBuilder>()?.append_value(value)
926+
}
927+
(DataType::Int32, PrimitiveLiteral::Int(value)) => {
928+
self.builder::<Int32Builder>()?.append_value(value)
929+
}
930+
(DataType::Int64, PrimitiveLiteral::Long(value)) => {
931+
self.builder::<Int64Builder>()?.append_value(value)
932+
}
933+
(DataType::Float32, PrimitiveLiteral::Float(OrderedFloat(value))) => {
934+
self.builder::<Float32Builder>()?.append_value(value)
935+
}
936+
(DataType::Float64, PrimitiveLiteral::Double(OrderedFloat(value))) => {
937+
self.builder::<Float64Builder>()?.append_value(value)
938+
}
939+
(DataType::Utf8, PrimitiveLiteral::String(value)) => {
940+
self.builder::<StringBuilder>()?.append_value(value)
941+
}
942+
(DataType::FixedSizeBinary(_), PrimitiveLiteral::Binary(value)) => self
943+
.builder::<FixedSizeBinaryBuilder>()?
944+
.append_value(value)?,
945+
(DataType::LargeBinary, PrimitiveLiteral::Binary(value)) => {
946+
self.builder::<LargeBinaryBuilder>()?.append_value(value)
947+
}
948+
(_, _) => {
949+
return Err(Error::new(
950+
ErrorKind::FeatureUnsupported,
951+
format!(
952+
"Builder of type {:?} does not accept literal {:?}",
953+
self.data_type, primitive
954+
),
955+
));
956+
}
957+
}
958+
959+
Ok(())
960+
}
961+
962+
/// Append a null value for the provided [DataType].
963+
pub(crate) fn append_null(&mut self) -> crate::Result<()> {
964+
match self.data_type {
965+
DataType::Boolean => self.builder::<BooleanBuilder>()?.append_null(),
966+
DataType::Int32 => self.builder::<Int32Builder>()?.append_null(),
967+
DataType::Int64 => self.builder::<Int64Builder>()?.append_null(),
968+
DataType::Float32 => self.builder::<Float32Builder>()?.append_null(),
969+
DataType::Float64 => self.builder::<Float64Builder>()?.append_null(),
970+
DataType::Decimal128(_, _) => self.builder::<Decimal128Builder>()?.append_null(),
971+
DataType::Date32 => self.builder::<Date32Builder>()?.append_null(),
972+
DataType::Time64(TimeUnit::Microsecond) => {
973+
self.builder::<Time64MicrosecondBuilder>()?.append_null()
974+
}
975+
DataType::Timestamp(TimeUnit::Microsecond, _) => {
976+
self.builder::<TimestampMicrosecondBuilder>()?.append_null()
977+
}
978+
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
979+
self.builder::<TimestampNanosecondBuilder>()?.append_null()
980+
}
981+
DataType::Utf8 => self.builder::<StringBuilder>()?.append_null(),
982+
DataType::FixedSizeBinary(_) => {
983+
self.builder::<FixedSizeBinaryBuilder>()?.append_null()
984+
}
985+
DataType::LargeBinary => self.builder::<LargeBinaryBuilder>()?.append_null(),
986+
_ => {
987+
return Err(Error::new(
988+
ErrorKind::FeatureUnsupported,
989+
format!(
990+
"Cannot append null values for data type: {:?}",
991+
self.data_type
992+
),
993+
))
994+
}
995+
}
996+
Ok(())
997+
}
998+
999+
/// Cast the `inner` builder to a specific type or return [Error].
1000+
fn builder<T: ArrayBuilder>(&mut self) -> crate::Result<&mut T> {
1001+
self.inner.as_any_mut().downcast_mut::<T>().ok_or_else(|| {
1002+
Error::new(
1003+
ErrorKind::Unexpected,
1004+
"Failed to cast builder to expected type",
1005+
)
1006+
})
1007+
}
1008+
}
1009+
}
1010+
8241011
impl TryFrom<&ArrowSchema> for crate::spec::Schema {
8251012
type Error = Error;
8261013

0 commit comments

Comments
 (0)