Skip to content

Commit cb71fed

Browse files
committed
Add LargeString, LargeBinary, LargeList and FixedSizeList
1 parent b801f44 commit cb71fed

File tree

1 file changed

+115
-27
lines changed

1 file changed

+115
-27
lines changed

crates/iceberg/src/arrow.rs

Lines changed: 115 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -189,18 +189,17 @@ fn visit_type<V: ArrowSchemaVisitor>(r#type: &DataType, visitor: &mut V) -> Resu
189189
p,
190190
DataType::Boolean
191191
| DataType::Utf8
192+
| DataType::LargeUtf8
192193
| DataType::Binary
194+
| DataType::LargeBinary
193195
| DataType::FixedSizeBinary(_)
194196
) =>
195197
{
196198
visitor.primitive(p)
197199
}
198-
DataType::List(element_field) => {
199-
visitor.before_list_element(element_field)?;
200-
let value = visit_type(element_field.data_type(), visitor)?;
201-
visitor.after_list_element(element_field)?;
202-
visitor.list(r#type, value)
203-
}
200+
DataType::List(element_field) => visit_list(r#type, element_field, visitor),
201+
DataType::LargeList(element_field) => visit_list(r#type, element_field, visitor),
202+
DataType::FixedSizeList(element_field, _) => visit_list(r#type, element_field, visitor),
204203
DataType::Map(field, _) => match field.data_type() {
205204
DataType::Struct(fields) => {
206205
if fields.len() != 2 {
@@ -242,6 +241,19 @@ fn visit_type<V: ArrowSchemaVisitor>(r#type: &DataType, visitor: &mut V) -> Resu
242241
}
243242
}
244243

244+
/// Visit list types in post order.
245+
#[allow(dead_code)]
246+
fn visit_list<V: ArrowSchemaVisitor>(
247+
data_type: &DataType,
248+
element_field: &Field,
249+
visitor: &mut V,
250+
) -> Result<V::T> {
251+
visitor.before_list_element(element_field)?;
252+
let value = visit_type(element_field.data_type(), visitor)?;
253+
visitor.after_list_element(element_field)?;
254+
visitor.list(data_type, value)
255+
}
256+
245257
/// Visit struct type in post order.
246258
#[allow(dead_code)]
247259
fn visit_struct<V: ArrowSchemaVisitor>(fields: &Fields, visitor: &mut V) -> Result<V::T> {
@@ -347,26 +359,30 @@ impl ArrowSchemaVisitor for ArrowSchemaConverter {
347359
}
348360

349361
fn list(&mut self, list: &DataType, value: Self::T) -> Result<Self::T> {
350-
match list {
351-
DataType::List(element_field) => {
352-
let id = get_field_id(element_field)?;
353-
let doc = get_field_doc(element_field);
354-
let element_field = Arc::new(NestedField {
355-
id,
356-
doc,
357-
name: "element".to_string(),
358-
required: !element_field.is_nullable(),
359-
field_type: Box::new(value.clone()),
360-
initial_default: None,
361-
write_default: None,
362-
});
363-
Ok(Type::List(ListType { element_field }))
362+
let element_field = match list {
363+
DataType::List(element_field) => element_field,
364+
DataType::LargeList(element_field) => element_field,
365+
DataType::FixedSizeList(element_field, _) => element_field,
366+
_ => {
367+
return Err(Error::new(
368+
ErrorKind::DataInvalid,
369+
"List type must have list data type",
370+
))
364371
}
365-
_ => Err(Error::new(
366-
ErrorKind::DataInvalid,
367-
"List type must have list data type",
368-
)),
369-
}
372+
};
373+
374+
let id = get_field_id(element_field)?;
375+
let doc = get_field_doc(element_field);
376+
let element_field = Arc::new(NestedField {
377+
id,
378+
doc,
379+
name: "element".to_string(),
380+
required: !element_field.is_nullable(),
381+
field_type: Box::new(value.clone()),
382+
initial_default: None,
383+
write_default: None,
384+
});
385+
Ok(Type::List(ListType { element_field }))
370386
}
371387

372388
fn map(&mut self, map: &DataType, key_value: Self::T, value: Self::T) -> Result<Self::T> {
@@ -444,11 +460,11 @@ impl ArrowSchemaVisitor for ArrowSchemaConverter {
444460
{
445461
Ok(Type::Primitive(PrimitiveType::Timestamptz))
446462
}
447-
DataType::Binary => Ok(Type::Primitive(PrimitiveType::Binary)),
463+
DataType::Binary | DataType::LargeBinary => Ok(Type::Primitive(PrimitiveType::Binary)),
448464
DataType::FixedSizeBinary(width) => {
449465
Ok(Type::Primitive(PrimitiveType::Fixed(*width as u64)))
450466
}
451-
DataType::Utf8 => Ok(Type::Primitive(PrimitiveType::String)),
467+
DataType::Utf8 | DataType::LargeUtf8 => Ok(Type::Primitive(PrimitiveType::String)),
452468
_ => Err(Error::new(
453469
ErrorKind::DataInvalid,
454470
format!("Unsupported Arrow data type: {p}"),
@@ -526,6 +542,10 @@ mod tests {
526542
ARROW_FIELD_ID_KEY.to_string(),
527543
"3".to_string(),
528544
)])),
545+
Field::new("n", DataType::LargeUtf8, false).with_metadata(HashMap::from([(
546+
ARROW_FIELD_ID_KEY.to_string(),
547+
"21".to_string(),
548+
)])),
529549
Field::new("d", DataType::Timestamp(TimeUnit::Microsecond, None), true).with_metadata(
530550
HashMap::from([(ARROW_FIELD_ID_KEY.to_string(), "4".to_string())]),
531551
),
@@ -570,6 +590,10 @@ mod tests {
570590
ARROW_FIELD_ID_KEY.to_string(),
571591
"13".to_string(),
572592
)])),
593+
Field::new("o", DataType::LargeBinary, false).with_metadata(HashMap::from([(
594+
ARROW_FIELD_ID_KEY.to_string(),
595+
"22".to_string(),
596+
)])),
573597
Field::new("m", DataType::FixedSizeBinary(10), false).with_metadata(HashMap::from([(
574598
ARROW_FIELD_ID_KEY.to_string(),
575599
"11".to_string(),
@@ -588,6 +612,36 @@ mod tests {
588612
ARROW_FIELD_ID_KEY.to_string(),
589613
"14".to_string(),
590614
)])),
615+
Field::new(
616+
"large_list",
617+
DataType::LargeList(Arc::new(
618+
Field::new("element", DataType::Utf8, false).with_metadata(HashMap::from([(
619+
ARROW_FIELD_ID_KEY.to_string(),
620+
"23".to_string(),
621+
)])),
622+
)),
623+
true,
624+
)
625+
.with_metadata(HashMap::from([(
626+
ARROW_FIELD_ID_KEY.to_string(),
627+
"24".to_string(),
628+
)])),
629+
Field::new(
630+
"fixed_list",
631+
DataType::FixedSizeList(
632+
Arc::new(
633+
Field::new("element", DataType::Binary, false).with_metadata(
634+
HashMap::from([(ARROW_FIELD_ID_KEY.to_string(), "26".to_string())]),
635+
),
636+
),
637+
10,
638+
),
639+
true,
640+
)
641+
.with_metadata(HashMap::from([(
642+
ARROW_FIELD_ID_KEY.to_string(),
643+
"25".to_string(),
644+
)])),
591645
Field::new("map", map, false).with_metadata(HashMap::from([(
592646
ARROW_FIELD_ID_KEY.to_string(),
593647
"16".to_string(),
@@ -622,6 +676,12 @@ mod tests {
622676
"required":true,
623677
"type":"string"
624678
},
679+
{
680+
"id":21,
681+
"name":"n",
682+
"required":true,
683+
"type":"string"
684+
},
625685
{
626686
"id":4,
627687
"name":"d",
@@ -676,6 +736,12 @@ mod tests {
676736
"required":true,
677737
"type":"binary"
678738
},
739+
{
740+
"id":22,
741+
"name":"o",
742+
"required":true,
743+
"type":"binary"
744+
},
679745
{
680746
"id":11,
681747
"name":"m",
@@ -693,6 +759,28 @@ mod tests {
693759
"element": "int"
694760
}
695761
},
762+
{
763+
"id":24,
764+
"name":"large_list",
765+
"required": false,
766+
"type": {
767+
"type": "list",
768+
"element-id": 23,
769+
"element-required": true,
770+
"element": "string"
771+
}
772+
},
773+
{
774+
"id":25,
775+
"name":"fixed_list",
776+
"required": false,
777+
"type": {
778+
"type": "list",
779+
"element-id": 26,
780+
"element-required": true,
781+
"element": "binary"
782+
}
783+
},
696784
{
697785
"id":16,
698786
"name":"map",

0 commit comments

Comments
 (0)