1818//! Manifest for Iceberg.
1919use std:: cmp:: min;
2020use std:: collections:: HashMap ;
21+ use std:: io:: { Read , Write } ;
2122use std:: str:: FromStr ;
2223use std:: sync:: Arc ;
2324
@@ -61,7 +62,7 @@ impl Manifest {
6162
6263 let entries = match metadata. format_version {
6364 FormatVersion :: V1 => {
64- let schema = manifest_schema_v1 ( partition_type. clone ( ) ) ?;
65+ let schema = manifest_schema_v1 ( & partition_type) ?;
6566 let reader = AvroReader :: with_schema ( & schema, bs) ?;
6667 reader
6768 . into_iter ( )
@@ -72,7 +73,7 @@ impl Manifest {
7273 . collect :: < Result < Vec < _ > > > ( ) ?
7374 }
7475 FormatVersion :: V2 => {
75- let schema = manifest_schema_v2 ( partition_type. clone ( ) ) ?;
76+ let schema = manifest_schema_v2 ( & partition_type) ?;
7677 let reader = AvroReader :: with_schema ( & schema, bs) ?;
7778 reader
7879 . into_iter ( )
@@ -241,8 +242,8 @@ impl ManifestWriter {
241242 . partition_type ( & manifest. metadata . schema ) ?;
242243 let table_schema = & manifest. metadata . schema ;
243244 let avro_schema = match manifest. metadata . format_version {
244- FormatVersion :: V1 => manifest_schema_v1 ( partition_type. clone ( ) ) ?,
245- FormatVersion :: V2 => manifest_schema_v2 ( partition_type. clone ( ) ) ?,
245+ FormatVersion :: V1 => manifest_schema_v1 ( & partition_type) ?,
246+ FormatVersion :: V2 => manifest_schema_v2 ( & partition_type) ?,
246247 } ;
247248 let mut avro_writer = AvroWriter :: new ( & avro_schema, Vec :: new ( ) ) ;
248249 avro_writer. add_user_metadata (
@@ -656,7 +657,39 @@ mod _const_schema {
656657 } )
657658 } ;
658659
659- pub ( super ) fn manifest_schema_v2 ( partition_type : StructType ) -> Result < AvroSchema , Error > {
660+ fn data_file_fields_v2 ( partition_type : & StructType ) -> Vec < NestedFieldRef > {
661+ vec ! [
662+ CONTENT . clone( ) ,
663+ FILE_PATH . clone( ) ,
664+ FILE_FORMAT . clone( ) ,
665+ Arc :: new( NestedField :: required(
666+ 102 ,
667+ "partition" ,
668+ Type :: Struct ( partition_type. clone( ) ) ,
669+ ) ) ,
670+ RECORD_COUNT . clone( ) ,
671+ FILE_SIZE_IN_BYTES . clone( ) ,
672+ COLUMN_SIZES . clone( ) ,
673+ VALUE_COUNTS . clone( ) ,
674+ NULL_VALUE_COUNTS . clone( ) ,
675+ NAN_VALUE_COUNTS . clone( ) ,
676+ LOWER_BOUNDS . clone( ) ,
677+ UPPER_BOUNDS . clone( ) ,
678+ KEY_METADATA . clone( ) ,
679+ SPLIT_OFFSETS . clone( ) ,
680+ EQUALITY_IDS . clone( ) ,
681+ SORT_ORDER_ID . clone( ) ,
682+ ]
683+ }
684+
685+ pub ( super ) fn data_file_schema_v2 ( partition_type : & StructType ) -> Result < AvroSchema , Error > {
686+ let schema = Schema :: builder ( )
687+ . with_fields ( data_file_fields_v2 ( partition_type) )
688+ . build ( ) ?;
689+ schema_to_avro_schema ( "data_file" , & schema)
690+ }
691+
692+ pub ( super ) fn manifest_schema_v2 ( partition_type : & StructType ) -> Result < AvroSchema , Error > {
660693 let fields = vec ! [
661694 STATUS . clone( ) ,
662695 SNAPSHOT_ID_V2 . clone( ) ,
@@ -665,62 +698,52 @@ mod _const_schema {
665698 Arc :: new( NestedField :: required(
666699 2 ,
667700 "data_file" ,
668- Type :: Struct ( StructType :: new( vec![
669- CONTENT . clone( ) ,
670- FILE_PATH . clone( ) ,
671- FILE_FORMAT . clone( ) ,
672- Arc :: new( NestedField :: required(
673- 102 ,
674- "partition" ,
675- Type :: Struct ( partition_type) ,
676- ) ) ,
677- RECORD_COUNT . clone( ) ,
678- FILE_SIZE_IN_BYTES . clone( ) ,
679- COLUMN_SIZES . clone( ) ,
680- VALUE_COUNTS . clone( ) ,
681- NULL_VALUE_COUNTS . clone( ) ,
682- NAN_VALUE_COUNTS . clone( ) ,
683- LOWER_BOUNDS . clone( ) ,
684- UPPER_BOUNDS . clone( ) ,
685- KEY_METADATA . clone( ) ,
686- SPLIT_OFFSETS . clone( ) ,
687- EQUALITY_IDS . clone( ) ,
688- SORT_ORDER_ID . clone( ) ,
689- ] ) ) ,
701+ Type :: Struct ( StructType :: new( data_file_fields_v2( partition_type) ) ) ,
690702 ) ) ,
691703 ] ;
692704 let schema = Schema :: builder ( ) . with_fields ( fields) . build ( ) ?;
693705 schema_to_avro_schema ( "manifest_entry" , & schema)
694706 }
695707
696- pub ( super ) fn manifest_schema_v1 ( partition_type : StructType ) -> Result < AvroSchema , Error > {
708+ fn data_file_fields_v1 ( partition_type : & StructType ) -> Vec < NestedFieldRef > {
709+ vec ! [
710+ FILE_PATH . clone( ) ,
711+ FILE_FORMAT . clone( ) ,
712+ Arc :: new( NestedField :: required(
713+ 102 ,
714+ "partition" ,
715+ Type :: Struct ( partition_type. clone( ) ) ,
716+ ) ) ,
717+ RECORD_COUNT . clone( ) ,
718+ FILE_SIZE_IN_BYTES . clone( ) ,
719+ BLOCK_SIZE_IN_BYTES . clone( ) ,
720+ COLUMN_SIZES . clone( ) ,
721+ VALUE_COUNTS . clone( ) ,
722+ NULL_VALUE_COUNTS . clone( ) ,
723+ NAN_VALUE_COUNTS . clone( ) ,
724+ LOWER_BOUNDS . clone( ) ,
725+ UPPER_BOUNDS . clone( ) ,
726+ KEY_METADATA . clone( ) ,
727+ SPLIT_OFFSETS . clone( ) ,
728+ SORT_ORDER_ID . clone( ) ,
729+ ]
730+ }
731+
732+ pub ( super ) fn data_file_schema_v1 ( partition_type : & StructType ) -> Result < AvroSchema , Error > {
733+ let schema = Schema :: builder ( )
734+ . with_fields ( data_file_fields_v1 ( partition_type) )
735+ . build ( ) ?;
736+ schema_to_avro_schema ( "data_file" , & schema)
737+ }
738+
739+ pub ( super ) fn manifest_schema_v1 ( partition_type : & StructType ) -> Result < AvroSchema , Error > {
697740 let fields = vec ! [
698741 STATUS . clone( ) ,
699742 SNAPSHOT_ID_V1 . clone( ) ,
700743 Arc :: new( NestedField :: required(
701744 2 ,
702745 "data_file" ,
703- Type :: Struct ( StructType :: new( vec![
704- FILE_PATH . clone( ) ,
705- FILE_FORMAT . clone( ) ,
706- Arc :: new( NestedField :: required(
707- 102 ,
708- "partition" ,
709- Type :: Struct ( partition_type) ,
710- ) ) ,
711- RECORD_COUNT . clone( ) ,
712- FILE_SIZE_IN_BYTES . clone( ) ,
713- BLOCK_SIZE_IN_BYTES . clone( ) ,
714- COLUMN_SIZES . clone( ) ,
715- VALUE_COUNTS . clone( ) ,
716- NULL_VALUE_COUNTS . clone( ) ,
717- NAN_VALUE_COUNTS . clone( ) ,
718- LOWER_BOUNDS . clone( ) ,
719- UPPER_BOUNDS . clone( ) ,
720- KEY_METADATA . clone( ) ,
721- SPLIT_OFFSETS . clone( ) ,
722- SORT_ORDER_ID . clone( ) ,
723- ] ) ) ,
746+ Type :: Struct ( StructType :: new( data_file_fields_v1( partition_type) ) ) ,
724747 ) ) ,
725748 ] ;
726749 let schema = Schema :: builder ( ) . with_fields ( fields) . build ( ) ?;
@@ -1189,6 +1212,49 @@ impl DataFile {
11891212 self . sort_order_id
11901213 }
11911214}
1215+
1216+ /// Convert data files to avro bytes and write to writer.
1217+ /// Return the bytes written.
1218+ pub fn write_data_files_to_avro < W : Write > (
1219+ writer : & mut W ,
1220+ data_files : impl IntoIterator < Item = DataFile > ,
1221+ partition_type : & StructType ,
1222+ version : FormatVersion ,
1223+ ) -> Result < usize > {
1224+ let avro_schema = match version {
1225+ FormatVersion :: V1 => _const_schema:: data_file_schema_v1 ( partition_type) . unwrap ( ) ,
1226+ FormatVersion :: V2 => _const_schema:: data_file_schema_v2 ( partition_type) . unwrap ( ) ,
1227+ } ;
1228+ let mut writer = AvroWriter :: new ( & avro_schema, writer) ;
1229+
1230+ for data_file in data_files {
1231+ let value = to_value ( _serde:: DataFile :: try_from ( data_file, partition_type, true ) ?) ?
1232+ . resolve ( & avro_schema) ?;
1233+ writer. append ( value) ?;
1234+ }
1235+
1236+ Ok ( writer. flush ( ) ?)
1237+ }
1238+
1239+ /// Parse data files from avro bytes.
1240+ pub fn read_data_files_from_avro < R : Read > (
1241+ reader : & mut R ,
1242+ schema : & Schema ,
1243+ partition_type : & StructType ,
1244+ version : FormatVersion ,
1245+ ) -> Result < Vec < DataFile > > {
1246+ let avro_schema = match version {
1247+ FormatVersion :: V1 => _const_schema:: data_file_schema_v1 ( partition_type) . unwrap ( ) ,
1248+ FormatVersion :: V2 => _const_schema:: data_file_schema_v2 ( partition_type) . unwrap ( ) ,
1249+ } ;
1250+
1251+ let reader = AvroReader :: with_schema ( & avro_schema, reader) ?;
1252+ reader
1253+ . into_iter ( )
1254+ . map ( |value| from_value :: < _serde:: DataFile > ( & value?) ?. try_into ( partition_type, schema) )
1255+ . collect :: < Result < Vec < _ > > > ( )
1256+ }
1257+
11921258/// Type of content stored by the data file: data, equality deletes, or
11931259/// position deletes (all v1 files are data files)
11941260#[ derive( Debug , PartialEq , Eq , Clone , Copy , Serialize , Deserialize ) ]
@@ -1551,6 +1617,7 @@ mod _serde {
15511617#[ cfg( test) ]
15521618mod tests {
15531619 use std:: fs;
1620+ use std:: io:: Cursor ;
15541621 use std:: sync:: Arc ;
15551622
15561623 use tempfile:: TempDir ;
@@ -2336,4 +2403,67 @@ mod tests {
23362403 // Verify manifest
23372404 ( fs:: read ( path) . expect ( "read_file must succeed" ) , res)
23382405 }
2406+
2407+ #[ tokio:: test]
2408+ async fn test_data_file_serialize_deserialize ( ) {
2409+ let schema = Arc :: new (
2410+ Schema :: builder ( )
2411+ . with_fields ( vec ! [
2412+ Arc :: new( NestedField :: optional(
2413+ 1 ,
2414+ "v1" ,
2415+ Type :: Primitive ( PrimitiveType :: Int ) ,
2416+ ) ) ,
2417+ Arc :: new( NestedField :: optional(
2418+ 2 ,
2419+ "v2" ,
2420+ Type :: Primitive ( PrimitiveType :: String ) ,
2421+ ) ) ,
2422+ Arc :: new( NestedField :: optional(
2423+ 3 ,
2424+ "v3" ,
2425+ Type :: Primitive ( PrimitiveType :: String ) ,
2426+ ) ) ,
2427+ ] )
2428+ . build ( )
2429+ . unwrap ( ) ,
2430+ ) ;
2431+ let data_files = vec ! [ DataFile {
2432+ content: DataContentType :: Data ,
2433+ file_path: "s3://testbucket/iceberg_data/iceberg_ctl/iceberg_db/iceberg_tbl/data/00000-7-45268d71-54eb-476c-b42c-942d880c04a1-00001.parquet" . to_string( ) ,
2434+ file_format: DataFileFormat :: Parquet ,
2435+ partition: Struct :: empty( ) ,
2436+ record_count: 1 ,
2437+ file_size_in_bytes: 875 ,
2438+ column_sizes: HashMap :: from( [ ( 1 , 47 ) , ( 2 , 48 ) , ( 3 , 52 ) ] ) ,
2439+ value_counts: HashMap :: from( [ ( 1 , 1 ) , ( 2 , 1 ) , ( 3 , 1 ) ] ) ,
2440+ null_value_counts: HashMap :: from( [ ( 1 , 0 ) , ( 2 , 0 ) , ( 3 , 0 ) ] ) ,
2441+ nan_value_counts: HashMap :: new( ) ,
2442+ lower_bounds: HashMap :: from( [ ( 1 , Datum :: int( 1 ) ) , ( 2 , Datum :: string( "a" ) ) , ( 3 , Datum :: string( "AC/DC" ) ) ] ) ,
2443+ upper_bounds: HashMap :: from( [ ( 1 , Datum :: int( 1 ) ) , ( 2 , Datum :: string( "a" ) ) , ( 3 , Datum :: string( "AC/DC" ) ) ] ) ,
2444+ key_metadata: None ,
2445+ split_offsets: vec![ 4 ] ,
2446+ equality_ids: vec![ ] ,
2447+ sort_order_id: Some ( 0 ) ,
2448+ } ] ;
2449+
2450+ let mut buffer = Vec :: new ( ) ;
2451+ let _ = write_data_files_to_avro (
2452+ & mut buffer,
2453+ data_files. clone ( ) . into_iter ( ) ,
2454+ & StructType :: new ( vec ! [ ] ) ,
2455+ FormatVersion :: V2 ,
2456+ )
2457+ . unwrap ( ) ;
2458+
2459+ let actual_data_file = read_data_files_from_avro (
2460+ & mut Cursor :: new ( buffer) ,
2461+ & schema,
2462+ & StructType :: new ( vec ! [ ] ) ,
2463+ FormatVersion :: V2 ,
2464+ )
2465+ . unwrap ( ) ;
2466+
2467+ assert_eq ! ( data_files, actual_data_file) ;
2468+ }
23392469}
0 commit comments