1919
2020use std:: sync:: Arc ;
2121
22- use arrow_array:: builder:: { MapBuilder , PrimitiveBuilder , StringBuilder } ;
23- use arrow_array:: types:: { Int64Type , TimestampMillisecondType } ;
22+ use arrow_array:: builder:: {
23+ BooleanBuilder , ListBuilder , MapBuilder , PrimitiveBuilder , StringBuilder , StructBuilder ,
24+ } ;
25+ use arrow_array:: types:: { Int32Type , Int64Type , Int8Type , TimestampMillisecondType } ;
2426use arrow_array:: RecordBatch ;
25- use arrow_schema:: { DataType , Field , Schema , TimeUnit } ;
27+ use arrow_schema:: { DataType , Field , Fields , Schema , TimeUnit } ;
2628
27- use crate :: spec:: TableMetadata ;
2829use crate :: table:: Table ;
2930use crate :: Result ;
3031
@@ -45,19 +46,18 @@ impl MetadataTable {
4546
4647 /// Get the snapshots table.
4748 pub fn snapshots ( & self ) -> SnapshotsTable {
48- SnapshotsTable {
49- metadata_table : self ,
50- }
49+ SnapshotsTable { table : & self . 0 }
5150 }
5251
53- fn metadata ( & self ) -> & TableMetadata {
54- self . 0 . metadata ( )
52+ /// Get the manifests table.
53+ pub fn manifests ( & self ) -> ManifestsTable {
54+ ManifestsTable { table : & self . 0 }
5555 }
5656}
5757
5858/// Snapshots table.
5959pub struct SnapshotsTable < ' a > {
60- metadata_table : & ' a MetadataTable ,
60+ table : & ' a Table ,
6161}
6262
6363impl < ' a > SnapshotsTable < ' a > {
@@ -104,7 +104,7 @@ impl<'a> SnapshotsTable<'a> {
104104 let mut manifest_list = StringBuilder :: new ( ) ;
105105 let mut summary = MapBuilder :: new ( None , StringBuilder :: new ( ) , StringBuilder :: new ( ) ) ;
106106
107- for snapshot in self . metadata_table . metadata ( ) . snapshots ( ) {
107+ for snapshot in self . table . metadata ( ) . snapshots ( ) {
108108 committed_at. append_value ( snapshot. timestamp_ms ( ) ) ;
109109 snapshot_id. append_value ( snapshot. snapshot_id ( ) ) ;
110110 parent_id. append_option ( snapshot. parent_snapshot_id ( ) ) ;
@@ -128,6 +128,133 @@ impl<'a> SnapshotsTable<'a> {
128128 }
129129}
130130
131+ /// Manifests table.
132+ pub struct ManifestsTable < ' a > {
133+ table : & ' a Table ,
134+ }
135+
136+ impl < ' a > ManifestsTable < ' a > {
137+ fn partition_summary_fields ( & self ) -> Vec < Field > {
138+ vec ! [
139+ Field :: new( "contains_null" , DataType :: Boolean , false ) ,
140+ Field :: new( "contains_nan" , DataType :: Boolean , true ) ,
141+ Field :: new( "lower_bound" , DataType :: Utf8 , true ) ,
142+ Field :: new( "upper_bound" , DataType :: Utf8 , true ) ,
143+ ]
144+ }
145+
146+ /// Returns the schema of the manifests table.
147+ pub fn schema ( & self ) -> Schema {
148+ Schema :: new ( vec ! [
149+ Field :: new( "content" , DataType :: Int8 , false ) ,
150+ Field :: new( "path" , DataType :: Utf8 , false ) ,
151+ Field :: new( "length" , DataType :: Int64 , false ) ,
152+ Field :: new( "partition_spec_id" , DataType :: Int32 , false ) ,
153+ Field :: new( "added_snapshot_id" , DataType :: Int64 , false ) ,
154+ Field :: new( "added_data_files_count" , DataType :: Int32 , false ) ,
155+ Field :: new( "existing_data_files_count" , DataType :: Int32 , false ) ,
156+ Field :: new( "deleted_data_files_count" , DataType :: Int32 , false ) ,
157+ Field :: new( "added_delete_files_count" , DataType :: Int32 , false ) ,
158+ Field :: new( "existing_delete_files_count" , DataType :: Int32 , false ) ,
159+ Field :: new( "deleted_delete_files_count" , DataType :: Int32 , false ) ,
160+ Field :: new(
161+ "partition_summaries" ,
162+ DataType :: List ( Arc :: new( Field :: new_struct(
163+ "item" ,
164+ self . partition_summary_fields( ) ,
165+ false ,
166+ ) ) ) ,
167+ false ,
168+ ) ,
169+ ] )
170+ }
171+
172+ /// Scans the manifests table.
173+ pub async fn scan ( & self ) -> Result < RecordBatch > {
174+ let mut content = PrimitiveBuilder :: < Int8Type > :: new ( ) ;
175+ let mut path = StringBuilder :: new ( ) ;
176+ let mut length = PrimitiveBuilder :: < Int64Type > :: new ( ) ;
177+ let mut partition_spec_id = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
178+ let mut added_snapshot_id = PrimitiveBuilder :: < Int64Type > :: new ( ) ;
179+ let mut added_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
180+ let mut existing_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
181+ let mut deleted_data_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
182+ let mut added_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
183+ let mut existing_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
184+ let mut deleted_delete_files_count = PrimitiveBuilder :: < Int32Type > :: new ( ) ;
185+ let mut partition_summaries = ListBuilder :: new ( StructBuilder :: from_fields (
186+ Fields :: from ( self . partition_summary_fields ( ) ) ,
187+ 0 ,
188+ ) )
189+ . with_field ( Arc :: new ( Field :: new_struct (
190+ "item" ,
191+ self . partition_summary_fields ( ) ,
192+ false ,
193+ ) ) ) ;
194+
195+ if let Some ( snapshot) = self . table . metadata ( ) . current_snapshot ( ) {
196+ let manifest_list = snapshot
197+ . load_manifest_list ( self . table . file_io ( ) , & self . table . metadata_ref ( ) )
198+ . await ?;
199+ for manifest in manifest_list. entries ( ) {
200+ content. append_value ( manifest. content as i8 ) ;
201+ path. append_value ( manifest. manifest_path . clone ( ) ) ;
202+ length. append_value ( manifest. manifest_length ) ;
203+ partition_spec_id. append_value ( manifest. partition_spec_id ) ;
204+ added_snapshot_id. append_value ( manifest. added_snapshot_id ) ;
205+ added_data_files_count. append_value ( manifest. added_files_count . unwrap_or ( 0 ) as i32 ) ;
206+ existing_data_files_count
207+ . append_value ( manifest. existing_files_count . unwrap_or ( 0 ) as i32 ) ;
208+ deleted_data_files_count
209+ . append_value ( manifest. deleted_files_count . unwrap_or ( 0 ) as i32 ) ;
210+ added_delete_files_count
211+ . append_value ( manifest. added_files_count . unwrap_or ( 0 ) as i32 ) ;
212+ existing_delete_files_count
213+ . append_value ( manifest. existing_files_count . unwrap_or ( 0 ) as i32 ) ;
214+ deleted_delete_files_count
215+ . append_value ( manifest. deleted_files_count . unwrap_or ( 0 ) as i32 ) ;
216+
217+ let partition_summaries_builder = partition_summaries. values ( ) ;
218+ for summary in & manifest. partitions {
219+ partition_summaries_builder
220+ . field_builder :: < BooleanBuilder > ( 0 )
221+ . unwrap ( )
222+ . append_value ( summary. contains_null ) ;
223+ partition_summaries_builder
224+ . field_builder :: < BooleanBuilder > ( 1 )
225+ . unwrap ( )
226+ . append_option ( summary. contains_nan ) ;
227+ partition_summaries_builder
228+ . field_builder :: < StringBuilder > ( 2 )
229+ . unwrap ( )
230+ . append_option ( summary. lower_bound . as_ref ( ) . map ( |v| v. to_string ( ) ) ) ;
231+ partition_summaries_builder
232+ . field_builder :: < StringBuilder > ( 3 )
233+ . unwrap ( )
234+ . append_option ( summary. upper_bound . as_ref ( ) . map ( |v| v. to_string ( ) ) ) ;
235+ partition_summaries_builder. append ( true ) ;
236+ }
237+ partition_summaries. append ( true ) ;
238+ }
239+ }
240+
241+ Ok ( RecordBatch :: try_new ( Arc :: new ( self . schema ( ) ) , vec ! [
242+ Arc :: new( content. finish( ) ) ,
243+ Arc :: new( path. finish( ) ) ,
244+ Arc :: new( length. finish( ) ) ,
245+ Arc :: new( partition_spec_id. finish( ) ) ,
246+ Arc :: new( added_snapshot_id. finish( ) ) ,
247+ Arc :: new( added_data_files_count. finish( ) ) ,
248+ Arc :: new( existing_data_files_count. finish( ) ) ,
249+ Arc :: new( deleted_data_files_count. finish( ) ) ,
250+ Arc :: new( added_delete_files_count. finish( ) ) ,
251+ Arc :: new( existing_delete_files_count. finish( ) ) ,
252+ Arc :: new( deleted_delete_files_count. finish( ) ) ,
253+ Arc :: new( partition_summaries. finish( ) ) ,
254+ ] ) ?)
255+ }
256+ }
257+
131258#[ cfg( test) ]
132259mod tests {
133260 use expect_test:: { expect, Expect } ;
@@ -253,4 +380,106 @@ mod tests {
253380 Some ( "committed_at" ) ,
254381 ) ;
255382 }
383+
384+ #[ tokio:: test]
385+ async fn test_manifests_table ( ) {
386+ let mut fixture = TableTestFixture :: new ( ) ;
387+ fixture. setup_manifest_files ( ) . await ;
388+
389+ let record_batch = fixture
390+ . table
391+ . metadata_table ( )
392+ . manifests ( )
393+ . scan ( )
394+ . await
395+ . unwrap ( ) ;
396+
397+ check_record_batch (
398+ record_batch,
399+ expect ! [ [ r#"
400+ Field { name: "content", data_type: Int8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
401+ Field { name: "path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
402+ Field { name: "length", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
403+ Field { name: "partition_spec_id", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
404+ Field { name: "added_snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
405+ Field { name: "added_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
406+ Field { name: "existing_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
407+ Field { name: "deleted_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
408+ Field { name: "added_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
409+ Field { name: "existing_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
410+ Field { name: "deleted_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} },
411+ Field { name: "partition_summaries", data_type: List(Field { name: "item", data_type: Struct([Field { name: "contains_null", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "contains_nan", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"# ] ] ,
412+ expect ! [ [ r#"
413+ content: PrimitiveArray<Int8>
414+ [
415+ 0,
416+ ],
417+ path: (skipped),
418+ length: (skipped),
419+ partition_spec_id: PrimitiveArray<Int32>
420+ [
421+ 0,
422+ ],
423+ added_snapshot_id: PrimitiveArray<Int64>
424+ [
425+ 3055729675574597004,
426+ ],
427+ added_data_files_count: PrimitiveArray<Int32>
428+ [
429+ 1,
430+ ],
431+ existing_data_files_count: PrimitiveArray<Int32>
432+ [
433+ 1,
434+ ],
435+ deleted_data_files_count: PrimitiveArray<Int32>
436+ [
437+ 1,
438+ ],
439+ added_delete_files_count: PrimitiveArray<Int32>
440+ [
441+ 1,
442+ ],
443+ existing_delete_files_count: PrimitiveArray<Int32>
444+ [
445+ 1,
446+ ],
447+ deleted_delete_files_count: PrimitiveArray<Int32>
448+ [
449+ 1,
450+ ],
451+ partition_summaries: ListArray
452+ [
453+ StructArray
454+ -- validity:
455+ [
456+ valid,
457+ ]
458+ [
459+ -- child 0: "contains_null" (Boolean)
460+ BooleanArray
461+ [
462+ false,
463+ ]
464+ -- child 1: "contains_nan" (Boolean)
465+ BooleanArray
466+ [
467+ false,
468+ ]
469+ -- child 2: "lower_bound" (Utf8)
470+ StringArray
471+ [
472+ "100",
473+ ]
474+ -- child 3: "upper_bound" (Utf8)
475+ StringArray
476+ [
477+ "300",
478+ ]
479+ ],
480+ ]"# ] ] ,
481+ & [ "path" , "length" ] ,
482+ Some ( "path" ) ,
483+ ) ;
484+ }
256485}
0 commit comments