refactor: pass an Arc of the field_ids to the FileScanTaskContext rather than cloning

sdd · sdd · commit e135b3a00346 · 2024-05-20T19:39:33.000+01:00
diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs
@@ -54,7 +54,7 @@ const CONCURRENCY_LIMIT_TASKS: usize = 10;
 /// Builder to create ArrowReader
 pub struct ArrowReaderBuilder {
     batch_size: Option<usize>,
-    field_ids: Vec<usize>,
+    field_ids: Arc<Vec<usize>>,
     file_io: FileIO,
     schema: SchemaRef,
     predicate: Option<BoundPredicate>,
@@ -65,7 +65,7 @@ impl ArrowReaderBuilder {
     pub fn new(file_io: FileIO, schema: SchemaRef) -> Self {
         ArrowReaderBuilder {
             batch_size: None,
-            field_ids: vec![],
+            field_ids: Arc::new(vec![]),
             file_io,
             schema,
             predicate: None,
@@ -81,7 +81,10 @@ impl ArrowReaderBuilder {
 
     /// Sets the desired column projection with a list of field ids.
     pub fn with_field_ids(mut self, field_ids: impl IntoIterator<Item = usize>) -> Self {
-        self.field_ids = field_ids.into_iter().collect();
+        let field_ids = field_ids.into_iter().collect();
+        let field_ids_arc = Arc::new(field_ids);
+        self.field_ids = field_ids_arc;
+
         self
     }
 
@@ -106,7 +109,7 @@ impl ArrowReaderBuilder {
 /// Reads data from Parquet files
 pub struct ArrowReader {
     batch_size: Option<usize>,
-    field_ids: Vec<usize>,
+    field_ids: Arc<Vec<usize>>,
     #[allow(dead_code)]
     schema: SchemaRef,
     file_io: FileIO,
@@ -207,7 +210,7 @@ struct FileScanTaskContext {
     file_io: FileIO,
     sender: Sender<Result<RecordBatch>>,
     batch_size: Option<usize>,
-    field_ids: Vec<usize>,
+    field_ids: Arc<Vec<usize>>,
     schema: SchemaRef,
     predicate: Option<BoundPredicate>,
 }
@@ -218,7 +221,7 @@ impl FileScanTaskContext {
         file_io: FileIO,
         sender: Sender<Result<RecordBatch>>,
         batch_size: Option<usize>,
-        field_ids: Vec<usize>,
+        field_ids: Arc<Vec<usize>>,
         schema: SchemaRef,
         predicate: Option<BoundPredicate>,
     ) -> Self {
@@ -294,7 +297,7 @@ impl FileScanTaskContext {
             }
 
             let mut indices = vec![];
-            for field_id in &self.field_ids {
+            for field_id in self.field_ids.as_ref() {
                 if let Some(col_idx) = column_map.get(&(*field_id as i32)) {
                     indices.push(*col_idx);
                 } else {