From fd20555a2c5f2c22fd6071d1c4ab0eaaa94f27b9 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 26 Jun 2025 13:41:34 +0200 Subject: [PATCH 1/6] Fix RowConverter when FixedSizeList is not the last Fix `RowConverter` row decoding when there is a `FixedSizeList` element and it's not the last. --- arrow-row/src/lib.rs | 71 ++++++++++++++++++++++++++++++++++++++++++- arrow-row/src/list.rs | 1 + 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 81320420dbe5..91faea1d1775 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -758,7 +758,18 @@ impl RowConverter { // SAFETY // We have validated that the rows came from this [`RowConverter`] // and therefore must be valid - unsafe { self.convert_raw(&mut rows, validate_utf8) } + let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?; + + for (i, row) in rows.iter().enumerate() { + if !row.is_empty() { + return Err(ArrowError::InvalidArgumentError(format!( + "Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}", + codecs = &self.codecs + ))); + } + } + + Ok(result) } /// Returns an empty [`Rows`] with capacity for `row_capacity` rows with @@ -2549,6 +2560,64 @@ mod tests { assert_eq!(&back[0], &list); } + #[test] + fn test_two_fixed_size_lists() { + let mut first = FixedSizeListBuilder::new(UInt8Builder::new(), 1); + // 0: [100] + first.values().append_value(100); + first.append(true); + // 1: [101] + first.values().append_value(101); + first.append(true); + // 2: [102] + first.values().append_value(102); + first.append(true); + // 3: [null] + first.values().append_null(); + first.append(true); + // 4: null + first.values().append_null(); // MASKED + first.append(false); + let first = Arc::new(first.finish()) as ArrayRef; + let first_type = first.data_type().clone(); + + let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1); + // 0: [200] + second.values().append_value(200); + second.append(true); + // 1: [201] + second.values().append_value(201); + second.append(true); + // 2: [202] + second.values().append_value(202); + second.append(true); + // 3: [null] + second.values().append_null(); + second.append(true); + // 4: null + second.values().append_null(); // MASKED + second.append(false); + let second = Arc::new(second.finish()) as ArrayRef; + let second_type = second.data_type().clone(); + + let converter = RowConverter::new(vec![ + SortField::new(first_type.clone()), + SortField::new(second_type.clone()), + ]) + .unwrap(); + + let rows = converter + .convert_columns(&[Arc::clone(&first), Arc::clone(&second)]) + .unwrap(); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 2); + back[0].to_data().validate_full().unwrap(); + assert_eq!(&back[0], &first); + back[1].to_data().validate_full().unwrap(); + assert_eq!(&back[1], &second); + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 627214dc9c46..931f9b4c51d6 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -292,6 +292,7 @@ pub unsafe fn decode_fixed_size_list( row_offset = next_offset; } } + *row = &row[row_offset..]; // Update row for the next decoder } let children = converter.convert_raw(&mut child_rows, validate_utf8)?; From 46027d024588a778018d44c17c530ed5cb4c3018 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 26 Jun 2025 14:09:04 +0200 Subject: [PATCH 2/6] Code cleanup --- arrow-row/src/list.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 931f9b4c51d6..e4b82df2867d 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -225,7 +225,6 @@ pub fn encode_fixed_size_list( data[*offset] = 0x01; *offset += 1; for child_idx in (idx * value_length)..(idx + 1) * value_length { - //dbg!(child_idx); let row = rows.row(child_idx); let end_offset = *offset + row.as_ref().len(); data[*offset..end_offset].copy_from_slice(row.as_ref()); @@ -233,12 +232,8 @@ pub fn encode_fixed_size_list( } } false => { - let null_sentinels = 1; - //+ value_length; // 1 for self + for values too - for i in 0..null_sentinels { - data[*offset + i] = null_sentinel; - } - *offset += null_sentinels; + data[*offset] = null_sentinel; + *offset += 1; } }; }) From b66ad68bffc09a2a0334f05adf7d2e6a9cf2abcc Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 27 Jun 2025 15:31:35 +0200 Subject: [PATCH 3/6] Add test row-encoding FixedSizeList with variable contents --- arrow-row/src/lib.rs | 126 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 91faea1d1775..200ab7f308da 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2618,6 +2618,132 @@ mod tests { assert_eq!(&back[1], &second); } + #[test] + fn test_fixed_size_list_with_variable_width_content() { + let mut first = FixedSizeListBuilder::new( + StructBuilder::from_fields( + vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC"))), + false, + ), + Field::new("offset_minutes", DataType::Int16, false), + Field::new("time_zone", DataType::Utf8, false), + ], + 1, + ), + 1, + ); + // 0: null + first + .values() + .field_builder::(0) + .unwrap() + .append_null(); + first + .values() + .field_builder::(1) + .unwrap() + .append_null(); + first + .values() + .field_builder::(2) + .unwrap() + .append_null(); + first.values().append(false); + first.append(false); + // 1: [null] + first + .values() + .field_builder::(0) + .unwrap() + .append_null(); + first + .values() + .field_builder::(1) + .unwrap() + .append_null(); + first + .values() + .field_builder::(2) + .unwrap() + .append_null(); + first.values().append(false); + first.append(true); + // 2: [1970-01-01 00:00:00.000000 UTC] + first + .values() + .field_builder::(0) + .unwrap() + .append_value(0); + first + .values() + .field_builder::(1) + .unwrap() + .append_value(0); + first + .values() + .field_builder::(2) + .unwrap() + .append_value("UTC"); + first.values().append(true); + first.append(true); + // 3: [2005-09-10 13:30:00.123456 Europe/Warsaw] + first + .values() + .field_builder::(0) + .unwrap() + .append_value(1126351800123456); + first + .values() + .field_builder::(1) + .unwrap() + .append_value(120); + first + .values() + .field_builder::(2) + .unwrap() + .append_value("Europe/Warsaw"); + first.values().append(true); + first.append(true); + let first = Arc::new(first.finish()) as ArrayRef; + let first_type = first.data_type().clone(); + + let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1); + // 0: [200] + second.values().append_value(200); + second.append(true); + // 1: [201] + second.values().append_value(201); + second.append(true); + // 2: [null] + second.values().append_null(); + second.append(true); + // 3: null + second.values().append_null(); // MASKED + second.append(false); + let second = Arc::new(second.finish()) as ArrayRef; + let second_type = second.data_type().clone(); + + let converter = RowConverter::new(vec![ + SortField::new(first_type.clone()), + SortField::new(second_type.clone()), + ]) + .unwrap(); + + let rows = converter + .convert_columns(&[Arc::clone(&first), Arc::clone(&second)]) + .unwrap(); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 2); + back[0].to_data().validate_full().unwrap(); + assert_eq!(&back[0], &first); + back[1].to_data().validate_full().unwrap(); + assert_eq!(&back[1], &second); + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, From 5a0781d2182996cf65b8fe31b7df7b584659bf1c Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 27 Jun 2025 15:31:53 +0200 Subject: [PATCH 4/6] Protect against encoding illformed columns When columns are unequal sizes, then encoding them can panic. This can be observed by changing `test_fixed_size_list_with_variable_width_content`, eg adding one more element to the second list. What's unclear, the panic can be avoided by calling `tracker.materialized()` in `compute_lengths_fixed_size_list` (similar to how list encoding does it). However, we still won't encode the whole data passed in, so it's better to reject this upfront. --- arrow-row/src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 200ab7f308da..392292c0db19 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -690,6 +690,15 @@ impl RowConverter { columns.len() ))); } + for colum in columns.iter().skip(1) { + if colum.len() != columns[0].len() { + return Err(ArrowError::InvalidArgumentError(format!( + "RowConverter columns must all have the same length, expected {} got {}", + columns[0].len(), + colum.len() + ))); + } + } let encoders = columns .iter() From aa926c34b5a2be2c02f16dcb4298cf8d4a75466e Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 27 Jun 2025 15:36:55 +0200 Subject: [PATCH 5/6] Switch test to use String array --- arrow-row/src/lib.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 392292c0db19..59932b04f06c 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2719,19 +2719,11 @@ mod tests { let first = Arc::new(first.finish()) as ArrayRef; let first_type = first.data_type().clone(); - let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1); - // 0: [200] - second.values().append_value(200); - second.append(true); - // 1: [201] - second.values().append_value(201); - second.append(true); - // 2: [null] - second.values().append_null(); - second.append(true); - // 3: null - second.values().append_null(); // MASKED - second.append(false); + let mut second = StringBuilder::new(); + second.append_value("somewhere near"); + second.append_null(); + second.append_value("Greenwich"); + second.append_value("Warsaw"); let second = Arc::new(second.finish()) as ArrayRef; let second_type = second.data_type().clone(); From 666bc9b715a4e5dd5f1539837e596bb38a3297e4 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 3 Jul 2025 21:34:21 +0200 Subject: [PATCH 6/6] Check in tests only Out of concern this can cause runtime penalty. --- arrow-row/src/lib.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 59932b04f06c..dac31a2fda5f 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -769,12 +769,14 @@ impl RowConverter { // and therefore must be valid let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?; - for (i, row) in rows.iter().enumerate() { - if !row.is_empty() { - return Err(ArrowError::InvalidArgumentError(format!( - "Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}", - codecs = &self.codecs - ))); + if cfg!(test) { + for (i, row) in rows.iter().enumerate() { + if !row.is_empty() { + return Err(ArrowError::InvalidArgumentError(format!( + "Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}", + codecs = &self.codecs + ))); + } } }