-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Fix RowNumberReader when not all row groups are selected
#8863
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3fbabe4
f0be36b
0bfc3c3
622bc19
c69d4d0
9dfd7c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -625,12 +625,11 @@ impl ArrowReaderOptions { | |
| pub fn with_virtual_columns(self, virtual_columns: Vec<FieldRef>) -> Self { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also made this comment in the original PR, but maybe we could change this to pub fn set_virtual_columns(&mut self, virtual_columns: Vec<FieldRef>) -> Result<()> {to avoid the panic. If we squeak this in before
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Attaching a link for easier discussion: #8715 (comment) Not having an ability to chain seems a bit degrading, though users can just use I'm fine with changing to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I was upthumbing
Meaning I'd be on board with
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree that sounds better to me
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| // Validate that all fields are virtual columns | ||
| for field in &virtual_columns { | ||
| if !is_virtual_column(field) { | ||
| panic!( | ||
| "Field '{}' is not a virtual column. Virtual columns must have extension type names starting with 'arrow.virtual.'", | ||
| field.name() | ||
| ); | ||
| } | ||
| assert!( | ||
| is_virtual_column(field), | ||
| "Field '{}' is not a virtual column. Virtual columns must have extension type names starting with 'arrow.virtual.'", | ||
| field.name() | ||
| ); | ||
| } | ||
| Self { | ||
| virtual_columns, | ||
|
|
@@ -5546,6 +5545,97 @@ pub(crate) mod tests { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_read_row_numbers_row_group_order() -> Result<()> { | ||
| // Make a parquet file with 100 rows split across 2 row groups | ||
| let array = Int64Array::from_iter_values(5000..5100); | ||
| let batch = RecordBatch::try_from_iter([("col", Arc::new(array) as ArrayRef)])?; | ||
| let mut buffer = Vec::new(); | ||
| let options = WriterProperties::builder() | ||
| .set_max_row_group_size(50) | ||
| .build(); | ||
| let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema().clone(), Some(options))?; | ||
| // write in 10 row batches as the size limits are enforced after each batch | ||
| for batch_chunk in (0..10).map(|i| batch.slice(i * 10, 10)) { | ||
| writer.write(&batch_chunk)?; | ||
| } | ||
| writer.close()?; | ||
|
|
||
| let row_number_field = Arc::new( | ||
| Field::new("row_number", ArrowDataType::Int64, false).with_extension_type(RowNumber), | ||
| ); | ||
|
|
||
| let buffer = Bytes::from(buffer); | ||
|
|
||
| let options = | ||
| ArrowReaderOptions::new().with_virtual_columns(vec![row_number_field.clone()]); | ||
|
|
||
| // read out with normal options | ||
| let arrow_reader = | ||
| ParquetRecordBatchReaderBuilder::try_new_with_options(buffer.clone(), options.clone())? | ||
| .build()?; | ||
|
|
||
| assert_eq!( | ||
| ValuesAndRowNumbers { | ||
| values: (5000..5100).collect(), | ||
| row_numbers: (0..100).collect() | ||
| }, | ||
| ValuesAndRowNumbers::new_from_reader(arrow_reader) | ||
| ); | ||
|
|
||
| // Now read, out of order row groups | ||
| let arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options(buffer, options)? | ||
| .with_row_groups(vec![1, 0]) | ||
| .build()?; | ||
|
|
||
| assert_eq!( | ||
| ValuesAndRowNumbers { | ||
| values: (5050..5100).chain(5000..5050).collect(), | ||
| row_numbers: (50..100).chain(0..50).collect(), | ||
| }, | ||
| ValuesAndRowNumbers::new_from_reader(arrow_reader) | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[derive(Debug, PartialEq)] | ||
| struct ValuesAndRowNumbers { | ||
| values: Vec<i64>, | ||
| row_numbers: Vec<i64>, | ||
| } | ||
| impl ValuesAndRowNumbers { | ||
| fn new_from_reader(reader: ParquetRecordBatchReader) -> Self { | ||
| let mut values = vec![]; | ||
| let mut row_numbers = vec![]; | ||
| for batch in reader { | ||
| let batch = batch.expect("Could not read batch"); | ||
| values.extend( | ||
| batch | ||
| .column_by_name("col") | ||
| .expect("Could not get col column") | ||
| .as_primitive::<arrow::datatypes::Int64Type>() | ||
| .iter() | ||
| .map(|v| v.expect("Could not get value")), | ||
| ); | ||
|
|
||
| row_numbers.extend( | ||
| batch | ||
| .column_by_name("row_number") | ||
| .expect("Could not get row_number column") | ||
| .as_primitive::<arrow::datatypes::Int64Type>() | ||
| .iter() | ||
| .map(|v| v.expect("Could not get row number")) | ||
| .collect::<Vec<_>>(), | ||
| ); | ||
| } | ||
| Self { | ||
| values, | ||
| row_numbers, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| #[should_panic(expected = "is not a virtual column")] | ||
| fn test_with_virtual_columns_rejects_non_virtual_fields() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tiny nit, to simplify L68 below?