|
18 | 18 | use std::collections::{HashMap, HashSet}; |
19 | 19 | use std::ops::Not; |
20 | 20 |
|
21 | | -use arrow_array::{ |
22 | | - Array, ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, |
23 | | - StringArray, Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, |
24 | | -}; |
| 21 | +use arrow_array::{Array, Int64Array, StringArray}; |
25 | 22 | use futures::{StreamExt, TryStreamExt}; |
26 | 23 | use itertools::Itertools; |
27 | 24 | use tokio::sync::oneshot::{Receiver, channel}; |
28 | 25 |
|
29 | 26 | use super::delete_filter::DeleteFilter; |
30 | | -use crate::arrow::arrow_schema_to_schema; |
31 | 27 | use crate::arrow::delete_file_loader::BasicDeleteFileLoader; |
| 28 | +use crate::arrow::{arrow_primitive_to_literal, arrow_schema_to_schema}; |
32 | 29 | use crate::delete_vector::DeleteVector; |
33 | 30 | use crate::expr::Predicate::AlwaysTrue; |
34 | 31 | use crate::expr::{Predicate, Reference}; |
35 | 32 | use crate::io::FileIO; |
36 | 33 | use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; |
37 | | -use crate::spec::{DataContentType, Datum, NestedFieldRef, PrimitiveType, SchemaRef}; |
| 34 | +use crate::spec::{DataContentType, Datum, SchemaRef}; |
38 | 35 | use crate::{Error, ErrorKind, Result}; |
39 | 36 |
|
40 | 37 | #[derive(Clone, Debug)] |
@@ -342,8 +339,30 @@ impl CachingDeleteFileLoader { |
342 | 339 | // only use columns that are in the set of equality_ids for this delete file |
343 | 340 | .filter(|(field, value)| equality_ids.contains(&value.id)) |
344 | 341 | .map(|(column, field)| { |
345 | | - let col_as_datum_vec = arrow_array_to_datum_iterator(column, field); |
346 | | - col_as_datum_vec.map(|c| (c, field.name.to_string())) |
| 342 | + let lit_vec = arrow_primitive_to_literal(column, &field.field_type)?; |
| 343 | + |
| 344 | + let primitive_type = field.field_type.as_primitive_type().ok_or(Error::new( |
| 345 | + ErrorKind::Unexpected, |
| 346 | + "field is not a primitive type", |
| 347 | + ))?; |
| 348 | + |
| 349 | + let datum_iterator: Box<dyn ExactSizeIterator<Item = Result<Option<Datum>>>> = |
| 350 | + Box::new(lit_vec.into_iter().map(move |c| { |
| 351 | + c.map(|literal| { |
| 352 | + literal |
| 353 | + .as_primitive_literal() |
| 354 | + .map(|primitive_literal| { |
| 355 | + Datum::new(primitive_type.clone(), primitive_literal) |
| 356 | + }) |
| 357 | + .ok_or(Error::new( |
| 358 | + ErrorKind::Unexpected, |
| 359 | + "failed to convert to primitive literal", |
| 360 | + )) |
| 361 | + }) |
| 362 | + .transpose() |
| 363 | + })); |
| 364 | + |
| 365 | + Ok::<_, Error>((datum_iterator, field.name.to_string())) |
347 | 366 | }) |
348 | 367 | .try_collect()?; |
349 | 368 |
|
@@ -371,90 +390,13 @@ impl CachingDeleteFileLoader { |
371 | 390 | } |
372 | 391 | } |
373 | 392 |
|
374 | | -macro_rules! prim_to_datum { |
375 | | - ($column:ident, $arr:ty, $dat:path) => {{ |
376 | | - let arr = $column.as_any().downcast_ref::<$arr>().ok_or(Error::new( |
377 | | - ErrorKind::Unexpected, |
378 | | - format!("could not downcast ArrayRef to {}", stringify!($arr)), |
379 | | - ))?; |
380 | | - Ok(Box::new(arr.iter().map(|val| Ok(val.map($dat))))) |
381 | | - }}; |
382 | | -} |
383 | | - |
384 | | -fn eq_col_unsupported(ty: &str) -> Error { |
385 | | - Error::new( |
386 | | - ErrorKind::FeatureUnsupported, |
387 | | - format!( |
388 | | - "Equality deletes where a predicate acts upon a {} column are not yet supported", |
389 | | - ty |
390 | | - ), |
391 | | - ) |
392 | | -} |
393 | | - |
394 | | -fn arrow_array_to_datum_iterator<'a>( |
395 | | - column: &'a ArrayRef, |
396 | | - field: &NestedFieldRef, |
397 | | -) -> Result<Box<dyn ExactSizeIterator<Item = Result<Option<Datum>>> + 'a>> { |
398 | | - match field.field_type.as_primitive_type() { |
399 | | - Some(primitive_type) => match primitive_type { |
400 | | - PrimitiveType::Int => prim_to_datum!(column, Int32Array, Datum::int), |
401 | | - PrimitiveType::Boolean => { |
402 | | - prim_to_datum!(column, BooleanArray, Datum::bool) |
403 | | - } |
404 | | - PrimitiveType::Long => prim_to_datum!(column, Int64Array, Datum::long), |
405 | | - PrimitiveType::Float => { |
406 | | - prim_to_datum!(column, Float32Array, Datum::float) |
407 | | - } |
408 | | - PrimitiveType::Double => { |
409 | | - prim_to_datum!(column, Float64Array, Datum::double) |
410 | | - } |
411 | | - PrimitiveType::String => { |
412 | | - prim_to_datum!(column, StringArray, Datum::string) |
413 | | - } |
414 | | - PrimitiveType::Date => prim_to_datum!(column, Date32Array, Datum::date), |
415 | | - PrimitiveType::Timestamp => { |
416 | | - prim_to_datum!(column, TimestampMicrosecondArray, Datum::timestamp_micros) |
417 | | - } |
418 | | - PrimitiveType::Timestamptz => { |
419 | | - prim_to_datum!(column, TimestampMicrosecondArray, Datum::timestamptz_micros) |
420 | | - } |
421 | | - PrimitiveType::TimestampNs => { |
422 | | - prim_to_datum!(column, TimestampNanosecondArray, Datum::timestamp_nanos) |
423 | | - } |
424 | | - PrimitiveType::TimestamptzNs => { |
425 | | - prim_to_datum!(column, TimestampNanosecondArray, Datum::timestamptz_nanos) |
426 | | - } |
427 | | - PrimitiveType::Time => { |
428 | | - let arr = column |
429 | | - .as_any() |
430 | | - .downcast_ref::<Time64MicrosecondArray>() |
431 | | - .ok_or(Error::new( |
432 | | - ErrorKind::Unexpected, |
433 | | - "could not downcast ArrayRef to Time64MicrosecondArray", |
434 | | - ))?; |
435 | | - Ok(Box::new(arr.iter().map(|val| match val { |
436 | | - None => Ok(None), |
437 | | - Some(val) => Datum::time_micros(val).map(Some), |
438 | | - }))) |
439 | | - } |
440 | | - PrimitiveType::Decimal { .. } => Err(eq_col_unsupported("Decimal")), |
441 | | - PrimitiveType::Uuid => Err(eq_col_unsupported("Uuid")), |
442 | | - PrimitiveType::Fixed(_) => Err(eq_col_unsupported("Fixed")), |
443 | | - PrimitiveType::Binary => Err(eq_col_unsupported("Binary")), |
444 | | - }, |
445 | | - None => Err(eq_col_unsupported( |
446 | | - "non-primitive (i.e. Struct, List, or Map)", |
447 | | - )), |
448 | | - } |
449 | | -} |
450 | | - |
451 | 393 | #[cfg(test)] |
452 | 394 | mod tests { |
453 | 395 | use std::collections::HashMap; |
454 | 396 | use std::fs::File; |
455 | 397 | use std::sync::Arc; |
456 | 398 |
|
457 | | - use arrow_array::{Int64Array, RecordBatch, StringArray}; |
| 399 | + use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray}; |
458 | 400 | use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; |
459 | 401 | use parquet::basic::Compression; |
460 | 402 | use parquet::file::properties::WriterProperties; |
|
0 commit comments