Skip to content

Commit 2ec77b5

Browse files
authored
Update variant_integration test to use final approved parquet-testing data (#8325)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes #8084 # Rationale for this change Now that we have merged the upstream parquet-variant tests: - apache/parquet-testing#91 We can test how far we are from the rust variant implementation working for all the values This PR updates the test harness added #8104 by @carpecodeum to use the final parquet files and the currnet APIs # What changes are included in this PR? 1. Update parquet-testing pin 2. Update the test harness to use the standard rust test runner (`#[test]`) rather than a custom main function 3. Added links to follow on tickets You can run this test manually like this: ```shell cargo test --all-features --test variant_integration ... running 138 tests test test_variant_integration_case_106 ... ok test test_variant_integration_case_107 ... ok test test_variant_integration_case_109 ... ok test test_variant_integration_case_110 ... ok .. test test_variant_integration_case_90 ... ok test test_variant_integration_case_91 ... ok test test_variant_integration_case_93 ... ok test test_variant_integration_case_83 - should panic ... ok test test_variant_integration_case_84 - should panic ... ok ``` # Are these changes tested? Yes this is all tests # Are there any user-facing changes? No
1 parent 7696432 commit 2ec77b5

File tree

5 files changed

+454
-1177
lines changed

5 files changed

+454
-1177
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@
1717

1818
//! [`VariantArray`] implementation
1919
20+
use crate::type_conversion::primitive_conversion_single_value;
2021
use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
2122
use arrow::buffer::NullBuffer;
2223
use arrow::datatypes::{
2324
Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
2425
UInt32Type, UInt64Type, UInt8Type,
2526
};
2627
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
28+
use parquet_variant::Uuid;
2729
use parquet_variant::Variant;
2830
use std::any::Any;
2931
use std::sync::Arc;
3032

31-
use crate::type_conversion::primitive_conversion_single_value;
32-
3333
/// An array of Parquet [`Variant`] values
3434
///
3535
/// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying
@@ -556,8 +556,15 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '
556556
let value = boolean_array.value(index);
557557
Variant::from(value)
558558
}
559-
DataType::FixedSizeBinary(_) => {
559+
DataType::FixedSizeBinary(binary_len) => {
560560
let array = typed_value.as_fixed_size_binary();
561+
// Try to treat 16 byte FixedSizeBinary as UUID
562+
let value = array.value(index);
563+
if *binary_len == 16 {
564+
if let Ok(uuid) = Uuid::from_slice(value) {
565+
return Variant::from(uuid);
566+
}
567+
}
561568
let value = array.value(index);
562569
Variant::from(value)
563570
}

parquet-variant/src/variant.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8};
1919
pub use self::list::VariantList;
2020
pub use self::metadata::{VariantMetadata, EMPTY_VARIANT_METADATA, EMPTY_VARIANT_METADATA_BYTES};
2121
pub use self::object::VariantObject;
22+
23+
// Publically export types used in the API
24+
pub use half::f16;
25+
pub use uuid::Uuid;
26+
2227
use crate::decoder::{
2328
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
2429
};
@@ -28,8 +33,6 @@ use std::ops::Deref;
2833

2934
use arrow_schema::ArrowError;
3035
use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc};
31-
use half::f16;
32-
use uuid::Uuid;
3336

3437
mod decimal;
3538
mod list;

parquet-variant/src/variant/metadata.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ impl VariantMetadataHeader {
130130
/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
131131
#[derive(Debug, Clone, PartialEq)]
132132
pub struct VariantMetadata<'m> {
133+
/// (Only) the bytes that make up this metadata instance.
133134
pub(crate) bytes: &'m [u8],
134135
header: VariantMetadataHeader,
135136
dictionary_size: u32,
@@ -332,7 +333,7 @@ impl<'m> VariantMetadata<'m> {
332333
self.header.version
333334
}
334335

335-
/// Gets an offset array entry by index.
336+
/// Gets an offset into the dictionary entry by index.
336337
///
337338
/// This offset is an index into the dictionary, at the boundary between string `i-1` and string
338339
/// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
@@ -342,6 +343,15 @@ impl<'m> VariantMetadata<'m> {
342343
self.header.offset_size.unpack_u32(bytes, i)
343344
}
344345

346+
/// Returns the total size, in bytes, of the metadata.
347+
///
348+
/// Note this value may be smaller than what was passed to [`Self::new`] or
349+
/// [`Self::try_new`] if the input was larger than necessary to encode the
350+
/// metadata dictionary.
351+
pub fn size(&self) -> usize {
352+
self.bytes.len()
353+
}
354+
345355
/// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the
346356
/// underlying bytes are [invalid].
347357
///

parquet/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ name = "encryption"
171171
required-features = ["arrow"]
172172
path = "./tests/encryption/mod.rs"
173173

174+
[[test]]
175+
name = "variant_integration"
176+
required-features = ["arrow", "variant_experimental", "serde"]
177+
path = "./tests/variant_integration.rs"
178+
174179
[[bin]]
175180
name = "parquet-read"
176181
required-features = ["cli"]

0 commit comments

Comments
 (0)