From 3ba9fe006f2f6ba11ba7ac8e486691d9d02bb1c4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 18 Jul 2025 12:38:16 -0400 Subject: [PATCH 01/14] [Variant] Strawman / infrastructure for variant_get of shredded values --- parquet-variant-compute/src/from_json.rs | 6 +- parquet-variant-compute/src/variant_array.rs | 280 +++++++++--- .../src/variant_array_builder.rs | 2 +- parquet-variant-compute/src/variant_get.rs | 180 -------- .../src/variant_get/mod.rs | 431 ++++++++++++++++++ .../src/variant_get/output/mod.rs | 87 ++++ .../src/variant_get/output/primitive.rs | 166 +++++++ .../src/variant_get/output/variant.rs | 146 ++++++ parquet-variant/src/path.rs | 2 +- 9 files changed, 1051 insertions(+), 249 deletions(-) delete mode 100644 parquet-variant-compute/src/variant_get.rs create mode 100644 parquet-variant-compute/src/variant_get/mod.rs create mode 100644 parquet-variant-compute/src/variant_get/output/mod.rs create mode 100644 parquet-variant-compute/src/variant_get/output/primitive.rs create mode 100644 parquet-variant-compute/src/variant_get/output/variant.rs diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index 05207d094a25..a101bf01cfda 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -52,7 +52,7 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result &BinaryViewArray { + match self { + ShreddingState::Unshredded { metadata, .. } => metadata, + ShreddingState::FullyShredded { metadata, .. } => metadata, + ShreddingState::PartiallyShredded { metadata, .. } => metadata, + } + } + + /// Return a reference to the value field, if present + pub fn value_field(&self) -> Option<&BinaryViewArray> { + match self { + ShreddingState::Unshredded { value, .. } => Some(value), + ShreddingState::FullyShredded { .. } => None, + ShreddingState::PartiallyShredded { value, .. } => Some(value), + } + } + + /// Return a reference to the typed_value field, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + match self { + ShreddingState::Unshredded { .. } => None, + ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), + } + } - /// Reference to the value column of inner - value_ref: ArrayRef, + /// Slice all the underlying arrays + pub fn slice(&self, offset: usize, length: usize) -> Self { + match self { + ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded { + metadata: metadata.slice(offset, length), + value: value.slice(offset, length), + }, + ShreddingState::FullyShredded { + metadata, + typed_value, + } => ShreddingState::FullyShredded { + metadata: metadata.slice(offset, length), + typed_value: typed_value.slice(offset, length), + }, + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => ShreddingState::PartiallyShredded { + metadata: metadata.slice(offset, length), + value: value.slice(offset, length), + typed_value: typed_value.slice(offset, length), + }, + } + } } impl VariantArray { @@ -79,12 +143,22 @@ impl VariantArray { /// # Errors: /// - If the `StructArray` does not contain the required fields /// - /// # Current support - /// This structure does not (yet) support the full Arrow Variant Array specification. + /// # Requirements of the `StructArray` + /// + /// 1. A required field named `metadata` which is binary, large_binary, or + /// binary_view /// - /// Only `StructArrays` with `metadata` and `value` fields that are - /// [`BinaryViewArray`] are supported. Shredded values are not currently supported - /// nor are using types other than `BinaryViewArray` + /// 2. An optional field named `value` that is binary, large_binary, or + /// binary_view + /// + /// 3. An optional field named `typed_value` which can be any primitive type + /// or be a list, large_list, list_view or struct + /// + /// NOTE: It is also permissible for the metadata field to be + /// Dictionary-Encoded, preferably (but not required) with an index type of + /// int8. + /// + /// Currently, only [`BinaryViewArray`] are supported. /// /// [`BinaryViewArray`]: arrow::array::BinaryViewArray pub fn try_new(inner: ArrayRef) -> Result { @@ -93,35 +167,64 @@ impl VariantArray { "Invalid VariantArray: requires StructArray as input".to_string(), )); }; - // Ensure the StructArray has a metadata field of BinaryView - let Some(metadata_field) = VariantArray::find_metadata_field(inner) else { + // Note the specification allows for any order so we must search by name + + // Ensure the StructArray has a metadata field of BinaryView + let Some(metadata_field) = inner.column_by_name("metadata") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); }; - if metadata_field.data_type() != &DataType::BinaryView { + let Some(metadata) = metadata_field.as_binary_view_opt() else { return Err(ArrowError::NotYetImplemented(format!( "VariantArray 'metadata' field must be BinaryView, got {}", metadata_field.data_type() ))); - } - let Some(value_field) = VariantArray::find_value_field(inner) else { - return Err(ArrowError::InvalidArgumentError( - "Invalid VariantArray: StructArray must contain a 'value' field".to_string(), - )); }; - if value_field.data_type() != &DataType::BinaryView { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - value_field.data_type() - ))); - } + + // Find the value field, if present + let value_field = inner.column_by_name("value"); + let value = value_field + .map(|v| match v.as_binary_view_opt() { + Some(bv) => Ok(bv), + None => Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + v.data_type() + ))), + }) + .transpose()?; + + // Find the typed_value field, if present + let typed_value = inner.column_by_name("typed_value"); + + // Note these clones are cheap, they just bump the ref count + let inner = inner.clone(); + let metadata = metadata.clone(); + let value = value.cloned(); + let typed_value = typed_value.cloned(); + + let shredding_state = match (metadata, value, typed_value) { + (metadata, Some(value), Some(typed_value)) => ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + }, + (metadata, Some(value), None) => ShreddingState::Unshredded { metadata, value }, + (metadata, None, Some(typed_value)) => ShreddingState::FullyShredded { + metadata, + typed_value, + }, + (_metadata_field, None, None) => { + return Err(ArrowError::InvalidArgumentError(String::from( + "VariantArray has neither value nor typed_value field", + ))); + } + }; Ok(Self { - inner: inner.clone(), - metadata_ref: metadata_field, - value_ref: value_field, + inner, + shredding_state, }) } @@ -135,36 +238,87 @@ impl VariantArray { self.inner } + /// Return the shredding state of this `VariantArray` + pub fn shredding_state(&self) -> &ShreddingState { + &self.shredding_state + } + /// Return the [`Variant`] instance stored at the given row /// - /// Panics if the index is out of bounds. + /// Consistently with other Arrow arrays types, this API requires you to + /// check for nulls first using [`Self::is_valid`]. + /// + /// # Panics + /// * if the index is out of bounds + /// * if the array value is null + /// + /// If this is a shredded variant but has no value at the shredded location, it + /// will return [`Variant::Null`]. + /// + /// + /// # Performance Note + /// + /// This is certainly not the most efficient way to access values in a + /// `VariantArray`, but it is useful for testing and debugging. /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant { - let metadata = self.metadata_field().as_binary_view().value(index); - let value = self.value_field().as_binary_view().value(index); - Variant::new(metadata, value) + match &self.shredding_state { + ShreddingState::Unshredded { metadata, value } => { + Variant::new(metadata.value(index), value.value(index)) + } + ShreddingState::FullyShredded { + metadata: _, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::Null + } else { + typed_value_to_variant(typed_value, index) + } + } + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::new(metadata.value(index), value.value(index)) + } else { + typed_value_to_variant(typed_value, index) + } + } + } } - fn find_metadata_field(array: &StructArray) -> Option { - array.column_by_name("metadata").cloned() + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &BinaryViewArray { + self.shredding_state.metadata_field() } - fn find_value_field(array: &StructArray) -> Option { - array.column_by_name("value").cloned() + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> Option<&BinaryViewArray> { + self.shredding_state.value_field() } - /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.metadata_ref + /// Return a reference to the typed_value field of the `StructArray`, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + self.shredding_state.typed_value_field() } +} - /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.value_ref +/// returns the non-null element at index as a Variant +fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant { + match typed_value.data_type() { + DataType::Int32 => { + let typed_value = typed_value.as_primitive::(); + Variant::from(typed_value.value(index)) + } + // todo other types here + _ => { + todo!(); // Unsupported typed_value type + } } } @@ -186,13 +340,11 @@ impl Array for VariantArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let slice = self.inner.slice(offset, length); - let met = self.metadata_ref.slice(offset, length); - let val = self.value_ref.slice(offset, length); + let inner = self.inner.slice(offset, length); + let shredding_state = self.shredding_state.slice(offset, length); Arc::new(Self { - inner: slice, - metadata_ref: met, - value_ref: val, + inner, + shredding_state, }) } @@ -258,7 +410,7 @@ mod test { let err = VariantArray::try_new(Arc::new(array)); assert_eq!( err.unwrap_err().to_string(), - "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field" + "Invalid argument error: VariantArray has neither value nor typed_value field" ); } diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 6a8dba06f15d..a3abae7db6ae 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -375,7 +375,7 @@ mod test { // the metadata and value fields of non shredded variants should not be null assert!(variant_array.metadata_field().nulls().is_none()); - assert!(variant_array.value_field().nulls().is_none()); + assert!(variant_array.value_field().unwrap().nulls().is_none()); let DataType::Struct(fields) = variant_array.data_type() else { panic!("Expected VariantArray to have Struct data type"); }; diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs deleted file mode 100644 index b3a3d9e41f13..000000000000 --- a/parquet-variant-compute/src/variant_get.rs +++ /dev/null @@ -1,180 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -use std::sync::Arc; - -use arrow::{ - array::{Array, ArrayRef}, - compute::CastOptions, - error::Result, -}; -use arrow_schema::{ArrowError, Field}; -use parquet_variant::VariantPath; - -use crate::{VariantArray, VariantArrayBuilder}; - -/// Returns an array with the specified path extracted from the variant values. -/// -/// The return array type depends on the `as_type` field of the options parameter -/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point -/// to the specified path. -/// 2. `as_type: Some()`: an array of the specified type is returned. -pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { - let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "expected a VariantArray as the input for variant_get".to_owned(), - ) - })?; - - if let Some(as_type) = options.as_type { - return Err(ArrowError::NotYetImplemented(format!( - "getting a {as_type} from a VariantArray is not implemented yet", - ))); - } - - let mut builder = VariantArrayBuilder::new(variant_array.len()); - for i in 0..variant_array.len() { - let new_variant = variant_array.value(i); - // TODO: perf? - let new_variant = new_variant.get_path(&options.path); - match new_variant { - // TODO: we're decoding the value and doing a copy into a variant value again. This - // copy can be much smarter. - Some(new_variant) => builder.append_variant(new_variant), - None => builder.append_null(), - } - } - - Ok(Arc::new(builder.build())) -} - -/// Controls the action of the variant_get kernel. -#[derive(Debug, Clone)] -pub struct GetOptions<'a> { - /// What path to extract - pub path: VariantPath<'a>, - /// if `as_type` is None, the returned array will itself be a VariantArray. - /// - /// if `as_type` is `Some(type)` the field is returned as the specified type. - pub as_type: Option, - /// Controls the casting behavior (e.g. error vs substituting null on cast error). - pub cast_options: CastOptions<'a>, -} - -impl<'a> GetOptions<'a> { - /// Construct options to get the specified path as a variant. - pub fn new_with_path(path: VariantPath<'a>) -> Self { - Self { - path, - as_type: None, - cast_options: Default::default(), - } - } -} - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use arrow::array::{Array, ArrayRef, StringArray}; - use parquet_variant::VariantPath; - - use crate::batch_json_string_to_variant; - use crate::VariantArray; - - use super::{variant_get, GetOptions}; - - fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { - // Create input array from JSON string - let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); - let input_variant_array_ref: ArrayRef = - Arc::new(batch_json_string_to_variant(&input_array_ref).unwrap()); - - let result = - variant_get(&input_variant_array_ref, GetOptions::new_with_path(path)).unwrap(); - - // Create expected array from JSON string - let expected_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(expected_json)])); - let expected_variant_array = batch_json_string_to_variant(&expected_array_ref).unwrap(); - - let result_array: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!( - result_array.len(), - 1, - "Expected result array to have length 1" - ); - assert!( - result_array.nulls().is_none(), - "Expected no nulls in result array" - ); - let result_variant = result_array.value(0); - let expected_variant = expected_variant_array.value(0); - assert_eq!( - result_variant, expected_variant, - "Result variant does not match expected variant" - ); - } - - #[test] - fn get_primitive_variant_field() { - single_variant_get_test( - r#"{"some_field": 1234}"#, - VariantPath::from("some_field"), - "1234", - ); - } - - #[test] - fn get_primitive_variant_list_index() { - single_variant_get_test("[1234, 5678]", VariantPath::from(0), "1234"); - } - - #[test] - fn get_primitive_variant_inside_object_of_object() { - single_variant_get_test( - r#"{"top_level_field": {"inner_field": 1234}}"#, - VariantPath::from("top_level_field").join("inner_field"), - "1234", - ); - } - - #[test] - fn get_primitive_variant_inside_list_of_object() { - single_variant_get_test( - r#"[{"some_field": 1234}]"#, - VariantPath::from(0).join("some_field"), - "1234", - ); - } - - #[test] - fn get_primitive_variant_inside_object_of_list() { - single_variant_get_test( - r#"{"some_field": [1234]}"#, - VariantPath::from("some_field").join(0), - "1234", - ); - } - - #[test] - fn get_complex_variant() { - single_variant_get_test( - r#"{"top_level_field": {"inner_field": 1234}}"#, - VariantPath::from("top_level_field"), - r#"{"inner_field": 1234}"#, - ); - } -} diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs new file mode 100644 index 000000000000..a8293b53cc53 --- /dev/null +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -0,0 +1,431 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use arrow::{ + array::{Array, ArrayRef}, + compute::CastOptions, + error::Result, +}; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::VariantPath; + +use crate::variant_array::ShreddingState; +use crate::variant_get::output::instantiate_output_builder; +use crate::VariantArray; + +mod output; + +/// Returns an array with the specified path extracted from the variant values. +/// +/// The return array type depends on the `as_type` field of the options parameter +/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point +/// to the specified path. +/// 2. `as_type: Some()`: an array of the specified type is returned. +pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { + let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "expected a VariantArray as the input for variant_get".to_owned(), + ) + })?; + + // Create the output writer based on the specified output options + let output_builder = instantiate_output_builder(options.clone())?; + + // Dispatch based on the shredding state of the input variant array + // TODO make this an enum on VariantArray (e.g ShreddingState) + match variant_array.shredding_state() { + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => output_builder.partially_shredded(variant_array, metadata, value, typed_value), + ShreddingState::FullyShredded { + metadata, + typed_value, + } => output_builder.fully_shredded(variant_array, metadata, typed_value), + ShreddingState::Unshredded { metadata, value } => { + output_builder.unshredded(variant_array, metadata, value) + } + } +} + +/// Controls the action of the variant_get kernel. +#[derive(Debug, Clone, Default)] +pub struct GetOptions<'a> { + /// What path to extract + pub path: VariantPath<'a>, + /// if `as_type` is None, the returned array will itself be a VariantArray. + /// + /// if `as_type` is `Some(type)` the field is returned as the specified type. + pub as_type: Option, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + pub cast_options: CastOptions<'a>, +} + +impl<'a> GetOptions<'a> { + /// Construct default options to get the specified path as a variant. + pub fn new() -> Self { + Default::default() + } + + /// Construct options to get the specified path as a variant. + pub fn new_with_path(path: VariantPath<'a>) -> Self { + Self { + path, + as_type: None, + cast_options: Default::default(), + } + } + + /// Specify the type to return. + pub fn with_as_type(mut self, as_type: Option) -> Self { + self.as_type = as_type; + self + } + + /// Specify the cast options to use when casting to the specified type. + pub fn with_cast_options(mut self, cast_options: CastOptions<'a>) -> Self { + self.cast_options = cast_options; + self + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow::array::{Array, ArrayRef, BinaryViewArray, Int32Array, StringArray, StructArray}; + use arrow::buffer::NullBuffer; + use arrow::compute::CastOptions; + use arrow_schema::{DataType, Field, FieldRef, Fields}; + use parquet_variant::{Variant, VariantPath}; + + use crate::batch_json_string_to_variant; + use crate::VariantArray; + + use super::{variant_get, GetOptions}; + + fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { + // Create input array from JSON string + let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); + let input_variant_array_ref: ArrayRef = + Arc::new(batch_json_string_to_variant(&input_array_ref).unwrap()); + + let result = + variant_get(&input_variant_array_ref, GetOptions::new_with_path(path)).unwrap(); + + // Create expected array from JSON string + let expected_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(expected_json)])); + let expected_variant_array = batch_json_string_to_variant(&expected_array_ref).unwrap(); + + let result_array: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!( + result_array.len(), + 1, + "Expected result array to have length 1" + ); + assert!( + result_array.nulls().is_none(), + "Expected no nulls in result array" + ); + let result_variant = result_array.value(0); + let expected_variant = expected_variant_array.value(0); + assert_eq!( + result_variant, expected_variant, + "Result variant does not match expected variant" + ); + } + + #[test] + fn get_primitive_variant_field() { + single_variant_get_test( + r#"{"some_field": 1234}"#, + VariantPath::from("some_field"), + "1234", + ); + } + + #[test] + fn get_primitive_variant_list_index() { + single_variant_get_test("[1234, 5678]", VariantPath::from(0), "1234"); + } + + #[test] + fn get_primitive_variant_inside_object_of_object() { + single_variant_get_test( + r#"{"top_level_field": {"inner_field": 1234}}"#, + VariantPath::from("top_level_field").join("inner_field"), + "1234", + ); + } + + #[test] + fn get_primitive_variant_inside_list_of_object() { + single_variant_get_test( + r#"[{"some_field": 1234}]"#, + VariantPath::from(0).join("some_field"), + "1234", + ); + } + + #[test] + fn get_primitive_variant_inside_object_of_list() { + single_variant_get_test( + r#"{"some_field": [1234]}"#, + VariantPath::from("some_field").join(0), + "1234", + ); + } + + #[test] + fn get_complex_variant() { + single_variant_get_test( + r#"{"top_level_field": {"inner_field": 1234}}"#, + VariantPath::from("top_level_field"), + r#"{"inner_field": 1234}"#, + ); + } + + /// Shredding: extract a value as a VariantArray + #[test] + fn get_variant_shredded_int32_as_variant() { + let array = shredded_int32_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::Int32(34)); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("n/a")); + assert_eq!(result.value(3), Variant::Int32(100)); + } + + /// Shredding: extract a value as an Int32Array + #[test] + fn get_variant_shredded_int32_as_int32_safe_cast() { + // Extract the typed value as Int32Array + let array = shredded_int32_variant_array(); + // specify we want the typed value as Int32 + let field = Field::new("typed_value", DataType::Int32, true); + let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(34), + None, + None, // "n/a" is not an Int32 so converted to null + Some(100), + ])); + assert_eq!(&result, &expected) + } + + /// Shredding: extract a value as an Int32Array, unsafe cast (should error on "n/a") + + #[test] + fn get_variant_shredded_int32_as_int32_unsafe_cast() { + // Extract the typed value as Int32Array + let array = shredded_int32_variant_array(); + let field = Field::new("typed_value", DataType::Int32, true); + let cast_options = CastOptions { + safe: false, // unsafe cast + ..Default::default() + }; + let options = GetOptions::new() + .with_as_type(Some(FieldRef::from(field))) + .with_cast_options(cast_options); + + let err = variant_get(&array, options).unwrap_err(); + // TODO make this error message nicer (not Debug format) + assert_eq!(err.to_string(), "Cast error: Failed to extract primitive of type Int32 from variant ShortString(ShortString(\"n/a\")) at path VariantPath([])"); + } + + /// Perfect Shredding: extract the typed value as a VariantArray + #[test] + fn get_variant_perfectly_shredded_int32_as_variant() { + let array = perfectly_shredded_int32_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::Int32(1)); + assert_eq!(result.value(1), Variant::Int32(2)); + assert_eq!(result.value(2), Variant::Int32(3)); + } + + /// Shredding: Extract the typed value as Int32Array + #[test] + fn get_variant_perfectly_shredded_int32_as_int32() { + // Extract the typed value as Int32Array + let array = perfectly_shredded_int32_variant_array(); + // specify we want the typed value as Int32 + let field = Field::new("typed_value", DataType::Int32, true); + let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])); + assert_eq!(&result, &expected) + } + + /// Return a VariantArray that represents a perfectly "shredded" variant + /// for the following example (3 Variant::Int32 values): + /// + /// ```text + /// 1 + /// 2 + /// 3 + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// typed_value: Int32Array, + /// } + /// ``` + fn perfectly_shredded_int32_variant_array() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + let typed_value = Int32Array::from(vec![Some(1), Some(2), Some(3)]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_field("typed_value", Arc::new(typed_value)) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + + /// Return a VariantArray that represents a normal "shredded" variant + /// for the following example + /// + /// Based on the example from [the doc] + /// + /// [the doc]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?tab=t.0 + /// + /// ```text + /// 34 + /// null (an Arrow NULL, not a Variant::Null) + /// "n/a" (a string) + /// 100 + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// value: BinaryViewArray, + /// typed_value: Int32Array, + /// } + /// ``` + fn shredded_int32_variant_array() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = Int32Array::from(vec![ + Some(34), // row 0 is shredded, so it has a value + None, // row 1 is null, so no value + None, // row 2 is a string, so no typed value + Some(100), // row 3 is shredded, so it has a value + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_field("typed_value", Arc::new(typed_value)) + .with_field("value", Arc::new(values)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + + /// Builds struct arrays from component fields + /// + /// TODO: move to arrow crate + #[derive(Debug, Default, Clone)] + struct StructArrayBuilder { + fields: Vec, + arrays: Vec, + nulls: Option, + } + + impl StructArrayBuilder { + fn new() -> Self { + Default::default() + } + + /// Add an array to this struct array as a field with the specified name. + fn with_field(mut self, field_name: &str, array: ArrayRef) -> Self { + let field = Field::new(field_name, array.data_type().clone(), true); + self.fields.push(Arc::new(field)); + self.arrays.push(array); + self + } + + /// Set the null buffer for this struct array. + fn with_nulls(mut self, nulls: NullBuffer) -> Self { + self.nulls = Some(nulls); + self + } + + pub fn build(self) -> StructArray { + let Self { + fields, + arrays, + nulls, + } = self; + StructArray::new(Fields::from(fields), arrays, nulls) + } + } +} diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs b/parquet-variant-compute/src/variant_get/output/mod.rs new file mode 100644 index 000000000000..44b747e126c8 --- /dev/null +++ b/parquet-variant-compute/src/variant_get/output/mod.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod primitive; +mod variant; + +use crate::variant_get::output::primitive::PrimitiveOutputBuilder; +use crate::variant_get::output::variant::VariantOutputBuilder; +use crate::variant_get::GetOptions; +use crate::VariantArray; +use arrow::array::{ArrayRef, BinaryViewArray}; +use arrow::datatypes::Int32Type; +use arrow::error::Result; +use arrow_schema::{ArrowError, DataType}; + +/// This trait represents something that gets the output of the variant_get kernel. +/// +/// For example, there are specializations for writing the output as a VariantArray, +/// or as a specific type (e.g. Int32Array). +/// +/// See [`instantiate_output_builder`] to create an instance of this trait. +pub(crate) trait OutputBuilder { + /// create output for a shredded variant array + fn partially_shredded( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> Result; + + /// output for a perfectly shredded variant array + fn fully_shredded( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> Result; + + /// write out an unshredded variant array + fn unshredded( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + value_field: &BinaryViewArray, + ) -> Result; +} + +pub(crate) fn instantiate_output_builder<'a>( + options: GetOptions<'a>, +) -> Result> { + let GetOptions { + as_type, + path, + cast_options, + } = options; + + let Some(as_type) = as_type else { + return Ok(Box::new(VariantOutputBuilder::new(path))); + }; + + // handle typed output + match as_type.data_type() { + DataType::Int32 => Ok(Box::new(PrimitiveOutputBuilder::::new( + path, + as_type, + cast_options, + ))), + dt => Err(ArrowError::NotYetImplemented(format!( + "variant_get with as_type={dt} is not implemented yet", + ))), + } +} diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs new file mode 100644 index 000000000000..2ffc53f5a099 --- /dev/null +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::VariantArray; +use arrow::error::Result; + +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, + PrimitiveArray, +}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::{Variant, VariantPath}; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Trait for Arrow primitive types that can be used in the output builder +/// +/// This just exists to add a generic way to convert from Variant to the primitive type +pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { + /// Try to extract the primitive value from a Variant, returning None if it + /// cannot be converted + /// + /// TODO: figure out how to handle coercion/casting + fn from_variant(variant: &Variant) -> Option; +} + +/// Outputs Primitive arrays +pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { + /// What path to extract + path: VariantPath<'a>, + /// Returned output type + as_type: FieldRef, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + cast_options: CastOptions<'a>, + /// Phantom data for the primitive type + _phantom: PhantomData, +} + +impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { + pub(super) fn new( + path: VariantPath<'a>, + as_type: FieldRef, + cast_options: CastOptions<'a>, + ) -> Self { + Self { + path, + as_type, + cast_options, + _phantom: PhantomData, + } + } +} + +impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result { + // build up the output array element by element + let mut nulls = NullBufferBuilder::new(variant_array.len()); + let mut values = Vec::with_capacity(variant_array.len()); + let typed_value = + cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; + // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) + let typed_value = typed_value.as_primitive::(); + + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } + + // if the typed value is null, decode the variant and extract the value + if typed_value.is_null(i) { + // todo follow path + let variant = variant_array.value(i); + let Some(value) = T::from_variant(&variant) else { + if self.cast_options.safe { + // safe mode: append null if we can't convert + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } else { + return Err(ArrowError::CastError(format!( + "Failed to extract primitive of type {} from variant {:?} at path {:?}", + self.as_type.data_type(), + variant, + self.path + ))); + } + }; + + nulls.append_non_null(); + values.push(value) + } else { + // otherwise we have a typed value, so we can use it directly + nulls.append_non_null(); + values.push(typed_value.value(i)); + } + } + + let nulls = nulls.finish(); + let array = PrimitiveArray::::new(values.into(), nulls) + .with_data_type(self.as_type.data_type().clone()); + Ok(Arc::new(array)) + } + + fn fully_shredded( + &self, + _variant_array: &VariantArray, + _metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result { + // if the types match exactly, we can just return the typed_value + if typed_value.data_type() == self.as_type.data_type() { + Ok(typed_value.clone()) + } else { + // TODO: try to cast the typed_value to the desired type? + Err(ArrowError::NotYetImplemented(format!( + "variant_get fully_shredded as {:?} with typed_value={:?} is not implemented yet", + self.as_type.data_type(), + typed_value.data_type() + ))) + } + } + + fn unshredded( + &self, + _variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + ) -> Result { + Err(ArrowError::NotYetImplemented(String::from( + "variant_get unshredded to primitive types is not implemented yet", + ))) + } +} + +impl ArrowPrimitiveVariant for Int32Type { + fn from_variant(variant: &Variant) -> Option { + variant.as_int32() + } +} + +// todo for other primitive types diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs new file mode 100644 index 000000000000..fd6a5ddd6211 --- /dev/null +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, DataType}; +use parquet_variant::{Variant, VariantPath}; +use std::sync::Arc; + +/// Outputs VariantArrays +pub(super) struct VariantOutputBuilder<'a> { + /// What path to extract + path: VariantPath<'a>, +} + +impl<'a> VariantOutputBuilder<'a> { + pub(super) fn new(path: VariantPath<'a>) -> Self { + Self { path } + } +} + +impl<'a> OutputBuilder for VariantOutputBuilder<'a> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + // TODO(perf): can reuse the metadata field here to avoid re-creating it + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result { + // in this case dispatch on the typed_value and + // TODO macro'ize this using downcast! to handle all other primitive types + // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) + let mut array_builder = VariantArrayBuilder::new(variant_array.len()); + match typed_value.data_type() { + DataType::Int32 => { + let primitive_array = typed_value.as_primitive::(); + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + array_builder.append_null(); + continue; + } + + if typed_value.is_null(i) { + // fall back to the value (variant) field + // (TODO could copy the variant bytes directly) + let value = variant_array.value(i); + array_builder.append_variant(value); + continue; + } + + // otherwise we have a typed value, so we can use it directly + let int_value = primitive_array.value(i); + array_builder.append_variant(Variant::from(int_value)); + } + } + dt => { + return Err(ArrowError::NotYetImplemented(format!( + "variant_get fully_shredded with typed_value={dt} is not implemented yet", + ))); + } + }; + Ok(Arc::new(array_builder.build())) + } + + fn fully_shredded( + &self, + variant_array: &VariantArray, + // TODO(perf): can reuse the metadata field here to avoid re-creating it + _metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result { + // in this case dispatch on the typed_value and + // TODO macro'ize this using downcast! to handle all other primitive types + // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) + let mut array_builder = VariantArrayBuilder::new(variant_array.len()); + match typed_value.data_type() { + DataType::Int32 => { + let primitive_array = typed_value.as_primitive::(); + for i in 0..variant_array.len() { + if primitive_array.is_null(i) { + array_builder.append_null(); + continue; + } + + let int_value = primitive_array.value(i); + array_builder.append_variant(Variant::from(int_value)); + } + } + dt => { + return Err(ArrowError::NotYetImplemented(format!( + "variant_get fully_shredded with typed_value={dt} is not implemented yet", + ))); + } + }; + Ok(Arc::new(array_builder.build())) + } + + fn unshredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + ) -> arrow::error::Result { + let mut builder = VariantArrayBuilder::new(variant_array.len()); + for i in 0..variant_array.len() { + let new_variant = variant_array.value(i); + + // TODO: perf? + let Some(new_variant) = new_variant.get_path(&self.path) else { + // path not found, append null + builder.append_null(); + continue; + }; + + // TODO: we're decoding the value and doing a copy into a variant value + // again. This can be much faster by using the _metadata and _value_field + // to avoid decoding the entire variant: + // + // 1) reuse the metadata arrays as is + // + // 2) Create a new BinaryViewArray that uses the same underlying buffers + // that the original variant used, but whose views points to a new + // offset for the new path + builder.append_variant(new_variant); + } + + Ok(Arc::new(builder.build())) + } +} diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index ddbfc5e469a4..e08d5ab52353 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -63,7 +63,7 @@ use std::{borrow::Cow, ops::Deref}; /// .join("baz"); /// assert_eq!(path[1], VariantPathElement::field("bar")); /// ``` -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Default)] pub struct VariantPath<'a>(Vec>); impl<'a> VariantPath<'a> { From f458ec555f2f1750cdcb4069d0c8b2e7d54d15bc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 5 Aug 2025 09:28:54 -0400 Subject: [PATCH 02/14] Apply suggestions from code review Co-authored-by: Samyak Sarnayak --- parquet-variant-compute/src/variant_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e47e52e595f4..f484be638cf6 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -158,7 +158,7 @@ impl VariantArray { /// Dictionary-Encoded, preferably (but not required) with an index type of /// int8. /// - /// Currently, only [`BinaryViewArray`] are supported. + /// Currently, only [`BinaryViewArray`] are supported. /// /// [`BinaryViewArray`]: arrow::array::BinaryViewArray pub fn try_new(inner: ArrayRef) -> Result { From 6b6665039a36567ce2bf6031f590fed464c0088d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 5 Aug 2025 09:34:18 -0400 Subject: [PATCH 03/14] remove completed todo comment --- parquet-variant-compute/src/variant_get/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index a8293b53cc53..2ec923ea10b9 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -45,7 +45,6 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { let output_builder = instantiate_output_builder(options.clone())?; // Dispatch based on the shredding state of the input variant array - // TODO make this an enum on VariantArray (e.g ShreddingState) match variant_array.shredding_state() { ShreddingState::PartiallyShredded { metadata, From 924155ca68d4b056e7e4d201b8182f620a17732d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 5 Aug 2025 09:34:32 -0400 Subject: [PATCH 04/14] Move validation logic to `ShreddingState::try_new` --- parquet-variant-compute/src/variant_array.rs | 204 ++++++++++--------- 1 file changed, 104 insertions(+), 100 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index f484be638cf6..fda6b1c1b33c 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -52,85 +52,6 @@ pub struct VariantArray { shredding_state: ShreddingState, } -/// Variant arrays can be shredded in one of three states, encoded here -#[derive(Debug)] -pub enum ShreddingState { - /// This variant has no typed_value field - Unshredded { - metadata: BinaryViewArray, - value: BinaryViewArray, - }, - /// This variant has a typed_value field and no value field - /// meaning it is fully shredded (aka the value is stored in typed_value) - FullyShredded { - metadata: BinaryViewArray, - typed_value: ArrayRef, - }, - /// This variant has both a value field and a typed_value field - /// meaning it is partially shredded: first the typed_value is used, and - /// if that is null, the value field is used. - PartiallyShredded { - metadata: BinaryViewArray, - value: BinaryViewArray, - typed_value: ArrayRef, - }, -} - -impl ShreddingState { - /// Return a reference to the metadata field - pub fn metadata_field(&self) -> &BinaryViewArray { - match self { - ShreddingState::Unshredded { metadata, .. } => metadata, - ShreddingState::FullyShredded { metadata, .. } => metadata, - ShreddingState::PartiallyShredded { metadata, .. } => metadata, - } - } - - /// Return a reference to the value field, if present - pub fn value_field(&self) -> Option<&BinaryViewArray> { - match self { - ShreddingState::Unshredded { value, .. } => Some(value), - ShreddingState::FullyShredded { .. } => None, - ShreddingState::PartiallyShredded { value, .. } => Some(value), - } - } - - /// Return a reference to the typed_value field, if present - pub fn typed_value_field(&self) -> Option<&ArrayRef> { - match self { - ShreddingState::Unshredded { .. } => None, - ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value), - ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), - } - } - - /// Slice all the underlying arrays - pub fn slice(&self, offset: usize, length: usize) -> Self { - match self { - ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded { - metadata: metadata.slice(offset, length), - value: value.slice(offset, length), - }, - ShreddingState::FullyShredded { - metadata, - typed_value, - } => ShreddingState::FullyShredded { - metadata: metadata.slice(offset, length), - typed_value: typed_value.slice(offset, length), - }, - ShreddingState::PartiallyShredded { - metadata, - value, - typed_value, - } => ShreddingState::PartiallyShredded { - metadata: metadata.slice(offset, length), - value: value.slice(offset, length), - typed_value: typed_value.slice(offset, length), - }, - } - } -} - impl VariantArray { /// Creates a new `VariantArray` from a [`StructArray`]. /// @@ -200,27 +121,8 @@ impl VariantArray { // Note these clones are cheap, they just bump the ref count let inner = inner.clone(); - let metadata = metadata.clone(); - let value = value.cloned(); - let typed_value = typed_value.cloned(); - - let shredding_state = match (metadata, value, typed_value) { - (metadata, Some(value), Some(typed_value)) => ShreddingState::PartiallyShredded { - metadata, - value, - typed_value, - }, - (metadata, Some(value), None) => ShreddingState::Unshredded { metadata, value }, - (metadata, None, Some(typed_value)) => ShreddingState::FullyShredded { - metadata, - typed_value, - }, - (_metadata_field, None, None) => { - return Err(ArrowError::InvalidArgumentError(String::from( - "VariantArray has neither value nor typed_value field", - ))); - } - }; + let shredding_state = + ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?; Ok(Self { inner, @@ -308,6 +210,108 @@ impl VariantArray { } } +/// Variant arrays can be shredded in one of three states, encoded here +#[derive(Debug)] +pub enum ShreddingState { + /// This variant has no typed_value field + Unshredded { + metadata: BinaryViewArray, + value: BinaryViewArray, + }, + /// This variant has a typed_value field and no value field + /// meaning it is fully shredded (aka the value is stored in typed_value) + FullyShredded { + metadata: BinaryViewArray, + typed_value: ArrayRef, + }, + /// This variant has both a value field and a typed_value field + /// meaning it is partially shredded: first the typed_value is used, and + /// if that is null, the value field is used. + PartiallyShredded { + metadata: BinaryViewArray, + value: BinaryViewArray, + typed_value: ArrayRef, + }, +} + +impl ShreddingState { + /// try to create a new `ShreddingState` from the given fields + pub fn try_new( + metadata: BinaryViewArray, + value: Option, + typed_value: Option, + ) -> Result { + match (metadata, value, typed_value) { + (metadata, Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded { + metadata, + value, + typed_value, + }), + (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }), + (metadata, None, Some(typed_value)) => Ok(Self::FullyShredded { + metadata, + typed_value, + }), + (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from( + "VariantArray has neither value nor typed_value field", + ))), + } + } + + /// Return a reference to the metadata field + pub fn metadata_field(&self) -> &BinaryViewArray { + match self { + ShreddingState::Unshredded { metadata, .. } => metadata, + ShreddingState::FullyShredded { metadata, .. } => metadata, + ShreddingState::PartiallyShredded { metadata, .. } => metadata, + } + } + + /// Return a reference to the value field, if present + pub fn value_field(&self) -> Option<&BinaryViewArray> { + match self { + ShreddingState::Unshredded { value, .. } => Some(value), + ShreddingState::FullyShredded { .. } => None, + ShreddingState::PartiallyShredded { value, .. } => Some(value), + } + } + + /// Return a reference to the typed_value field, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + match self { + ShreddingState::Unshredded { .. } => None, + ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), + } + } + + /// Slice all the underlying arrays + pub fn slice(&self, offset: usize, length: usize) -> Self { + match self { + ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded { + metadata: metadata.slice(offset, length), + value: value.slice(offset, length), + }, + ShreddingState::FullyShredded { + metadata, + typed_value, + } => ShreddingState::FullyShredded { + metadata: metadata.slice(offset, length), + typed_value: typed_value.slice(offset, length), + }, + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => ShreddingState::PartiallyShredded { + metadata: metadata.slice(offset, length), + value: value.slice(offset, length), + typed_value: typed_value.slice(offset, length), + }, + } + } +} + /// returns the non-null element at index as a Variant fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant { match typed_value.data_type() { From 5cb10687188ba0d597e049b22c3e5bb1607f36ce Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Aug 2025 07:27:24 -0400 Subject: [PATCH 05/14] Apply suggestions from code review Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_array.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index fda6b1c1b33c..c2747ab63d53 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -105,15 +105,14 @@ impl VariantArray { }; // Find the value field, if present - let value_field = inner.column_by_name("value"); - let value = value_field - .map(|v| match v.as_binary_view_opt() { - Some(bv) => Ok(bv), - None => Err(ArrowError::NotYetImplemented(format!( + let value = inner + .column_by_name("value") + .map(|v| v.as_binary_view_opt.unwrap_or_else(|| { + ArrowError::NotYetImplemented(format!( "VariantArray 'value' field must be BinaryView, got {}", v.data_type() - ))), - }) + )) + })) .transpose()?; // Find the typed_value field, if present @@ -170,10 +169,7 @@ impl VariantArray { ShreddingState::Unshredded { metadata, value } => { Variant::new(metadata.value(index), value.value(index)) } - ShreddingState::FullyShredded { - metadata: _, - typed_value, - } => { + ShreddingState::FullyShredded { typed_value, .. } => { if typed_value.is_null(index) { Variant::Null } else { From 9c22f85866b006724e82e4149805bd8a7ffba138 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Aug 2025 07:30:11 -0400 Subject: [PATCH 06/14] cleanup --- parquet-variant-compute/src/variant_array.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c2747ab63d53..0c99757a13af 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -107,12 +107,15 @@ impl VariantArray { // Find the value field, if present let value = inner .column_by_name("value") - .map(|v| v.as_binary_view_opt.unwrap_or_else(|| { - ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - v.data_type() - )) - })) + .map(|v| { + let Some(binary_view) = v.as_binary_view_opt() else { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + v.data_type() + ))); + }; + Ok(binary_view) + }) .transpose()?; // Find the typed_value field, if present From eee199e5c641d0ea7aad59e9f53d5276669e6a4a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Aug 2025 07:57:24 -0400 Subject: [PATCH 07/14] Avoid panics in production code --- parquet-variant-compute/src/variant_array.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 0c99757a13af..32cbfd81bc3e 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -320,7 +320,11 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant { } // todo other types here _ => { - todo!(); // Unsupported typed_value type + // We shouldn't panic in production code, but this is a + // placeholder until we implement more types + // TODO tickets: XXXX + debug_assert!(false, "Unsupported typed_value type: {:?}", typed_value.data_type()); + Variant::Null } } } From 336a0a36deb974b3159aba89d70afdea88bd3db1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Aug 2025 08:01:54 -0400 Subject: [PATCH 08/14] comments --- parquet-variant-compute/src/variant_array.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 32cbfd81bc3e..ecb1623c6492 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -318,12 +318,17 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant { let typed_value = typed_value.as_primitive::(); Variant::from(typed_value.value(index)) } - // todo other types here + // todo other types here (note this is very similar to cast_to_variant.rs) + // so it would be great to figure out how to share this code _ => { // We shouldn't panic in production code, but this is a // placeholder until we implement more types // TODO tickets: XXXX - debug_assert!(false, "Unsupported typed_value type: {:?}", typed_value.data_type()); + debug_assert!( + false, + "Unsupported typed_value type: {:?}", + typed_value.data_type() + ); Variant::Null } } From 3e8820b7a38c22a390009bb7698393fbc9435df9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 12:46:56 -0400 Subject: [PATCH 09/14] Fix up merge --- parquet-variant-compute/src/variant_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index ecb1623c6492..33cbe6ee782f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -167,7 +167,7 @@ impl VariantArray { /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. - pub fn value(&self, index: usize) -> Variant { + pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { ShreddingState::Unshredded { metadata, value } => { Variant::new(metadata.value(index), value.value(index)) @@ -312,7 +312,7 @@ impl ShreddingState { } /// returns the non-null element at index as a Variant -fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant { +fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> { match typed_value.data_type() { DataType::Int32 => { let typed_value = typed_value.as_primitive::(); From 55a7ae6be5566cb665d03a80539cbbb1a757156c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 12:47:14 -0400 Subject: [PATCH 10/14] Update parquet-variant-compute/src/variant_array.rs Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_array.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index ecb1623c6492..419d9d5e3dca 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -108,13 +108,12 @@ impl VariantArray { let value = inner .column_by_name("value") .map(|v| { - let Some(binary_view) = v.as_binary_view_opt() else { - return Err(ArrowError::NotYetImplemented(format!( + v.as_binary_view_opt().ok_or_else(|| { + ArrowError::NotYetImplemented(format!( "VariantArray 'value' field must be BinaryView, got {}", v.data_type() - ))); - }; - Ok(binary_view) + )); + }) }) .transpose()?; From 6e38490647093c97b2a966636d9556d7c40c60ea Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 12:48:04 -0400 Subject: [PATCH 11/14] fix compilation --- parquet-variant-compute/src/variant_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index a30247e9595a..b38324ef410e 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -112,7 +112,7 @@ impl VariantArray { ArrowError::NotYetImplemented(format!( "VariantArray 'value' field must be BinaryView, got {}", v.data_type() - )); + )) }) }) .transpose()?; From 0ccd0f86669145103f8cfa478e1d3cb23474a4d5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 13:20:28 -0400 Subject: [PATCH 12/14] update docs --- parquet-variant-compute/src/lib.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index aa63d17a5ef6..5433bd6b6b05 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -15,6 +15,26 @@ // specific language governing permissions and limitations // under the License. +//! [`VariantArray`] and compute kernels for the [Variant Binary Encoding] from [Apache Parquet]. +//! +//! ## Main APIs +//! - [`VariantArray`] : Represents an array of `Variant` values. +//! - [`batch_json_string_to_variant`]: Function to convert a batch of JSON strings to a `VariantArray`. +//! - [`batch_variant_to_json_string`]: Function to convert a `VariantArray` to a batch of JSON strings. +//! - [`cast_to_variant`]: Module to cast other Arrow arrays to `VariantArray`. +//! - [`variant_get`]: Module to get values from a `VariantArray` using a specified [`VariantPath`] +//! +//! ## 🚧 Work In Progress +//! +//! This crate is under active development and is not yet ready for production use. +//! If you are interested in helping, you can find more information on the GitHub [Variant issue] +//! +//! [Variant Binary Encoding]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +//! [Apache Parquet]: https://parquet.apache.org/ +//! [`VariantPath`]: parquet_variant::VariantPath +//! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736 + + pub mod cast_to_variant; mod from_json; mod to_json; From f8b2df44c2d77a1047de52766f3684672aeef5cf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 13:31:42 -0400 Subject: [PATCH 13/14] Rename ShreddingState to conform better to spec --- parquet-variant-compute/src/lib.rs | 4 +- parquet-variant-compute/src/variant_array.rs | 50 +++++++++++++------ .../src/variant_get/mod.rs | 2 +- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 5433bd6b6b05..de7fc720be93 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -19,6 +19,7 @@ //! //! ## Main APIs //! - [`VariantArray`] : Represents an array of `Variant` values. +//! - [`VariantArrayBuilder`]: For building [`VariantArray`] //! - [`batch_json_string_to_variant`]: Function to convert a batch of JSON strings to a `VariantArray`. //! - [`batch_variant_to_json_string`]: Function to convert a `VariantArray` to a batch of JSON strings. //! - [`cast_to_variant`]: Module to cast other Arrow arrays to `VariantArray`. @@ -34,7 +35,6 @@ //! [`VariantPath`]: parquet_variant::VariantPath //! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736 - pub mod cast_to_variant; mod from_json; mod to_json; @@ -42,7 +42,7 @@ mod variant_array; mod variant_array_builder; pub mod variant_get; -pub use variant_array::VariantArray; +pub use variant_array::{ShreddingState, VariantArray}; pub use variant_array_builder::{VariantArrayBuilder, VariantArrayVariantBuilder}; pub use from_json::batch_json_string_to_variant; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index b38324ef410e..d51df550622d 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -80,8 +80,6 @@ impl VariantArray { /// int8. /// /// Currently, only [`BinaryViewArray`] are supported. - /// - /// [`BinaryViewArray`]: arrow::array::BinaryViewArray pub fn try_new(inner: ArrayRef) -> Result { let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( @@ -171,7 +169,7 @@ impl VariantArray { ShreddingState::Unshredded { metadata, value } => { Variant::new(metadata.value(index), value.value(index)) } - ShreddingState::FullyShredded { typed_value, .. } => { + ShreddingState::Typed { typed_value, .. } => { if typed_value.is_null(index) { Variant::Null } else { @@ -208,23 +206,45 @@ impl VariantArray { } } -/// Variant arrays can be shredded in one of three states, encoded here +/// Represents the shredding state of a [`VariantArray`] +/// +/// [`VariantArray`]s can be shredded according to the [Parquet Variant +/// Shredding Spec]. Shredding means that the actual value is stored in a typed +/// `typed_field` instead of the generic `value` field. +/// +/// Both value and typed_value are optional fields used together to encode a +/// single value. Values in the two fields must be interpreted according to the +/// following table (see [Parquet Variant Shredding Spec] for more details): +/// +/// | value | typed_value | Meaning | +/// |----------|--------------|---------| +/// | null | null | The value is missing; only valid for shredded object fields | +/// | non-null | null | The value is present and may be any type, including `null` | +/// | null | non-null | The value is present and is the shredded type | +/// | non-null | non-null | The value is present and is a partially shredded object | +/// +/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding #[derive(Debug)] pub enum ShreddingState { + // TODO: add missing state where there is neither value nor typed_value + // Missing { metadata: BinaryViewArray }, /// This variant has no typed_value field Unshredded { metadata: BinaryViewArray, value: BinaryViewArray, }, /// This variant has a typed_value field and no value field - /// meaning it is fully shredded (aka the value is stored in typed_value) - FullyShredded { + /// meaning it is the shredded type + Typed { metadata: BinaryViewArray, typed_value: ArrayRef, }, - /// This variant has both a value field and a typed_value field - /// meaning it is partially shredded: first the typed_value is used, and - /// if that is null, the value field is used. + /// Partially shredded: + /// * value is an object + /// * typed_value is a shredded object. + /// + /// Note the spec says "Writers must not produce data where both value and + /// typed_value are non-null, unless the Variant value is an object." PartiallyShredded { metadata: BinaryViewArray, value: BinaryViewArray, @@ -246,7 +266,7 @@ impl ShreddingState { typed_value, }), (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }), - (metadata, None, Some(typed_value)) => Ok(Self::FullyShredded { + (metadata, None, Some(typed_value)) => Ok(Self::Typed { metadata, typed_value, }), @@ -260,7 +280,7 @@ impl ShreddingState { pub fn metadata_field(&self) -> &BinaryViewArray { match self { ShreddingState::Unshredded { metadata, .. } => metadata, - ShreddingState::FullyShredded { metadata, .. } => metadata, + ShreddingState::Typed { metadata, .. } => metadata, ShreddingState::PartiallyShredded { metadata, .. } => metadata, } } @@ -269,7 +289,7 @@ impl ShreddingState { pub fn value_field(&self) -> Option<&BinaryViewArray> { match self { ShreddingState::Unshredded { value, .. } => Some(value), - ShreddingState::FullyShredded { .. } => None, + ShreddingState::Typed { .. } => None, ShreddingState::PartiallyShredded { value, .. } => Some(value), } } @@ -278,7 +298,7 @@ impl ShreddingState { pub fn typed_value_field(&self) -> Option<&ArrayRef> { match self { ShreddingState::Unshredded { .. } => None, - ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::Typed { typed_value, .. } => Some(typed_value), ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), } } @@ -290,10 +310,10 @@ impl ShreddingState { metadata: metadata.slice(offset, length), value: value.slice(offset, length), }, - ShreddingState::FullyShredded { + ShreddingState::Typed { metadata, typed_value, - } => ShreddingState::FullyShredded { + } => ShreddingState::Typed { metadata: metadata.slice(offset, length), typed_value: typed_value.slice(offset, length), }, diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index 2ec923ea10b9..c19a9d6750ff 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -51,7 +51,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { value, typed_value, } => output_builder.partially_shredded(variant_array, metadata, value, typed_value), - ShreddingState::FullyShredded { + ShreddingState::Typed { metadata, typed_value, } => output_builder.fully_shredded(variant_array, metadata, typed_value), From 2355247e0f20f1d7fcb71bfa6f7c43d70a7d6e7d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 7 Aug 2025 13:32:36 -0400 Subject: [PATCH 14/14] Update name in output builder --- parquet-variant-compute/src/variant_get/mod.rs | 2 +- parquet-variant-compute/src/variant_get/output/mod.rs | 2 +- parquet-variant-compute/src/variant_get/output/primitive.rs | 2 +- parquet-variant-compute/src/variant_get/output/variant.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index c19a9d6750ff..cc852bbc32a2 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -54,7 +54,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { ShreddingState::Typed { metadata, typed_value, - } => output_builder.fully_shredded(variant_array, metadata, typed_value), + } => output_builder.typed(variant_array, metadata, typed_value), ShreddingState::Unshredded { metadata, value } => { output_builder.unshredded(variant_array, metadata, value) } diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs b/parquet-variant-compute/src/variant_get/output/mod.rs index 44b747e126c8..245d73cce8db 100644 --- a/parquet-variant-compute/src/variant_get/output/mod.rs +++ b/parquet-variant-compute/src/variant_get/output/mod.rs @@ -44,7 +44,7 @@ pub(crate) trait OutputBuilder { ) -> Result; /// output for a perfectly shredded variant array - fn fully_shredded( + fn typed( &self, variant_array: &VariantArray, metadata: &BinaryViewArray, diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 2ffc53f5a099..36e4221e3242 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -126,7 +126,7 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, Ok(Arc::new(array)) } - fn fully_shredded( + fn typed( &self, _variant_array: &VariantArray, _metadata: &BinaryViewArray, diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index fd6a5ddd6211..2c04111a5306 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -79,7 +79,7 @@ impl<'a> OutputBuilder for VariantOutputBuilder<'a> { Ok(Arc::new(array_builder.build())) } - fn fully_shredded( + fn typed( &self, variant_array: &VariantArray, // TODO(perf): can reuse the metadata field here to avoid re-creating it