Skip to content

Commit f4840f6

Browse files
authored
[Variant] Implement new VariantValueArrayBuilder (#8360)
# Which issue does this PR close? - Pre-work for #8361 # Rationale for this change There is currently no good way to populate a new variant array with variant values that reference an existing metadata, but that functionality is needed when transforming existing variant data (e.g. for shredding and unshredding operations). # What changes are included in this PR? Add a new `VariantValueArrayBuilder` that does not try to create new metadata; instead, it wraps a `ReadOnlyMetadata` around the `VariantMetadata` instance of the `Variant` value being inserted. This takes advantage of the new generic `ParentState` capability. NOTE: The new array builder does _not_ impl `VariantBuilderExt` because it does not have a `MetadataBuilder` instance -- the instance is created on demand as part of the insertion itself. Instead, callers can directly invoke `VariantValueArrayBuilder::parent_state()`. This approach avoids the considerable complexity of keeping an internal metadata column index in sync with whatever external indexing might produce the variant value to be appended. It also doesn't seem to matter -- I did some pathfinding of variant shredding (going from binary to shredded variant based on some target schema), and the `VariantBuilderExt` does not seem especially helpful for that code. # Are these changes tested? New unit tests. # Are there any user-facing changes? New class.
1 parent aed2f3b commit f4840f6

File tree

7 files changed

+283
-34
lines changed

7 files changed

+283
-34
lines changed

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ pub mod variant_get;
4646
mod variant_to_arrow;
4747

4848
pub use variant_array::{ShreddingState, VariantArray};
49-
pub use variant_array_builder::VariantArrayBuilder;
49+
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};
5050

5151
pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
5252
pub use from_json::json_to_variant;

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 240 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ use arrow_schema::{ArrowError, DataType, Field, Fields};
2323
use parquet_variant::{
2424
BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt,
2525
};
26-
use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder};
26+
use parquet_variant::{
27+
ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder,
28+
};
2729
use std::sync::Arc;
2830

2931
/// A builder for [`VariantArray`]
@@ -205,6 +207,154 @@ impl VariantBuilderExt for VariantArrayBuilder {
205207
}
206208
}
207209

210+
/// A builder for creating only the value column of a [`VariantArray`]
211+
///
212+
/// This builder is used when you have existing metadata and only need to build
213+
/// the value column. It's useful for scenarios like variant unshredding, data
214+
/// transformation, or filtering where you want to reuse existing metadata.
215+
///
216+
/// The builder produces a [`BinaryViewArray`] that can be combined with existing
217+
/// metadata to create a complete [`VariantArray`].
218+
///
219+
/// # Example:
220+
/// ```
221+
/// # use arrow::array::Array;
222+
/// # use parquet_variant::{Variant};
223+
/// # use parquet_variant_compute::VariantValueArrayBuilder;
224+
/// // Create a variant value builder for 10 rows
225+
/// let mut builder = VariantValueArrayBuilder::new(10);
226+
///
227+
/// // Append some values with their corresponding metadata, which the
228+
/// // builder takes advantage of to avoid creating new metadata.
229+
/// builder.append_value(Variant::from(42));
230+
/// builder.append_null();
231+
/// builder.append_value(Variant::from("hello"));
232+
///
233+
/// // Build the final value array
234+
/// let value_array = builder.build().unwrap();
235+
/// assert_eq!(value_array.len(), 3);
236+
/// ```
237+
#[derive(Debug)]
238+
pub struct VariantValueArrayBuilder {
239+
value_builder: ValueBuilder,
240+
value_offsets: Vec<usize>,
241+
nulls: NullBufferBuilder,
242+
}
243+
244+
impl VariantValueArrayBuilder {
245+
/// Create a new `VariantValueArrayBuilder` with the specified row capacity
246+
pub fn new(row_capacity: usize) -> Self {
247+
Self {
248+
value_builder: ValueBuilder::new(),
249+
value_offsets: Vec::with_capacity(row_capacity),
250+
nulls: NullBufferBuilder::new(row_capacity),
251+
}
252+
}
253+
254+
/// Build the final value array
255+
///
256+
/// Returns a [`BinaryViewArray`] containing the serialized variant values.
257+
/// This can be combined with existing metadata to create a complete [`VariantArray`].
258+
pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> {
259+
let value_buffer = self.value_builder.into_inner();
260+
let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets);
261+
if let Some(nulls) = self.nulls.finish() {
262+
let (views, buffers, _) = array.into_parts();
263+
array = BinaryViewArray::try_new(views, buffers, Some(nulls))?;
264+
}
265+
Ok(array)
266+
}
267+
268+
/// Append a null row to the builder
269+
///
270+
/// WARNING: It is only valid to call this method when building the `value` field of a shredded
271+
/// variant column (which is nullable). The `value` field of a binary (unshredded) variant
272+
/// column is non-nullable, and callers should instead invoke [`Self::append_value`] with
273+
/// `Variant::Null`, passing the appropriate metadata value.
274+
pub fn append_null(&mut self) {
275+
self.value_offsets.push(self.value_builder.offset());
276+
self.nulls.append_null();
277+
}
278+
279+
/// Append a variant value with its corresponding metadata
280+
///
281+
/// # Arguments
282+
/// * `value` - The variant value to append
283+
/// * `metadata` - The metadata dictionary for this variant (used for field name resolution)
284+
///
285+
/// # Returns
286+
/// * `Ok(())` if the value was successfully appended
287+
/// * `Err(ArrowError)` if the variant contains field names not found in the metadata
288+
///
289+
/// # Example
290+
/// ```
291+
/// # use parquet_variant::Variant;
292+
/// # use parquet_variant_compute::VariantValueArrayBuilder;
293+
/// let mut builder = VariantValueArrayBuilder::new(10);
294+
/// builder.append_value(Variant::from(42));
295+
/// ```
296+
pub fn append_value(&mut self, value: Variant<'_, '_>) {
297+
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
298+
ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value);
299+
}
300+
301+
/// Creates a builder-specific parent state.
302+
///
303+
/// For example, this can be useful for code that wants to copy a subset of fields from an
304+
/// object `value` as a new row of `value_array_builder`:
305+
///
306+
/// ```no_run
307+
/// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant};
308+
/// # use parquet_variant_compute::VariantValueArrayBuilder;
309+
/// # let value = Variant::Null;
310+
/// # let mut value_array_builder = VariantValueArrayBuilder::new(0);
311+
/// # fn should_keep(field_name: &str) -> bool { todo!() };
312+
/// let Variant::Object(obj) = value else {
313+
/// panic!("Not a variant object");
314+
/// };
315+
/// let mut metadata_builder = ReadOnlyMetadataBuilder::new(obj.metadata.clone());
316+
/// let state = value_array_builder.parent_state(&mut metadata_builder);
317+
/// let mut object_builder = ObjectBuilder::new(state, false);
318+
/// for (field_name, field_value) in obj.iter() {
319+
/// if should_keep(field_name) {
320+
/// object_builder.insert_bytes(field_name, field_value);
321+
/// }
322+
/// }
323+
/// object_builder.finish(); // appends the filtered object
324+
/// ```
325+
pub fn parent_state<'a>(
326+
&'a mut self,
327+
metadata_builder: &'a mut dyn MetadataBuilder,
328+
) -> ParentState<'a, ValueArrayBuilderState<'a>> {
329+
let state = ValueArrayBuilderState {
330+
value_offsets: &mut self.value_offsets,
331+
nulls: &mut self.nulls,
332+
};
333+
334+
ParentState::new(&mut self.value_builder, metadata_builder, state)
335+
}
336+
}
337+
338+
/// Builder-specific state for array building that manages array-level offsets and nulls. See
339+
/// [`VariantBuilderExt`] for details.
340+
#[derive(Debug)]
341+
pub struct ValueArrayBuilderState<'a> {
342+
value_offsets: &'a mut Vec<usize>,
343+
nulls: &'a mut NullBufferBuilder,
344+
}
345+
346+
// All changes are pending until finalized
347+
impl BuilderSpecificState for ValueArrayBuilderState<'_> {
348+
fn finish(
349+
&mut self,
350+
_metadata_builder: &mut dyn MetadataBuilder,
351+
value_builder: &mut ValueBuilder,
352+
) {
353+
self.value_offsets.push(value_builder.offset());
354+
self.nulls.append_non_null();
355+
}
356+
}
357+
208358
fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
209359
// All offsets are less than or equal to the buffer length, so we can safely cast all offsets
210360
// inside the loop below, as long as the buffer length fits in u32.
@@ -228,6 +378,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
228378
mod test {
229379
use super::*;
230380
use arrow::array::Array;
381+
use parquet_variant::Variant;
231382

232383
/// Test that both the metadata and value buffers are non nullable
233384
#[test]
@@ -288,4 +439,92 @@ mod test {
288439
let list = variant.as_list().expect("variant to be a list");
289440
assert_eq!(list.len(), 2);
290441
}
442+
443+
#[test]
444+
fn test_variant_value_array_builder_basic() {
445+
let mut builder = VariantValueArrayBuilder::new(10);
446+
447+
// Append some values
448+
builder.append_value(Variant::from(42i32));
449+
builder.append_null();
450+
builder.append_value(Variant::from("hello"));
451+
452+
let value_array = builder.build().unwrap();
453+
assert_eq!(value_array.len(), 3);
454+
}
455+
456+
#[test]
457+
fn test_variant_value_array_builder_with_objects() {
458+
// Populate a variant array with objects
459+
let mut builder = VariantArrayBuilder::new(3);
460+
builder
461+
.new_object()
462+
.with_field("name", "Alice")
463+
.with_field("age", 30i32)
464+
.finish();
465+
466+
builder
467+
.new_object()
468+
.with_field("name", "Bob")
469+
.with_field("age", 42i32)
470+
.with_field("city", "Wonderland")
471+
.finish();
472+
473+
builder
474+
.new_object()
475+
.with_field("name", "Charlie")
476+
.with_field("age", 1i32)
477+
.finish();
478+
479+
let array = builder.build();
480+
481+
// Copy (some of) the objects over to the value array builder
482+
//
483+
// NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only
484+
// filter or manipulate values within a row.
485+
let mut builder = VariantValueArrayBuilder::new(3);
486+
487+
// straight copy
488+
builder.append_value(array.value(0));
489+
490+
// filtering fields takes more work because we need to manually create an object builder
491+
let value = array.value(1);
492+
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
493+
let state = builder.parent_state(&mut metadata_builder);
494+
ObjectBuilder::new(state, false)
495+
.with_field("name", value.get_object_field("name").unwrap())
496+
.with_field("age", value.get_object_field("age").unwrap())
497+
.finish();
498+
499+
// same bytes, but now nested and duplicated inside a list
500+
let value = array.value(2);
501+
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
502+
let state = builder.parent_state(&mut metadata_builder);
503+
ListBuilder::new(state, false)
504+
.with_value(value.clone())
505+
.with_value(value.clone())
506+
.finish();
507+
508+
let array2 = VariantArray::from_parts(
509+
array.metadata_field().clone(),
510+
Some(builder.build().unwrap()),
511+
None,
512+
None,
513+
);
514+
515+
assert_eq!(array2.len(), 3);
516+
assert_eq!(array.value(0), array2.value(0));
517+
518+
assert_eq!(
519+
array.value(1).get_object_field("name"),
520+
array2.value(1).get_object_field("name")
521+
);
522+
assert_eq!(
523+
array.value(1).get_object_field("age"),
524+
array2.value(1).get_object_field("age")
525+
);
526+
527+
assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
528+
assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
529+
}
291530
}

parquet-variant-compute/src/variant_get.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,13 @@ fn shredded_get_path(
135135
let shred_basic_variant =
136136
|target: VariantArray, path: VariantPath<'_>, as_field: Option<&Field>| {
137137
let as_type = as_field.map(|f| f.data_type());
138-
let mut builder =
139-
make_variant_to_arrow_row_builder(path, as_type, cast_options, target.len())?;
138+
let mut builder = make_variant_to_arrow_row_builder(
139+
target.metadata_field(),
140+
path,
141+
as_type,
142+
cast_options,
143+
target.len(),
144+
)?;
140145
for i in 0..target.len() {
141146
if target.is_null(i) {
142147
builder.append_null()?;

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::{ArrayRef, PrimitiveBuilder};
18+
use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder};
1919
use arrow::compute::CastOptions;
2020
use arrow::datatypes::{self, ArrowPrimitiveType, DataType};
2121
use arrow::error::{ArrowError, Result};
2222
use parquet_variant::{Variant, VariantPath};
2323

2424
use crate::type_conversion::VariantAsPrimitive;
25-
use crate::VariantArrayBuilder;
25+
use crate::{VariantArray, VariantValueArrayBuilder};
2626

2727
use std::sync::Arc;
2828

@@ -109,7 +109,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
109109
}
110110

111111
pub(crate) fn make_variant_to_arrow_row_builder<'a>(
112-
//metadata: &BinaryViewArray,
112+
metadata: &BinaryViewArray,
113113
path: VariantPath<'a>,
114114
data_type: Option<&'a DataType>,
115115
cast_options: &'a CastOptions,
@@ -119,7 +119,10 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>(
119119

120120
let mut builder = match data_type {
121121
// If no data type was requested, build an unshredded VariantArray.
122-
None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(capacity)),
122+
None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
123+
metadata.clone(),
124+
capacity,
125+
)),
123126
Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new(
124127
cast_options,
125128
capacity,
@@ -278,36 +281,40 @@ where
278281

279282
/// Builder for creating VariantArray output (for path extraction without type conversion)
280283
pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
281-
builder: VariantArrayBuilder,
284+
metadata: BinaryViewArray,
285+
builder: VariantValueArrayBuilder,
286+
nulls: NullBufferBuilder,
282287
}
283288

284289
impl VariantToBinaryVariantArrowRowBuilder {
285-
fn new(capacity: usize) -> Self {
290+
fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
286291
Self {
287-
builder: VariantArrayBuilder::new(capacity),
292+
metadata,
293+
builder: VariantValueArrayBuilder::new(capacity),
294+
nulls: NullBufferBuilder::new(capacity),
288295
}
289296
}
290297
}
291298

292299
impl VariantToBinaryVariantArrowRowBuilder {
293300
fn append_null(&mut self) -> Result<()> {
294301
self.builder.append_null();
302+
self.nulls.append_null();
295303
Ok(())
296304
}
297305

298306
fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
299-
// TODO: We need a way to convert a Variant directly to bytes. In particular, we want to
300-
// just copy across the underlying value byte slice of a `Variant::Object` or
301-
// `Variant::List`, without any interaction with a `VariantMetadata` (because the shredding
302-
// spec requires us to reuse the existing metadata when unshredding).
303-
//
304-
// One could _probably_ emulate this with parquet_variant::VariantBuilder, but it would do a
305-
// lot of unnecessary work and would also create a new metadata column we don't need.
306-
self.builder.append_variant(value.clone());
307+
self.builder.append_value(value.clone());
308+
self.nulls.append_non_null();
307309
Ok(true)
308310
}
309311

310-
fn finish(self) -> Result<ArrayRef> {
311-
Ok(Arc::new(self.builder.build()))
312+
fn finish(mut self) -> Result<ArrayRef> {
313+
Ok(Arc::new(VariantArray::from_parts(
314+
self.metadata,
315+
Some(self.builder.build()?),
316+
None, // no typed_value column
317+
self.nulls.finish(),
318+
)))
312319
}
313320
}

parquet-variant/src/builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder {
562562

563563
impl WritableMetadataBuilder {
564564
/// Upsert field name to dictionary, return its ID
565-
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
565+
pub fn upsert_field_name(&mut self, field_name: &str) -> u32 {
566566
let (id, new_entry) = self.field_names.insert_full(field_name.to_string());
567567

568568
if new_entry {

0 commit comments

Comments
 (0)