Skip to content

Commit 10d9714

Browse files
authored
Introduce MAX_INLINE_VIEW_LEN constant for string/byte views (#7719)
# Which issue does this PR close? As suggested by @Dandandan in #7650 (comment): > We probably should set this as constant somewhere and use it # Rationale for this change Using a symbolic constant in the code rather than a hard coded constant makes it easier to: 1. Understand what the value means 2. Link / attach documentation to the constant to provide context # What changes are included in this PR? 1. Introduce `MAX_INLINE_VIEW_LEN` constant for string/byte views 2. Update code to use that instead of `12` # Are there any user-facing changes? A new constant
1 parent 4d3906c commit 10d9714

File tree

4 files changed

+44
-34
lines changed

4 files changed

+44
-34
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::types::bytes::ByteArrayNativeType;
2222
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2323
use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
2424
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25-
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
25+
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
2626
use arrow_schema::{ArrowError, DataType};
2727
use core::str;
2828
use num::ToPrimitive;
@@ -77,8 +77,9 @@ use super::ByteArrayType;
7777
/// 0 31 63 95 127
7878
/// ```
7979
///
80-
/// * Strings with length <= 12 are stored directly in the view. See
81-
/// [`Self::inline_value`] to access the inlined prefix from a short view.
80+
/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
81+
/// the view. See [`Self::inline_value`] to access the inlined prefix from a
82+
/// short view.
8283
///
8384
/// * Strings with length > 12: The first four bytes are stored inline in the
8485
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
@@ -128,6 +129,7 @@ use super::ByteArrayType;
128129
/// assert_eq!(value, "this string is also longer than 12 bytes");
129130
/// ```
130131
///
132+
/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
131133
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
132134
///
133135
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
@@ -316,7 +318,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
316318
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
317319
let v = self.views.get_unchecked(idx);
318320
let len = *v as u32;
319-
let b = if len <= 12 {
321+
let b = if len <= MAX_INLINE_VIEW_LEN {
320322
Self::inline_value(v, len as usize)
321323
} else {
322324
let view = ByteView::from(*v);
@@ -331,10 +333,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
331333
///
332334
/// # Safety
333335
/// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
334-
/// - The `len` must be the length of the inlined value. It should never be larger than 12.
336+
/// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
335337
#[inline(always)]
336338
pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
337-
debug_assert!(len <= 12);
339+
debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
338340
std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
339341
}
340342

@@ -347,7 +349,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
347349
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
348350
self.views.iter().map(move |v| {
349351
let len = *v as u32;
350-
if len <= 12 {
352+
if len <= MAX_INLINE_VIEW_LEN {
351353
unsafe { Self::inline_value(v, len as usize) }
352354
} else {
353355
let view = ByteView::from(*v);
@@ -371,7 +373,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
371373
return &[] as &[u8];
372374
}
373375

374-
if prefix_len <= 4 || len <= 12 {
376+
if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
375377
unsafe { StringViewArray::inline_value(v, prefix_len) }
376378
} else {
377379
let view = ByteView::from(*v);
@@ -401,7 +403,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
401403
return &[] as &[u8];
402404
}
403405

404-
if len <= 12 {
406+
if len as u32 <= MAX_INLINE_VIEW_LEN {
405407
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
406408
} else {
407409
let view = ByteView::from(*v);
@@ -495,9 +497,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
495497
self.views()
496498
.iter()
497499
.map(|v| {
498-
let len = (*v as u32) as usize;
499-
if len > 12 {
500-
len
500+
let len = *v as u32;
501+
if len > MAX_INLINE_VIEW_LEN {
502+
len as usize
501503
} else {
502504
0
503505
}
@@ -511,11 +513,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
511513
/// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
512514
///
513515
/// ByteView types give us the following two advantages, and we need to be careful not to lose them:
514-
/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
516+
/// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
515517
/// Meaning that reading one array element requires only one memory access
516518
/// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
517519
///
518-
/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
520+
/// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
519521
/// thanks to the inlined 4 bytes.
520522
/// Consider equality check:
521523
/// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
@@ -525,8 +527,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
525527
/// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
526528
///
527529
/// # Order check flow
528-
/// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
529-
/// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
530+
/// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
531+
/// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
530532
/// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
531533
/// (2.2) o.w., we need to compare the full string.
532534
///
@@ -544,7 +546,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
544546
let r_view = right.views().get_unchecked(right_idx);
545547
let r_len = *r_view as u32;
546548

547-
if l_len <= 12 && r_len <= 12 {
549+
if l_len <= MAX_INLINE_VIEW_LEN && r_len <= MAX_INLINE_VIEW_LEN {
548550
let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
549551
let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
550552
return l_data.cmp(r_data);

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::marker::PhantomData;
2020
use std::sync::Arc;
2121

2222
use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
23-
use arrow_data::ByteView;
23+
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
2424
use arrow_schema::ArrowError;
2525
use hashbrown::hash_table::Entry;
2626
use hashbrown::HashTable;
@@ -68,8 +68,8 @@ impl BlockSizeGrowthStrategy {
6868
///
6969
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
7070
/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71-
/// writes values larger than 12 bytes to the current in-progress block, with values smaller
72-
/// than 12 bytes inlined into the views. If a value is appended that will not fit in the
71+
/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
72+
/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
7373
/// in-progress block, it will be closed, and a new block of sufficient size allocated
7474
///
7575
/// # Append Views
@@ -114,7 +114,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
114114
/// Set a fixed buffer size for variable length strings
115115
///
116116
/// The block size is the size of the buffer used to store values greater
117-
/// than 12 bytes. The builder allocates new buffers when the current
117+
/// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
118118
/// buffer is full.
119119
///
120120
/// By default the builder balances buffer size and buffer count by
@@ -221,7 +221,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
221221
} else {
222222
self.views_buffer.extend(array.views().iter().map(|v| {
223223
let mut byte_view = ByteView::from(*v);
224-
if byte_view.length > 12 {
224+
if byte_view.length > MAX_INLINE_VIEW_LEN {
225225
// Small views (<=12 bytes) are inlined, so only need to update large views
226226
byte_view.buffer_index += starting_buffer;
227227
};
@@ -289,7 +289,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
289289
pub fn get_value(&self, index: usize) -> &[u8] {
290290
let view = self.views_buffer.as_slice().get(index).unwrap();
291291
let len = *view as u32;
292-
if len <= 12 {
292+
if len <= MAX_INLINE_VIEW_LEN {
293293
// # Safety
294294
// The view is valid from the builder
295295
unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
@@ -315,7 +315,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
315315
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
316316
let v: &[u8] = value.as_ref().as_ref();
317317
let length: u32 = v.len().try_into().unwrap();
318-
if length <= 12 {
318+
if length <= MAX_INLINE_VIEW_LEN {
319319
let mut view_buffer = [0; 16];
320320
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
321321
view_buffer[4..4 + v.len()].copy_from_slice(v);

arrow-data/src/byte_view.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@
1818
use arrow_buffer::Buffer;
1919
use arrow_schema::ArrowError;
2020

21+
/// The maximum number of bytes that can be stored inline in a byte view.
22+
///
23+
/// See [`ByteView`] and [`GenericByteViewArray`] for more information on the
24+
/// layout of the views.
25+
///
26+
/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
27+
pub const MAX_INLINE_VIEW_LEN: u32 = 12;
28+
2129
/// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and
2230
/// `BinaryViewArray`) where the length is greater than 12 bytes.
2331
///
@@ -76,15 +84,15 @@ impl ByteView {
7684
/// See example on [`ByteView`] docs
7785
///
7886
/// Notes:
79-
/// * the length should always be greater than 12 (Data less than 12
80-
/// bytes is stored as an inline view)
87+
/// * the length should always be greater than [`MAX_INLINE_VIEW_LEN`]
88+
/// (Data less than 12 bytes is stored as an inline view)
8189
/// * buffer and offset are set to `0`
8290
///
8391
/// # Panics
8492
/// If the prefix is not exactly 4 bytes
8593
#[inline]
8694
pub fn new(length: u32, prefix: &[u8]) -> Self {
87-
debug_assert!(length > 12);
95+
debug_assert!(length > MAX_INLINE_VIEW_LEN);
8896
Self {
8997
length,
9098
prefix: u32::from_le_bytes(prefix.try_into().unwrap()),
@@ -159,8 +167,8 @@ where
159167
{
160168
for (idx, v) in views.iter().enumerate() {
161169
let len = *v as u32;
162-
if len <= 12 {
163-
if len < 12 && (v >> (32 + len * 8)) != 0 {
170+
if len <= MAX_INLINE_VIEW_LEN {
171+
if len < MAX_INLINE_VIEW_LEN && (v >> (32 + len * 8)) != 0 {
164172
return Err(ArrowError::InvalidArgumentError(format!(
165173
"View at index {idx} contained non-zero padding for string of length {len}",
166174
)));

arrow-select/src/coalesce/byte_view.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use arrow_array::cast::AsArray;
2020
use arrow_array::types::ByteViewType;
2121
use arrow_array::{Array, ArrayRef, GenericByteViewArray};
2222
use arrow_buffer::{Buffer, NullBufferBuilder};
23-
use arrow_data::ByteView;
23+
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
2424
use arrow_schema::ArrowError;
2525
use std::marker::PhantomData;
2626
use std::sync::Arc;
@@ -125,7 +125,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
125125
// If there are buffers, we need to update the buffer index
126126
let updated_views = views.iter().map(|v| {
127127
let mut byte_view = ByteView::from(*v);
128-
if byte_view.length > 12 {
128+
if byte_view.length > MAX_INLINE_VIEW_LEN {
129129
// Small views (<=12 bytes) are inlined, so only need to update large views
130130
byte_view.buffer_index += starting_buffer;
131131
};
@@ -182,7 +182,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
182182
if remaining_capacity < str_len as usize {
183183
break;
184184
}
185-
if str_len > 12 {
185+
if str_len > MAX_INLINE_VIEW_LEN {
186186
remaining_capacity -= str_len as usize;
187187
}
188188
num_view_to_current += 1;
@@ -233,7 +233,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
233233
.iter()
234234
.filter_map(|v| {
235235
let b = ByteView::from(*v);
236-
if b.length > 12 {
236+
if b.length > MAX_INLINE_VIEW_LEN {
237237
Some(b.length as usize)
238238
} else {
239239
None
@@ -251,7 +251,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
251251
// Copy the views, updating the buffer index and copying the data as needed
252252
let new_views = views.iter().map(|v| {
253253
let mut b: ByteView = ByteView::from(*v);
254-
if b.length > 12 {
254+
if b.length > MAX_INLINE_VIEW_LEN {
255255
let buffer_index = b.buffer_index as usize;
256256
let buffer_offset = b.offset as usize;
257257
let str_len = b.length as usize;

0 commit comments

Comments
 (0)