Skip to content

Commit 96fd53a

Browse files
committed
Merge remote-tracking branch 'upstream/main' into issue_7743
2 parents 442402b + b269422 commit 96fd53a

File tree

17 files changed

+3201
-272
lines changed

17 files changed

+3201
-272
lines changed

arrow-arith/src/numeric.rs

Lines changed: 644 additions & 39 deletions
Large diffs are not rendered by default.

arrow-array/src/array/byte_view_array.rs

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::types::bytes::ByteArrayNativeType;
2222
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2323
use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
2424
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25-
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
25+
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
2626
use arrow_schema::{ArrowError, DataType};
2727
use core::str;
2828
use num::ToPrimitive;
@@ -78,8 +78,9 @@ use super::ByteArrayType;
7878
/// 0 31 63 95 127
7979
/// ```
8080
///
81-
/// * Strings with length <= 12 are stored directly in the view. See
82-
/// [`Self::inline_value`] to access the inlined prefix from a short view.
81+
/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82+
/// the view. See [`Self::inline_value`] to access the inlined prefix from a
83+
/// short view.
8384
///
8485
/// * Strings with length > 12: The first four bytes are stored inline in the
8586
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
@@ -129,6 +130,7 @@ use super::ByteArrayType;
129130
/// assert_eq!(value, "this string is also longer than 12 bytes");
130131
/// ```
131132
///
133+
/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
132134
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
133135
///
134136
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
@@ -317,7 +319,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
317319
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
318320
let v = self.views.get_unchecked(idx);
319321
let len = *v as u32;
320-
let b = if len <= 12 {
322+
let b = if len <= MAX_INLINE_VIEW_LEN {
321323
Self::inline_value(v, len as usize)
322324
} else {
323325
let view = ByteView::from(*v);
@@ -332,10 +334,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
332334
///
333335
/// # Safety
334336
/// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
335-
/// - The `len` must be the length of the inlined value. It should never be larger than 12.
337+
/// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
336338
#[inline(always)]
337339
pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
338-
debug_assert!(len <= 12);
340+
debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
339341
std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
340342
}
341343

@@ -348,7 +350,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
348350
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
349351
self.views.iter().map(move |v| {
350352
let len = *v as u32;
351-
if len <= 12 {
353+
if len <= MAX_INLINE_VIEW_LEN {
352354
unsafe { Self::inline_value(v, len as usize) }
353355
} else {
354356
let view = ByteView::from(*v);
@@ -372,7 +374,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
372374
return &[] as &[u8];
373375
}
374376

375-
if prefix_len <= 4 || len <= 12 {
377+
if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
376378
unsafe { StringViewArray::inline_value(v, prefix_len) }
377379
} else {
378380
let view = ByteView::from(*v);
@@ -402,7 +404,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
402404
return &[] as &[u8];
403405
}
404406

405-
if len <= 12 {
407+
if len as u32 <= MAX_INLINE_VIEW_LEN {
406408
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
407409
} else {
408410
let view = ByteView::from(*v);
@@ -496,9 +498,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
496498
self.views()
497499
.iter()
498500
.map(|v| {
499-
let len = (*v as u32) as usize;
500-
if len > 12 {
501-
len
501+
let len = *v as u32;
502+
if len > MAX_INLINE_VIEW_LEN {
503+
len as usize
502504
} else {
503505
0
504506
}
@@ -512,11 +514,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
512514
/// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
513515
///
514516
/// ByteView types give us the following two advantages, and we need to be careful not to lose them:
515-
/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
517+
/// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
516518
/// Meaning that reading one array element requires only one memory access
517519
/// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
518520
///
519-
/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
521+
/// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
520522
/// thanks to the inlined 4 bytes.
521523
/// Consider equality check:
522524
/// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
@@ -526,8 +528,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
526528
/// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
527529
///
528530
/// # Order check flow
529-
/// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
530-
/// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
531+
/// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
532+
/// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
531533
/// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
532534
/// (2.2) o.w., we need to compare the full string.
533535
///
@@ -555,7 +557,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
555557
// one of the string is larger than 12 bytes,
556558
// we then try to compare the inlined data first
557559

558-
// Note: In theory, ByteView is only used for views larger than 12 bytes,
560+
// Note: In theory, ByteView is only used for string which is larger than 12 bytes,
559561
// but we can still use it to get the inlined prefix for shorter strings.
560562
// The prefix is always the first 4 bytes of the view, for both short and long strings.
561563
let l_inlined_be = l_byte_view.prefix.swap_bytes();

arrow-array/src/array/fixed_size_list_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,8 @@ impl From<ArrayData> for FixedSizeListArray {
343343
fn from(data: ArrayData) -> Self {
344344
let value_length = match data.data_type() {
345345
DataType::FixedSizeList(_, len) => *len,
346-
_ => {
347-
panic!("FixedSizeListArray data should contain a FixedSizeList data type")
346+
data_type => {
347+
panic!("FixedSizeListArray data should contain a FixedSizeList data type, got {data_type:?}")
348348
}
349349
};
350350

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::marker::PhantomData;
2020
use std::sync::Arc;
2121

2222
use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
23-
use arrow_data::ByteView;
23+
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
2424
use arrow_schema::ArrowError;
2525
use hashbrown::hash_table::Entry;
2626
use hashbrown::HashTable;
@@ -68,8 +68,8 @@ impl BlockSizeGrowthStrategy {
6868
///
6969
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
7070
/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71-
/// writes values larger than 12 bytes to the current in-progress block, with values smaller
72-
/// than 12 bytes inlined into the views. If a value is appended that will not fit in the
71+
/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
72+
/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
7373
/// in-progress block, it will be closed, and a new block of sufficient size allocated
7474
///
7575
/// # Append Views
@@ -114,7 +114,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
114114
/// Set a fixed buffer size for variable length strings
115115
///
116116
/// The block size is the size of the buffer used to store values greater
117-
/// than 12 bytes. The builder allocates new buffers when the current
117+
/// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
118118
/// buffer is full.
119119
///
120120
/// By default the builder balances buffer size and buffer count by
@@ -221,7 +221,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
221221
} else {
222222
self.views_buffer.extend(array.views().iter().map(|v| {
223223
let mut byte_view = ByteView::from(*v);
224-
if byte_view.length > 12 {
224+
if byte_view.length > MAX_INLINE_VIEW_LEN {
225225
// Small views (<=12 bytes) are inlined, so only need to update large views
226226
byte_view.buffer_index += starting_buffer;
227227
};
@@ -289,7 +289,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
289289
pub fn get_value(&self, index: usize) -> &[u8] {
290290
let view = self.views_buffer.as_slice().get(index).unwrap();
291291
let len = *view as u32;
292-
if len <= 12 {
292+
if len <= MAX_INLINE_VIEW_LEN {
293293
// # Safety
294294
// The view is valid from the builder
295295
unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
@@ -315,7 +315,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
315315
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
316316
let v: &[u8] = value.as_ref().as_ref();
317317
let length: u32 = v.len().try_into().unwrap();
318-
if length <= 12 {
318+
if length <= MAX_INLINE_VIEW_LEN {
319319
let mut view_buffer = [0; 16];
320320
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
321321
view_buffer[4..4 + v.len()].copy_from_slice(v);

0 commit comments

Comments
 (0)