1616// under the License.
1717
1818use crate :: array:: print_long_array;
19- use crate :: builder:: GenericBytesViewBuilder ;
19+ use crate :: builder:: GenericByteViewBuilder ;
2020use crate :: iterator:: ArrayIter ;
2121use crate :: types:: bytes:: ByteArrayNativeType ;
22- use crate :: types:: BytesViewType ;
22+ use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
2323use crate :: { Array , ArrayAccessor , ArrayRef } ;
2424use arrow_buffer:: { Buffer , NullBuffer , ScalarBuffer } ;
25- use arrow_data:: { ArrayData , ArrayDataBuilder , BytesView } ;
25+ use arrow_data:: { ArrayData , ArrayDataBuilder , ByteView } ;
2626use arrow_schema:: { ArrowError , DataType } ;
2727use std:: any:: Any ;
2828use std:: fmt:: Debug ;
2929use std:: marker:: PhantomData ;
3030use std:: sync:: Arc ;
3131
32- /// An array of variable length bytes view arrays
33- pub struct GenericBytesViewArray < T : BytesViewType + ?Sized > {
32+ /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33+ ///
34+ /// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35+ /// meaning that take / filter operations can be implemented without copying the underlying data.
36+ ///
37+ /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
38+ pub struct GenericByteViewArray < T : ByteViewType + ?Sized > {
3439 data_type : DataType ,
3540 views : ScalarBuffer < u128 > ,
3641 buffers : Vec < Buffer > ,
3742 phantom : PhantomData < T > ,
3843 nulls : Option < NullBuffer > ,
3944}
4045
41- impl < T : BytesViewType + ?Sized > Clone for GenericBytesViewArray < T > {
46+ impl < T : ByteViewType + ?Sized > Clone for GenericByteViewArray < T > {
4247 fn clone ( & self ) -> Self {
4348 Self {
4449 data_type : T :: DATA_TYPE ,
@@ -50,22 +55,22 @@ impl<T: BytesViewType + ?Sized> Clone for GenericBytesViewArray<T> {
5055 }
5156}
5257
53- impl < T : BytesViewType + ?Sized > GenericBytesViewArray < T > {
54- /// Create a new [`GenericBytesViewArray `] from the provided parts, panicking on failure
58+ impl < T : ByteViewType + ?Sized > GenericByteViewArray < T > {
59+ /// Create a new [`GenericByteViewArray `] from the provided parts, panicking on failure
5560 ///
5661 /// # Panics
5762 ///
58- /// Panics if [`GenericBytesViewArray ::try_new`] returns an error
63+ /// Panics if [`GenericByteViewArray ::try_new`] returns an error
5964 pub fn new ( views : ScalarBuffer < u128 > , buffers : Vec < Buffer > , nulls : Option < NullBuffer > ) -> Self {
6065 Self :: try_new ( views, buffers, nulls) . unwrap ( )
6166 }
6267
63- /// Create a new [`GenericBytesViewArray `] from the provided parts, returning an error on failure
68+ /// Create a new [`GenericByteViewArray `] from the provided parts, returning an error on failure
6469 ///
6570 /// # Errors
6671 ///
6772 /// * `views.len() != nulls.len()`
68- /// * [BytesViewType ::validate] fails
73+ /// * [ByteViewType ::validate] fails
6974 pub fn try_new (
7075 views : ScalarBuffer < u128 > ,
7176 buffers : Vec < Buffer > ,
@@ -93,7 +98,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
9398 } )
9499 }
95100
96- /// Create a new [`GenericBytesViewArray `] from the provided parts, without validation
101+ /// Create a new [`GenericByteViewArray `] from the provided parts, without validation
97102 ///
98103 /// # Safety
99104 ///
@@ -112,7 +117,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
112117 }
113118 }
114119
115- /// Create a new [`GenericBytesViewArray `] of length `len` where all values are null
120+ /// Create a new [`GenericByteViewArray `] of length `len` where all values are null
116121 pub fn new_null ( len : usize ) -> Self {
117122 Self {
118123 data_type : T :: DATA_TYPE ,
@@ -123,14 +128,14 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
123128 }
124129 }
125130
126- /// Creates a [`GenericBytesViewArray `] based on an iterator of values without nulls
131+ /// Creates a [`GenericByteViewArray `] based on an iterator of values without nulls
127132 pub fn from_iter_values < Ptr , I > ( iter : I ) -> Self
128133 where
129134 Ptr : AsRef < T :: Native > ,
130135 I : IntoIterator < Item = Ptr > ,
131136 {
132137 let iter = iter. into_iter ( ) ;
133- let mut builder = GenericBytesViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
138+ let mut builder = GenericByteViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
134139 for v in iter {
135140 builder. append_value ( v) ;
136141 }
@@ -179,7 +184,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
179184 let ptr = self . views . as_ptr ( ) as * const u8 ;
180185 std:: slice:: from_raw_parts ( ptr. add ( idx * 16 + 4 ) , len as usize )
181186 } else {
182- let view = BytesView :: from ( * v) ;
187+ let view = ByteView :: from ( * v) ;
183188 let data = self . buffers . get_unchecked ( view. buffer_index as usize ) ;
184189 let offset = view. offset as usize ;
185190 data. get_unchecked ( offset..offset + len as usize )
@@ -204,7 +209,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
204209 }
205210}
206211
207- impl < T : BytesViewType + ?Sized > Debug for GenericBytesViewArray < T > {
212+ impl < T : ByteViewType + ?Sized > Debug for GenericByteViewArray < T > {
208213 fn fmt ( & self , f : & mut std:: fmt:: Formatter ) -> std:: fmt:: Result {
209214 write ! ( f, "{}ViewArray\n [\n " , T :: PREFIX ) ?;
210215 print_long_array ( self , f, |array, index, f| {
@@ -214,7 +219,7 @@ impl<T: BytesViewType + ?Sized> Debug for GenericBytesViewArray<T> {
214219 }
215220}
216221
217- impl < T : BytesViewType + ?Sized > Array for GenericBytesViewArray < T > {
222+ impl < T : ByteViewType + ?Sized > Array for GenericByteViewArray < T > {
218223 fn as_any ( & self ) -> & dyn Any {
219224 self
220225 }
@@ -265,19 +270,19 @@ impl<T: BytesViewType + ?Sized> Array for GenericBytesViewArray<T> {
265270 }
266271}
267272
268- impl < ' a , T : BytesViewType + ?Sized > ArrayAccessor for & ' a GenericBytesViewArray < T > {
273+ impl < ' a , T : ByteViewType + ?Sized > ArrayAccessor for & ' a GenericByteViewArray < T > {
269274 type Item = & ' a T :: Native ;
270275
271276 fn value ( & self , index : usize ) -> Self :: Item {
272- GenericBytesViewArray :: value ( self , index)
277+ GenericByteViewArray :: value ( self , index)
273278 }
274279
275280 unsafe fn value_unchecked ( & self , index : usize ) -> Self :: Item {
276- GenericBytesViewArray :: value_unchecked ( self , index)
281+ GenericByteViewArray :: value_unchecked ( self , index)
277282 }
278283}
279284
280- impl < ' a , T : BytesViewType + ?Sized > IntoIterator for & ' a GenericBytesViewArray < T > {
285+ impl < ' a , T : ByteViewType + ?Sized > IntoIterator for & ' a GenericByteViewArray < T > {
281286 type Item = Option < & ' a T :: Native > ;
282287 type IntoIter = ArrayIter < Self > ;
283288
@@ -286,7 +291,7 @@ impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray<T
286291 }
287292}
288293
289- impl < T : BytesViewType + ?Sized > From < ArrayData > for GenericBytesViewArray < T > {
294+ impl < T : ByteViewType + ?Sized > From < ArrayData > for GenericByteViewArray < T > {
290295 fn from ( value : ArrayData ) -> Self {
291296 let views = value. buffers ( ) [ 0 ] . clone ( ) ;
292297 let views = ScalarBuffer :: new ( views, value. offset ( ) , value. len ( ) ) ;
@@ -301,8 +306,8 @@ impl<T: BytesViewType + ?Sized> From<ArrayData> for GenericBytesViewArray<T> {
301306 }
302307}
303308
304- impl < T : BytesViewType + ?Sized > From < GenericBytesViewArray < T > > for ArrayData {
305- fn from ( mut array : GenericBytesViewArray < T > ) -> Self {
309+ impl < T : ByteViewType + ?Sized > From < GenericByteViewArray < T > > for ArrayData {
310+ fn from ( mut array : GenericByteViewArray < T > ) -> Self {
306311 let len = array. len ( ) ;
307312 array. buffers . insert ( 0 , array. views . into_inner ( ) ) ;
308313 let builder = ArrayDataBuilder :: new ( T :: DATA_TYPE )
@@ -314,30 +319,30 @@ impl<T: BytesViewType + ?Sized> From<GenericBytesViewArray<T>> for ArrayData {
314319 }
315320}
316321
317- impl < Ptr , T : BytesViewType + ?Sized > FromIterator < Option < Ptr > > for GenericBytesViewArray < T >
322+ impl < Ptr , T : ByteViewType + ?Sized > FromIterator < Option < Ptr > > for GenericByteViewArray < T >
318323where
319324 Ptr : AsRef < T :: Native > ,
320325{
321326 fn from_iter < I : IntoIterator < Item = Option < Ptr > > > ( iter : I ) -> Self {
322327 let iter = iter. into_iter ( ) ;
323- let mut builder = GenericBytesViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
328+ let mut builder = GenericByteViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
324329 builder. extend ( iter) ;
325330 builder. finish ( )
326331 }
327332}
328333
329- /// A [`GenericBytesViewArray `] of `[u8]`
330- pub type BinaryViewArray = GenericBytesViewArray < [ u8 ] > ;
334+ /// A [`GenericByteViewArray `] of `[u8]`
335+ pub type BinaryViewArray = GenericByteViewArray < BinaryViewType > ;
331336
332- /// A [`GenericBytesViewArray `] of `str`
337+ /// A [`GenericByteViewArray `] of `str`
333338///
334339/// ```
335340/// use arrow_array::StringViewArray;
336341/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
337342/// assert_eq!(array.value(0), "hello");
338343/// assert_eq!(array.value(3), "large payload over 12 bytes");
339344/// ```
340- pub type StringViewArray = GenericBytesViewArray < str > ;
345+ pub type StringViewArray = GenericByteViewArray < StringViewType > ;
341346
342347impl From < Vec < & str > > for StringViewArray {
343348 fn from ( v : Vec < & str > ) -> Self {
@@ -348,8 +353,9 @@ impl From<Vec<&str>> for StringViewArray {
348353#[ cfg( test) ]
349354mod tests {
350355 use crate :: builder:: StringViewBuilder ;
351- use crate :: types:: BytesViewType ;
352356 use crate :: { Array , BinaryViewArray , StringViewArray } ;
357+ use arrow_buffer:: { Buffer , ScalarBuffer } ;
358+ use arrow_data:: ByteView ;
353359
354360 #[ test]
355361 fn try_new ( ) {
@@ -363,20 +369,22 @@ mod tests {
363369 assert_eq ! ( array. value( 3 ) , "large payload over 12 bytes" ) ;
364370
365371 let array = BinaryViewArray :: from_iter_values ( vec ! [
366- b"hello" . to_bytes ( ) ,
367- b"world" . to_bytes ( ) ,
368- b"lulu" . to_bytes ( ) ,
369- b"large payload over 12 bytes" . to_bytes ( ) ,
372+ b"hello" . as_slice ( ) ,
373+ b"world" . as_slice ( ) ,
374+ b"lulu" . as_slice ( ) ,
375+ b"large payload over 12 bytes" . as_slice ( ) ,
370376 ] ) ;
371377 assert_eq ! ( array. value( 0 ) , b"hello" ) ;
372378 assert_eq ! ( array. value( 3 ) , b"large payload over 12 bytes" ) ;
373379
380+ // test empty array
374381 let array = {
375382 let mut builder = StringViewBuilder :: new ( ) ;
376383 builder. finish ( )
377384 } ;
378385 assert ! ( array. is_empty( ) ) ;
379386
387+ // test builder append
380388 let array = {
381389 let mut builder = StringViewBuilder :: new ( ) ;
382390 builder. append_value ( "hello" ) ;
@@ -387,5 +395,86 @@ mod tests {
387395 assert_eq ! ( array. value( 0 ) , "hello" ) ;
388396 assert ! ( array. is_null( 1 ) ) ;
389397 assert_eq ! ( array. value( 2 ) , "large payload over 12 bytes" ) ;
398+
399+ // test builder's in_progress re-created
400+ let array = {
401+ // make a builder with small block size.
402+ let mut builder = StringViewBuilder :: new ( ) . with_block_size ( 14 ) ;
403+ builder. append_value ( "large payload over 12 bytes" ) ;
404+ builder. append_option ( Some ( "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created" ) ) ;
405+ builder. finish ( )
406+ } ;
407+ assert_eq ! ( array. value( 0 ) , "large payload over 12 bytes" ) ;
408+ assert_eq ! ( array. value( 1 ) , "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created" ) ;
409+ assert_eq ! ( 2 , array. buffers. len( ) ) ;
410+ }
411+
412+ #[ test]
413+ #[ should_panic( expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers" ) ]
414+ fn new_with_invalid_view_data ( ) {
415+ let v = "large payload over 12 bytes" ;
416+ let view = ByteView {
417+ length : 13 ,
418+ prefix : u32:: from_le_bytes ( v. as_bytes ( ) [ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ,
419+ buffer_index : 3 ,
420+ offset : 1 ,
421+ } ;
422+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
423+ let buffers = vec ! [ Buffer :: from_slice_ref( v) ] ;
424+ StringViewArray :: new ( views, buffers, None ) ;
425+ }
426+
427+ #[ test]
428+ #[ should_panic(
429+ expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
430+ ) ]
431+ fn new_with_invalid_utf8_data ( ) {
432+ let v: Vec < u8 > = vec ! [ 0xf0 , 0x80 , 0x80 , 0x80 ] ;
433+ let view = ByteView {
434+ length : v. len ( ) as u32 ,
435+ prefix : u32:: from_le_bytes ( v[ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ,
436+ buffer_index : 0 ,
437+ offset : 0 ,
438+ } ;
439+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
440+ let buffers = vec ! [ Buffer :: from_slice_ref( v) ] ;
441+ StringViewArray :: new ( views, buffers, None ) ;
442+ }
443+
444+ #[ test]
445+ #[ should_panic( expected = "View at index 0 contained non-zero padding for string of length 1" ) ]
446+ fn new_with_invalid_zero_padding ( ) {
447+ let mut data = [ 0 ; 12 ] ;
448+ data[ 0 ] = b'H' ;
449+ data[ 11 ] = 1 ; // no zero padding
450+
451+ let mut view_buffer = [ 0 ; 16 ] ;
452+ view_buffer[ 0 ..4 ] . copy_from_slice ( & 1u32 . to_le_bytes ( ) ) ;
453+ view_buffer[ 4 ..] . copy_from_slice ( & data) ;
454+
455+ let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
456+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
457+ let buffers = vec ! [ ] ;
458+ StringViewArray :: new ( views, buffers, None ) ;
459+ }
460+
461+ #[ test]
462+ #[ should_panic( expected = "Mismatch between embedded prefix and data" ) ]
463+ fn test_mismatch_between_embedded_prefix_and_data ( ) {
464+ let input_str_1 = "Hello, Rustaceans!" ;
465+ let input_str_2 = "Hallo, Rustaceans!" ;
466+ let length = input_str_1. len ( ) as u32 ;
467+ assert ! ( input_str_1. len( ) > 12 ) ;
468+
469+ let mut view_buffer = [ 0 ; 16 ] ;
470+ view_buffer[ 0 ..4 ] . copy_from_slice ( & length. to_le_bytes ( ) ) ;
471+ view_buffer[ 4 ..8 ] . copy_from_slice ( & input_str_1. as_bytes ( ) [ 0 ..4 ] ) ;
472+ view_buffer[ 8 ..12 ] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
473+ view_buffer[ 12 ..] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
474+ let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
475+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
476+ let buffers = vec ! [ Buffer :: from_slice_ref( input_str_2. as_bytes( ) ) ] ;
477+
478+ StringViewArray :: new ( views, buffers, None ) ;
390479 }
391480}
0 commit comments