@@ -27,6 +27,8 @@ use std::sync::Arc;
2727
2828/// InProgressArray for StringViewArray and BinaryViewArray
2929pub ( crate ) struct InProgressByteViewArray < B : ByteViewType > {
30+ /// The source array
31+ source : Option < Source > ,
3032 /// the target batch size (and thus size for views allocation)
3133 batch_size : usize ,
3234 /// The in progress vies
@@ -44,6 +46,15 @@ pub(crate) struct InProgressByteViewArray<B: ByteViewType> {
4446 _phantom : PhantomData < B > ,
4547}
4648
49+ struct Source {
50+ /// The array to copy form
51+ array : ArrayRef ,
52+ /// Should the strings from the source array be copied into new buffers?
53+ need_gc : bool ,
54+ /// How many bytes were actually used in the source array's buffers?
55+ ideal_buffer_size : usize ,
56+ }
57+
4758// manually implement Debug because ByteViewType doesn't implement Debug
4859impl < B : ByteViewType > std:: fmt:: Debug for InProgressByteViewArray < B > {
4960 fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
@@ -63,6 +74,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
6374
6475 Self {
6576 batch_size,
77+ source : None ,
6678 views : Vec :: new ( ) , // allocate in push
6779 nulls : NullBufferBuilder :: new ( batch_size) , // no allocation
6880 current : None ,
@@ -80,15 +92,6 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
8092 self . views . reserve ( self . batch_size ) ;
8193 }
8294
83- /// Update self.nulls with the nulls from the StringViewArray
84- fn push_nulls ( & mut self , s : & GenericByteViewArray < B > ) {
85- if let Some ( nulls) = s. nulls ( ) . as_ref ( ) {
86- self . nulls . append_buffer ( nulls) ;
87- } else {
88- self . nulls . append_n_non_nulls ( s. len ( ) ) ;
89- }
90- }
91-
9295 /// Finishes in progress block, if any
9396 fn finish_current ( & mut self ) {
9497 let Some ( next_buffer) = self . current . take ( ) else {
@@ -263,38 +266,67 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
263266}
264267
265268impl < B : ByteViewType > InProgressArray for InProgressByteViewArray < B > {
266- fn push_array ( & mut self , array : ArrayRef ) {
267- // If creating StringViewArray output, ensure input was valid utf8 too
269+ fn set_source ( & mut self , source : Option < ArrayRef > ) {
270+ self . source = source. map ( |array| {
271+ let s = array. as_byte_view :: < B > ( ) ;
272+
273+ let ( need_gc, ideal_buffer_size) = if s. data_buffers ( ) . is_empty ( ) {
274+ ( false , 0 )
275+ } else {
276+ let ideal_buffer_size = s. total_buffer_bytes_used ( ) ;
277+ let actual_buffer_size = s. get_buffer_memory_size ( ) ;
278+ // copying strings is expensive, so only do it if the array is
279+ // sparse (uses at least 2x the memory it needs)
280+ let need_gc =
281+ ideal_buffer_size != 0 && actual_buffer_size > ( ideal_buffer_size * 2 ) ;
282+ ( need_gc, ideal_buffer_size)
283+ } ;
284+
285+ Source {
286+ array,
287+ need_gc,
288+ ideal_buffer_size,
289+ }
290+ } )
291+ }
292+
293+ fn copy_rows ( & mut self , offset : usize , len : usize ) -> Result < ( ) , ArrowError > {
268294 self . ensure_capacity ( ) ;
269- let s = array. as_byte_view :: < B > ( ) ;
295+ let source = self . source . take ( ) . ok_or_else ( || {
296+ ArrowError :: InvalidArgumentError ( "InProgressByteViewArray: source not set" . to_string ( ) )
297+ } ) ?;
270298
271- // add any nulls, as necessary
272- self . push_nulls ( s ) ;
299+ // If creating StringViewArray output, ensure input was valid utf8 too
300+ let s = source . array . as_byte_view :: < B > ( ) ;
273301
274- // If there are no data buffers in s (all inlined views), can append the
275- // views/nulls and done
276- if s. data_buffers ( ) . is_empty ( ) {
277- self . views . extend_from_slice ( s. views ( ) . as_ref ( ) ) ;
278- return ;
279- }
302+ // add any nulls, as necessary
303+ if let Some ( nulls) = s. nulls ( ) . as_ref ( ) {
304+ let nulls = nulls. slice ( offset, len) ;
305+ self . nulls . append_buffer ( & nulls) ;
306+ } else {
307+ self . nulls . append_n_non_nulls ( len) ;
308+ } ;
280309
281- let ideal_buffer_size = s. total_buffer_bytes_used ( ) ;
282- let actual_buffer_size = s. get_buffer_memory_size ( ) ;
283310 let buffers = s. data_buffers ( ) ;
311+ let views = & s. views ( ) . as_ref ( ) [ offset..offset + len] ;
284312
285- // None of the views references the buffers (e.g. sliced)
286- if ideal_buffer_size == 0 {
287- self . views . extend_from_slice ( s. views ( ) . as_ref ( ) ) ;
288- return ;
313+ // If there are no data buffers in s (all inlined views), can append the
314+ // views/nulls and done
315+ if source. ideal_buffer_size == 0 {
316+ self . views . extend_from_slice ( views) ;
317+ self . source = Some ( source) ;
318+ return Ok ( ( ) ) ;
289319 }
290320
291321 // Copying the strings into a buffer can be time-consuming so
292322 // only do it if the array is sparse
293- if actual_buffer_size > ( ideal_buffer_size * 2 ) {
294- self . append_views_and_copy_strings ( s . views ( ) , ideal_buffer_size, buffers) ;
323+ if source . need_gc {
324+ self . append_views_and_copy_strings ( views, source . ideal_buffer_size , buffers) ;
295325 } else {
296- self . append_views_and_update_buffer_index ( s . views ( ) , buffers) ;
326+ self . append_views_and_update_buffer_index ( views, buffers) ;
297327 }
328+ self . source = Some ( source) ;
329+ Ok ( ( ) )
298330 }
299331
300332 fn finish ( & mut self ) -> Result < ArrayRef , ArrowError > {
0 commit comments