1717
1818use crate :: models:: ColumnInfo ;
1919use arrow:: array:: {
20- Array , StringArray , TimestampMicrosecondArray , TimestampMillisecondArray ,
20+ Array , Int64Array , StringArray , TimestampMicrosecondArray , TimestampMillisecondArray ,
2121 TimestampNanosecondArray , TimestampSecondArray , UnionArray ,
2222} ;
2323use arrow:: datatypes:: { Field , Schema , TimeUnit } ;
@@ -26,7 +26,45 @@ use chrono::DateTime;
2626use datafusion:: arrow:: array:: ArrayRef ;
2727use datafusion:: arrow:: datatypes:: DataType ;
2828use datafusion:: common:: Result as DataFusionResult ;
29+ use std:: fmt:: Display ;
2930use std:: sync:: Arc ;
31+ use std:: { env, fmt} ;
32+
33+ pub struct Config {
34+ pub dbt_serialization_format : SerializationFormat ,
35+ }
36+
37+ impl Default for Config {
38+ fn default ( ) -> Self {
39+ Self {
40+ dbt_serialization_format : SerializationFormat :: new ( ) ,
41+ }
42+ }
43+ }
44+ #[ derive( Copy , Clone , PartialEq , Eq ) ]
45+ pub enum SerializationFormat {
46+ Arrow ,
47+ Json ,
48+ }
49+
50+ impl Display for SerializationFormat {
51+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
52+ match self {
53+ Self :: Arrow => write ! ( f, "arrow" ) ,
54+ Self :: Json => write ! ( f, "json" ) ,
55+ }
56+ }
57+ }
58+
59+ impl SerializationFormat {
60+ fn new ( ) -> Self {
61+ let var = env:: var ( "DBT_SERIALIZATION_FORMAT" ) . unwrap_or_else ( |_| "json" . to_string ( ) ) ;
62+ match var. to_lowercase ( ) . as_str ( ) {
63+ "arrow" => Self :: Arrow ,
64+ _ => Self :: Json ,
65+ }
66+ }
67+ }
3068
3169#[ must_use]
3270pub fn first_non_empty_type ( union_array : & UnionArray ) -> Option < ( DataType , ArrayRef ) > {
@@ -42,6 +80,7 @@ pub fn first_non_empty_type(union_array: &UnionArray) -> Option<(DataType, Array
4280
4381pub fn convert_record_batches (
4482 records : Vec < RecordBatch > ,
83+ serialization_format : SerializationFormat ,
4584) -> DataFusionResult < ( Vec < RecordBatch > , Vec < ColumnInfo > ) > {
4685 let mut converted_batches = Vec :: new ( ) ;
4786 let column_infos = ColumnInfo :: from_batch ( & records) ;
@@ -71,7 +110,8 @@ pub fn convert_record_batches(
71110 }
72111 }
73112 DataType :: Timestamp ( unit, _) => {
74- let converted_column = convert_timestamp_to_struct ( column, * unit) ;
113+ let converted_column =
114+ convert_timestamp_to_struct ( column, * unit, serialization_format) ;
75115 fields. push (
76116 Field :: new (
77117 field. name ( ) ,
@@ -97,63 +137,80 @@ pub fn convert_record_batches(
97137 Ok ( ( converted_batches. clone ( ) , column_infos) )
98138}
99139
140+ macro_rules! downcast_and_iter {
141+ ( $column: expr, $array_type: ty) => {
142+ $column
143+ . as_any( )
144+ . downcast_ref:: <$array_type>( )
145+ . unwrap( )
146+ . into_iter( )
147+ } ;
148+ }
149+
100150#[ allow(
101151 clippy:: unwrap_used,
102152 clippy:: as_conversions,
103153 clippy:: cast_possible_truncation
104154) ]
105- fn convert_timestamp_to_struct ( column : & ArrayRef , unit : TimeUnit ) -> ArrayRef {
106- let timestamps: Vec < _ > = match unit {
107- TimeUnit :: Second => column
108- . as_any ( )
109- . downcast_ref :: < TimestampSecondArray > ( )
110- . unwrap ( )
111- . iter ( )
112- . map ( |x| {
113- x. map ( |ts| {
114- let ts = DateTime :: from_timestamp ( ts, 0 ) . unwrap ( ) ;
115- format ! ( "{}" , ts. timestamp( ) )
116- } )
117- } )
118- . collect ( ) ,
119- TimeUnit :: Millisecond => column
120- . as_any ( )
121- . downcast_ref :: < TimestampMillisecondArray > ( )
122- . unwrap ( )
123- . iter ( )
124- . map ( |x| {
125- x. map ( |ts| {
126- let ts = DateTime :: from_timestamp_millis ( ts) . unwrap ( ) ;
127- format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_millis( ) )
128- } )
129- } )
130- . collect ( ) ,
131- TimeUnit :: Microsecond => column
132- . as_any ( )
133- . downcast_ref :: < TimestampMicrosecondArray > ( )
134- . unwrap ( )
135- . iter ( )
136- . map ( |x| {
137- x. map ( |ts| {
138- let ts = DateTime :: from_timestamp_micros ( ts) . unwrap ( ) ;
139- format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_micros( ) )
140- } )
141- } )
142- . collect ( ) ,
143- TimeUnit :: Nanosecond => column
144- . as_any ( )
145- . downcast_ref :: < TimestampNanosecondArray > ( )
146- . unwrap ( )
147- . iter ( )
148- . map ( |x| {
149- x. map ( |ts| {
150- let ts = DateTime :: from_timestamp_nanos ( ts) ;
151- format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_nanos( ) )
152- } )
153- } )
154- . collect ( ) ,
155- } ;
156- Arc :: new ( StringArray :: from ( timestamps) ) as ArrayRef
155+ fn convert_timestamp_to_struct (
156+ column : & ArrayRef ,
157+ unit : TimeUnit ,
158+ ser : SerializationFormat ,
159+ ) -> ArrayRef {
160+ match ser {
161+ SerializationFormat :: Arrow => {
162+ let timestamps: Vec < _ > = match unit {
163+ TimeUnit :: Second => downcast_and_iter ! ( column, TimestampSecondArray ) . collect ( ) ,
164+ TimeUnit :: Millisecond => {
165+ downcast_and_iter ! ( column, TimestampMillisecondArray ) . collect ( )
166+ }
167+ TimeUnit :: Microsecond => {
168+ downcast_and_iter ! ( column, TimestampMicrosecondArray ) . collect ( )
169+ }
170+ TimeUnit :: Nanosecond => {
171+ downcast_and_iter ! ( column, TimestampNanosecondArray ) . collect ( )
172+ }
173+ } ;
174+ Arc :: new ( Int64Array :: from ( timestamps) ) as ArrayRef
175+ }
176+ SerializationFormat :: Json => {
177+ let timestamps: Vec < _ > = match unit {
178+ TimeUnit :: Second => downcast_and_iter ! ( column, TimestampSecondArray )
179+ . map ( |x| {
180+ x. map ( |ts| {
181+ let ts = DateTime :: from_timestamp ( ts, 0 ) . unwrap ( ) ;
182+ format ! ( "{}" , ts. timestamp( ) )
183+ } )
184+ } )
185+ . collect ( ) ,
186+ TimeUnit :: Millisecond => downcast_and_iter ! ( column, TimestampMillisecondArray )
187+ . map ( |x| {
188+ x. map ( |ts| {
189+ let ts = DateTime :: from_timestamp_millis ( ts) . unwrap ( ) ;
190+ format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_millis( ) )
191+ } )
192+ } )
193+ . collect ( ) ,
194+ TimeUnit :: Microsecond => downcast_and_iter ! ( column, TimestampMicrosecondArray )
195+ . map ( |x| {
196+ x. map ( |ts| {
197+ let ts = DateTime :: from_timestamp_micros ( ts) . unwrap ( ) ;
198+ format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_micros( ) )
199+ } )
200+ } )
201+ . collect ( ) ,
202+ TimeUnit :: Nanosecond => downcast_and_iter ! ( column, TimestampNanosecondArray )
203+ . map ( |x| {
204+ x. map ( |ts| {
205+ let ts = DateTime :: from_timestamp_nanos ( ts) ;
206+ format ! ( "{}.{}" , ts. timestamp( ) , ts. timestamp_subsec_nanos( ) )
207+ } )
208+ } )
209+ . collect ( ) ,
210+ } ;
211+ Arc :: new ( StringArray :: from ( timestamps) ) as ArrayRef
212+ }
213+ }
157214}
158215
159216#[ cfg( test) ]
@@ -224,7 +281,8 @@ mod tests {
224281 Arc :: new ( TimestampNanosecondArray :: from ( values) ) as ArrayRef
225282 }
226283 } ;
227- let result = convert_timestamp_to_struct ( & timestamp_array, * unit) ;
284+ let result =
285+ convert_timestamp_to_struct ( & timestamp_array, * unit, SerializationFormat :: Json ) ;
228286 let string_array = result. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
229287 assert_eq ! ( string_array. len( ) , 2 ) ;
230288 assert_eq ! ( string_array. value( 0 ) , * expected) ;
@@ -250,7 +308,8 @@ mod tests {
250308 ] ) ) as ArrayRef ;
251309 let batch = RecordBatch :: try_new ( schema, vec ! [ int_array, timestamp_array] ) . unwrap ( ) ;
252310 let records = vec ! [ batch] ;
253- let ( converted_batches, column_infos) = convert_record_batches ( records) . unwrap ( ) ;
311+ let ( converted_batches, column_infos) =
312+ convert_record_batches ( records. clone ( ) , SerializationFormat :: Json ) . unwrap ( ) ;
254313
255314 let converted_batch = & converted_batches[ 0 ] ;
256315 assert_eq ! ( converted_batches. len( ) , 1 ) ;
@@ -270,5 +329,17 @@ mod tests {
270329 assert_eq ! ( column_infos[ 0 ] . r#type, "fixed" ) ;
271330 assert_eq ! ( column_infos[ 1 ] . name, "timestamp_col" ) ;
272331 assert_eq ! ( column_infos[ 1 ] . r#type, "timestamp_ntz" ) ;
332+
333+ let ( converted_batches, _) =
334+ convert_record_batches ( records, SerializationFormat :: Arrow ) . unwrap ( ) ;
335+ let converted_batch = & converted_batches[ 0 ] ;
336+ let converted_timestamp_array = converted_batch
337+ . column ( 1 )
338+ . as_any ( )
339+ . downcast_ref :: < Int64Array > ( )
340+ . unwrap ( ) ;
341+ assert_eq ! ( converted_timestamp_array. value( 0 ) , 1_627_846_261 ) ;
342+ assert ! ( converted_timestamp_array. is_null( 1 ) ) ;
343+ assert_eq ! ( converted_timestamp_array. value( 2 ) , 1_627_846_262 ) ;
273344 }
274345}
0 commit comments