Skip to content

Commit 8a5be13

Browse files
a10yalamb
andauthored
Enable casting from Utf8View (#6077)
* Enable casting from Utf8View -> string or temporal types * save * implement casting utf8view -> timestamp/interval types, with tests * fix clippy * fmt --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 9be0eb5 commit 8a5be13

File tree

2 files changed

+265
-62
lines changed

2 files changed

+265
-62
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 123 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
210210
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
211211
(FixedSizeBinary(_), Binary | LargeBinary) => true,
212212
(
213-
Utf8 | LargeUtf8,
213+
Utf8 | LargeUtf8 | Utf8View,
214214
Binary
215215
| LargeBinary
216216
| Utf8
@@ -228,7 +228,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
228228
| Interval(_),
229229
) => true,
230230
(Utf8 | LargeUtf8, Utf8View) => true,
231-
(Utf8View, Utf8 | LargeUtf8) => true,
232231
(BinaryView, Binary | LargeBinary) => true,
233232
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
234233
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
@@ -1269,6 +1268,56 @@ pub fn cast_with_options(
12691268
"Casting from {from_type:?} to {to_type:?} not supported",
12701269
))),
12711270
},
1271+
(Utf8View, _) => match to_type {
1272+
UInt8 => parse_string_view::<UInt8Type>(array, cast_options),
1273+
UInt16 => parse_string_view::<UInt16Type>(array, cast_options),
1274+
UInt32 => parse_string_view::<UInt32Type>(array, cast_options),
1275+
UInt64 => parse_string_view::<UInt64Type>(array, cast_options),
1276+
Int8 => parse_string_view::<Int8Type>(array, cast_options),
1277+
Int16 => parse_string_view::<Int16Type>(array, cast_options),
1278+
Int32 => parse_string_view::<Int32Type>(array, cast_options),
1279+
Int64 => parse_string_view::<Int64Type>(array, cast_options),
1280+
Float32 => parse_string_view::<Float32Type>(array, cast_options),
1281+
Float64 => parse_string_view::<Float64Type>(array, cast_options),
1282+
Date32 => parse_string_view::<Date32Type>(array, cast_options),
1283+
Date64 => parse_string_view::<Date64Type>(array, cast_options),
1284+
Binary => cast_view_to_byte::<StringViewType, GenericBinaryType<i32>>(array),
1285+
LargeBinary => cast_view_to_byte::<StringViewType, GenericBinaryType<i64>>(array),
1286+
Utf8 => cast_view_to_byte::<StringViewType, GenericStringType<i32>>(array),
1287+
LargeUtf8 => cast_view_to_byte::<StringViewType, GenericStringType<i64>>(array),
1288+
Time32(TimeUnit::Second) => parse_string_view::<Time32SecondType>(array, cast_options),
1289+
Time32(TimeUnit::Millisecond) => {
1290+
parse_string_view::<Time32MillisecondType>(array, cast_options)
1291+
}
1292+
Time64(TimeUnit::Microsecond) => {
1293+
parse_string_view::<Time64MicrosecondType>(array, cast_options)
1294+
}
1295+
Time64(TimeUnit::Nanosecond) => {
1296+
parse_string_view::<Time64NanosecondType>(array, cast_options)
1297+
}
1298+
Timestamp(TimeUnit::Second, to_tz) => {
1299+
cast_view_to_timestamp::<TimestampSecondType>(array, to_tz, cast_options)
1300+
}
1301+
Timestamp(TimeUnit::Millisecond, to_tz) => {
1302+
cast_view_to_timestamp::<TimestampMillisecondType>(array, to_tz, cast_options)
1303+
}
1304+
Timestamp(TimeUnit::Microsecond, to_tz) => {
1305+
cast_view_to_timestamp::<TimestampMicrosecondType>(array, to_tz, cast_options)
1306+
}
1307+
Timestamp(TimeUnit::Nanosecond, to_tz) => {
1308+
cast_view_to_timestamp::<TimestampNanosecondType>(array, to_tz, cast_options)
1309+
}
1310+
Interval(IntervalUnit::YearMonth) => {
1311+
cast_view_to_year_month_interval(array, cast_options)
1312+
}
1313+
Interval(IntervalUnit::DayTime) => cast_view_to_day_time_interval(array, cast_options),
1314+
Interval(IntervalUnit::MonthDayNano) => {
1315+
cast_view_to_month_day_nano_interval(array, cast_options)
1316+
}
1317+
_ => Err(ArrowError::CastError(format!(
1318+
"Casting from {from_type:?} to {to_type:?} not supported",
1319+
))),
1320+
},
12721321
(LargeUtf8, _) => match to_type {
12731322
UInt8 => parse_string::<UInt8Type, i64>(array, cast_options),
12741323
UInt16 => parse_string::<UInt16Type, i64>(array, cast_options),
@@ -1365,8 +1414,6 @@ pub fn cast_with_options(
13651414
"Casting from {from_type:?} to {to_type:?} not supported",
13661415
))),
13671416
},
1368-
(Utf8View, Utf8) => cast_view_to_byte::<StringViewType, GenericStringType<i32>>(array),
1369-
(Utf8View, LargeUtf8) => cast_view_to_byte::<StringViewType, GenericStringType<i64>>(array),
13701417
(BinaryView, Binary) => cast_view_to_byte::<BinaryViewType, GenericBinaryType<i32>>(array),
13711418
(BinaryView, LargeBinary) => {
13721419
cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
@@ -3960,6 +4007,11 @@ mod tests {
39604007

39614008
#[test]
39624009
fn test_cast_string_to_timestamp() {
4010+
let a0 = Arc::new(StringViewArray::from(vec![
4011+
Some("2020-09-08T12:00:00.123456789+00:00"),
4012+
Some("Not a valid date"),
4013+
None,
4014+
])) as ArrayRef;
39634015
let a1 = Arc::new(StringArray::from(vec![
39644016
Some("2020-09-08T12:00:00.123456789+00:00"),
39654017
Some("Not a valid date"),
@@ -3970,7 +4022,7 @@ mod tests {
39704022
Some("Not a valid date"),
39714023
None,
39724024
])) as ArrayRef;
3973-
for array in &[a1, a2] {
4025+
for array in &[a0, a1, a2] {
39744026
for time_unit in &[
39754027
TimeUnit::Second,
39764028
TimeUnit::Millisecond,
@@ -4039,6 +4091,11 @@ mod tests {
40394091

40404092
#[test]
40414093
fn test_cast_string_to_date32() {
4094+
let a0 = Arc::new(StringViewArray::from(vec![
4095+
Some("2018-12-25"),
4096+
Some("Not a valid date"),
4097+
None,
4098+
])) as ArrayRef;
40424099
let a1 = Arc::new(StringArray::from(vec![
40434100
Some("2018-12-25"),
40444101
Some("Not a valid date"),
@@ -4049,7 +4106,7 @@ mod tests {
40494106
Some("Not a valid date"),
40504107
None,
40514108
])) as ArrayRef;
4052-
for array in &[a1, a2] {
4109+
for array in &[a0, a1, a2] {
40534110
let to_type = DataType::Date32;
40544111
let b = cast(array, &to_type).unwrap();
40554112
let c = b.as_primitive::<Date32Type>();
@@ -4071,30 +4128,47 @@ mod tests {
40714128

40724129
#[test]
40734130
fn test_cast_string_format_yyyymmdd_to_date32() {
4074-
let a = Arc::new(StringArray::from(vec![
4131+
let a0 = Arc::new(StringViewArray::from(vec![
4132+
Some("2020-12-25"),
4133+
Some("20201117"),
4134+
])) as ArrayRef;
4135+
let a1 = Arc::new(StringArray::from(vec![
4136+
Some("2020-12-25"),
4137+
Some("20201117"),
4138+
])) as ArrayRef;
4139+
let a2 = Arc::new(LargeStringArray::from(vec![
40754140
Some("2020-12-25"),
40764141
Some("20201117"),
40774142
])) as ArrayRef;
40784143

4079-
let to_type = DataType::Date32;
4080-
let options = CastOptions {
4081-
safe: false,
4082-
format_options: FormatOptions::default(),
4083-
};
4084-
let result = cast_with_options(&a, &to_type, &options).unwrap();
4085-
let c = result.as_primitive::<Date32Type>();
4086-
assert_eq!(
4087-
chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
4088-
c.value_as_date(0)
4089-
);
4090-
assert_eq!(
4091-
chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
4092-
c.value_as_date(1)
4093-
);
4144+
for array in &[a0, a1, a2] {
4145+
let to_type = DataType::Date32;
4146+
let options = CastOptions {
4147+
safe: false,
4148+
format_options: FormatOptions::default(),
4149+
};
4150+
let result = cast_with_options(&array, &to_type, &options).unwrap();
4151+
let c = result.as_primitive::<Date32Type>();
4152+
assert_eq!(
4153+
chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
4154+
c.value_as_date(0)
4155+
);
4156+
assert_eq!(
4157+
chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
4158+
c.value_as_date(1)
4159+
);
4160+
}
40944161
}
40954162

40964163
#[test]
40974164
fn test_cast_string_to_time32second() {
4165+
let a0 = Arc::new(StringViewArray::from(vec![
4166+
Some("08:08:35.091323414"),
4167+
Some("08:08:60.091323414"), // leap second
4168+
Some("08:08:61.091323414"), // not valid
4169+
Some("Not a valid time"),
4170+
None,
4171+
])) as ArrayRef;
40984172
let a1 = Arc::new(StringArray::from(vec![
40994173
Some("08:08:35.091323414"),
41004174
Some("08:08:60.091323414"), // leap second
@@ -4109,7 +4183,7 @@ mod tests {
41094183
Some("Not a valid time"),
41104184
None,
41114185
])) as ArrayRef;
4112-
for array in &[a1, a2] {
4186+
for array in &[a0, a1, a2] {
41134187
let to_type = DataType::Time32(TimeUnit::Second);
41144188
let b = cast(array, &to_type).unwrap();
41154189
let c = b.as_primitive::<Time32SecondType>();
@@ -4130,6 +4204,13 @@ mod tests {
41304204

41314205
#[test]
41324206
fn test_cast_string_to_time32millisecond() {
4207+
let a0 = Arc::new(StringViewArray::from(vec![
4208+
Some("08:08:35.091323414"),
4209+
Some("08:08:60.091323414"), // leap second
4210+
Some("08:08:61.091323414"), // not valid
4211+
Some("Not a valid time"),
4212+
None,
4213+
])) as ArrayRef;
41334214
let a1 = Arc::new(StringArray::from(vec![
41344215
Some("08:08:35.091323414"),
41354216
Some("08:08:60.091323414"), // leap second
@@ -4144,7 +4225,7 @@ mod tests {
41444225
Some("Not a valid time"),
41454226
None,
41464227
])) as ArrayRef;
4147-
for array in &[a1, a2] {
4228+
for array in &[a0, a1, a2] {
41484229
let to_type = DataType::Time32(TimeUnit::Millisecond);
41494230
let b = cast(array, &to_type).unwrap();
41504231
let c = b.as_primitive::<Time32MillisecondType>();
@@ -4165,6 +4246,11 @@ mod tests {
41654246

41664247
#[test]
41674248
fn test_cast_string_to_time64microsecond() {
4249+
let a0 = Arc::new(StringViewArray::from(vec![
4250+
Some("08:08:35.091323414"),
4251+
Some("Not a valid time"),
4252+
None,
4253+
])) as ArrayRef;
41684254
let a1 = Arc::new(StringArray::from(vec![
41694255
Some("08:08:35.091323414"),
41704256
Some("Not a valid time"),
@@ -4175,7 +4261,7 @@ mod tests {
41754261
Some("Not a valid time"),
41764262
None,
41774263
])) as ArrayRef;
4178-
for array in &[a1, a2] {
4264+
for array in &[a0, a1, a2] {
41794265
let to_type = DataType::Time64(TimeUnit::Microsecond);
41804266
let b = cast(array, &to_type).unwrap();
41814267
let c = b.as_primitive::<Time64MicrosecondType>();
@@ -4194,6 +4280,11 @@ mod tests {
41944280

41954281
#[test]
41964282
fn test_cast_string_to_time64nanosecond() {
4283+
let a0 = Arc::new(StringViewArray::from(vec![
4284+
Some("08:08:35.091323414"),
4285+
Some("Not a valid time"),
4286+
None,
4287+
])) as ArrayRef;
41974288
let a1 = Arc::new(StringArray::from(vec![
41984289
Some("08:08:35.091323414"),
41994290
Some("Not a valid time"),
@@ -4204,7 +4295,7 @@ mod tests {
42044295
Some("Not a valid time"),
42054296
None,
42064297
])) as ArrayRef;
4207-
for array in &[a1, a2] {
4298+
for array in &[a0, a1, a2] {
42084299
let to_type = DataType::Time64(TimeUnit::Nanosecond);
42094300
let b = cast(array, &to_type).unwrap();
42104301
let c = b.as_primitive::<Time64NanosecondType>();
@@ -4223,6 +4314,11 @@ mod tests {
42234314

42244315
#[test]
42254316
fn test_cast_string_to_date64() {
4317+
let a0 = Arc::new(StringViewArray::from(vec![
4318+
Some("2020-09-08T12:00:00"),
4319+
Some("Not a valid date"),
4320+
None,
4321+
])) as ArrayRef;
42264322
let a1 = Arc::new(StringArray::from(vec![
42274323
Some("2020-09-08T12:00:00"),
42284324
Some("Not a valid date"),
@@ -4233,7 +4329,7 @@ mod tests {
42334329
Some("Not a valid date"),
42344330
None,
42354331
])) as ArrayRef;
4236-
for array in &[a1, a2] {
4332+
for array in &[a0, a1, a2] {
42374333
let to_type = DataType::Date64;
42384334
let b = cast(array, &to_type).unwrap();
42394335
let c = b.as_primitive::<Date64Type>();

0 commit comments

Comments
 (0)