Skip to content

Commit d9efdbb

Browse files
committed
Arrow: Use case instead of wrapping a map/list
Wrapping the list seems to introduce an odd behavior where `null` values are converted to an empty list `[]`. Resolves #251
1 parent dba1ef8 commit d9efdbb

File tree

3 files changed

+35
-15
lines changed

3 files changed

+35
-15
lines changed

dev/provision.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,23 @@
339339
)
340340

341341
spark.sql("INSERT INTO default.test_table_add_column VALUES ('2', '2')")
342+
343+
spark.sql(
344+
"""
345+
CREATE TABLE default.test_table_empty_list_and_map (
346+
col_list array<int>,
347+
col_map map<int, int>
348+
)
349+
USING iceberg
350+
TBLPROPERTIES (
351+
'format-version'='1'
352+
);
353+
"""
354+
)
355+
356+
spark.sql(
357+
"""
358+
INSERT INTO default.test_table_empty_list_and_map
359+
VALUES (null, null)
360+
"""
361+
)

pyiceberg/io/pyarrow.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,24 +1052,12 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional
10521052
return field_array
10531053

10541054
def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]:
1055-
return (
1056-
pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array))
1057-
if isinstance(list_array, pa.ListArray)
1058-
else None
1059-
)
1055+
return list_array.cast(schema_to_pyarrow(list_type)) if isinstance(list_array, pa.ListArray) else None
10601056

10611057
def map(
10621058
self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array]
10631059
) -> Optional[pa.Array]:
1064-
return (
1065-
pa.MapArray.from_arrays(
1066-
map_array.offsets,
1067-
self.cast_if_needed(map_type.key_field, key_result),
1068-
self.cast_if_needed(map_type.value_field, value_result),
1069-
)
1070-
if isinstance(map_array, pa.MapArray)
1071-
else None
1072-
)
1060+
return map_array.cast(schema_to_pyarrow(map_type)) if isinstance(map_array, pa.MapArray) else None
10731061

10741062
def primitive(self, _: PrimitiveType, array: Optional[pa.Array]) -> Optional[pa.Array]:
10751063
return array

tests/test_integration.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def table_test_table_sanitized_character(catalog: Catalog) -> Table:
9393
return catalog.load_table("default.test_table_sanitized_character")
9494

9595

96+
@pytest.fixture()
97+
def table_test_empty_list_and_map(catalog: Catalog) -> Table:
98+
return catalog.load_table("default.test_table_empty_list_and_map")
99+
100+
96101
TABLE_NAME = ("default", "t1")
97102

98103

@@ -417,8 +422,15 @@ def test_upgrade_table_version(table_test_table_version: Table) -> None:
417422

418423

419424
@pytest.mark.integration
420-
def test_reproduce_issue(table_test_table_sanitized_character: Table) -> None:
425+
def test_sanitize_column_names(table_test_table_sanitized_character: Table) -> None:
421426
arrow_table = table_test_table_sanitized_character.scan().to_arrow()
422427
assert len(arrow_table.schema.names), 1
423428
assert len(table_test_table_sanitized_character.schema().fields), 1
424429
assert arrow_table.schema.names[0] == table_test_table_sanitized_character.schema().fields[0].name
430+
431+
432+
@pytest.mark.integration
433+
def test_null_list_and_map(table_test_empty_list_and_map: Table) -> None:
434+
arrow_table = table_test_empty_list_and_map.scan().to_arrow()
435+
assert arrow_table["col_list"].to_pylist() == [None]
436+
assert arrow_table["col_map"].to_pylist() == [None]

0 commit comments

Comments
 (0)