Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions pyiceberg/table/name_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@


class MappedField(IcebergBaseModel):
field_id: int = Field(alias="field-id")
field_id: Optional[int] = Field(alias="field-id", default=None)
names: List[str] = conlist(str)
fields: List[MappedField] = Field(default_factory=list)

Expand All @@ -49,12 +49,12 @@ def convert_null_to_empty_List(cls, v: Any) -> Any:
@model_serializer
def ser_model(self) -> Dict[str, Any]:
"""Set custom serializer to leave out the field when it is empty."""
fields = {"fields": self.fields} if len(self.fields) > 0 else {}
return {
"field-id": self.field_id,
"names": self.names,
**fields,
}
serialized: Dict[str, Any] = {"names": self.names}
if self.field_id is not None:
serialized["field-id"] = self.field_id
if len(self.fields) > 0:
serialized["fields"] = self.fields
return serialized

def __len__(self) -> int:
"""Return the number of fields."""
Expand All @@ -65,7 +65,8 @@ def __str__(self) -> str:
# Otherwise the UTs fail because the order of the set can change
fields_str = ", ".join([str(e) for e in self.fields]) or ""
fields_str = " " + fields_str if fields_str else ""
return "([" + ", ".join(self.names) + "] -> " + (str(self.field_id) or "?") + fields_str + ")"
field_id = "?" if self.field_id is None else (str(self.field_id) or "?")
return "([" + ", ".join(self.names) + "] -> " + field_id + fields_str + ")"


class NameMapping(IcebergRootModel[List[MappedField]]):
Expand Down Expand Up @@ -232,7 +233,9 @@ def mapping(self, nm: NameMapping, field_results: List[MappedField]) -> List[Map

def fields(self, struct: List[MappedField], field_results: List[MappedField]) -> List[MappedField]:
reassignments: Dict[str, int] = {
update.name: update.field_id for f in field_results if (update := self._updates.get(f.field_id))
update.name: update.field_id
for f in field_results
if f.field_id is not None and (update := self._updates.get(f.field_id))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the rationale behind this change? Should we look at all the other places where .field_id is used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The update_mapping API doesn't currently support changes to mappings without a field ID since updates is typed with Dict[int, NestedField].

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, is this change also due to the type checker?

Copy link
Contributor Author

@barronw barronw Dec 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, also changing the API of updating_mapping probably requires a larger discussion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the explanation! cc @Fokko / @sungwy for another set of eyes

}
return [
updated_field
Expand All @@ -241,6 +244,8 @@ def fields(self, struct: List[MappedField], field_results: List[MappedField]) ->
]

def field(self, field: MappedField, field_result: List[MappedField]) -> MappedField:
if field.field_id is None:
return field
field_names = field.names
if (update := self._updates.get(field.field_id)) is not None and update.name not in field_names:
field_names.append(update.name)
Expand Down Expand Up @@ -333,8 +338,8 @@ def struct(self, struct: StructType, struct_partner: Optional[MappedField], fiel
return StructType(*field_results)

def field(self, field: NestedField, field_partner: Optional[MappedField], field_result: IcebergType) -> IcebergType:
if field_partner is None:
raise ValueError(f"Field missing from NameMapping: {'.'.join(self.current_path)}")
if field_partner is None or field_partner.field_id is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious about this change, why do we need to check for field_partner.field_id

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should just be a type check since NestedField expects a field ID below. The field partner is looked up by field ID before being passed into this method.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see field_id is a required field in NestedField. Without field_partner.field_id is None, type checker errors

raise ValueError(f"Field or field ID missing from NameMapping: {'.'.join(self.current_path)}")

return NestedField(
field_id=field_partner.field_id,
Expand Down
30 changes: 29 additions & 1 deletion tests/table/test_name_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,21 @@ def test_json_mapped_field_no_names_deserialization() -> None:
assert MappedField(field_id=1, names=[]) == MappedField.model_validate_json(mapped_field_with_null_fields)


def test_json_mapped_field_no_field_id_deserialization() -> None:
mapped_field = """{
"names": []
}
"""
assert MappedField(field_id=None, names=[]) == MappedField.model_validate_json(mapped_field)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: also test omitting the field_id=None


mapped_field_with_null_fields = """{
"names": [],
"fields": null
}
"""
assert MappedField(names=[]) == MappedField.model_validate_json(mapped_field_with_null_fields)


def test_json_name_mapping_deserialization() -> None:
name_mapping = """
[
Expand Down Expand Up @@ -164,10 +179,23 @@ def test_json_name_mapping_deserialization() -> None:
])


def test_json_mapped_field_no_field_id_serialization() -> None:
table_name_mapping_nested_no_field_id = NameMapping([
MappedField(field_id=1, names=["foo"]),
MappedField(field_id=None, names=["bar"]),
MappedField(field_id=2, names=["qux"], fields=[MappedField(field_id=None, names=["element"])]),
])

assert (
table_name_mapping_nested_no_field_id.model_dump_json()
== """[{"names":["foo"],"field-id":1},{"names":["bar"]},{"names":["qux"],"field-id":2,"fields":[{"names":["element"]}]}]"""
)


def test_json_serialization(table_name_mapping_nested: NameMapping) -> None:
assert (
table_name_mapping_nested.model_dump_json()
== """[{"field-id":1,"names":["foo"]},{"field-id":2,"names":["bar"]},{"field-id":3,"names":["baz"]},{"field-id":4,"names":["qux"],"fields":[{"field-id":5,"names":["element"]}]},{"field-id":6,"names":["quux"],"fields":[{"field-id":7,"names":["key"]},{"field-id":8,"names":["value"],"fields":[{"field-id":9,"names":["key"]},{"field-id":10,"names":["value"]}]}]},{"field-id":11,"names":["location"],"fields":[{"field-id":12,"names":["element"],"fields":[{"field-id":13,"names":["latitude"]},{"field-id":14,"names":["longitude"]}]}]},{"field-id":15,"names":["person"],"fields":[{"field-id":16,"names":["name"]},{"field-id":17,"names":["age"]}]}]"""
== """[{"names":["foo"],"field-id":1},{"names":["bar"],"field-id":2},{"names":["baz"],"field-id":3},{"names":["qux"],"field-id":4,"fields":[{"names":["element"],"field-id":5}]},{"names":["quux"],"field-id":6,"fields":[{"names":["key"],"field-id":7},{"names":["value"],"field-id":8,"fields":[{"names":["key"],"field-id":9},{"names":["value"],"field-id":10}]}]},{"names":["location"],"field-id":11,"fields":[{"names":["element"],"field-id":12,"fields":[{"names":["latitude"],"field-id":13},{"names":["longitude"],"field-id":14}]}]},{"names":["person"],"field-id":15,"fields":[{"names":["name"],"field-id":16},{"names":["age"],"field-id":17}]}]"""
)


Expand Down