-
Notifications
You must be signed in to change notification settings - Fork 397
🐛 Write fields instead of spec object #846
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,13 +31,15 @@ | |
| Type, | ||
| ) | ||
|
|
||
| from pydantic_core import to_json | ||
|
|
||
| from pyiceberg.avro.file import AvroFile, AvroOutputFile | ||
| from pyiceberg.conversions import to_bytes | ||
| from pyiceberg.exceptions import ValidationError | ||
| from pyiceberg.io import FileIO, InputFile, OutputFile | ||
| from pyiceberg.partitioning import PartitionSpec | ||
| from pyiceberg.schema import Schema | ||
| from pyiceberg.typedef import EMPTY_DICT, Record, TableVersion | ||
| from pyiceberg.typedef import Record, TableVersion | ||
| from pyiceberg.types import ( | ||
| BinaryType, | ||
| BooleanType, | ||
|
|
@@ -645,7 +647,6 @@ class ManifestWriter(ABC): | |
| _output_file: OutputFile | ||
| _writer: AvroOutputFile[ManifestEntry] | ||
| _snapshot_id: int | ||
| _meta: Dict[str, str] | ||
| _added_files: int | ||
| _added_rows: int | ||
| _existing_files: int | ||
|
|
@@ -655,15 +656,12 @@ class ManifestWriter(ABC): | |
| _min_data_sequence_number: Optional[int] | ||
| _partitions: List[Record] | ||
|
|
||
| def __init__( | ||
| self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT | ||
| ) -> None: | ||
| def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int) -> None: | ||
| self.closed = False | ||
| self._spec = spec | ||
| self._schema = schema | ||
| self._output_file = output_file | ||
| self._snapshot_id = snapshot_id | ||
| self._meta = meta | ||
|
|
||
| self._added_files = 0 | ||
| self._added_rows = 0 | ||
|
|
@@ -697,6 +695,15 @@ def content(self) -> ManifestContent: ... | |
| @abstractmethod | ||
| def version(self) -> TableVersion: ... | ||
|
|
||
| @property | ||
| def _meta(self) -> Dict[str, str]: | ||
| return { | ||
| "schema": self._schema.model_dump_json(), | ||
| "partition-spec": to_json(self._spec.fields).decode("utf-8"), | ||
| "partition-spec-id": str(self._spec.spec_id), | ||
| "format-version": str(self.version), | ||
| } | ||
|
|
||
| def _with_partition(self, format_version: TableVersion) -> Schema: | ||
| data_file_type = data_file_with_partition( | ||
| format_version=format_version, partition_type=self._spec.partition_type(self._schema) | ||
|
|
@@ -771,12 +778,6 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, | |
| schema, | ||
| output_file, | ||
| snapshot_id, | ||
| { | ||
| "schema": schema.model_dump_json(), | ||
| "partition-spec": spec.model_dump_json(), | ||
| "partition-spec-id": str(spec.spec_id), | ||
| "format-version": "1", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is 👍 since the |
||
| }, | ||
| ) | ||
|
|
||
| def content(self) -> ManifestContent: | ||
|
|
@@ -792,19 +793,7 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: | |
|
|
||
| class ManifestWriterV2(ManifestWriter): | ||
| def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int): | ||
| super().__init__( | ||
| spec, | ||
| schema, | ||
| output_file, | ||
| snapshot_id, | ||
| meta={ | ||
| "schema": schema.model_dump_json(), | ||
| "partition-spec": spec.model_dump_json(), | ||
| "partition-spec-id": str(spec.spec_id), | ||
| "format-version": "2", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is 👍 since the |
||
| "content": "data", | ||
| }, | ||
| ) | ||
| super().__init__(spec, schema, output_file, snapshot_id) | ||
|
|
||
| def content(self) -> ManifestContent: | ||
| return ManifestContent.DATA | ||
|
|
@@ -813,6 +802,13 @@ def content(self) -> ManifestContent: | |
| def version(self) -> TableVersion: | ||
| return 2 | ||
|
|
||
| @property | ||
| def _meta(self) -> Dict[str, str]: | ||
| return { | ||
| **super()._meta, | ||
| "content": "data", | ||
| } | ||
|
|
||
| def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: | ||
| if entry.data_sequence_number is None: | ||
| if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -348,8 +348,8 @@ def test_write_manifest( | |
|
|
||
| expected_metadata = { | ||
| "schema": test_schema.model_dump_json(), | ||
| "partition-spec": test_spec.model_dump_json(), | ||
| "partition-spec-id": str(test_spec.spec_id), | ||
| "partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible to not hardcode this value?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually like that we are hardcoding this value because the issue wasn't caught because we inferred it from test_spec before :)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, makes sense
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer to hardcore the expected value so it is clear what is being returned when you go over the tests. |
||
| "partition-spec-id": str(demo_manifest_file.partition_spec_id), | ||
| "format-version": str(format_version), | ||
| } | ||
| _verify_metadata_with_fastavro( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason why we dont want to use the same logic? Like
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately the spec.fields returns a list, which is not a Pydantic object, but a native Python construct. So the method isn't available.