Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 21 additions & 25 deletions pyiceberg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@
Type,
)

from pydantic_core import to_json

from pyiceberg.avro.file import AvroFile, AvroOutputFile
from pyiceberg.conversions import to_bytes
from pyiceberg.exceptions import ValidationError
from pyiceberg.io import FileIO, InputFile, OutputFile
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.typedef import EMPTY_DICT, Record, TableVersion
from pyiceberg.typedef import Record, TableVersion
from pyiceberg.types import (
BinaryType,
BooleanType,
Expand Down Expand Up @@ -645,7 +647,6 @@ class ManifestWriter(ABC):
_output_file: OutputFile
_writer: AvroOutputFile[ManifestEntry]
_snapshot_id: int
_meta: Dict[str, str]
_added_files: int
_added_rows: int
_existing_files: int
Expand All @@ -655,15 +656,12 @@ class ManifestWriter(ABC):
_min_data_sequence_number: Optional[int]
_partitions: List[Record]

def __init__(
self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT
) -> None:
def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int) -> None:
self.closed = False
self._spec = spec
self._schema = schema
self._output_file = output_file
self._snapshot_id = snapshot_id
self._meta = meta

self._added_files = 0
self._added_rows = 0
Expand Down Expand Up @@ -697,6 +695,15 @@ def content(self) -> ManifestContent: ...
@abstractmethod
def version(self) -> TableVersion: ...

@property
def _meta(self) -> Dict[str, str]:
return {
"schema": self._schema.model_dump_json(),
"partition-spec": to_json(self._spec.fields).decode("utf-8"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason why we dont want to use the same logic? Like

                "partition-spec": self._spec.model_dump_json(),

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the spec.fields returns a list, which is not a Pydantic object, but a native Python construct. So the method isn't available.

"partition-spec-id": str(self._spec.spec_id),
"format-version": str(self.version),
}

def _with_partition(self, format_version: TableVersion) -> Schema:
data_file_type = data_file_with_partition(
format_version=format_version, partition_type=self._spec.partition_type(self._schema)
Expand Down Expand Up @@ -771,12 +778,6 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile,
schema,
output_file,
snapshot_id,
{
"schema": schema.model_dump_json(),
"partition-spec": spec.model_dump_json(),
"partition-spec-id": str(spec.spec_id),
"format-version": "1",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is 👍 since the version function is defined

},
)

def content(self) -> ManifestContent:
Expand All @@ -792,19 +793,7 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:

class ManifestWriterV2(ManifestWriter):
def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int):
super().__init__(
spec,
schema,
output_file,
snapshot_id,
meta={
"schema": schema.model_dump_json(),
"partition-spec": spec.model_dump_json(),
"partition-spec-id": str(spec.spec_id),
"format-version": "2",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is 👍 since the version function is defined

"content": "data",
},
)
super().__init__(spec, schema, output_file, snapshot_id)

def content(self) -> ManifestContent:
return ManifestContent.DATA
Expand All @@ -813,6 +802,13 @@ def content(self) -> ManifestContent:
def version(self) -> TableVersion:
return 2

@property
def _meta(self) -> Dict[str, str]:
return {
**super()._meta,
"content": "data",
}

def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
if entry.data_sequence_number is None:
if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id:
Expand Down
4 changes: 2 additions & 2 deletions tests/utils/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ def test_write_manifest(

expected_metadata = {
"schema": test_schema.model_dump_json(),
"partition-spec": test_spec.model_dump_json(),
"partition-spec-id": str(test_spec.spec_id),
"partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to not hardcode this value?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually like that we are hardcoding this value because the issue wasn't caught because we inferred it from test_spec before :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, makes sense

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to hardcore the expected value so it is clear what is being returned when you go over the tests.

"partition-spec-id": str(demo_manifest_file.partition_spec_id),
"format-version": str(format_version),
}
_verify_metadata_with_fastavro(
Expand Down