Skip to content

Commit dba1ef8

Browse files
authored
Correct schema behavior (#247)
* Correct schema behavior When we alter the schema, we want to use the latest schema by default, except when you select a specific snapshot that has a schema-id. * Add warning if schema-id is missing from the metadata * Catch unexisting snapshots
1 parent 452238e commit dba1ef8

File tree

2 files changed

+103
-7
lines changed

2 files changed

+103
-7
lines changed

pyiceberg/table/__init__.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import datetime
2020
import itertools
2121
import uuid
22+
import warnings
2223
from abc import ABC, abstractmethod
2324
from copy import copy
2425
from dataclasses import dataclass
@@ -942,15 +943,23 @@ def snapshot(self) -> Optional[Snapshot]:
942943
return self.table.current_snapshot()
943944

944945
def projection(self) -> Schema:
945-
snapshot_schema = self.table.schema()
946-
if snapshot := self.snapshot():
947-
if snapshot.schema_id is not None:
948-
snapshot_schema = self.table.schemas()[snapshot.schema_id]
946+
current_schema = self.table.schema()
947+
if self.snapshot_id is not None:
948+
snapshot = self.table.snapshot_by_id(self.snapshot_id)
949+
if snapshot is not None:
950+
if snapshot.schema_id is not None:
951+
snapshot_schema = self.table.schemas().get(snapshot.schema_id)
952+
if snapshot_schema is not None:
953+
current_schema = snapshot_schema
954+
else:
955+
warnings.warn(f"Metadata does not contain schema with id: {snapshot.schema_id}")
956+
else:
957+
raise ValueError(f"Snapshot not found: {self.snapshot_id}")
949958

950959
if "*" in self.selected_fields:
951-
return snapshot_schema
960+
return current_schema
952961

953-
return snapshot_schema.select(*self.selected_fields, case_sensitive=self.case_sensitive)
962+
return current_schema.select(*self.selected_fields, case_sensitive=self.case_sensitive)
954963

955964
@abstractmethod
956965
def plan_files(self) -> Iterable[ScanTask]:

tests/table/test_init.py

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@
2222
import pytest
2323
from sortedcontainers import SortedList
2424

25+
from pyiceberg.catalog.noop import NoopCatalog
2526
from pyiceberg.exceptions import CommitFailedException
2627
from pyiceberg.expressions import (
2728
AlwaysTrue,
2829
And,
2930
EqualTo,
3031
In,
3132
)
32-
from pyiceberg.io import PY_IO_IMPL
33+
from pyiceberg.io import PY_IO_IMPL, load_file_io
3334
from pyiceberg.manifest import (
3435
DataFile,
3536
DataFileContent,
@@ -848,3 +849,89 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None:
848849
match="Requirement failed: default sort order id has changed: expected 1, found 3",
849850
):
850851
AssertDefaultSortOrderId(default_sort_order_id=1).validate(base_metadata)
852+
853+
854+
def test_correct_schema() -> None:
855+
table_metadata = TableMetadataV2(
856+
**{
857+
"format-version": 2,
858+
"table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1",
859+
"location": "s3://bucket/test/location",
860+
"last-sequence-number": 34,
861+
"last-updated-ms": 1602638573590,
862+
"last-column-id": 3,
863+
"current-schema-id": 1,
864+
"schemas": [
865+
{"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]},
866+
{
867+
"type": "struct",
868+
"schema-id": 1,
869+
"identifier-field-ids": [1, 2],
870+
"fields": [
871+
{"id": 1, "name": "x", "required": True, "type": "long"},
872+
{"id": 2, "name": "y", "required": True, "type": "long"},
873+
{"id": 3, "name": "z", "required": True, "type": "long"},
874+
],
875+
},
876+
],
877+
"default-spec-id": 0,
878+
"partition-specs": [
879+
{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}
880+
],
881+
"last-partition-id": 1000,
882+
"default-sort-order-id": 0,
883+
"sort-orders": [],
884+
"current-snapshot-id": 123,
885+
"snapshots": [
886+
{
887+
"snapshot-id": 234,
888+
"timestamp-ms": 1515100955770,
889+
"sequence-number": 0,
890+
"summary": {"operation": "append"},
891+
"manifest-list": "s3://a/b/1.avro",
892+
"schema-id": 10,
893+
},
894+
{
895+
"snapshot-id": 123,
896+
"timestamp-ms": 1515100955770,
897+
"sequence-number": 0,
898+
"summary": {"operation": "append"},
899+
"manifest-list": "s3://a/b/1.avro",
900+
"schema-id": 0,
901+
},
902+
],
903+
}
904+
)
905+
906+
t = Table(
907+
identifier=("default", "t1"),
908+
metadata=table_metadata,
909+
metadata_location="s3://../..",
910+
io=load_file_io(),
911+
catalog=NoopCatalog("NoopCatalog"),
912+
)
913+
914+
# Should use the current schema, instead the one from the snapshot
915+
assert t.scan().projection() == Schema(
916+
NestedField(field_id=1, name='x', field_type=LongType(), required=True),
917+
NestedField(field_id=2, name='y', field_type=LongType(), required=True),
918+
NestedField(field_id=3, name='z', field_type=LongType(), required=True),
919+
schema_id=1,
920+
identifier_field_ids=[1, 2],
921+
)
922+
923+
# When we explicitly filter on the commit, we want to have the schema that's linked to the snapshot
924+
assert t.scan(snapshot_id=123).projection() == Schema(
925+
NestedField(field_id=1, name='x', field_type=LongType(), required=True),
926+
schema_id=0,
927+
identifier_field_ids=[],
928+
)
929+
930+
with pytest.warns(UserWarning, match="Metadata does not contain schema with id: 10"):
931+
t.scan(snapshot_id=234).projection()
932+
933+
# Invalid snapshot
934+
with pytest.raises(ValueError) as exc_info:
935+
_ = t.scan(snapshot_id=-1).projection()
936+
937+
assert "Snapshot not found: -1" in str(exc_info.value)

0 commit comments

Comments
 (0)