1414# KIND, either express or implied. See the License for the
1515# specific language governing permissions and limitations
1616# under the License.
17- from typing import Iterator , Optional
17+ from typing import Iterator , Optional , Set
1818
1919from pyiceberg .exceptions import ValidationException
2020from pyiceberg .expressions import BooleanExpression
2121from pyiceberg .expressions .visitors import ROWS_CANNOT_MATCH , _InclusiveMetricsEvaluator
2222from pyiceberg .manifest import ManifestContent , ManifestEntry , ManifestEntryStatus , ManifestFile
23+ from pyiceberg .schema import Schema
2324from pyiceberg .table import Table
2425from pyiceberg .table .snapshots import Operation , Snapshot , ancestors_between
2526from pyiceberg .typedef import Record
2627
27- VALIDATE_DATA_FILES_EXIST_OPERATIONS = {Operation .OVERWRITE , Operation .REPLACE , Operation .DELETE }
28+ VALIDATE_DATA_FILES_EXIST_OPERATIONS : Set [Operation ] = {Operation .OVERWRITE , Operation .REPLACE , Operation .DELETE }
29+ VALIDATE_ADDED_DATA_FILES_OPERATIONS : Set [Operation ] = {Operation .APPEND , Operation .OVERWRITE }
2830
2931
3032def _validation_history (
@@ -77,6 +79,47 @@ def _validation_history(
7779 return manifests_files , snapshots
7880
7981
82+ def _filter_manifest_entries (
83+ entry : ManifestEntry ,
84+ snapshot_ids : set [int ],
85+ data_filter : Optional [BooleanExpression ],
86+ partition_set : Optional [dict [int , set [Record ]]],
87+ entry_status : Optional [ManifestEntryStatus ],
88+ schema : Schema ,
89+ ) -> bool :
90+ """Filter manifest entries based on data filter and partition set.
91+
92+ Args:
93+ entry: Manifest entry to filter
94+ snapshot_ids: set of snapshot ids to match data files
95+ data_filter: Optional filter to match data files
96+ partition_set: Optional set of partitions to match data files
97+ entry_status: Optional status to match data files
98+ schema: schema for filtering
99+
100+ Returns:
101+ True if the entry should be included, False otherwise
102+ """
103+ if entry .snapshot_id not in snapshot_ids :
104+ return False
105+
106+ if entry_status is not None and entry .status != entry_status :
107+ return False
108+
109+ if data_filter is not None :
110+ evaluator = _InclusiveMetricsEvaluator (schema , data_filter )
111+ if evaluator .eval (entry .data_file ) is ROWS_CANNOT_MATCH :
112+ return False
113+
114+ if partition_set is not None :
115+ partition = entry .data_file .partition
116+ spec_id = entry .data_file .spec_id
117+ if spec_id not in partition_set or partition not in partition_set [spec_id ]:
118+ return False
119+
120+ return True
121+
122+
80123def _deleted_data_files (
81124 table : Table ,
82125 starting_snapshot : Snapshot ,
@@ -108,27 +151,12 @@ def _deleted_data_files(
108151 ManifestContent .DATA ,
109152 )
110153
111- if data_filter is not None :
112- evaluator = _InclusiveMetricsEvaluator (table .schema (), data_filter ).eval
113-
114154 for manifest in manifests :
115155 for entry in manifest .fetch_manifest_entry (table .io , discard_deleted = False ):
116- if entry .snapshot_id not in snapshot_ids :
117- continue
118-
119- if entry .status != ManifestEntryStatus .DELETED :
120- continue
121-
122- if data_filter is not None and evaluator (entry .data_file ) is ROWS_CANNOT_MATCH :
123- continue
124-
125- if partition_set is not None :
126- spec_id = entry .data_file .spec_id
127- partition = entry .data_file .partition
128- if spec_id not in partition_set or partition not in partition_set [spec_id ]:
129- continue
130-
131- yield entry
156+ if _filter_manifest_entries (
157+ entry , snapshot_ids , data_filter , partition_set , ManifestEntryStatus .DELETED , table .schema ()
158+ ):
159+ yield entry
132160
133161
134162def _validate_deleted_data_files (
@@ -150,3 +178,60 @@ def _validate_deleted_data_files(
150178 if any (conflicting_entries ):
151179 conflicting_snapshots = {entry .snapshot_id for entry in conflicting_entries }
152180 raise ValidationException (f"Deleted data files were found matching the filter for snapshots { conflicting_snapshots } !" )
181+
182+
183+ def _added_data_files (
184+ table : Table ,
185+ starting_snapshot : Snapshot ,
186+ data_filter : Optional [BooleanExpression ],
187+ partition_set : Optional [dict [int , set [Record ]]],
188+ parent_snapshot : Optional [Snapshot ],
189+ ) -> Iterator [ManifestEntry ]:
190+ """Return manifest entries for data files added between the starting snapshot and parent snapshot.
191+
192+ Args:
193+ table: Table to get the history from
194+ starting_snapshot: Starting snapshot to get the history from
195+ data_filter: Optional filter to match data files
196+ partition_set: Optional set of partitions to match data files
197+ parent_snapshot: Parent snapshot to get the history from
198+
199+ Returns:
200+ Iterator of manifest entries for added data files matching the conditions
201+ """
202+ if parent_snapshot is None :
203+ return
204+
205+ manifests , snapshot_ids = _validation_history (
206+ table ,
207+ parent_snapshot ,
208+ starting_snapshot ,
209+ VALIDATE_ADDED_DATA_FILES_OPERATIONS ,
210+ ManifestContent .DATA ,
211+ )
212+
213+ for manifest in manifests :
214+ for entry in manifest .fetch_manifest_entry (table .io ):
215+ if _filter_manifest_entries (entry , snapshot_ids , data_filter , partition_set , None , table .schema ()):
216+ yield entry
217+
218+
219+ def _validate_added_data_files (
220+ table : Table ,
221+ starting_snapshot : Snapshot ,
222+ data_filter : Optional [BooleanExpression ],
223+ parent_snapshot : Optional [Snapshot ],
224+ ) -> None :
225+ """Validate that no files matching a filter have been added to the table since a starting snapshot.
226+
227+ Args:
228+ table: Table to validate
229+ starting_snapshot: Snapshot current at the start of the operation
230+ data_filter: Expression used to find added data files
231+ parent_snapshot: Ending snapshot on the branch being validated
232+
233+ """
234+ conflicting_entries = _added_data_files (table , starting_snapshot , data_filter , None , parent_snapshot )
235+ if any (conflicting_entries ):
236+ conflicting_snapshots = {entry .snapshot_id for entry in conflicting_entries if entry .snapshot_id is not None }
237+ raise ValidationException (f"Added data files were found matching the filter for snapshots { conflicting_snapshots } !" )
0 commit comments