Skip to content

Commit 8d86d94

Browse files
committed
Automatically redirect to articles with same checksum
1 parent fe0b5fa commit 8d86d94

File tree

2 files changed

+57
-0
lines changed

2 files changed

+57
-0
lines changed

src/zimscraperlib/zim/creator.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
- can be used to store a filepath and content read from it (not stored) """
2020

2121
import datetime
22+
import hashlib
2223
import pathlib
24+
import re
2325
import weakref
2426
from typing import Any, Callable, Dict, Optional, Tuple, Union
2527

@@ -104,6 +106,10 @@ def __init__(
104106

105107
self.workaround_nocancel = workaround_nocancel
106108

109+
self.autodedup_filters = []
110+
111+
self.dedup_items = dict()
112+
107113
def start(self):
108114
super().__enter__()
109115

@@ -119,6 +125,9 @@ def update_metadata(self, **kwargs):
119125
for name, value in kwargs.items():
120126
self.add_metadata(name, value)
121127

128+
def add_autodedup_filter(self, filter_regex: str):
129+
self.autodedup_filters.append(re.compile(filter_regex))
130+
122131
def add_item_for(
123132
self,
124133
path: str,
@@ -151,6 +160,31 @@ def add_item_for(
151160
if fpath is None and content is None:
152161
raise ValueError("One of fpath or content is required")
153162

163+
for dedup_filter in self.autodedup_filters:
164+
if dedup_filter.match(path):
165+
if content:
166+
digest = hashlib.sha256(content).digest()
167+
else:
168+
sha256 = hashlib.sha256()
169+
with open(fpath, "rb") as f:
170+
while True:
171+
data = f.read(65536) # lets read stuff in 64kb chunks!
172+
if not data:
173+
break
174+
sha256.update(data)
175+
digest = sha256.digest()
176+
177+
if digest in self.dedup_items:
178+
self.add_redirect(
179+
path=path,
180+
target_path=self.dedup_items[digest],
181+
title=title,
182+
is_front=is_front,
183+
)
184+
return path
185+
self.dedup_items[digest] = path
186+
break
187+
154188
mimetype = mimetype_for(
155189
path=path, content=content, fpath=fpath, mimetype=mimetype
156190
)

tests/zim/test_zim_creator.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,29 @@ def test_noindexlanguage(tmp_path):
125125
assert reader.has_title_index
126126
assert not reader.has_fulltext_index
127127

128+
def test_duplicatefiles(tmp_path, png_image, html_file):
129+
fpath = tmp_path / "test.zim"
130+
with Creator(fpath, "welcome", "") as creator:
131+
creator.add_autodedup_filter(r"^images/.*$")
132+
# add a file not matching filter patterns
133+
creator.add_item_for("other_folder1/yahoo0.png", "Image1", fpath=png_image)
134+
# add same file but first matching filter patterns => will be added as-is
135+
creator.add_item_for("images/yahoo1.png", "Image1", fpath=png_image)
136+
# add same file but second matching filter patterns => will be replaced by a redirect
137+
creator.add_item_for("images/yahoo2.png", "Image2", fpath=png_image)
138+
# add same file but not matching filter patterns => will be added as-is
139+
creator.add_item_for("other_folder2/yahoo3.png", "Image1", fpath=png_image)
140+
141+
reader = Archive(fpath)
142+
# make sure we have our image
143+
assert reader.get_item("images/yahoo1.png")
144+
assert not reader.get_entry_by_path("images/yahoo1.png").is_redirect
145+
assert reader.get_item("images/yahoo2.png")
146+
assert reader.get_entry_by_path("images/yahoo2.png").is_redirect
147+
assert reader.get_item("other_folder1/yahoo0.png")
148+
assert not reader.get_entry_by_path("other_folder1/yahoo0.png").is_redirect
149+
assert reader.get_item("other_folder2/yahoo3.png")
150+
assert not reader.get_entry_by_path("other_folder2/yahoo3.png").is_redirect
128151

129152
def test_add_item_for(tmp_path):
130153
fpath = tmp_path / "test.zim"

0 commit comments

Comments
 (0)