Skip to content

Commit 4ac7665

Browse files
committed
Return complete url rewrite information so that 'users' can decide what they want to do
1 parent d6a297b commit 4ac7665

File tree

6 files changed

+439
-113
lines changed

6 files changed

+439
-113
lines changed

src/zimscraperlib/rewriting/css.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __simple_transform(
5151
[
5252
"url(",
5353
m_object["quote"],
54-
url_rewriter(m_object["url"], base_href),
54+
url_rewriter(m_object["url"], base_href).rewriten_url,
5555
m_object["quote"],
5656
")",
5757
]
@@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
190190
new_url = self.url_rewriter(
191191
url_node.value, # pyright: ignore
192192
self.base_href,
193-
)
193+
).rewriten_url
194194
url_node.value = str(new_url) # pyright: ignore
195195
url_node.representation = ( # pyright: ignore
196196
f'"{serialize_url(str(new_url))}"'
@@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
206206
elif isinstance(node, ast.Declaration):
207207
self._process_list(node.value) # pyright: ignore
208208
elif isinstance(node, ast.URLToken):
209-
new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore
209+
new_url = self.url_rewriter(
210+
node.value, self.base_href
211+
).rewriten_url # pyright: ignore
210212
node.value = new_url
211213
node.representation = f"url({serialize_url(new_url)})"
212214

src/zimscraperlib/rewriting/html.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,9 @@ def rewrite_href_src_attributes(
603603
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
604604
return (
605605
attr_name,
606-
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
606+
url_rewriter(
607+
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
608+
).rewriten_url,
607609
)
608610

609611

@@ -618,10 +620,10 @@ def rewrite_srcset_attribute(
618620
if attr_name != "srcset" or not attr_value:
619621
return
620622
value_list = attr_value.split(",")
621-
new_value_list = []
623+
new_value_list: list[str] = []
622624
for value in value_list:
623625
url, *other = value.strip().split(" ", maxsplit=1)
624-
new_url = url_rewriter(url, base_href=base_href)
626+
new_url = url_rewriter(url, base_href=base_href).rewriten_url
625627
new_value = " ".join([new_url, *other])
626628
new_value_list.append(new_value)
627629
return (attr_name, ", ".join(new_value_list))
@@ -711,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
711713
return
712714
return (
713715
attr_name,
714-
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
716+
f"{match['interval']};"
717+
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
715718
)

src/zimscraperlib/rewriting/js.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
286286
This takes into account that the result must be a relative URL, i.e. it
287287
cannot be 'vendor.module.js' but must be './vendor.module.js'.
288288
"""
289-
url = self.url_rewriter(url, base_href=self.base_href)
289+
url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
290290
if not (
291291
url.startswith("/") or url.startswith("./") or url.startswith("../")
292292
):

src/zimscraperlib/rewriting/url_rewriting.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
147147
raise ValueError(f"Unexpected password in value: {value} {parts.password}")
148148

149149

150+
class RewriteResult(NamedTuple):
151+
absolute_url: str
152+
rewriten_url: str
153+
zim_path: ZimPath | None
154+
155+
150156
class ArticleUrlRewriter:
151157
"""
152158
Rewrite urls in article.
@@ -176,16 +182,11 @@ def __init__(
176182
missing_zim_paths: list of ZIM paths which are known to already be missing
177183
from the existing_zim_paths ; usefull only in complement with this variable ;
178184
new missing entries will be added as URLs are normalized in this function
179-
180-
Results:
181-
items_to_download: populated with the list of rewritten URLs, so that one
182-
might use it to download items after rewriting the document
183185
"""
184186
self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
185187
self.article_url = article_url
186188
self.existing_zim_paths = existing_zim_paths
187189
self.missing_zim_paths = missing_zim_paths
188-
self.items_to_download: dict[ZimPath, HttpUrl] = {}
189190

190191
def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
191192
"""Utility to transform an item URL into a ZimPath"""
@@ -201,7 +202,7 @@ def __call__(
201202
base_href: str | None,
202203
*,
203204
rewrite_all_url: bool = True,
204-
) -> str:
205+
) -> RewriteResult:
205206
"""Rewrite a url contained in a article.
206207
207208
The url is "fully" rewrited to point to a normalized entry path
@@ -210,17 +211,25 @@ def __call__(
210211
try:
211212
item_url = item_url.strip()
212213

214+
item_absolute_url = urljoin(
215+
urljoin(self.article_url.value, base_href), item_url
216+
)
217+
213218
# Make case of standalone fragments more straightforward
214219
if item_url.startswith("#"):
215-
return item_url
220+
return RewriteResult(
221+
absolute_url=item_absolute_url,
222+
rewriten_url=item_url,
223+
zim_path=None,
224+
)
216225

217226
item_scheme = urlsplit(item_url).scheme
218227
if item_scheme and item_scheme not in ("http", "https"):
219-
return item_url
220-
221-
item_absolute_url = urljoin(
222-
urljoin(self.article_url.value, base_href), item_url
223-
)
228+
return RewriteResult(
229+
absolute_url=item_absolute_url,
230+
rewriten_url=item_url,
231+
zim_path=None,
232+
)
224233

225234
item_fragment = urlsplit(item_absolute_url).fragment
226235

@@ -229,9 +238,11 @@ def __call__(
229238
if rewrite_all_url or (
230239
self.existing_zim_paths and item_path in self.existing_zim_paths
231240
):
232-
if item_path not in self.items_to_download:
233-
self.items_to_download[item_path] = HttpUrl(item_absolute_url)
234-
return self.get_document_uri(item_path, item_fragment)
241+
return RewriteResult(
242+
absolute_url=item_absolute_url,
243+
rewriten_url=self.get_document_uri(item_path, item_fragment),
244+
zim_path=item_path,
245+
)
235246
else:
236247
if (
237248
self.missing_zim_paths is not None
@@ -242,7 +253,11 @@ def __call__(
242253
# with duplicate messages
243254
self.missing_zim_paths.add(item_path)
244255
# The url doesn't point to a known entry
245-
return item_absolute_url
256+
return RewriteResult(
257+
absolute_url=item_absolute_url,
258+
rewriten_url=item_absolute_url,
259+
zim_path=item_path,
260+
)
246261

247262
except Exception as exc: # pragma: no cover
248263
item_scheme = (
@@ -275,7 +290,11 @@ def __call__(
275290
f"rewrite_all_url: {rewrite_all_url}",
276291
exc_info=exc,
277292
)
278-
return item_url
293+
return RewriteResult(
294+
absolute_url=item_absolute_url,
295+
rewriten_url=item_url,
296+
zim_path=None,
297+
)
279298

280299
def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
281300
"""Given an ZIM item path and its fragment, get the URI to use in document

tests/rewriting/conftest.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from zimscraperlib.rewriting.url_rewriting import (
88
ArticleUrlRewriter,
99
HttpUrl,
10+
RewriteResult,
1011
ZimPath,
1112
)
1213

@@ -24,8 +25,12 @@ def __call__(
2425
base_href: str | None, # noqa: ARG002
2526
*,
2627
rewrite_all_url: bool = True, # noqa: ARG002
27-
) -> str:
28-
return item_url + self.suffix
28+
) -> RewriteResult:
29+
return RewriteResult(
30+
absolute_url=item_url + self.suffix,
31+
rewriten_url=item_url + self.suffix,
32+
zim_path=None,
33+
)
2934

3035
def get_item_path(
3136
self, item_url: str, base_href: str | None # noqa: ARG002

0 commit comments

Comments
 (0)