From 8f0661d26186f60b703d203a425fc64880f95a21 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Tue, 14 Jun 2022 10:18:05 +0300 Subject: [PATCH 1/6] Initial revision of Tools/scripts/checkhtmllinks.py --- Misc/ACKS | 1 + ...2-06-14-10-15-17.gh-issue-93851.rL64WQ.rst | 4 + Tools/scripts/README | 1 + Tools/scripts/checkhtmllinks.py | 316 ++++++++++++++++++ 4 files changed, 322 insertions(+) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2022-06-14-10-15-17.gh-issue-93851.rL64WQ.rst create mode 100644 Tools/scripts/checkhtmllinks.py diff --git a/Misc/ACKS b/Misc/ACKS index c3a4f9b9dededd..b873cdc587b0b7 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -806,6 +806,7 @@ Oleg Höfling Robert Hölzl Stefan Hölzl Catalin Iacob +Oleg Iarygin Mihai Ibanescu Ali Ikinci Aaron Iles diff --git a/Misc/NEWS.d/next/Tools-Demos/2022-06-14-10-15-17.gh-issue-93851.rL64WQ.rst b/Misc/NEWS.d/next/Tools-Demos/2022-06-14-10-15-17.gh-issue-93851.rL64WQ.rst new file mode 100644 index 00000000000000..d106485ad646ea --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2022-06-14-10-15-17.gh-issue-93851.rL64WQ.rst @@ -0,0 +1,4 @@ +Added :source:`Tools/scripts/checkhtmllinks.py` for on-mood searching for +dead and redirected (potentially soon-to-be-dead; use ``-r`` to allow) links. +Note that a full run through the whole rendered documentation takes more than +an hour. diff --git a/Tools/scripts/README b/Tools/scripts/README index c1d66731ba6495..e6fe2c7ab3f92d 100644 --- a/Tools/scripts/README +++ b/Tools/scripts/README @@ -7,6 +7,7 @@ abitype.py Converts a C file to use the PEP 384 type definition A analyze_dxp.py Analyzes the result of sys.getdxp() byext.py Print lines/words/chars stats of files by extension byteyears.py Print product of a file's size and age +checkhtmllinks.py Check if specified HTML files have dead or redirected links cleanfuture.py Fix redundant Python __future__ statements combinerefs.py A helper for analyzing PYTHONDUMPREFS output copytime.py Copy one file's atime and mtime to another diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py new file mode 100644 index 00000000000000..f3beafc9330605 --- /dev/null +++ b/Tools/scripts/checkhtmllinks.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 + +"""Check if specified HTML files have dead and redirected (to-be dead) links. + +Call this script on HTML files of the rendered documentation. + +A full run through the whole rendered documentation takes about thirty minutes. + +Copyright © 2022 by Oleg Iarygin +Licensed to PSF under a Contributor Agreement. +""" + +import sys +from argparse import ArgumentParser +from codecs import decode +from concurrent.futures import ThreadPoolExecutor +from enum import Enum, auto +from functools import cache +from html.parser import HTMLParser +from http import HTTPStatus +from io import StringIO, TextIOBase +from itertools import count +from pathlib import Path +from time import sleep +from urllib.error import HTTPError +from urllib.parse import urldefrag, urljoin, urlsplit +from urllib.request import HTTPRedirectHandler, Request, build_opener +from urllib.response import addinfourl + + +def get_attribute(attribute_name: str, container: list[tuple[str, str]]): + """Scan a list of (name, value) tuples collecting requested values.""" + return (value for name, value in container if name == attribute_name) + + +class LinkAnalyzer(HTMLParser): + """Scanner for hyperlink referers and targets. + + A referer is an HTML element. A target is an id or name attribute of + any HTML element. + """ + def __init__(self): + super().__init__() + self.targets = set() + self.referers = set() + + def handle_starttag(self, tag, attrs): + if tag == 'a': + self.referers.update(get_attribute('href', attrs)) + self.targets.update(get_attribute('name', attrs)) + self.targets.update(get_attribute('id', attrs)) + + +class _NoRedirectHandler(HTTPRedirectHandler): + + def redirect_request(self, req, fp, code, msg, headers, newurl): + raise HTTPError(req.full_url, code, msg, headers, fp) + + +UA = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ' + + 'like Gecko) Chrome/102.0.5005.63 Safari/537.36') + + +@cache +def get_link_targets_and_referers(url: str, allow_redirects: bool): + """Load a page and extract output hyperlinks with potential input points. + + To speed scanning up the result is cached (memoized) so other pages linking + to the URL are processed instantly. + + Returns: + - None if a target cannot be accessed . + - False if a redirection is met and allow_redirects is False. + - a tuple of targets (hyperlink lannding points) and referers (links). + """ + try: + # Sites love to return HTTP 403 Forbidden to unknown user agents + # (crawlers, rare browsers, etc.) so we have no choise but to pretend + # to be Chrome. + request = Request(url, headers={'User-Agent': UA}) + + opener = build_opener( + HTTPRedirectHandler if allow_redirects else _NoRedirectHandler, + ) + page = opener.open(request) + + except HTTPError as e: + if e.code == HTTPStatus.TOO_MANY_REQUESTS: + sleep(2) + return get_link_targets_and_referers(url, allow_redirects) + elif 300 <= e.code <= 399: + return False + return None + + except Exception: + # URLib, HTTP client, whatever can throw any network-related exception. + return None + + with page: + analyzer = LinkAnalyzer() + try: + page_source = decode(page.read()) + analyzer.feed(page_source) + + except Exception: + # Whatever the document is, it cannot have link anchors + analyzer.targets = [] + + return analyzer.targets, analyzer.referers + + +def get_absolute_url(base_path: Path, url: str): + return urljoin(base_path.resolve().as_uri(), url.strip()) + + +def analyze_page(base_path: Path, url: str, allow_redirects: str): + url_to_check = get_absolute_url(base_path, url) + # Improve cache reuse by dropping URL parts not affecting content + direct_url_to_check, _ = urldefrag(url_to_check) + return get_link_targets_and_referers(direct_url_to_check, allow_redirects) + + +class LinkStatus(Enum): + SKIPPED = auto() + GOOD = auto() + BAD_REFERRER = auto() + BAD_TARGET = auto() + REDIRECTED_TARGET = auto() + + +def skip_unsupported(target: str, target_parts) -> LinkStatus | None: + # For now, skip absolute paths considering them always available. + # Otherwise we would need to write a manual URL resolver for file://. + if target_parts.path[:1] == '/' and not target_parts.netloc: + print(f' skipped {target} (absolute links are unsupported yet)') + return LinkStatus.SKIPPED, target + + # Skip mailto links; we cannot verify them anyway + if target_parts.scheme == 'mailto': + return LinkStatus.SKIPPED, target + + return None + + +def determine_target_status(target_page: str, target_parts) -> LinkStatus: + if target_page is False: + return LinkStatus.REDIRECTED_TARGET + + elif target_page is None: + return LinkStatus.BAD_TARGET + + elif target_parts.fragment: + targets, _ = target_page + is_correct_link = target_parts.fragment in targets + return LinkStatus.GOOD if is_correct_link else LinkStatus.BAD_TARGET + + return LinkStatus.GOOD + + +def test_file(filename, allow_redirects, limit, print_intermediate_report): + links = analyze_page(filename, '', allow_redirects) + if not links: + print(' not found') + return LinkStatus.BAD_REFERRER + + targets, referers = links + if limit is not None and len(referers) > limit: + print(f'skipped; {len(referers)} links is above the --limit threshold') + return [] + + def test_target(map_args): + iteration_id, target = map_args + print(f'[{iteration_id}/{len(referers)}] link to {target}...') + target_parts = urlsplit(target) + + early_abruption_status = skip_unsupported(target, target_parts) + if early_abruption_status: + return early_abruption_status + + target_page = analyze_page(filename, target, allow_redirects) + status = determine_target_status(target_page, target_parts) + print_intermediate_report(status, target) + return status, target + + with ThreadPoolExecutor() as pool: + return pool.map(test_target, zip(count(1), referers)) + + +def check_if_error(status: tuple[LinkStatus, str]): + return status[0] not in {LinkStatus.SKIPPED, LinkStatus.GOOD} + + +################################################### +# +# CUI (Console User Interface) +# +################################################### + +def print_title(text: str, printer: TextIOBase = sys.stdout) -> None: + border = '=' * len(text) + print('', border, text, border, '', sep='\n', file=printer) + + +def match_report_message(issue: LinkStatus, target: str, printer: TextIOBase): + match issue: + case LinkStatus.BAD_REFERRER: + print(' file not found', file=printer) + return True + + case LinkStatus.BAD_TARGET: + fragment = urlsplit(target).fragment + entity = f'#{fragment}' if fragment else 'the file' + print( + f' broken {target} link; check if {entity} exists', + file=printer, + ) + return True + + case LinkStatus.REDIRECTED_TARGET: + print( + f' redirected {target} link; increased loading time', + file=printer, + ) + return True + + case _: + return False + + +def get_end_report(log: list[tuple[Path, tuple[LinkStatus, str]]]) -> str: + # print() is a wrapper around a syscalls, and syscalls are slow. + # So we batch all reports first so a caller will do a single call. + # Reallocation should not be a problem because expected amount of printer + # errors is miniscule. + non_empty = False + with StringIO() as printer: + print_title('Final report on problems', printer=printer) + for file, broken_links in log: + print(file, file=printer) + for issue, target in broken_links: + non_empty |= match_report_message(issue, target, printer) + + return printer.getvalue() if non_empty else None + + +intermediate_reports = { + LinkStatus.SKIPPED: 'skipped', + LinkStatus.BAD_REFERRER: 'failed', + LinkStatus.BAD_TARGET: 'failed', + LinkStatus.REDIRECTED_TARGET: 'redirected', +} + + +def print_intermediate_report(status: LinkStatus, target: str) -> None: + status_text = intermediate_reports.get(status) + if status_text: + print(f' {status_text} {target}') + + +def main(options) -> None: + print('collecting filenames to check...', flush=True) + all_errors = [] + try: + input_files = list(Path('.').glob(options.path)) + file_count = len(input_files) + for file_id, file_path in zip(count(1), input_files): + print_title(f'[{file_id}/{file_count}] {file_path}') + + target_results = test_file( + file_path, + options.allow_redirects, + options.limit, + print_intermediate_report, + ) + errors = [ + status for status in target_results if check_if_error(status) + ] + if errors: + all_errors.append((file_path, errors)) + + except KeyboardInterrupt: + print('\naborted') + + readable_report = get_end_report(all_errors) + if readable_report: + sys.exit(readable_report) + + +HELP_PROLOG = 'Check if specified HTML files have dead or redirected links' + + +HELP_EPILOG = """ +Call this script on HTML files of the rendered documentation. + +Eventhough the script does is multithreaded and caches the findings for already +processed pages, a full run through the whole rendered documentation takes more +than an hour. +""" + + +if __name__ == '__main__': + parser = ArgumentParser(description=HELP_PROLOG, epilog=HELP_EPILOG) + parser.add_argument('path', help='A glob pattern of file paths to scan') + parser.add_argument( + '-r', + '--allow-redirects', + action='store_true', + help='Do not report HTTP 3xx links as kind-of-broken', + ) + parser.add_argument( + '-l', + '--limit', + type=int, + help='Skip files that contain more hyperlinks than specified', + ) + main(parser.parse_args()) From a547a7397316489c189c95936e3927cf197e68f6 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Tue, 14 Jun 2022 10:18:05 +0300 Subject: [PATCH 2/6] Clarify runtime estimations --- Tools/scripts/checkhtmllinks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py index f3beafc9330605..efadc75828f09f 100644 --- a/Tools/scripts/checkhtmllinks.py +++ b/Tools/scripts/checkhtmllinks.py @@ -4,7 +4,7 @@ Call this script on HTML files of the rendered documentation. -A full run through the whole rendered documentation takes about thirty minutes. +A full run through the whole rendered documentation takes about an hour. Copyright © 2022 by Oleg Iarygin Licensed to PSF under a Contributor Agreement. @@ -293,8 +293,8 @@ def main(options) -> None: Call this script on HTML files of the rendered documentation. Eventhough the script does is multithreaded and caches the findings for already -processed pages, a full run through the whole rendered documentation takes more -than an hour. +processed pages, a full run through the whole rendered documentation takes +about an hour. """ From 5a9e8d86aaab522114d6e76f15a205ab998695f2 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Wed, 15 Jun 2022 16:01:15 +0300 Subject: [PATCH 3/6] Hide an unimportant abstraction --- Tools/scripts/checkhtmllinks.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py index efadc75828f09f..8f5269442f9b84 100644 --- a/Tools/scripts/checkhtmllinks.py +++ b/Tools/scripts/checkhtmllinks.py @@ -28,11 +28,6 @@ from urllib.response import addinfourl -def get_attribute(attribute_name: str, container: list[tuple[str, str]]): - """Scan a list of (name, value) tuples collecting requested values.""" - return (value for name, value in container if name == attribute_name) - - class LinkAnalyzer(HTMLParser): """Scanner for hyperlink referers and targets. @@ -44,11 +39,15 @@ def __init__(self): self.targets = set() self.referers = set() + @staticmethod + def _get_attribute(attribute_name: str, container: list[tuple[str, str]]): + return (value for name, value in container if name == attribute_name) + def handle_starttag(self, tag, attrs): if tag == 'a': - self.referers.update(get_attribute('href', attrs)) - self.targets.update(get_attribute('name', attrs)) - self.targets.update(get_attribute('id', attrs)) + self.referers.update(self._get_attribute('href', attrs)) + self.targets.update(self._get_attribute('name', attrs)) + self.targets.update(self._get_attribute('id', attrs)) class _NoRedirectHandler(HTTPRedirectHandler): From c57e2643078265b01e1bb9d6786071780e7953e7 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Wed, 15 Jun 2022 16:04:01 +0300 Subject: [PATCH 4/6] Use more relevant abstractions --- Tools/scripts/checkhtmllinks.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py index 8f5269442f9b84..8493ed086ad4c7 100644 --- a/Tools/scripts/checkhtmllinks.py +++ b/Tools/scripts/checkhtmllinks.py @@ -40,14 +40,14 @@ def __init__(self): self.referers = set() @staticmethod - def _get_attribute(attribute_name: str, container: list[tuple[str, str]]): + def _get_tag_attr(attribute_name: str, container: list[tuple[str, str]]): return (value for name, value in container if name == attribute_name) def handle_starttag(self, tag, attrs): if tag == 'a': - self.referers.update(self._get_attribute('href', attrs)) - self.targets.update(self._get_attribute('name', attrs)) - self.targets.update(self._get_attribute('id', attrs)) + self.referers.update(self._get_tag_attr('href', attrs)) + self.targets.update(self._get_tag_attr('name', attrs)) + self.targets.update(self._get_tag_attr('id', attrs)) class _NoRedirectHandler(HTTPRedirectHandler): @@ -56,6 +56,8 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): raise HTTPError(req.full_url, code, msg, headers, fp) +# Sites love to return HTTP 403 Forbidden to unknown user agents (crawlers, +# rare browsers, etc.) so we have no choise but to pretend to be Chrome. UA = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ' + 'like Gecko) Chrome/102.0.5005.63 Safari/537.36') @@ -73,11 +75,7 @@ def get_link_targets_and_referers(url: str, allow_redirects: bool): - a tuple of targets (hyperlink lannding points) and referers (links). """ try: - # Sites love to return HTTP 403 Forbidden to unknown user agents - # (crawlers, rare browsers, etc.) so we have no choise but to pretend - # to be Chrome. request = Request(url, headers={'User-Agent': UA}) - opener = build_opener( HTTPRedirectHandler if allow_redirects else _NoRedirectHandler, ) From 9c4ab0ff139d6d254a40aff2cebbb12d850371f9 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Wed, 15 Jun 2022 16:30:24 +0300 Subject: [PATCH 5/6] Use the same capitalization as in `-h` cmdline key --- Tools/scripts/checkhtmllinks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py index 8493ed086ad4c7..3d9fd6ebab8a30 100644 --- a/Tools/scripts/checkhtmllinks.py +++ b/Tools/scripts/checkhtmllinks.py @@ -283,31 +283,31 @@ def main(options) -> None: sys.exit(readable_report) -HELP_PROLOG = 'Check if specified HTML files have dead or redirected links' +HELP_PROLOG = 'Check if specified HTML files have dead or redirected links.' HELP_EPILOG = """ Call this script on HTML files of the rendered documentation. -Eventhough the script does is multithreaded and caches the findings for already -processed pages, a full run through the whole rendered documentation takes +Eventhough the script is multithreaded and findings for already processed +pages are cached, a full run through the whole rendered documentation takes about an hour. """ if __name__ == '__main__': parser = ArgumentParser(description=HELP_PROLOG, epilog=HELP_EPILOG) - parser.add_argument('path', help='A glob pattern of file paths to scan') + parser.add_argument('path', help='a glob pattern of file paths to scan') parser.add_argument( '-r', '--allow-redirects', action='store_true', - help='Do not report HTTP 3xx links as kind-of-broken', + help='do not report HTTP 3xx links as kind-of-broken', ) parser.add_argument( '-l', '--limit', type=int, - help='Skip files that contain more hyperlinks than specified', + help='skip files that contain more links than specified', ) main(parser.parse_args()) From af53adaec9abd25c3dcb54a38a446cf5fd5a1e07 Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Thu, 4 Aug 2022 11:46:30 +0300 Subject: [PATCH 6/6] Fix test_target() so it always returns a list --- Tools/scripts/checkhtmllinks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/scripts/checkhtmllinks.py b/Tools/scripts/checkhtmllinks.py index 3d9fd6ebab8a30..88b7cb8e5c1d5c 100644 --- a/Tools/scripts/checkhtmllinks.py +++ b/Tools/scripts/checkhtmllinks.py @@ -158,7 +158,7 @@ def test_file(filename, allow_redirects, limit, print_intermediate_report): links = analyze_page(filename, '', allow_redirects) if not links: print(' not found') - return LinkStatus.BAD_REFERRER + return [(LinkStatus.BAD_REFERRER, None)] targets, referers = links if limit is not None and len(referers) > limit: