diff --git a/README.rst b/README.rst index 4b9163f..7e686ec 100644 --- a/README.rst +++ b/README.rst @@ -33,7 +33,9 @@ Basic usage ----------- #. Install app to somewhere on your Python path (e.g. ``pip install - django-linkcheck``). + django-linkcheck``). If you do not need multilingual support, you can skip + the compilation of the translation files with an environment variable, e.g. + (``LINKCHECK_SKIP_TRANSLATIONS=true pip install django-linkcheck``). #. Add ``'linkcheck'`` to your ``settings.INSTALLED_APPS``. diff --git a/linkcheck/build_meta.py b/linkcheck/build_meta.py index 435f55d..a75ee56 100644 --- a/linkcheck/build_meta.py +++ b/linkcheck/build_meta.py @@ -1,3 +1,4 @@ +import os import subprocess from setuptools import build_meta as default @@ -5,17 +6,29 @@ def compile_translation_files(): - print("Compile translation files") + print("Compiling translation files...") subprocess.run(["django-admin", "compilemessages"], cwd="linkcheck") +def should_compile_translation_files(): + skip_translations = os.environ.get("LINKCHECK_SKIP_TRANSLATIONS") + if skip_translations and skip_translations.lower() in ("1", "true", "yes", "t", "y"): + return False + + return True + + def build_sdist(sdist_directory, config_settings=None): - compile_translation_files() + if should_compile_translation_files(): + compile_translation_files() + return default.build_sdist(sdist_directory, config_settings) def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): - compile_translation_files() + if should_compile_translation_files(): + compile_translation_files() + return default.build_wheel( wheel_directory, config_settings=config_settings, diff --git a/linkcheck/linkcheck_settings.py b/linkcheck/linkcheck_settings.py index 566be34..617aef4 100644 --- a/linkcheck/linkcheck_settings.py +++ b/linkcheck/linkcheck_settings.py @@ -59,3 +59,5 @@ SITE_DOMAINS = getattr(settings, 'LINKCHECK_SITE_DOMAINS', []) DISABLE_LISTENERS = getattr(settings, 'LINKCHECK_DISABLE_LISTENERS', False) TOLERATE_BROKEN_ANCHOR = getattr(settings, 'LINKCHECK_TOLERATE_BROKEN_ANCHOR', True) +PROXIES = getattr(settings, 'LINKCHECK_PROXIES', {}) +TRUST_PROXY_SSL = getattr(settings, 'LINKCHECK_TRUST_PROXY_SSL', False) diff --git a/linkcheck/listeners.py b/linkcheck/listeners.py index f82784f..0642aeb 100644 --- a/linkcheck/listeners.py +++ b/linkcheck/listeners.py @@ -132,7 +132,7 @@ def delete_instance_links(sender, instance, **kwargs): def instance_pre_save(sender, instance, raw=False, **kwargs): - if not instance.pk or raw: + if instance._state.adding or not instance.pk or raw: # Ignore unsaved instances or raw imports return current_url = instance.get_absolute_url() diff --git a/linkcheck/models.py b/linkcheck/models.py index 536fb50..d7c189f 100644 --- a/linkcheck/models.py +++ b/linkcheck/models.py @@ -31,8 +31,10 @@ LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT, MAX_URL_LENGTH, MEDIA_PREFIX, + PROXIES, SITE_DOMAINS, TOLERATE_BROKEN_ANCHOR, + TRUST_PROXY_SSL, ) logger = logging.getLogger(__name__) @@ -386,6 +388,10 @@ def check_external(self, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): "timeout": LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT, "verify": True, } + if PROXIES: + request_params["verify"] = not TRUST_PROXY_SSL + request_params["proxies"] = PROXIES + try: try: # At first try a HEAD request diff --git a/linkcheck/templates/linkcheck/report.html b/linkcheck/templates/linkcheck/report.html index a65d577..1734170 100644 --- a/linkcheck/templates/linkcheck/report.html +++ b/linkcheck/templates/linkcheck/report.html @@ -171,7 +171,7 @@

{{report_type}} in '{{object.obj {% if link.url.redirect_to %} - R{% translate "Redirects to" %}: {{ link.url.redirect_to }} + {% translate "Redirects to" %}: {{ link.url.redirect_to }} {% endif %} {% endfor %} diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 8a3f1e1..4d2f95e 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,7 +1,8 @@ import os -from datetime import datetime, timedelta +from datetime import timedelta +from http import HTTPStatus from io import StringIO -from unittest.mock import patch +from unittest.mock import Mock, patch import requests_mock import urllib3 @@ -13,6 +14,7 @@ from django.test import LiveServerTestCase, TestCase from django.test.utils import override_settings from django.urls import reverse +from django.utils import timezone from requests.exceptions import ConnectionError from linkcheck.linkcheck_settings import MAX_URL_LENGTH @@ -25,6 +27,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url +from linkcheck.utils import check_links, concurrent_check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -672,6 +675,31 @@ def test_external_check_blocked_user_agent_blocked_head(self): self.assertEqual(uv.redirect_to, '') self.assertEqual(uv.type, 'external') + @patch( + 'linkcheck.models.PROXIES', + {'http': 'http://proxy.example.com:8080'}, + ) + @patch('requests.head') + def test_external_proxy_request(self, mock_head): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.reason = 'OK' + mock_response.history = [] + mock_head.return_value = mock_response + request_url = 'http://test.com' + uv = Url(url=request_url) + uv.check_url() + self.assertEqual(uv.status, True) + self.assertEqual(uv.message, '200 OK') + self.assertEqual(uv.type, 'external') + mock_head.assert_called_once() + (call_url,), call_kwargs = mock_head.call_args + self.assertEqual(call_url, request_url) + self.assertEqual( + call_kwargs.get('proxies'), + {'http': 'http://proxy.example.com:8080'}, + ) + def test_external_check_timedout(self): uv = Url(url=f"{self.live_server_url}/timeout/") uv.check_url() @@ -870,7 +898,7 @@ def test_checklinks_command(self): "1 internal URLs and 0 external URLs have been checked.\n" ) - yesterday = datetime.now() - timedelta(days=1) + yesterday = timezone.now() - timedelta(days=1) Url.objects.all().update(last_checked=yesterday) out = StringIO() call_command('checklinks', externalinterval=20, stdout=out) @@ -1203,6 +1231,62 @@ def test_filter_callable(self): ) +class TestCheckLinks(TestCase): + + def _setup_mock_urls(self, mocker): + """ + Set up common mock URLs for link checking tests. + """ + good_url = 'https://example.com/good' + mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') + Url.objects.create(url=good_url) + + bad_url = 'https://example.com/bad' + mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND') + Url.objects.create(url=bad_url) + + exception_url = 'https://example.com/exception' + mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong")) + Url.objects.create(url=exception_url) + + recently_checked_url = 'https://example.com/recent' + # Shouldn't be requested + Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1)) + + return (good_url, bad_url, exception_url, recently_checked_url) + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker) + + self.assertEqual(check_links(), 3) + self.assertEqual(Url.objects.get(url=good_url).status, True) + self.assertEqual(Url.objects.get(url=bad_url).status, False) + self.assertEqual(Url.objects.get(url=exception_url).status, False) + self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + + @requests_mock.Mocker() + def test_concurrent_check_links(self, mocker): + self._setup_mock_urls(mocker) + + # Since the tests are running in sqlite, we can't insert data via our threaded code + # there's enough other test coverage that we can use `Url.save` as a proxy + with patch.object(Url, "save") as patched_save: + self.assertEqual(concurrent_check_links(), 3) + self.assertEqual(patched_save.call_count, 3) + + def test_concurrent_check_links_error_handling(self): + Url.objects.create(url='https://example.com/good') + with ( + patch("linkcheck.utils.logger.exception") as patched_logged_exception, + patch.object(Url, "check_external", side_effect=ValueError("oops")), + ): + self.assertEqual(concurrent_check_links(), 0) + self.assertEqual(patched_logged_exception.call_count, 1) + msg, *args = patched_logged_exception.call_args[0] + self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops") + + def get_command_output(command, *args, **kwargs): """ Helper function for running a management command and checking its output diff --git a/linkcheck/utils.py b/linkcheck/utils.py index 0b4e6d7..ba00e90 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -1,4 +1,6 @@ import logging +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta from django.apps import apps @@ -120,6 +122,70 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True, return check_count +def concurrent_check_links( + external_recheck_interval=10080, + limit=-1, + check_internal=True, + check_external=True, + max_workers=20, +): + """ + Return the number of links effectively checked. + A concurrent version of `check_links` + + Args: + external_recheck_interval: Minutes before rechecking external links + limit: Maximum number of URLs to check (-1 for unlimited) + check_internal: Whether to check internal links + check_external: Whether to check external links + max_workers: Maximum number of concurrent threads + """ + + urls = Url.objects.all() + + # An optimization for when check_internal is False + if not check_internal: + recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval) + urls = urls.exclude(last_checked__gt=recheck_datetime) + + url_list = list(urls[:limit] if limit > 0 else urls) + + if not url_list: + return 0 + + # Thread-safe counter + check_count = 0 + count_lock = threading.Lock() + + def check_single_url(url_obj): + """Check a single URL and return 1 if checked, 0 if not""" + try: + status = url_obj.check_url(check_internal=check_internal, check_external=check_external) + return 1 if status is not None else 0 + except Exception as e: + logger.exception( + "%s while checking %s: %s", + type(e).__name__, + url_obj.url, + e + ) + return 0 + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_url = { + executor.submit(check_single_url, url): url + for url in url_list + } + # Process completed futures + for future in as_completed(future_to_url): + result = future.result() + with count_lock: + check_count += result + + return check_count + + def update_urls(urls, content_type, object_id): # Structure of urls param is [(field, link text, url), ... ] @@ -183,7 +249,7 @@ def find_all_links(linklists=None): linklists = linklist_cls().get_linklist() for linklist in linklists: - object_id = linklist["object"].id + object_id = linklist["object"].pk urls = linklist["urls"] + linklist["images"] if urls: new = update_urls(urls, content_type, object_id)