diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 8ffa844..e74a165 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,6 +1,7 @@ import os import sys -from datetime import datetime, timedelta +from datetime import timedelta +from http import HTTPStatus from io import StringIO from unittest.mock import patch @@ -14,6 +15,7 @@ from django.test import LiveServerTestCase, TestCase from django.test.utils import override_settings from django.urls import reverse +from django.utils import timezone from requests.exceptions import ConnectionError from linkcheck.linkcheck_settings import MAX_URL_LENGTH @@ -26,6 +28,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url +from linkcheck.utils import check_links, concurrent_check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -877,7 +880,7 @@ def test_checklinks_command(self): "1 internal URLs and 0 external URLs have been checked.\n" ) - yesterday = datetime.now() - timedelta(days=1) + yesterday = timezone.now() - timedelta(days=1) Url.objects.all().update(last_checked=yesterday) out = StringIO() call_command('checklinks', externalinterval=20, stdout=out) @@ -1210,6 +1213,62 @@ def test_filter_callable(self): ) +class TestCheckLinks(TestCase): + + def _setup_mock_urls(self, mocker): + """ + Set up common mock URLs for link checking tests. + """ + good_url = 'https://example.com/good' + mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') + Url.objects.create(url=good_url) + + bad_url = 'https://example.com/bad' + mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND') + Url.objects.create(url=bad_url) + + exception_url = 'https://example.com/exception' + mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong")) + Url.objects.create(url=exception_url) + + recently_checked_url = 'https://example.com/recent' + # Shouldn't be requested + Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1)) + + return (good_url, bad_url, exception_url, recently_checked_url) + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker) + + self.assertEqual(check_links(), 3) + self.assertEqual(Url.objects.get(url=good_url).status, True) + self.assertEqual(Url.objects.get(url=bad_url).status, False) + self.assertEqual(Url.objects.get(url=exception_url).status, False) + self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + + @requests_mock.Mocker() + def test_concurrent_check_links(self, mocker): + self._setup_mock_urls(mocker) + + # Since the tests are running in sqlite, we can't insert data via our threaded code + # there's enough other test coverage that we can use `Url.save` as a proxy + with patch.object(Url, "save") as patched_save: + self.assertEqual(concurrent_check_links(), 3) + self.assertEqual(patched_save.call_count, 3) + + def test_concurrent_check_links_error_handling(self): + Url.objects.create(url='https://example.com/good') + with ( + patch("linkcheck.utils.logger.exception") as patched_logged_exception, + patch.object(Url, "check_external", side_effect=ValueError("oops")), + ): + self.assertEqual(concurrent_check_links(), 0) + self.assertEqual(patched_logged_exception.call_count, 1) + msg, *args = patched_logged_exception.call_args[0] + self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops") + + def get_command_output(command, *args, **kwargs): """ Helper function for running a management command and checking its output diff --git a/linkcheck/utils.py b/linkcheck/utils.py index b40dbb5..fd1cc25 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -1,4 +1,6 @@ import logging +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta from django.apps import apps @@ -120,6 +122,74 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True, return check_count +def concurrent_check_links( + external_recheck_interval=10080, + limit=-1, + check_internal=True, + check_external=True, + max_workers=20, +): + """ + Return the number of links effectively checked. + + A concurrent version of `check_links`. It should be faster than `check_links`, but + be aware that if you have multiple links to the same domain, you risk triggering + some attack detection on the target server, hence this concurrent version is best used + for links from all different domains or internal links. + + Args: + external_recheck_interval: Minutes before rechecking external links + limit: Maximum number of URLs to check (-1 for unlimited) + check_internal: Whether to check internal links + check_external: Whether to check external links + max_workers: Maximum number of concurrent threads + """ + + urls = Url.objects.all() + + # An optimization for when check_internal is False + if not check_internal: + recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval) + urls = urls.exclude(last_checked__gt=recheck_datetime) + + url_list = list(urls[:limit] if limit > 0 else urls) + + if not url_list: + return 0 + + # Thread-safe counter + check_count = 0 + count_lock = threading.Lock() + + def check_single_url(url_obj): + """Check a single URL and return 1 if checked, 0 if not""" + try: + status = url_obj.check_url(check_internal=check_internal, check_external=check_external) + return 1 if status is not None else 0 + except Exception as e: + logger.exception( + "%s while checking %s: %s", + type(e).__name__, + url_obj.url, + e + ) + return 0 + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_url = { + executor.submit(check_single_url, url): url + for url in url_list + } + # Process completed futures + for future in as_completed(future_to_url): + result = future.result() + with count_lock: + check_count += result + + return check_count + + def update_urls(urls, content_type, object_id): # Structure of urls param is [(field, link text, url), ... ]