diff --git a/README.rst b/README.rst
index 4b9163f..7e686ec 100644
--- a/README.rst
+++ b/README.rst
@@ -33,7 +33,9 @@ Basic usage
-----------
#. Install app to somewhere on your Python path (e.g. ``pip install
- django-linkcheck``).
+ django-linkcheck``). If you do not need multilingual support, you can skip
+ the compilation of the translation files with an environment variable, e.g.
+ (``LINKCHECK_SKIP_TRANSLATIONS=true pip install django-linkcheck``).
#. Add ``'linkcheck'`` to your ``settings.INSTALLED_APPS``.
diff --git a/linkcheck/build_meta.py b/linkcheck/build_meta.py
index 435f55d..a75ee56 100644
--- a/linkcheck/build_meta.py
+++ b/linkcheck/build_meta.py
@@ -1,3 +1,4 @@
+import os
import subprocess
from setuptools import build_meta as default
@@ -5,17 +6,29 @@
def compile_translation_files():
- print("Compile translation files")
+ print("Compiling translation files...")
subprocess.run(["django-admin", "compilemessages"], cwd="linkcheck")
+def should_compile_translation_files():
+ skip_translations = os.environ.get("LINKCHECK_SKIP_TRANSLATIONS")
+ if skip_translations and skip_translations.lower() in ("1", "true", "yes", "t", "y"):
+ return False
+
+ return True
+
+
def build_sdist(sdist_directory, config_settings=None):
- compile_translation_files()
+ if should_compile_translation_files():
+ compile_translation_files()
+
return default.build_sdist(sdist_directory, config_settings)
def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
- compile_translation_files()
+ if should_compile_translation_files():
+ compile_translation_files()
+
return default.build_wheel(
wheel_directory,
config_settings=config_settings,
diff --git a/linkcheck/linkcheck_settings.py b/linkcheck/linkcheck_settings.py
index 566be34..617aef4 100644
--- a/linkcheck/linkcheck_settings.py
+++ b/linkcheck/linkcheck_settings.py
@@ -59,3 +59,5 @@
SITE_DOMAINS = getattr(settings, 'LINKCHECK_SITE_DOMAINS', [])
DISABLE_LISTENERS = getattr(settings, 'LINKCHECK_DISABLE_LISTENERS', False)
TOLERATE_BROKEN_ANCHOR = getattr(settings, 'LINKCHECK_TOLERATE_BROKEN_ANCHOR', True)
+PROXIES = getattr(settings, 'LINKCHECK_PROXIES', {})
+TRUST_PROXY_SSL = getattr(settings, 'LINKCHECK_TRUST_PROXY_SSL', False)
diff --git a/linkcheck/listeners.py b/linkcheck/listeners.py
index f82784f..0642aeb 100644
--- a/linkcheck/listeners.py
+++ b/linkcheck/listeners.py
@@ -132,7 +132,7 @@ def delete_instance_links(sender, instance, **kwargs):
def instance_pre_save(sender, instance, raw=False, **kwargs):
- if not instance.pk or raw:
+ if instance._state.adding or not instance.pk or raw:
# Ignore unsaved instances or raw imports
return
current_url = instance.get_absolute_url()
diff --git a/linkcheck/models.py b/linkcheck/models.py
index 536fb50..d7c189f 100644
--- a/linkcheck/models.py
+++ b/linkcheck/models.py
@@ -31,8 +31,10 @@
LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT,
MAX_URL_LENGTH,
MEDIA_PREFIX,
+ PROXIES,
SITE_DOMAINS,
TOLERATE_BROKEN_ANCHOR,
+ TRUST_PROXY_SSL,
)
logger = logging.getLogger(__name__)
@@ -386,6 +388,10 @@ def check_external(self, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL):
"timeout": LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT,
"verify": True,
}
+ if PROXIES:
+ request_params["verify"] = not TRUST_PROXY_SSL
+ request_params["proxies"] = PROXIES
+
try:
try:
# At first try a HEAD request
diff --git a/linkcheck/templates/linkcheck/report.html b/linkcheck/templates/linkcheck/report.html
index a65d577..1734170 100644
--- a/linkcheck/templates/linkcheck/report.html
+++ b/linkcheck/templates/linkcheck/report.html
@@ -171,7 +171,7 @@
{{report_type}} in '{{object.obj
{% if link.url.redirect_to %}
- | R{% translate "Redirects to" %}: {{ link.url.redirect_to }} |
+ | {% translate "Redirects to" %}: {{ link.url.redirect_to }} |
{% endif %}
{% endfor %}
diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py
index 8a3f1e1..4d2f95e 100644
--- a/linkcheck/tests/test_linkcheck.py
+++ b/linkcheck/tests/test_linkcheck.py
@@ -1,7 +1,8 @@
import os
-from datetime import datetime, timedelta
+from datetime import timedelta
+from http import HTTPStatus
from io import StringIO
-from unittest.mock import patch
+from unittest.mock import Mock, patch
import requests_mock
import urllib3
@@ -13,6 +14,7 @@
from django.test import LiveServerTestCase, TestCase
from django.test.utils import override_settings
from django.urls import reverse
+from django.utils import timezone
from requests.exceptions import ConnectionError
from linkcheck.linkcheck_settings import MAX_URL_LENGTH
@@ -25,6 +27,7 @@
unregister_listeners,
)
from linkcheck.models import Link, Url
+from linkcheck.utils import check_links, concurrent_check_links
from linkcheck.views import get_jquery_min_js
from .sampleapp.models import Author, Book, Journal, Page
@@ -672,6 +675,31 @@ def test_external_check_blocked_user_agent_blocked_head(self):
self.assertEqual(uv.redirect_to, '')
self.assertEqual(uv.type, 'external')
+ @patch(
+ 'linkcheck.models.PROXIES',
+ {'http': 'http://proxy.example.com:8080'},
+ )
+ @patch('requests.head')
+ def test_external_proxy_request(self, mock_head):
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.reason = 'OK'
+ mock_response.history = []
+ mock_head.return_value = mock_response
+ request_url = 'http://test.com'
+ uv = Url(url=request_url)
+ uv.check_url()
+ self.assertEqual(uv.status, True)
+ self.assertEqual(uv.message, '200 OK')
+ self.assertEqual(uv.type, 'external')
+ mock_head.assert_called_once()
+ (call_url,), call_kwargs = mock_head.call_args
+ self.assertEqual(call_url, request_url)
+ self.assertEqual(
+ call_kwargs.get('proxies'),
+ {'http': 'http://proxy.example.com:8080'},
+ )
+
def test_external_check_timedout(self):
uv = Url(url=f"{self.live_server_url}/timeout/")
uv.check_url()
@@ -870,7 +898,7 @@ def test_checklinks_command(self):
"1 internal URLs and 0 external URLs have been checked.\n"
)
- yesterday = datetime.now() - timedelta(days=1)
+ yesterday = timezone.now() - timedelta(days=1)
Url.objects.all().update(last_checked=yesterday)
out = StringIO()
call_command('checklinks', externalinterval=20, stdout=out)
@@ -1203,6 +1231,62 @@ def test_filter_callable(self):
)
+class TestCheckLinks(TestCase):
+
+ def _setup_mock_urls(self, mocker):
+ """
+ Set up common mock URLs for link checking tests.
+ """
+ good_url = 'https://example.com/good'
+ mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK')
+ Url.objects.create(url=good_url)
+
+ bad_url = 'https://example.com/bad'
+ mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND')
+ Url.objects.create(url=bad_url)
+
+ exception_url = 'https://example.com/exception'
+ mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong"))
+ Url.objects.create(url=exception_url)
+
+ recently_checked_url = 'https://example.com/recent'
+ # Shouldn't be requested
+ Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1))
+
+ return (good_url, bad_url, exception_url, recently_checked_url)
+
+ @requests_mock.Mocker()
+ def test_check_links(self, mocker):
+ good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker)
+
+ self.assertEqual(check_links(), 3)
+ self.assertEqual(Url.objects.get(url=good_url).status, True)
+ self.assertEqual(Url.objects.get(url=bad_url).status, False)
+ self.assertEqual(Url.objects.get(url=exception_url).status, False)
+ self.assertEqual(Url.objects.get(url=recently_checked_url).status, None)
+
+ @requests_mock.Mocker()
+ def test_concurrent_check_links(self, mocker):
+ self._setup_mock_urls(mocker)
+
+ # Since the tests are running in sqlite, we can't insert data via our threaded code
+ # there's enough other test coverage that we can use `Url.save` as a proxy
+ with patch.object(Url, "save") as patched_save:
+ self.assertEqual(concurrent_check_links(), 3)
+ self.assertEqual(patched_save.call_count, 3)
+
+ def test_concurrent_check_links_error_handling(self):
+ Url.objects.create(url='https://example.com/good')
+ with (
+ patch("linkcheck.utils.logger.exception") as patched_logged_exception,
+ patch.object(Url, "check_external", side_effect=ValueError("oops")),
+ ):
+ self.assertEqual(concurrent_check_links(), 0)
+ self.assertEqual(patched_logged_exception.call_count, 1)
+ msg, *args = patched_logged_exception.call_args[0]
+ self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops")
+
+
def get_command_output(command, *args, **kwargs):
"""
Helper function for running a management command and checking its output
diff --git a/linkcheck/utils.py b/linkcheck/utils.py
index 0b4e6d7..ba00e90 100644
--- a/linkcheck/utils.py
+++ b/linkcheck/utils.py
@@ -1,4 +1,6 @@
import logging
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta
from django.apps import apps
@@ -120,6 +122,70 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True,
return check_count
+def concurrent_check_links(
+ external_recheck_interval=10080,
+ limit=-1,
+ check_internal=True,
+ check_external=True,
+ max_workers=20,
+):
+ """
+ Return the number of links effectively checked.
+ A concurrent version of `check_links`
+
+ Args:
+ external_recheck_interval: Minutes before rechecking external links
+ limit: Maximum number of URLs to check (-1 for unlimited)
+ check_internal: Whether to check internal links
+ check_external: Whether to check external links
+ max_workers: Maximum number of concurrent threads
+ """
+
+ urls = Url.objects.all()
+
+ # An optimization for when check_internal is False
+ if not check_internal:
+ recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval)
+ urls = urls.exclude(last_checked__gt=recheck_datetime)
+
+ url_list = list(urls[:limit] if limit > 0 else urls)
+
+ if not url_list:
+ return 0
+
+ # Thread-safe counter
+ check_count = 0
+ count_lock = threading.Lock()
+
+ def check_single_url(url_obj):
+ """Check a single URL and return 1 if checked, 0 if not"""
+ try:
+ status = url_obj.check_url(check_internal=check_internal, check_external=check_external)
+ return 1 if status is not None else 0
+ except Exception as e:
+ logger.exception(
+ "%s while checking %s: %s",
+ type(e).__name__,
+ url_obj.url,
+ e
+ )
+ return 0
+
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ # Submit all tasks
+ future_to_url = {
+ executor.submit(check_single_url, url): url
+ for url in url_list
+ }
+ # Process completed futures
+ for future in as_completed(future_to_url):
+ result = future.result()
+ with count_lock:
+ check_count += result
+
+ return check_count
+
+
def update_urls(urls, content_type, object_id):
# Structure of urls param is [(field, link text, url), ... ]
@@ -183,7 +249,7 @@ def find_all_links(linklists=None):
linklists = linklist_cls().get_linklist()
for linklist in linklists:
- object_id = linklist["object"].id
+ object_id = linklist["object"].pk
urls = linklist["urls"] + linklist["images"]
if urls:
new = update_urls(urls, content_type, object_id)