Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ Basic usage
-----------

#. Install app to somewhere on your Python path (e.g. ``pip install
django-linkcheck``).
django-linkcheck``). If you do not need multilingual support, you can skip
the compilation of the translation files with an environment variable, e.g.
(``LINKCHECK_SKIP_TRANSLATIONS=true pip install django-linkcheck``).

#. Add ``'linkcheck'`` to your ``settings.INSTALLED_APPS``.

Expand Down
19 changes: 16 additions & 3 deletions linkcheck/build_meta.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
import os
import subprocess

from setuptools import build_meta as default
from setuptools.build_meta import * # noqa: F401, F403


def compile_translation_files():
print("Compile translation files")
print("Compiling translation files...")
subprocess.run(["django-admin", "compilemessages"], cwd="linkcheck")


def should_compile_translation_files():
skip_translations = os.environ.get("LINKCHECK_SKIP_TRANSLATIONS")
if skip_translations and skip_translations.lower() in ("1", "true", "yes", "t", "y"):
return False

return True


def build_sdist(sdist_directory, config_settings=None):
compile_translation_files()
if should_compile_translation_files():
compile_translation_files()

return default.build_sdist(sdist_directory, config_settings)


def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
compile_translation_files()
if should_compile_translation_files():
compile_translation_files()

return default.build_wheel(
wheel_directory,
config_settings=config_settings,
Expand Down
2 changes: 2 additions & 0 deletions linkcheck/linkcheck_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,5 @@
SITE_DOMAINS = getattr(settings, 'LINKCHECK_SITE_DOMAINS', [])
DISABLE_LISTENERS = getattr(settings, 'LINKCHECK_DISABLE_LISTENERS', False)
TOLERATE_BROKEN_ANCHOR = getattr(settings, 'LINKCHECK_TOLERATE_BROKEN_ANCHOR', True)
PROXIES = getattr(settings, 'LINKCHECK_PROXIES', {})
TRUST_PROXY_SSL = getattr(settings, 'LINKCHECK_TRUST_PROXY_SSL', False)
2 changes: 1 addition & 1 deletion linkcheck/listeners.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def delete_instance_links(sender, instance, **kwargs):


def instance_pre_save(sender, instance, raw=False, **kwargs):
if not instance.pk or raw:
if instance._state.adding or not instance.pk or raw:
# Ignore unsaved instances or raw imports
return
current_url = instance.get_absolute_url()
Expand Down
6 changes: 6 additions & 0 deletions linkcheck/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT,
MAX_URL_LENGTH,
MEDIA_PREFIX,
PROXIES,
SITE_DOMAINS,
TOLERATE_BROKEN_ANCHOR,
TRUST_PROXY_SSL,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -386,6 +388,10 @@ def check_external(self, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL):
"timeout": LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT,
"verify": True,
}
if PROXIES:
request_params["verify"] = not TRUST_PROXY_SSL
request_params["proxies"] = PROXIES

try:
try:
# At first try a HEAD request
Expand Down
2 changes: 1 addition & 1 deletion linkcheck/templates/linkcheck/report.html
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ <h3 style='display: inline; padding-left: 5px;'>{{report_type}} in '{{object.obj
</td>
</tr>
{% if link.url.redirect_to %}
<tr><td colspan="6">R{% translate "Redirects to" %}: <a href="{{ link.url.redirect_to }}" target="_blank">{{ link.url.redirect_to }}</a></td></tr>
<tr><td colspan="6">{% translate "Redirects to" %}: <a href="{{ link.url.redirect_to }}" target="_blank">{{ link.url.redirect_to }}</a></td></tr>
{% endif %}
{% endfor %}
</table>
Expand Down
90 changes: 87 additions & 3 deletions linkcheck/tests/test_linkcheck.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
from datetime import datetime, timedelta
from datetime import timedelta
from http import HTTPStatus
from io import StringIO
from unittest.mock import patch
from unittest.mock import Mock, patch

import requests_mock
import urllib3
Expand All @@ -13,6 +14,7 @@
from django.test import LiveServerTestCase, TestCase
from django.test.utils import override_settings
from django.urls import reverse
from django.utils import timezone
from requests.exceptions import ConnectionError

from linkcheck.linkcheck_settings import MAX_URL_LENGTH
Expand All @@ -25,6 +27,7 @@
unregister_listeners,
)
from linkcheck.models import Link, Url
from linkcheck.utils import check_links, concurrent_check_links
from linkcheck.views import get_jquery_min_js

from .sampleapp.models import Author, Book, Journal, Page
Expand Down Expand Up @@ -672,6 +675,31 @@ def test_external_check_blocked_user_agent_blocked_head(self):
self.assertEqual(uv.redirect_to, '')
self.assertEqual(uv.type, 'external')

@patch(
'linkcheck.models.PROXIES',
{'http': 'http://proxy.example.com:8080'},
)
@patch('requests.head')
def test_external_proxy_request(self, mock_head):
mock_response = Mock()
mock_response.status_code = 200
mock_response.reason = 'OK'
mock_response.history = []
mock_head.return_value = mock_response
request_url = 'http://test.com'
uv = Url(url=request_url)
uv.check_url()
self.assertEqual(uv.status, True)
self.assertEqual(uv.message, '200 OK')
self.assertEqual(uv.type, 'external')
mock_head.assert_called_once()
(call_url,), call_kwargs = mock_head.call_args
self.assertEqual(call_url, request_url)
self.assertEqual(
call_kwargs.get('proxies'),
{'http': 'http://proxy.example.com:8080'},
)

def test_external_check_timedout(self):
uv = Url(url=f"{self.live_server_url}/timeout/")
uv.check_url()
Expand Down Expand Up @@ -870,7 +898,7 @@ def test_checklinks_command(self):
"1 internal URLs and 0 external URLs have been checked.\n"
)

yesterday = datetime.now() - timedelta(days=1)
yesterday = timezone.now() - timedelta(days=1)
Url.objects.all().update(last_checked=yesterday)
out = StringIO()
call_command('checklinks', externalinterval=20, stdout=out)
Expand Down Expand Up @@ -1203,6 +1231,62 @@ def test_filter_callable(self):
)


class TestCheckLinks(TestCase):

def _setup_mock_urls(self, mocker):
"""
Set up common mock URLs for link checking tests.
"""
good_url = 'https://example.com/good'
mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK')
Url.objects.create(url=good_url)

bad_url = 'https://example.com/bad'
mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND')
Url.objects.create(url=bad_url)

exception_url = 'https://example.com/exception'
mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong"))
Url.objects.create(url=exception_url)

recently_checked_url = 'https://example.com/recent'
# Shouldn't be requested
Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1))

return (good_url, bad_url, exception_url, recently_checked_url)

@requests_mock.Mocker()
def test_check_links(self, mocker):
good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker)

self.assertEqual(check_links(), 3)
self.assertEqual(Url.objects.get(url=good_url).status, True)
self.assertEqual(Url.objects.get(url=bad_url).status, False)
self.assertEqual(Url.objects.get(url=exception_url).status, False)
self.assertEqual(Url.objects.get(url=recently_checked_url).status, None)

@requests_mock.Mocker()
def test_concurrent_check_links(self, mocker):
self._setup_mock_urls(mocker)

# Since the tests are running in sqlite, we can't insert data via our threaded code
# there's enough other test coverage that we can use `Url.save` as a proxy
with patch.object(Url, "save") as patched_save:
self.assertEqual(concurrent_check_links(), 3)
self.assertEqual(patched_save.call_count, 3)

def test_concurrent_check_links_error_handling(self):
Url.objects.create(url='https://example.com/good')
with (
patch("linkcheck.utils.logger.exception") as patched_logged_exception,
patch.object(Url, "check_external", side_effect=ValueError("oops")),
):
self.assertEqual(concurrent_check_links(), 0)
self.assertEqual(patched_logged_exception.call_count, 1)
msg, *args = patched_logged_exception.call_args[0]
self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops")


def get_command_output(command, *args, **kwargs):
"""
Helper function for running a management command and checking its output
Expand Down
68 changes: 67 additions & 1 deletion linkcheck/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta

from django.apps import apps
Expand Down Expand Up @@ -120,6 +122,70 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True,
return check_count


def concurrent_check_links(
external_recheck_interval=10080,
limit=-1,
check_internal=True,
check_external=True,
max_workers=20,
):
"""
Return the number of links effectively checked.
A concurrent version of `check_links`

Args:
external_recheck_interval: Minutes before rechecking external links
limit: Maximum number of URLs to check (-1 for unlimited)
check_internal: Whether to check internal links
check_external: Whether to check external links
max_workers: Maximum number of concurrent threads
"""

urls = Url.objects.all()

# An optimization for when check_internal is False
if not check_internal:
recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval)
urls = urls.exclude(last_checked__gt=recheck_datetime)

url_list = list(urls[:limit] if limit > 0 else urls)

if not url_list:
return 0

# Thread-safe counter
check_count = 0
count_lock = threading.Lock()

def check_single_url(url_obj):
"""Check a single URL and return 1 if checked, 0 if not"""
try:
status = url_obj.check_url(check_internal=check_internal, check_external=check_external)
return 1 if status is not None else 0
except Exception as e:
logger.exception(
"%s while checking %s: %s",
type(e).__name__,
url_obj.url,
e
)
return 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_url = {
executor.submit(check_single_url, url): url
for url in url_list
}
# Process completed futures
for future in as_completed(future_to_url):
result = future.result()
with count_lock:
check_count += result

return check_count


def update_urls(urls, content_type, object_id):
# Structure of urls param is [(field, link text, url), ... ]

Expand Down Expand Up @@ -183,7 +249,7 @@ def find_all_links(linklists=None):
linklists = linklist_cls().get_linklist()

for linklist in linklists:
object_id = linklist["object"].id
object_id = linklist["object"].pk
urls = linklist["urls"] + linklist["images"]
if urls:
new = update_urls(urls, content_type, object_id)
Expand Down