Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions linkcheck/tests/test_linkcheck.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
from datetime import datetime, timedelta
from datetime import timedelta
from http import HTTPStatus
from io import StringIO
from unittest.mock import patch

Expand All @@ -14,6 +15,7 @@
from django.test import LiveServerTestCase, TestCase
from django.test.utils import override_settings
from django.urls import reverse
from django.utils import timezone
from requests.exceptions import ConnectionError

from linkcheck.linkcheck_settings import MAX_URL_LENGTH
Expand All @@ -26,6 +28,7 @@
unregister_listeners,
)
from linkcheck.models import Link, Url
from linkcheck.utils import check_links, concurrent_check_links
from linkcheck.views import get_jquery_min_js

from .sampleapp.models import Author, Book, Journal, Page
Expand Down Expand Up @@ -877,7 +880,7 @@ def test_checklinks_command(self):
"1 internal URLs and 0 external URLs have been checked.\n"
)

yesterday = datetime.now() - timedelta(days=1)
yesterday = timezone.now() - timedelta(days=1)
Url.objects.all().update(last_checked=yesterday)
out = StringIO()
call_command('checklinks', externalinterval=20, stdout=out)
Expand Down Expand Up @@ -1210,6 +1213,62 @@ def test_filter_callable(self):
)


class TestCheckLinks(TestCase):

def _setup_mock_urls(self, mocker):
"""
Set up common mock URLs for link checking tests.
"""
good_url = 'https://example.com/good'
mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK')
Url.objects.create(url=good_url)

bad_url = 'https://example.com/bad'
mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND')
Url.objects.create(url=bad_url)

exception_url = 'https://example.com/exception'
mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong"))
Url.objects.create(url=exception_url)

recently_checked_url = 'https://example.com/recent'
# Shouldn't be requested
Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1))

return (good_url, bad_url, exception_url, recently_checked_url)

@requests_mock.Mocker()
def test_check_links(self, mocker):
good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker)

self.assertEqual(check_links(), 3)
self.assertEqual(Url.objects.get(url=good_url).status, True)
self.assertEqual(Url.objects.get(url=bad_url).status, False)
self.assertEqual(Url.objects.get(url=exception_url).status, False)
self.assertEqual(Url.objects.get(url=recently_checked_url).status, None)

@requests_mock.Mocker()
def test_concurrent_check_links(self, mocker):
self._setup_mock_urls(mocker)

# Since the tests are running in sqlite, we can't insert data via our threaded code
# there's enough other test coverage that we can use `Url.save` as a proxy
with patch.object(Url, "save") as patched_save:
self.assertEqual(concurrent_check_links(), 3)
self.assertEqual(patched_save.call_count, 3)

def test_concurrent_check_links_error_handling(self):
Url.objects.create(url='https://example.com/good')
with (
patch("linkcheck.utils.logger.exception") as patched_logged_exception,
patch.object(Url, "check_external", side_effect=ValueError("oops")),
):
self.assertEqual(concurrent_check_links(), 0)
self.assertEqual(patched_logged_exception.call_count, 1)
msg, *args = patched_logged_exception.call_args[0]
self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops")


def get_command_output(command, *args, **kwargs):
"""
Helper function for running a management command and checking its output
Expand Down
70 changes: 70 additions & 0 deletions linkcheck/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta

from django.apps import apps
Expand Down Expand Up @@ -120,6 +122,74 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True,
return check_count


def concurrent_check_links(
external_recheck_interval=10080,
limit=-1,
check_internal=True,
check_external=True,
max_workers=20,
):
"""
Return the number of links effectively checked.

A concurrent version of `check_links`. It should be faster than `check_links`, but
be aware that if you have multiple links to the same domain, you risk triggering
some attack detection on the target server, hence this concurrent version is best used
for links from all different domains or internal links.

Args:
external_recheck_interval: Minutes before rechecking external links
limit: Maximum number of URLs to check (-1 for unlimited)
check_internal: Whether to check internal links
check_external: Whether to check external links
max_workers: Maximum number of concurrent threads
"""

urls = Url.objects.all()

# An optimization for when check_internal is False
if not check_internal:
recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval)
urls = urls.exclude(last_checked__gt=recheck_datetime)

url_list = list(urls[:limit] if limit > 0 else urls)

if not url_list:
return 0

# Thread-safe counter
check_count = 0
count_lock = threading.Lock()

def check_single_url(url_obj):
"""Check a single URL and return 1 if checked, 0 if not"""
try:
status = url_obj.check_url(check_internal=check_internal, check_external=check_external)
return 1 if status is not None else 0
except Exception as e:
logger.exception(
"%s while checking %s: %s",
type(e).__name__,
url_obj.url,
e
)
return 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_url = {
executor.submit(check_single_url, url): url
for url in url_list
}
# Process completed futures
for future in as_completed(future_to_url):
result = future.result()
with count_lock:
check_count += result

return check_count


def update_urls(urls, content_type, object_id):
# Structure of urls param is [(field, link text, url), ... ]

Expand Down
Loading