diff --git a/djangoproject/scss/_style.scss b/djangoproject/scss/_style.scss index 08a9a834fd..82d5f4131c 100644 --- a/djangoproject/scss/_style.scss +++ b/djangoproject/scss/_style.scss @@ -2650,7 +2650,7 @@ search.filters { position: relative; a { - padding: 10px 20px; + padding: 10px 15px; text-decoration: none; border-bottom: 3px solid transparent; transition: color 0.3s ease, border-bottom 0.3s ease; diff --git a/djangoproject/sitemaps.py b/djangoproject/sitemaps.py new file mode 100644 index 0000000000..3c7becbf3a --- /dev/null +++ b/djangoproject/sitemaps.py @@ -0,0 +1,76 @@ +from dataclasses import dataclass + +from django.contrib import sitemaps +from django_hosts.resolvers import reverse + + +@dataclass +class URLObject: + name: str + host: str = "www" + + +class LocationAbsoluteUrlMixin: + def get_urls(self, site=None, **kwargs): + """ + Prevent the Django sitemap framework from prefixing the domain. + Use the absolute URL returned by location(). + """ + urls = [] + for item in self.items(): + loc = self.location(item) + urls.append( + { + "location": loc, + "lastmod": None, + "changefreq": self.changefreq, + "priority": self.priority, + } + ) + return urls + + +class StaticViewSitemap(LocationAbsoluteUrlMixin, sitemaps.Sitemap): + priority = 0.5 + changefreq = "monthly" + + def items(self): + return [ + # accounts + URLObject("registration_register"), + # aggregator + URLObject("community-index"), + URLObject("community-ecosystem"), + URLObject("local-django-communities"), + # contact + URLObject("contact_foundation"), + # dashboard + URLObject("dashboard-index", host="dashboard"), + URLObject("metric-list", host="dashboard"), + # djangoproject + URLObject("homepage"), + URLObject("overview"), + URLObject("start"), + URLObject("code_of_conduct"), + URLObject("conduct_faq"), + URLObject("conduct_reporting"), + URLObject("conduct_enforcement"), + URLObject("conduct_changes"), + URLObject("diversity"), + URLObject("diversity_changes"), + # foundation + URLObject("foundation_meeting_archive_index"), + # fundraising + URLObject("fundraising:index"), + # members + URLObject("members:individual-members"), + URLObject("members:corporate-members"), + URLObject("members:corporate-members-join"), + URLObject("members:corporate-members-badges"), + URLObject("members:teams"), + # releases + URLObject("download"), + ] + + def location(self, item): + return reverse(item.name, host=item.host) diff --git a/djangoproject/tests.py b/djangoproject/tests.py index 2c67432110..5e8355601b 100644 --- a/djangoproject/tests.py +++ b/djangoproject/tests.py @@ -202,6 +202,7 @@ def test_single_h1_per_page(self): "styleguide/", # Has multiple

examples. "admin/", # Admin templates are out of our control. "reset/done/", # Uses an admin template. + "sitemap.xml", ] resolver = get_resolver() urls = self.extract_patterns(resolver.url_patterns) @@ -211,3 +212,9 @@ def test_single_h1_per_page(self): response = self.client.get(url) self.assertEqual(response.status_code, 200) self.assertContains(response, "= 1: self.stdout.write(f"Starting update for {release} at {datetime.now()}...") + release.sync_from_sitemap(force=force) + # checkout_dir is shared for all languages. checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version) parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath( diff --git a/docs/models.py b/docs/models.py index d690c76bfc..9b95e462e2 100644 --- a/docs/models.py +++ b/docs/models.py @@ -5,6 +5,7 @@ from functools import partial, reduce from pathlib import Path +import requests from django.conf import settings from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.search import ( @@ -26,19 +27,19 @@ from django.utils.html import strip_tags from django_hosts.resolvers import reverse -from blog.models import Entry from releases.models import Release from . import utils from .search import ( DEFAULT_TEXT_SEARCH_CONFIG, - SEARCHABLE_VIEWS, START_SEL, STOP_SEL, TSEARCH_CONFIG_LANGUAGES, DocumentationCategory, + fetch_html, get_document_search_vector, ) +from .utils import extract_inner_html def get_search_config(lang): @@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents): the database. Deletes all the release's documents first then reinserts them as needed. """ - self.documents.all().delete() + self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete() # Read excluded paths from robots.docs.txt. robots_path = settings.BASE_DIR.joinpath( @@ -218,65 +219,51 @@ def sync_to_db(self, decoded_documents): metadata=document, config=get_search_config(self.lang), ) - for document in self.documents.all(): + for document in self.documents.exclude( + metadata__parents=DocumentationCategory.WEBSITE + ): document.metadata["breadcrumbs"] = list( Document.objects.breadcrumbs(document).values("title", "path") ) document.save(update_fields=("metadata",)) - self._sync_blog_to_db() - self._sync_views_to_db() + def sync_from_sitemap(self, force=False): + from djangoproject.urls.www import sitemaps - def _sync_blog_to_db(self): - """ - Sync the blog entries into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The blog is only written in English currently + if not self.is_dev: + return - entries = Entry.objects.published().searchable() - Document.objects.bulk_create( - [ - Document( - release=self, - path=entry.get_absolute_url(), - title=entry.headline, - metadata={ - "body": entry.body_html, - "breadcrumbs": [ - { - "path": DocumentationCategory.WEBSITE, - "title": "News", - }, - ], - "parents": DocumentationCategory.WEBSITE, - "slug": entry.slug, - "title": entry.headline, - "toc": "", - }, - config=get_search_config(self.lang), - ) - for entry in entries - ] - ) + if force: + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).delete() - def _sync_views_to_db(self): - """ - Sync the specific views into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The searchable views are only written in English currently + doc_urls = set( + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).values_list("path", flat=True) + ) - Document.objects.bulk_create( - [ - Document( + for sitemap in sitemaps.values(): + for url in sitemap().get_urls(): + path = url["location"] + if path in doc_urls: + continue + try: + page_html = fetch_html(path) + except requests.RequestException: + continue + try: + main_html = extract_inner_html(page_html, tag="main") + title = extract_inner_html(page_html, tag="h1") + except ValueError: + continue + Document.objects.create( release=self, - path=searchable_view.www_absolute_url, - title=searchable_view.page_title, + path=path, + title=title, metadata={ - "body": searchable_view.html, + "body": main_html, "breadcrumbs": [ { "path": DocumentationCategory.WEBSITE, @@ -284,15 +271,11 @@ def _sync_views_to_db(self): }, ], "parents": DocumentationCategory.WEBSITE, - "slug": searchable_view.url_name, - "title": searchable_view.page_title, + "title": title, "toc": "", }, config=get_search_config(self.lang), ) - for searchable_view in SEARCHABLE_VIEWS - ] - ) def _clean_document_path(path): @@ -353,6 +336,14 @@ def search(self, query_text, release, document_category=None): config=models.F("config"), ) base_filter = Q(release_id=release.id) + if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev: + dev_release = DocumentRelease.objects.get_by_version_and_lang( + version="dev", lang=settings.DEFAULT_LANGUAGE_CODE + ) + base_filter |= Q( + release_id=dev_release.id, + metadata__parents=DocumentationCategory.WEBSITE, + ) if document_category: base_filter &= Q(metadata__parents__startswith=document_category) base_qs = ( diff --git a/docs/search.py b/docs/search.py index 0b7eaef1d3..d6d1ec9959 100644 --- a/docs/search.py +++ b/docs/search.py @@ -1,11 +1,8 @@ -from dataclasses import dataclass - +import requests from django.contrib.postgres.search import SearchVector from django.db.models import TextChoices from django.db.models.fields.json import KeyTextTransform -from django.template.loader import get_template from django.utils.translation import gettext_lazy as _ -from django_hosts import reverse # Imported from # https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659 @@ -81,25 +78,31 @@ def parse(cls, value, default=None): return None -@dataclass -class SearchableView: - page_title: str - url_name: str - template: str - - @property - def html(self): - return get_template(self.template).render() - - @property - def www_absolute_url(self): - return reverse(self.url_name, host="www") - +def fetch_html(url, timeout=10): + """ + Fetch the HTML of a page if status code is 200. + Simulates a human browser and accepts only text/html. + """ -SEARCHABLE_VIEWS = [ - SearchableView( - page_title="Django's Ecosystem", - url_name="community-ecosystem", - template="aggregator/ecosystem.html", - ), -] + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" + ), + "Accept": "text/html", + "Accept-Language": "en-US,en;q=0.9", + } + + response = requests.get(url, headers=headers, timeout=timeout) + + if response.status_code == 200: + content_type = response.headers.get("Content-Type", "") + if "text/html" in content_type: + return response.text + else: + raise requests.RequestException(f"Unexpected Content-Type: {content_type}") + else: + raise requests.RequestException( + f"Failed to fetch {url}, status code: {response.status_code}" + ) diff --git a/docs/sitemaps.py b/docs/sitemaps.py index 6f520aa753..c46ec0933b 100644 --- a/docs/sitemaps.py +++ b/docs/sitemaps.py @@ -1,10 +1,12 @@ from django.contrib.sitemaps import Sitemap +from djangoproject.sitemaps import LocationAbsoluteUrlMixin + from .models import Document from .search import DocumentationCategory -class DocsSitemap(Sitemap): +class DocsSitemap(LocationAbsoluteUrlMixin, Sitemap): def __init__(self, lang): self.lang = lang @@ -16,6 +18,9 @@ def items(self): .select_related("release__release") ) + def location(self, item): + return item.get_absolute_url() + def changefreq(self, obj): return "daily" @@ -33,19 +38,3 @@ def priority(self, obj): return 1 else: return 0.1 - - def _urls(self, page, site, protocol): - # XXX: To workaround bad interaction between contrib.sitemaps and - # django-hosts (scheme/domain would be repeated twice in URLs) - urls = [] - for item in self.paginator.page(page).object_list: - loc = item.get_absolute_url() - priority = self.priority(item) - url_info = { - "item": item, - "location": loc, - "changefreq": self.changefreq(item), - "priority": str(priority if priority is not None else ""), - } - urls.append(url_info) - return urls diff --git a/docs/templates/docs/search_results.html b/docs/templates/docs/search_results.html index 3001213db8..8d42d30e53 100644 --- a/docs/templates/docs/search_results.html +++ b/docs/templates/docs/search_results.html @@ -1,7 +1,8 @@ {% extends "docs/doc.html" %} {% load i18n docs %} -{% block title %}{% translate "Search | Django documentation" %}{% endblock %} +{% block title %}{% translate "Search" %}{% endblock %} +{% block header %}{% endblock %} {% block toc-wrapper %}{% endblock %} {% block breadcrumbs-wrapper %}{% endblock %} diff --git a/docs/tests/test_models.py b/docs/tests/test_models.py index 4724cabeaa..d8ab638b44 100644 --- a/docs/tests/test_models.py +++ b/docs/tests/test_models.py @@ -1,13 +1,16 @@ import datetime from operator import attrgetter +from unittest import mock +import requests_mock from django.conf import settings from django.db import connection from django.test import TestCase from django.utils import timezone from django_hosts import reverse -from blog.models import Entry +from blog.models import ContentFormat, Entry +from djangoproject.sitemaps import StaticViewSitemap from releases.models import Release from ..models import Document, DocumentRelease @@ -184,6 +187,9 @@ def test_get_available_languages_by_version(self): class DocumentManagerTest(TestCase): @classmethod def setUpTestData(cls): + cls.dev_release = DocumentRelease.objects.create( + lang=settings.DEFAULT_LANGUAGE_CODE + ) cls.release = DocumentRelease.objects.create( release=Release.objects.create(version="1.2.3"), ) @@ -358,6 +364,20 @@ def setUpTestData(cls): "release": cls.release_fr, "title": "Notes de publication de Django 1.9.4", }, + { + "metadata": { + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + "path": "example", + "release": cls.dev_release, + "title": "Blog post", + }, ] Document.objects.bulk_create(Document(**doc) for doc in documents) @@ -457,28 +477,21 @@ def test_search_title(self): ), ) + def test_website_document_items_included_english(self): + self.assertQuerySetEqual( + Document.objects.search("Main", self.release), + ["Blog post"], + transform=attrgetter("title"), + ) + + def test_website_document_items_excluded_non_english(self): + self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0) + class UpdateDocTests(TestCase): @classmethod def setUpTestData(cls): - now = timezone.now() - cls.release = DocumentRelease.objects.create( - release=Release.objects.create( - version="1.0.0", - eol_date=now + datetime.timedelta(days=1), - ) - ) - cls.entry = Entry.objects.create( - pub_date=now, - is_active=True, - is_searchable=True, - headline="Searchable post", - slug="a", - body_html="

Searchable Blog Post

", - ) - cls.docs_documents = cls.release.documents.exclude( - metadata__parents=DocumentationCategory.WEBSITE - ) + cls.release = DocumentRelease.objects.create() def test_sync_to_db(self): self.release.sync_to_db( @@ -490,24 +503,8 @@ def test_sync_to_db(self): } ] ) - self.assertQuerySetEqual( - self.release.documents.all(), - [ - "foo/bar", - reverse("community-ecosystem", host="www"), - self.entry.get_absolute_url(), - ], - ordered=False, - transform=attrgetter("path"), - ) - - def test_sync_to_db_skip_non_english(self): - """ - Releases must be English to include the blog and website results in search. - """ - non_english = DocumentRelease.objects.create(lang="es") - non_english.sync_to_db([]) - self.assertQuerySetEqual(non_english.documents.all(), []) + document = self.release.documents.get() + self.assertEqual(document.path, "foo/bar") def test_clean_path(self): self.release.sync_to_db( @@ -519,7 +516,7 @@ def test_clean_path(self): } ] ) - document = self.docs_documents.get() + document = self.release.documents.get() self.assertEqual(document.path, "foo/bar") def test_title_strip_tags(self): @@ -533,7 +530,7 @@ def test_title_strip_tags(self): ] ) self.assertQuerySetEqual( - self.docs_documents.all(), + self.release.documents.all(), ["This is the title"], transform=attrgetter("title"), ) @@ -549,7 +546,7 @@ def test_title_entities(self): ] ) self.assertQuerySetEqual( - self.docs_documents, + self.release.documents.all(), ["Title & title"], transform=attrgetter("title"), ) @@ -562,7 +559,7 @@ def test_empty_documents(self): {"current_page_name": "foo/3"}, ] ) - self.assertQuerySetEqual(self.docs_documents, []) + self.assertQuerySetEqual(self.release.documents.all(), []) def test_excluded_documents(self): """ @@ -592,6 +589,172 @@ def test_excluded_documents(self): document = release.documents.get() self.assertEqual(document.path, "nonexcluded/bar") + def test_sync_to_db_not_delete_website_docs(self): + Document.objects.create( + release=self.release, + path="example_path", + title="Title 1", + metadata={ + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + ) + self.release.sync_to_db([]) + self.assertEqual(Document.objects.filter(release=self.release).count(), 1) + + def test_sync_from_sitemap_skip_non_en_dev_release(self): + release = Release.objects.create(version="5.2") + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + for lang, release_obj in [ + ("fr", None), + ("fr", release), + (settings.DEFAULT_LANGUAGE_CODE, release), + ]: + doc_release = DocumentRelease.objects.create( + lang=lang, + release=release_obj, + ) + with self.subTest(lang=lang, release=release_obj): + doc_release.sync_from_sitemap() + self.assertFalse( + Document.objects.filter(path=blog_entry.get_absolute_url()).exists() + ) + + @staticmethod + def _mock_static_sitemap_urls(mocker): + sitemap = StaticViewSitemap() + for item in sitemap.items(): + url = reverse(item.name, host=item.host) + mocker.get(url, text="", headers={"Content-Type": "text/html"}) + + @requests_mock.mock() + def test_sync_from_sitemap(self, mocker): + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + self._mock_static_sitemap_urls(mocker) + mocker.get( + blog_entry.get_absolute_url(), + text="
Main 1

Title 1

", + headers={"Content-Type": "text/html"}, + ) + self.release.sync_from_sitemap() + + document = Document.objects.get( + release=self.release, path=blog_entry.get_absolute_url() + ) + self.assertEqual( + document.title, + "Title 1", + ) + self.assertEqual( + document.metadata, + { + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + ) + + @mock.patch("docs.search.fetch_html") + def test_sync_from_sitemap_only_requests_non_existing(self, mock_fetch_html): + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path=blog_entry.get_absolute_url(), + ) + self.release.sync_from_sitemap() + + mock_fetch_html.assert_not_called() + + document = Document.objects.get( + release=self.release, path=blog_entry.get_absolute_url() + ) + # Confirm Document has not been updated. + self.assertEqual( + document.metadata, + {"parents": DocumentationCategory.WEBSITE}, + ) + + @requests_mock.mock() + def test_sync_from_sitemap_force(self, mocker): + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path="some_path", + ) + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + blog_url = blog_entry.get_absolute_url() + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path=blog_url, + ) + self._mock_static_sitemap_urls(mocker) + mocker.get( + blog_url, + text="
Main 1

Title 1

", + headers={"Content-Type": "text/html"}, + ) + self.release.sync_from_sitemap(force=True) + + self.assertEqual(Document.objects.count(), 1) + + document = Document.objects.get(release=self.release, path=blog_url) + # Confirm Document has been updated. + self.assertEqual( + document.path, + blog_url, + ) + self.assertEqual( + document.title, + "Title 1", + ) + self.assertEqual( + document.metadata, + { + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + ) + class DocumentUrlTests(TestCase): @classmethod @@ -625,11 +788,3 @@ def test_document_url(self): ], transform=lambda doc: doc.get_absolute_url(), ) - - def test_document_url_documentation_category_website(self): - self.release._sync_views_to_db() - document_view = self.release.documents.get() - self.assertEqual( - document_view.get_absolute_url(), - "http://www.djangoproject.localhost:8000/community/ecosystem/", - ) diff --git a/docs/tests/test_utils.py b/docs/tests/test_utils.py index 4ab650020a..f676540b08 100644 --- a/docs/tests/test_utils.py +++ b/docs/tests/test_utils.py @@ -3,7 +3,7 @@ from django.test import SimpleTestCase -from ..utils import get_doc_path, sanitize_for_trigram +from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram class TestUtils(SimpleTestCase): @@ -38,3 +38,39 @@ def test_sanitize_for_trigram(self): ]: with self.subTest(query=query): self.assertEqual(sanitize_for_trigram(query), sanitized_query) + + def test_extract_inner_html(self): + for html, expected_output in [ + ("

Hello

", "

Hello

"), + ( + '
Test
' + "

Title

", + "

Title

", + ), + ("
& < > ©
", "& < > ©"), + ("
", ""), + ("
Hello world
", "Hello world"), + ("

Hi

Text

Bye

", "

Hi

Text

Bye

"), + ]: + with self.subTest(html=html): + self.assertEqual(extract_inner_html(html, tag="main"), expected_output) + + def test_extract_inner_html_multiple_same_tags_raises(self): + with self.assertRaisesMessage( + ValueError, "
occurs more than once in HTML." + ): + extract_inner_html( + "
One main
Two main
", tag="main" + ) + + def test_extract_inner_html_multiple_same_tags_nested_raises(self): + with self.assertRaisesMessage( + ValueError, "Nested
tags are not allowed." + ): + extract_inner_html( + "
One main
Two main
", tag="main" + ) + + def test_extract_inner_html_tag_not_found_raises(self): + with self.assertRaisesMessage(ValueError, "
not found in HTML."): + extract_inner_html("

Test

", tag="main") diff --git a/docs/utils.py b/docs/utils.py index 500104df9c..b031867dcb 100644 --- a/docs/utils.py +++ b/docs/utils.py @@ -1,5 +1,6 @@ import re import unicodedata +from html.parser import HTMLParser from django.conf import settings from django.http import Http404 @@ -92,3 +93,59 @@ def get_module_path(name, full_path): if full_path.endswith(name_suffix): return full_path.removesuffix(name_suffix) return None + + +class SingleTagInnerHTMLExtractor(HTMLParser): + def __init__(self, target_tag): + super().__init__() + self.target_tag = target_tag.lower() + self.capturing = False + self.inner_html = [] + self.tag_count = 0 + + def handle_starttag(self, tag, attrs): + tag = tag.lower() + if tag == self.target_tag: + self.tag_count += 1 + if self.capturing: + # Nested target tag not allowed. + raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.") + self.capturing = True + elif self.capturing: + self.inner_html.append(self.get_starttag_text()) + + def handle_endtag(self, tag): + tag = tag.lower() + if self.capturing: + if tag == self.target_tag: + self.capturing = False + else: + self.inner_html.append(f"") + + def handle_data(self, data): + if self.capturing: + self.inner_html.append(data) + + def handle_entityref(self, name): + if self.capturing: + self.inner_html.append(f"&{name};") + + def handle_charref(self, name): + if self.capturing: + self.inner_html.append(f"&#{name};") + + +def extract_inner_html(html, tag): + """ + Extracts the inner HTML of a tag that appears exactly once. + """ + parser = SingleTagInnerHTMLExtractor(tag) + parser.feed(html) + parser.close() + + if parser.tag_count == 0: + raise ValueError(f"<{tag}> not found in HTML.") + if parser.tag_count > 1: + raise ValueError(f"<{tag}> occurs more than once in HTML.") + + return "".join(parser.inner_html)