diff --git a/djangoproject/scss/_style.scss b/djangoproject/scss/_style.scss
index 08a9a834fd..82d5f4131c 100644
--- a/djangoproject/scss/_style.scss
+++ b/djangoproject/scss/_style.scss
@@ -2650,7 +2650,7 @@ search.filters {
position: relative;
a {
- padding: 10px 20px;
+ padding: 10px 15px;
text-decoration: none;
border-bottom: 3px solid transparent;
transition: color 0.3s ease, border-bottom 0.3s ease;
diff --git a/djangoproject/sitemaps.py b/djangoproject/sitemaps.py
new file mode 100644
index 0000000000..3c7becbf3a
--- /dev/null
+++ b/djangoproject/sitemaps.py
@@ -0,0 +1,76 @@
+from dataclasses import dataclass
+
+from django.contrib import sitemaps
+from django_hosts.resolvers import reverse
+
+
+@dataclass
+class URLObject:
+ name: str
+ host: str = "www"
+
+
+class LocationAbsoluteUrlMixin:
+ def get_urls(self, site=None, **kwargs):
+ """
+ Prevent the Django sitemap framework from prefixing the domain.
+ Use the absolute URL returned by location().
+ """
+ urls = []
+ for item in self.items():
+ loc = self.location(item)
+ urls.append(
+ {
+ "location": loc,
+ "lastmod": None,
+ "changefreq": self.changefreq,
+ "priority": self.priority,
+ }
+ )
+ return urls
+
+
+class StaticViewSitemap(LocationAbsoluteUrlMixin, sitemaps.Sitemap):
+ priority = 0.5
+ changefreq = "monthly"
+
+ def items(self):
+ return [
+ # accounts
+ URLObject("registration_register"),
+ # aggregator
+ URLObject("community-index"),
+ URLObject("community-ecosystem"),
+ URLObject("local-django-communities"),
+ # contact
+ URLObject("contact_foundation"),
+ # dashboard
+ URLObject("dashboard-index", host="dashboard"),
+ URLObject("metric-list", host="dashboard"),
+ # djangoproject
+ URLObject("homepage"),
+ URLObject("overview"),
+ URLObject("start"),
+ URLObject("code_of_conduct"),
+ URLObject("conduct_faq"),
+ URLObject("conduct_reporting"),
+ URLObject("conduct_enforcement"),
+ URLObject("conduct_changes"),
+ URLObject("diversity"),
+ URLObject("diversity_changes"),
+ # foundation
+ URLObject("foundation_meeting_archive_index"),
+ # fundraising
+ URLObject("fundraising:index"),
+ # members
+ URLObject("members:individual-members"),
+ URLObject("members:corporate-members"),
+ URLObject("members:corporate-members-join"),
+ URLObject("members:corporate-members-badges"),
+ URLObject("members:teams"),
+ # releases
+ URLObject("download"),
+ ]
+
+ def location(self, item):
+ return reverse(item.name, host=item.host)
diff --git a/djangoproject/tests.py b/djangoproject/tests.py
index 2c67432110..5e8355601b 100644
--- a/djangoproject/tests.py
+++ b/djangoproject/tests.py
@@ -202,6 +202,7 @@ def test_single_h1_per_page(self):
"styleguide/", # Has multiple
examples.
"admin/", # Admin templates are out of our control.
"reset/done/", # Uses an admin template.
+ "sitemap.xml",
]
resolver = get_resolver()
urls = self.extract_patterns(resolver.url_patterns)
@@ -211,3 +212,9 @@ def test_single_h1_per_page(self):
response = self.client.get(url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, "= 1:
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
+ release.sync_from_sitemap(force=force)
+
# checkout_dir is shared for all languages.
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(
diff --git a/docs/models.py b/docs/models.py
index d690c76bfc..9b95e462e2 100644
--- a/docs/models.py
+++ b/docs/models.py
@@ -5,6 +5,7 @@
from functools import partial, reduce
from pathlib import Path
+import requests
from django.conf import settings
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import (
@@ -26,19 +27,19 @@
from django.utils.html import strip_tags
from django_hosts.resolvers import reverse
-from blog.models import Entry
from releases.models import Release
from . import utils
from .search import (
DEFAULT_TEXT_SEARCH_CONFIG,
- SEARCHABLE_VIEWS,
START_SEL,
STOP_SEL,
TSEARCH_CONFIG_LANGUAGES,
DocumentationCategory,
+ fetch_html,
get_document_search_vector,
)
+from .utils import extract_inner_html
def get_search_config(lang):
@@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents):
the database. Deletes all the release's documents first then
reinserts them as needed.
"""
- self.documents.all().delete()
+ self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
# Read excluded paths from robots.docs.txt.
robots_path = settings.BASE_DIR.joinpath(
@@ -218,65 +219,51 @@ def sync_to_db(self, decoded_documents):
metadata=document,
config=get_search_config(self.lang),
)
- for document in self.documents.all():
+ for document in self.documents.exclude(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ):
document.metadata["breadcrumbs"] = list(
Document.objects.breadcrumbs(document).values("title", "path")
)
document.save(update_fields=("metadata",))
- self._sync_blog_to_db()
- self._sync_views_to_db()
+ def sync_from_sitemap(self, force=False):
+ from djangoproject.urls.www import sitemaps
- def _sync_blog_to_db(self):
- """
- Sync the blog entries into search based on the release documents
- support end date.
- """
- if self.lang != "en":
- return # The blog is only written in English currently
+ if not self.is_dev:
+ return
- entries = Entry.objects.published().searchable()
- Document.objects.bulk_create(
- [
- Document(
- release=self,
- path=entry.get_absolute_url(),
- title=entry.headline,
- metadata={
- "body": entry.body_html,
- "breadcrumbs": [
- {
- "path": DocumentationCategory.WEBSITE,
- "title": "News",
- },
- ],
- "parents": DocumentationCategory.WEBSITE,
- "slug": entry.slug,
- "title": entry.headline,
- "toc": "",
- },
- config=get_search_config(self.lang),
- )
- for entry in entries
- ]
- )
+ if force:
+ Document.objects.filter(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ).delete()
- def _sync_views_to_db(self):
- """
- Sync the specific views into search based on the release documents
- support end date.
- """
- if self.lang != "en":
- return # The searchable views are only written in English currently
+ doc_urls = set(
+ Document.objects.filter(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ).values_list("path", flat=True)
+ )
- Document.objects.bulk_create(
- [
- Document(
+ for sitemap in sitemaps.values():
+ for url in sitemap().get_urls():
+ path = url["location"]
+ if path in doc_urls:
+ continue
+ try:
+ page_html = fetch_html(path)
+ except requests.RequestException:
+ continue
+ try:
+ main_html = extract_inner_html(page_html, tag="main")
+ title = extract_inner_html(page_html, tag="h1")
+ except ValueError:
+ continue
+ Document.objects.create(
release=self,
- path=searchable_view.www_absolute_url,
- title=searchable_view.page_title,
+ path=path,
+ title=title,
metadata={
- "body": searchable_view.html,
+ "body": main_html,
"breadcrumbs": [
{
"path": DocumentationCategory.WEBSITE,
@@ -284,15 +271,11 @@ def _sync_views_to_db(self):
},
],
"parents": DocumentationCategory.WEBSITE,
- "slug": searchable_view.url_name,
- "title": searchable_view.page_title,
+ "title": title,
"toc": "",
},
config=get_search_config(self.lang),
)
- for searchable_view in SEARCHABLE_VIEWS
- ]
- )
def _clean_document_path(path):
@@ -353,6 +336,14 @@ def search(self, query_text, release, document_category=None):
config=models.F("config"),
)
base_filter = Q(release_id=release.id)
+ if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev:
+ dev_release = DocumentRelease.objects.get_by_version_and_lang(
+ version="dev", lang=settings.DEFAULT_LANGUAGE_CODE
+ )
+ base_filter |= Q(
+ release_id=dev_release.id,
+ metadata__parents=DocumentationCategory.WEBSITE,
+ )
if document_category:
base_filter &= Q(metadata__parents__startswith=document_category)
base_qs = (
diff --git a/docs/search.py b/docs/search.py
index 0b7eaef1d3..d6d1ec9959 100644
--- a/docs/search.py
+++ b/docs/search.py
@@ -1,11 +1,8 @@
-from dataclasses import dataclass
-
+import requests
from django.contrib.postgres.search import SearchVector
from django.db.models import TextChoices
from django.db.models.fields.json import KeyTextTransform
-from django.template.loader import get_template
from django.utils.translation import gettext_lazy as _
-from django_hosts import reverse
# Imported from
# https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659
@@ -81,25 +78,31 @@ def parse(cls, value, default=None):
return None
-@dataclass
-class SearchableView:
- page_title: str
- url_name: str
- template: str
-
- @property
- def html(self):
- return get_template(self.template).render()
-
- @property
- def www_absolute_url(self):
- return reverse(self.url_name, host="www")
-
+def fetch_html(url, timeout=10):
+ """
+ Fetch the HTML of a page if status code is 200.
+ Simulates a human browser and accepts only text/html.
+ """
-SEARCHABLE_VIEWS = [
- SearchableView(
- page_title="Django's Ecosystem",
- url_name="community-ecosystem",
- template="aggregator/ecosystem.html",
- ),
-]
+ headers = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
+ ),
+ "Accept": "text/html",
+ "Accept-Language": "en-US,en;q=0.9",
+ }
+
+ response = requests.get(url, headers=headers, timeout=timeout)
+
+ if response.status_code == 200:
+ content_type = response.headers.get("Content-Type", "")
+ if "text/html" in content_type:
+ return response.text
+ else:
+ raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
+ else:
+ raise requests.RequestException(
+ f"Failed to fetch {url}, status code: {response.status_code}"
+ )
diff --git a/docs/sitemaps.py b/docs/sitemaps.py
index 6f520aa753..c46ec0933b 100644
--- a/docs/sitemaps.py
+++ b/docs/sitemaps.py
@@ -1,10 +1,12 @@
from django.contrib.sitemaps import Sitemap
+from djangoproject.sitemaps import LocationAbsoluteUrlMixin
+
from .models import Document
from .search import DocumentationCategory
-class DocsSitemap(Sitemap):
+class DocsSitemap(LocationAbsoluteUrlMixin, Sitemap):
def __init__(self, lang):
self.lang = lang
@@ -16,6 +18,9 @@ def items(self):
.select_related("release__release")
)
+ def location(self, item):
+ return item.get_absolute_url()
+
def changefreq(self, obj):
return "daily"
@@ -33,19 +38,3 @@ def priority(self, obj):
return 1
else:
return 0.1
-
- def _urls(self, page, site, protocol):
- # XXX: To workaround bad interaction between contrib.sitemaps and
- # django-hosts (scheme/domain would be repeated twice in URLs)
- urls = []
- for item in self.paginator.page(page).object_list:
- loc = item.get_absolute_url()
- priority = self.priority(item)
- url_info = {
- "item": item,
- "location": loc,
- "changefreq": self.changefreq(item),
- "priority": str(priority if priority is not None else ""),
- }
- urls.append(url_info)
- return urls
diff --git a/docs/templates/docs/search_results.html b/docs/templates/docs/search_results.html
index 3001213db8..8d42d30e53 100644
--- a/docs/templates/docs/search_results.html
+++ b/docs/templates/docs/search_results.html
@@ -1,7 +1,8 @@
{% extends "docs/doc.html" %}
{% load i18n docs %}
-{% block title %}{% translate "Search | Django documentation" %}{% endblock %}
+{% block title %}{% translate "Search" %}{% endblock %}
+{% block header %}{% endblock %}
{% block toc-wrapper %}{% endblock %}
{% block breadcrumbs-wrapper %}{% endblock %}
diff --git a/docs/tests/test_models.py b/docs/tests/test_models.py
index 4724cabeaa..d8ab638b44 100644
--- a/docs/tests/test_models.py
+++ b/docs/tests/test_models.py
@@ -1,13 +1,16 @@
import datetime
from operator import attrgetter
+from unittest import mock
+import requests_mock
from django.conf import settings
from django.db import connection
from django.test import TestCase
from django.utils import timezone
from django_hosts import reverse
-from blog.models import Entry
+from blog.models import ContentFormat, Entry
+from djangoproject.sitemaps import StaticViewSitemap
from releases.models import Release
from ..models import Document, DocumentRelease
@@ -184,6 +187,9 @@ def test_get_available_languages_by_version(self):
class DocumentManagerTest(TestCase):
@classmethod
def setUpTestData(cls):
+ cls.dev_release = DocumentRelease.objects.create(
+ lang=settings.DEFAULT_LANGUAGE_CODE
+ )
cls.release = DocumentRelease.objects.create(
release=Release.objects.create(version="1.2.3"),
)
@@ -358,6 +364,20 @@ def setUpTestData(cls):
"release": cls.release_fr,
"title": "Notes de publication de Django 1.9.4",
},
+ {
+ "metadata": {
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ "path": "example",
+ "release": cls.dev_release,
+ "title": "Blog post",
+ },
]
Document.objects.bulk_create(Document(**doc) for doc in documents)
@@ -457,28 +477,21 @@ def test_search_title(self):
),
)
+ def test_website_document_items_included_english(self):
+ self.assertQuerySetEqual(
+ Document.objects.search("Main", self.release),
+ ["Blog post"],
+ transform=attrgetter("title"),
+ )
+
+ def test_website_document_items_excluded_non_english(self):
+ self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0)
+
class UpdateDocTests(TestCase):
@classmethod
def setUpTestData(cls):
- now = timezone.now()
- cls.release = DocumentRelease.objects.create(
- release=Release.objects.create(
- version="1.0.0",
- eol_date=now + datetime.timedelta(days=1),
- )
- )
- cls.entry = Entry.objects.create(
- pub_date=now,
- is_active=True,
- is_searchable=True,
- headline="Searchable post",
- slug="a",
- body_html="Searchable Blog Post
",
- )
- cls.docs_documents = cls.release.documents.exclude(
- metadata__parents=DocumentationCategory.WEBSITE
- )
+ cls.release = DocumentRelease.objects.create()
def test_sync_to_db(self):
self.release.sync_to_db(
@@ -490,24 +503,8 @@ def test_sync_to_db(self):
}
]
)
- self.assertQuerySetEqual(
- self.release.documents.all(),
- [
- "foo/bar",
- reverse("community-ecosystem", host="www"),
- self.entry.get_absolute_url(),
- ],
- ordered=False,
- transform=attrgetter("path"),
- )
-
- def test_sync_to_db_skip_non_english(self):
- """
- Releases must be English to include the blog and website results in search.
- """
- non_english = DocumentRelease.objects.create(lang="es")
- non_english.sync_to_db([])
- self.assertQuerySetEqual(non_english.documents.all(), [])
+ document = self.release.documents.get()
+ self.assertEqual(document.path, "foo/bar")
def test_clean_path(self):
self.release.sync_to_db(
@@ -519,7 +516,7 @@ def test_clean_path(self):
}
]
)
- document = self.docs_documents.get()
+ document = self.release.documents.get()
self.assertEqual(document.path, "foo/bar")
def test_title_strip_tags(self):
@@ -533,7 +530,7 @@ def test_title_strip_tags(self):
]
)
self.assertQuerySetEqual(
- self.docs_documents.all(),
+ self.release.documents.all(),
["This is the title"],
transform=attrgetter("title"),
)
@@ -549,7 +546,7 @@ def test_title_entities(self):
]
)
self.assertQuerySetEqual(
- self.docs_documents,
+ self.release.documents.all(),
["Title & title"],
transform=attrgetter("title"),
)
@@ -562,7 +559,7 @@ def test_empty_documents(self):
{"current_page_name": "foo/3"},
]
)
- self.assertQuerySetEqual(self.docs_documents, [])
+ self.assertQuerySetEqual(self.release.documents.all(), [])
def test_excluded_documents(self):
"""
@@ -592,6 +589,172 @@ def test_excluded_documents(self):
document = release.documents.get()
self.assertEqual(document.path, "nonexcluded/bar")
+ def test_sync_to_db_not_delete_website_docs(self):
+ Document.objects.create(
+ release=self.release,
+ path="example_path",
+ title="Title 1",
+ metadata={
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ )
+ self.release.sync_to_db([])
+ self.assertEqual(Document.objects.filter(release=self.release).count(), 1)
+
+ def test_sync_from_sitemap_skip_non_en_dev_release(self):
+ release = Release.objects.create(version="5.2")
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="test",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ for lang, release_obj in [
+ ("fr", None),
+ ("fr", release),
+ (settings.DEFAULT_LANGUAGE_CODE, release),
+ ]:
+ doc_release = DocumentRelease.objects.create(
+ lang=lang,
+ release=release_obj,
+ )
+ with self.subTest(lang=lang, release=release_obj):
+ doc_release.sync_from_sitemap()
+ self.assertFalse(
+ Document.objects.filter(path=blog_entry.get_absolute_url()).exists()
+ )
+
+ @staticmethod
+ def _mock_static_sitemap_urls(mocker):
+ sitemap = StaticViewSitemap()
+ for item in sitemap.items():
+ url = reverse(item.name, host=item.host)
+ mocker.get(url, text="", headers={"Content-Type": "text/html"})
+
+ @requests_mock.mock()
+ def test_sync_from_sitemap(self, mocker):
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="test",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ self._mock_static_sitemap_urls(mocker)
+ mocker.get(
+ blog_entry.get_absolute_url(),
+ text="Main 1Title 1
",
+ headers={"Content-Type": "text/html"},
+ )
+ self.release.sync_from_sitemap()
+
+ document = Document.objects.get(
+ release=self.release, path=blog_entry.get_absolute_url()
+ )
+ self.assertEqual(
+ document.title,
+ "Title 1",
+ )
+ self.assertEqual(
+ document.metadata,
+ {
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ )
+
+ @mock.patch("docs.search.fetch_html")
+ def test_sync_from_sitemap_only_requests_non_existing(self, mock_fetch_html):
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="test",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path=blog_entry.get_absolute_url(),
+ )
+ self.release.sync_from_sitemap()
+
+ mock_fetch_html.assert_not_called()
+
+ document = Document.objects.get(
+ release=self.release, path=blog_entry.get_absolute_url()
+ )
+ # Confirm Document has not been updated.
+ self.assertEqual(
+ document.metadata,
+ {"parents": DocumentationCategory.WEBSITE},
+ )
+
+ @requests_mock.mock()
+ def test_sync_from_sitemap_force(self, mocker):
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path="some_path",
+ )
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="test",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ blog_url = blog_entry.get_absolute_url()
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path=blog_url,
+ )
+ self._mock_static_sitemap_urls(mocker)
+ mocker.get(
+ blog_url,
+ text="Main 1Title 1
",
+ headers={"Content-Type": "text/html"},
+ )
+ self.release.sync_from_sitemap(force=True)
+
+ self.assertEqual(Document.objects.count(), 1)
+
+ document = Document.objects.get(release=self.release, path=blog_url)
+ # Confirm Document has been updated.
+ self.assertEqual(
+ document.path,
+ blog_url,
+ )
+ self.assertEqual(
+ document.title,
+ "Title 1",
+ )
+ self.assertEqual(
+ document.metadata,
+ {
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ )
+
class DocumentUrlTests(TestCase):
@classmethod
@@ -625,11 +788,3 @@ def test_document_url(self):
],
transform=lambda doc: doc.get_absolute_url(),
)
-
- def test_document_url_documentation_category_website(self):
- self.release._sync_views_to_db()
- document_view = self.release.documents.get()
- self.assertEqual(
- document_view.get_absolute_url(),
- "http://www.djangoproject.localhost:8000/community/ecosystem/",
- )
diff --git a/docs/tests/test_utils.py b/docs/tests/test_utils.py
index 4ab650020a..f676540b08 100644
--- a/docs/tests/test_utils.py
+++ b/docs/tests/test_utils.py
@@ -3,7 +3,7 @@
from django.test import SimpleTestCase
-from ..utils import get_doc_path, sanitize_for_trigram
+from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram
class TestUtils(SimpleTestCase):
@@ -38,3 +38,39 @@ def test_sanitize_for_trigram(self):
]:
with self.subTest(query=query):
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
+
+ def test_extract_inner_html(self):
+ for html, expected_output in [
+ ("Hello
", "
Hello
"),
+ (
+ ''
+ "Title
",
+ "Title
",
+ ),
+ ("& < > ©", "& < > ©"),
+ ("", ""),
+ ("Hello world", "Hello world"),
+ ("Hi
TextBye
", "Hi
TextBye
"),
+ ]:
+ with self.subTest(html=html):
+ self.assertEqual(extract_inner_html(html, tag="main"), expected_output)
+
+ def test_extract_inner_html_multiple_same_tags_raises(self):
+ with self.assertRaisesMessage(
+ ValueError, " occurs more than once in HTML."
+ ):
+ extract_inner_html(
+ "One mainTwo main", tag="main"
+ )
+
+ def test_extract_inner_html_multiple_same_tags_nested_raises(self):
+ with self.assertRaisesMessage(
+ ValueError, "Nested tags are not allowed."
+ ):
+ extract_inner_html(
+ "One mainTwo main", tag="main"
+ )
+
+ def test_extract_inner_html_tag_not_found_raises(self):
+ with self.assertRaisesMessage(ValueError, " not found in HTML."):
+ extract_inner_html("Test
", tag="main")
diff --git a/docs/utils.py b/docs/utils.py
index 500104df9c..b031867dcb 100644
--- a/docs/utils.py
+++ b/docs/utils.py
@@ -1,5 +1,6 @@
import re
import unicodedata
+from html.parser import HTMLParser
from django.conf import settings
from django.http import Http404
@@ -92,3 +93,59 @@ def get_module_path(name, full_path):
if full_path.endswith(name_suffix):
return full_path.removesuffix(name_suffix)
return None
+
+
+class SingleTagInnerHTMLExtractor(HTMLParser):
+ def __init__(self, target_tag):
+ super().__init__()
+ self.target_tag = target_tag.lower()
+ self.capturing = False
+ self.inner_html = []
+ self.tag_count = 0
+
+ def handle_starttag(self, tag, attrs):
+ tag = tag.lower()
+ if tag == self.target_tag:
+ self.tag_count += 1
+ if self.capturing:
+ # Nested target tag not allowed.
+ raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.")
+ self.capturing = True
+ elif self.capturing:
+ self.inner_html.append(self.get_starttag_text())
+
+ def handle_endtag(self, tag):
+ tag = tag.lower()
+ if self.capturing:
+ if tag == self.target_tag:
+ self.capturing = False
+ else:
+ self.inner_html.append(f"{tag}>")
+
+ def handle_data(self, data):
+ if self.capturing:
+ self.inner_html.append(data)
+
+ def handle_entityref(self, name):
+ if self.capturing:
+ self.inner_html.append(f"&{name};")
+
+ def handle_charref(self, name):
+ if self.capturing:
+ self.inner_html.append(f"{name};")
+
+
+def extract_inner_html(html, tag):
+ """
+ Extracts the inner HTML of a tag that appears exactly once.
+ """
+ parser = SingleTagInnerHTMLExtractor(tag)
+ parser.feed(html)
+ parser.close()
+
+ if parser.tag_count == 0:
+ raise ValueError(f"<{tag}> not found in HTML.")
+ if parser.tag_count > 1:
+ raise ValueError(f"<{tag}> occurs more than once in HTML.")
+
+ return "".join(parser.inner_html)