diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 03aff25ce6117a..c1d056a5447edf 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,14 +15,18 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
Create a parser instance able to parse invalid markup.
- If *convert_charrefs* is ``True`` (the default), all character
- references (except the ones in ``script``/``style`` elements) are
+ If *convert_charrefs* is true (the default), all character
+ references (except the ones in elements like ``script`` and ``style``) are
automatically converted to the corresponding Unicode characters.
+ If *scripting* is false (the default), the content of the ``noscript``
+ element is parsed normally; if it's true, it's returned as is without
+ being parsed.
+
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
+ .. versionchanged:: 3.10.20
+ Added the *scripting* parameter.
+
Example HTML Parser Application
-------------------------------
@@ -159,15 +166,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
.. method:: HTMLParser.handle_data(data)
This method is called to process arbitrary data (e.g. text nodes and the
- content of ```` and ````).
+ content of elements like ``script`` and ``style``).
.. method:: HTMLParser.handle_entityref(name)
This method is called to process a named character reference of the form
``&name;`` (e.g. ``>``), where *name* is a general entity reference
- (e.g. ``'gt'``). This method is never called if *convert_charrefs* is
- ``True``.
+ (e.g. ``'gt'``).
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_charref(name)
@@ -175,8 +182,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
This method is called to process decimal and hexadecimal numeric character
references of the form ``NNN;`` and ``NNN;``. For example, the decimal
equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``;
- in this case the method will receive ``'62'`` or ``'x3E'``. This method
- is never called if *convert_charrefs* is ``True``.
+ in this case the method will receive ``'62'`` or ``'x3E'``.
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_comment(data)
@@ -284,8 +291,8 @@ Parsing an element with a few attributes and a title::
Data : Python
End tag : h1
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing::
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing::
>>> parser.feed('')
Start tag: style
@@ -294,10 +301,10 @@ further parsing::
End tag : style
>>> parser.feed('')
+ ... 'alert("hello! ☺");')
Start tag: script
attr: ('type', 'text/javascript')
- Data : alert("hello!");
+ Data : alert("hello! ☺");
End tag : script
Parsing comments::
@@ -317,7 +324,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``)::
+if *convert_charrefs* is false::
>>> for chunk in ['buff', 'ered ', 'text']:
... parser.feed(chunk)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 8724c22f8ff289..62134d376e1654 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If *scripting* is false (the default), the content of the
+ ``noscript`` element is parsed normally; if it's true,
+ it's returned as is without being parsed.
"""
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
@@ -153,7 +161,9 @@ def get_starttag_text(self):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
- if escapable and not self.convert_charrefs:
+ if self.cdata_elem == 'plaintext':
+ self.interesting = re.compile(r'\Z')
+ elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
@@ -441,8 +451,10 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
+ if (tag in self.CDATA_CONTENT_ELEMENTS or
+ (self.scripting and tag == "noscript") or
+ tag == "plaintext"):
+ self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index a7be7a6e20224a..1c1be3ff476886 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -7,6 +7,18 @@
from test import support
+SAMPLE_RCDATA = (
+ ''
+ ""
+ ''
+ ''
+ ''
+ '\u2603'
+)
+
+SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺'
+
+
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, autocdata=False, **kw):
@@ -292,30 +304,20 @@ def test_get_starttag_text(self):
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'',
- 'foo = " script>"',
- 'foo = ""',
- 'foo = ""',
- 'foo = ""',
- 'foo = "ſcript>"',
- 'foo = ""',
])
def test_script_content(self, content):
s = f''
- self._run_check(s, [("starttag", "script", []),
- ("data", content),
- ("endtag", "script")])
+ self._run_check(s, [
+ ("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script"),
+ ])
@support.subTests('content', [
'a::before { content: ""; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: ""; }',
'a::before { content: "\u2603"; }',
- 'a::before { content: "< /style>"; }',
- 'a::before { content: " style>"; }',
- 'a::before { content: ""; }',
- 'a::before { content: ""; }',
- 'a::before { content: ""; }',
- 'a::before { content: "ſtyle>"; }',
])
def test_style_content(self, content):
s = f''
@@ -323,47 +325,59 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
- @support.subTests('content', [
- '',
- "",
- '',
- '',
- '',
- '\u2603',
- '< /title>',
- ' title>',
- '',
- '',
- '',
- '',
+ @support.subTests('tag', ['title', 'textarea'])
+ def test_rcdata_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RCDATA}{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RCDATA),
+ ("endtag", tag),
])
- def test_title_content(self, content):
- source = f"{content}"
+ source = f"<{tag}>&{tag}>"
self._run_check(source, [
- ("starttag", "title", []),
- ("data", content),
- ("endtag", "title"),
+ ("starttag", tag, []),
+ ('entityref', 'amp'),
+ ("endtag", tag),
])
- @support.subTests('content', [
- '',
- "",
- '',
- '',
- '',
- '\u2603',
- '< /textarea>',
- ' textarea>',
- '',
- '',
- '',
+ @support.subTests('tag',
+ ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+ def test_rawtext_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RAWTEXT}{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", tag),
+ ])
+
+ def test_noscript_content(self):
+ source = f""
+ # scripting=False -- normal mode
+ self._run_check(source, [
+ ('starttag', 'noscript', []),
+ ('comment', ' not a comment '),
+ ('starttag', 'not', [('a', 'start tag')]),
+ ('unknown decl', 'CDATA[not a cdata'),
+ ('comment', 'not a bogus comment'),
+ ('endtag', 'not'),
+ ('data', '☃'),
+ ('entityref', 'amp'),
+ ('charref', '9786'),
+ ('endtag', 'noscript'),
])
- def test_textarea_content(self, content):
- source = f""
+ # scripting=True -- RAWTEXT mode
+ self._run_check(source, [
+ ("starttag", "noscript", []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", "noscript"),
+ ], collector=EventCollector(scripting=True))
+
+ def test_plaintext_content(self):
+ content = SAMPLE_RAWTEXT + '' # not closing
+ source = f"{content}"
self._run_check(source, [
- ("starttag", "textarea", []),
+ ("starttag", "plaintext", []),
("data", content),
- ("endtag", "textarea"),
])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
@@ -380,52 +394,65 @@ def test_script_closing_tag(self, endtag):
("endtag", "script")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
- @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
- 'style/', 'style foo=bar', 'style foo=">"'])
- def test_style_closing_tag(self, endtag):
- content = """
- b::before { content: ""; }
- p::before { content: "¬-an-entity-ref;"; }
- a::before { content: ""; }
- a::after { content: ""; }
- """
- s = f'