EP_inspection_tool_proto/lib/python3.12/site-packages/tinyhtml5/parser.py

2645 lines
100 KiB
Python
Raw Normal View History

2026-03-05 11:53:58 +00:00
from . import inputstream
from .constants import (
ReparseError,
Token,
adjust_foreign_attributes,
adjust_mathml_attributes,
adjust_svg_attributes,
ascii_upper_to_lower,
cdata_elements,
heading_elements,
html_integration_point_elements,
mathml_text_integration_point_elements,
namespaces,
rcdata_elements,
space_characters,
special_elements,
)
from .tokenizer import HTMLTokenizer
from .treebuilder import Marker, TreeBuilder
def parse(document, namespace_html_elements=True, **kwargs):
"""Parse an HTML document into a tree.
:param document:
The document to parse as a HTML string, filename, file-like object.
:type document:
:class:`str`, :class:`bytes`, :class:`pathlib.Path` or
:term:`file object`
:param bool namespace_html_elements:
Whether or not to namespace HTML elements.
Extra parameters can be provided to define possible encodings if the
document is given as :class:`bytes`.
:param override_encoding: Forced encoding provided by user agent.
:type override_encoding: str or bytes
:param transport_encoding: Encoding provided by transport layout.
:type transport_encoding: str or bytes
:param same_origin_parent_encoding: Parent document encoding.
:type same_origin_parent_encoding: str or bytes
:param likely_encoding: Possible encoding provided by user agent.
:type likely_encoding: str or bytes
:param default_encoding: Encoding used as fallback.
:type default_encoding: str or bytes
:returns: :class:`xml.etree.ElementTree.Element`.
Example:
>>> from tinyhtml5 import parse
>>> parse('<html><body><p>This is a doc</p></body></html>')
<Element '{http://www.w3.org/1999/xhtml}html' at >
"""
return HTMLParser(namespace_html_elements).parse(document, **kwargs)
class HTMLParser:
"""HTML parser.
Generate a tree structure from a stream of (possibly malformed) HTML.
"""
def __init__(self, namespace_html_elements=True):
self.tree = TreeBuilder(namespace_html_elements)
self.errors = []
self.phases = {name: cls(self, self.tree) for name, cls in _phases.items()}
def _parse(self, stream, container=None, scripting=False, **kwargs):
self.container = container
self.scripting = scripting
self.tokenizer = HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()
try:
self.main_loop()
except ReparseError:
self.reset()
self.main_loop()
def reset(self):
self.tree.reset()
self.first_start_tag = False
self.errors = []
self.compatibility_mode = "no quirks" # or "quirks" or "limited quirks"
if self.container:
if self.container in cdata_elements:
self.tokenizer.state = self.tokenizer.rcdata_state
elif self.container in rcdata_elements:
self.tokenizer.state = self.tokenizer.rawtext_state
elif self.container == 'plaintext':
self.tokenizer.state = self.tokenizer.plaintext_state
else:
# State already is data state.
# self.tokenizer.state = self.tokenizer.data_state
pass
self.phase = self.phases["before html"]
self.phase._insert_html_element()
self.reset_insertion_mode()
else:
self.phase = self.phases["initial"]
self.last_phase = None
self.before_rcdata_phase = None
self.frameset_ok = True
@property
def encoding(self):
"""Name of the character encoding that was used to decode the input stream.
:obj:`None` if that is not determined yet.
"""
if hasattr(self, 'tokenizer'):
return self.tokenizer.stream.encoding[0].name
def is_html_integration_point(self, element):
full_name = (element.namespace, element.name)
if full_name == (namespaces["mathml"], "annotation-xml"):
return (
"encoding" in element.attributes and
element.attributes["encoding"].translate(ascii_upper_to_lower) in
("text/html", "application/xhtml+xml"))
return full_name in html_integration_point_elements
def is_mathml_text_integration_point(self, element):
full_name = (element.namespace, element.name)
return full_name in mathml_text_integration_point_elements
def main_loop(self):
for token in self.tokenizer:
previous_token = None
new_token = token
while new_token is not None:
previous_token = new_token
current_node = (
self.tree.open_elements[-1] if self.tree.open_elements else None)
current_node_namespace = (
current_node.namespace if current_node else None)
current_node_name = current_node.name if current_node else None
type = new_token["type"]
if type == Token.PARSE_ERROR:
self.parse_error(new_token["data"], new_token.get("datavars", {}))
new_token = None
else:
if (len(self.tree.open_elements) == 0 or
current_node_namespace == self.tree.default_namespace or
(self.is_mathml_text_integration_point(current_node) and
((type == Token.START_TAG and
token["name"] not in frozenset(["mglyph", "malignmark"])) or
type in (Token.CHARACTERS, Token.SPACE_CHARACTERS))) or
(current_node_namespace == namespaces["mathml"] and
current_node_name == "annotation-xml" and
type == Token.START_TAG and
token["name"] == "svg") or
(self.is_html_integration_point(current_node) and type in (
Token.START_TAG, Token.CHARACTERS,
Token.SPACE_CHARACTERS))):
phase = self.phase
else:
phase = self.phases["in foreign content"]
if type == Token.CHARACTERS:
new_token = phase.process_characters(new_token)
elif type == Token.SPACE_CHARACTERS:
new_token = phase.process_space_characters(new_token)
elif type == Token.START_TAG:
new_token = phase.process_start_tag(new_token)
elif type == Token.END_TAG:
new_token = phase.process_end_tag(new_token)
elif type == Token.COMMENT:
new_token = phase.process_comment(new_token)
elif type == Token.DOCTYPE:
new_token = phase.process_doctype(new_token)
if (type == Token.START_TAG and previous_token["selfClosing"] and
not previous_token["selfClosingAcknowledged"]):
self.parse_error(
"non-void-element-with-trailing-solidus",
{"name": previous_token["name"]})
# When the loop finishes it's EOF.
reprocess = True
phases = []
while reprocess:
phases.append(self.phase)
reprocess = self.phase.process_eof()
if reprocess:
assert self.phase not in phases
def parse(self, stream, full_tree=False, **kwargs):
"""Parse a HTML document into a well-formed tree.
If ``full_tree`` is ``True``, return the whole tree.
"""
self._parse(stream, **kwargs)
return self.tree.get_document(full_tree)
def parse_fragment(self, stream, container="div", **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment.
``container`` is the tag name of the fragments container.
"""
self._parse(stream, container=container, **kwargs)
return self.tree.get_fragment()
def parse_error(self, errorcode, datavars=None):
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
def adjust_mathml_attributes(self, token):
adjust_attributes(token, adjust_mathml_attributes)
def adjust_svg_attributes(self, token):
adjust_attributes(token, adjust_svg_attributes)
def adjust_foreign_attributes(self, token):
adjust_attributes(token, adjust_foreign_attributes)
def reset_insertion_mode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
new_modes = {
"select": "in select",
"td": "in cell",
"th": "in cell",
"tr": "in row",
"tbody": "in table body",
"thead": "in table body",
"tfoot": "in table body",
"caption": "in caption",
"colgroup": "in column group",
"table": "in table",
"head": "in body",
"body": "in body",
"frameset": "in frameset",
"html": "before head"
}
for node in self.tree.open_elements[::-1]:
node_name = node.name
new_phase = None
if node == self.tree.open_elements[0]:
assert self.container
last = True
node_name = self.container
# Check for conditions that should only happen in the fragment case.
if node_name in ("select", "colgroup", "head", "html"):
assert self.container
if not last and node.namespace != self.tree.default_namespace:
continue
if node_name in new_modes:
new_phase = self.phases[new_modes[node_name]]
break
elif last:
new_phase = self.phases["in body"]
break
self.phase = new_phase
def parse_rcdata_rawtext(self, token, content_type):
# Generic RCDATA/RAWTEXT Parsing algorithm.
assert content_type in ("RAWTEXT", "RCDATA")
self.tree.insert_element(token)
if content_type == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtext_state
else:
self.tokenizer.state = self.tokenizer.rcdata_state
self.original_phase = self.phase
self.phase = self.phases["text"]
def dispatch(items):
return {
key: value
for keys, value in items
for key in ((keys,) if isinstance(keys, str) else keys)
}
class Phase:
"""Base class for helper that implements each phase of processing."""
__slots__ = ("parser", "tree", "__start_tag_cache", "__end_tag_cache")
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
self.__start_tag_cache = {}
self.__end_tag_cache = {}
def process_eof(self): # pragma: no cover
raise NotImplementedError
def process_comment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insert_comment(token, self.tree.open_elements[-1])
def process_doctype(self, token):
self.parser.parse_error("unexpected-doctype")
def process_characters(self, token):
self.tree.insert_text(token["data"])
def process_space_characters(self, token):
self.tree.insert_text(token["data"])
def process_start_tag(self, token):
name = token["name"]
# In Py3, `in` is quicker when there are few cache hits (typically
# short inputs).
if name in self.__start_tag_cache:
function = self.__start_tag_cache[name]
else:
function = self.__start_tag_cache[name] = self.start_tag_handler.get(
name, type(self).start_tag_other)
# Bound the cache size in case we get loads of unknown tags.
while len(self.__start_tag_cache) > len(self.start_tag_handler) * 1.1:
# This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7.
self.__start_tag_cache.pop(next(iter(self.__start_tag_cache)))
return function(self, token)
def start_tag_html(self, token):
if not self.parser.first_start_tag and token["name"] == "html":
self.parser.parse_error("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parse_error().
for attr, value in token["data"].items():
if attr not in self.tree.open_elements[0].attributes:
self.tree.open_elements[0].attributes[attr] = value
self.parser.first_start_tag = False
def process_end_tag(self, token):
name = token["name"]
# In Py3, `in` is quicker when there are few cache hits (typically
# short inputs).
if name in self.__end_tag_cache:
function = self.__end_tag_cache[name]
else:
function = self.__end_tag_cache[name] = self.end_tag_handler.get(
name, type(self).end_tag_other)
# Bound the cache size in case we get loads of unknown tags.
while len(self.__end_tag_cache) > len(self.end_tag_handler) * 1.1:
# This makes the eviction policy random on Py < 3.7 and FIFO >= 3.7.
self.__end_tag_cache.pop(next(iter(self.__end_tag_cache)))
return function(self, token)
class InitialPhase(Phase):
__slots__ = tuple()
def process_space_characters(self, token):
pass
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_doctype(self, token):
name = token["name"]
public_id = token["publicId"]
system_id = token["systemId"]
correct = token["correct"]
if (name != "html" or public_id is not None or
system_id is not None and system_id != "about:legacy-compat"):
self.parser.parse_error("unknown-doctype")
if public_id is None:
public_id = ""
self.tree.insert_doctype(token)
if public_id != "":
public_id = public_id.translate(ascii_upper_to_lower)
if (not correct or token["name"] != "html" or
public_id.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::"
"extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::"
"extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
public_id in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
public_id.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
system_id is None or
system_id and system_id.lower() ==
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatibility_mode = "quirks"
elif (public_id.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) or
public_id.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
system_id is not None):
self.parser.compatibility_mode = "limited quirks"
self.parser.phase = self.parser.phases["before html"]
def anything_else(self):
self.parser.compatibility_mode = "quirks"
self.parser.phase = self.parser.phases["before html"]
def process_characters(self, token):
self.parser.parse_error("expected-doctype-but-got-chars")
self.anything_else()
return token
def process_start_tag(self, token):
self.parser.parse_error(
"expected-doctype-but-got-start-tag", {"name": token["name"]})
self.anything_else()
return token
def process_end_tag(self, token):
self.parser.parse_error(
"expected-doctype-but-got-end-tag", {"name": token["name"]})
self.anything_else()
return token
def process_eof(self):
self.parser.parse_error("expected-doctype-but-got-eof")
self.anything_else()
return True
class BeforeHtmlPhase(Phase):
__slots__ = tuple()
def _insert_html_element(self):
self.tree.insert_root(implied_tag_token("html", "START_TAG"))
self.parser.phase = self.parser.phases["before head"]
def process_eof(self):
self._insert_html_element()
return True
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_space_characters(self, token):
pass
def process_characters(self, token):
self._insert_html_element()
return token
def process_start_tag(self, token):
if token["name"] == "html":
self.parser.first_start_tag = True
self._insert_html_element()
return token
def process_end_tag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parse_error(
"unexpected-end-tag-before-html", {"name": token["name"]})
else:
self._insert_html_element()
return token
class BeforeHeadPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return True
def process_space_characters(self, token):
pass
def process_characters(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_head(self, token):
self.tree.insert_element(token)
self.tree.head_element = self.tree.open_elements[-1]
self.parser.phase = self.parser.phases["in head"]
def start_tag_other(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def end_tag_imply_head(self, token):
self.start_tag_head(implied_tag_token("head", "START_TAG"))
return token
def end_tag_other(self, token):
self.parser.parse_error("end-tag-after-implied-root", {"name": token["name"]})
start_tag_handler = dispatch([
("html", start_tag_html),
("head", start_tag_head)
])
end_tag_handler = dispatch([
(("head", "body", "html", "br"), end_tag_imply_head)
])
class InHeadPhase(Phase):
__slots__ = tuple()
# the real thing
def process_eof(self):
self.anything_else()
return True
def process_characters(self, token):
self.anything_else()
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_head(self, token):
self.parser.parse_error("two-heads-are-not-better-than-one")
def start_tag_base_link_command(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_meta(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.encoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.change_encoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttributeParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttributeParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.change_encoding(codec)
def start_tag_title(self, token):
self.parser.parse_rcdata_rawtext(token, "RCDATA")
def start_tag_noframes_style(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
def start_tag_noscript(self, token):
if self.parser.scripting:
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
else:
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in head noscript"]
def start_tag_script(self, token):
self.tree.insert_element(token)
self.parser.tokenizer.state = self.parser.tokenizer.script_data_state
self.parser.original_phase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def start_tag_other(self, token):
self.anything_else()
return token
def end_tag_head(self, token):
node = self.parser.tree.open_elements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["after head"]
def end_tag_html_body_br(self, token):
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
self.end_tag_head(implied_tag_token("head"))
start_tag_handler = dispatch([
("html", start_tag_html),
("title", start_tag_title),
(("noframes", "style"), start_tag_noframes_style),
("noscript", start_tag_noscript),
("script", start_tag_script),
(("base", "basefont", "bgsound", "command", "link"),
start_tag_base_link_command),
("meta", start_tag_meta),
("head", start_tag_head)
])
end_tag_handler = dispatch([
("head", end_tag_head),
(("br", "html", "body"), end_tag_html_body_br)
])
class InHeadNoscriptPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.parser.parse_error("eof-in-head-noscript")
self.anything_else()
return True
def process_comment(self, token):
return self.parser.phases["in head"].process_comment(token)
def process_characters(self, token):
self.parser.parse_error("char-in-head-noscript")
self.anything_else()
return token
def process_space_characters(self, token):
return self.parser.phases["in head"].process_space_characters(token)
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_base_link_command(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_head_noscript(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": token["name"]})
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anything_else()
return token
def end_tag_noscript(self, token):
node = self.parser.tree.open_elements.pop()
assert node.name == "noscript", f"Expected noscript got {node.name}"
self.parser.phase = self.parser.phases["in head"]
def end_tag_br(self, token):
self.parser.parse_error(
"unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
# Caller must raise parse error first!
self.end_tag_noscript(implied_tag_token("noscript"))
start_tag_handler = dispatch([
("html", start_tag_html),
(("basefont", "bgsound", "link", "meta", "noframes", "style"),
start_tag_base_link_command),
(("head", "noscript"), start_tag_head_noscript),
])
end_tag_handler = dispatch([
("noscript", end_tag_noscript),
("br", end_tag_br),
])
class AfterHeadPhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.anything_else()
return True
def process_characters(self, token):
self.anything_else()
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_body(self, token):
self.parser.frameset_ok = False
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in body"]
def start_tag_frameset(self, token):
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in frameset"]
def start_tag_from_head(self, token):
self.parser.parse_error(
"unexpected-start-tag-out-of-my-head", {"name": token["name"]})
self.tree.open_elements.append(self.tree.head_element)
self.parser.phases["in head"].process_start_tag(token)
for node in self.tree.open_elements[::-1]:
if node.name == "head":
self.tree.open_elements.remove(node)
break
def start_tag_head(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": token["name"]})
def start_tag_other(self, token):
self.anything_else()
return token
def end_tag_html_body_br(self, token):
self.anything_else()
return token
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def anything_else(self):
self.tree.insert_element(implied_tag_token("body", "START_TAG"))
self.parser.phase = self.parser.phases["in body"]
self.parser.frameset_ok = True
start_tag_handler = dispatch([
("html", start_tag_html),
("body", start_tag_body),
("frameset", start_tag_frameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"), start_tag_from_head),
("head", start_tag_head)
])
end_tag_handler = dispatch([
(("body", "html", "br"), end_tag_html_body_br)
])
class InBodyPhase(Phase):
# https://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# The really-really-really-very crazy mode.
__slots__ = ("process_space_characters",)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Set this to the default handler.
self.process_space_characters = self.process_space_characters_non_pre
def is_matching_formatting_element(self, node1, node2):
return (
node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
def add_formatting_element(self, token):
self.tree.insert_element(token)
element = self.tree.open_elements[-1]
matching_elements = []
for node in self.tree.active_formatting_elements[::-1]:
if node is Marker:
break
elif self.is_matching_formatting_element(node, element):
matching_elements.append(node)
assert len(matching_elements) <= 3
if len(matching_elements) == 3:
self.tree.active_formatting_elements.remove(matching_elements[-1])
self.tree.active_formatting_elements.append(element)
# The real deal.
def process_eof(self):
allowed_elements = frozenset((
"dd", "dt", "li", "p",
"tbody", "td", "tfoot", "th", "thead", "tr",
"body", "html"))
for node in self.tree.open_elements[::-1]:
if node.name not in allowed_elements:
self.parser.parse_error("expected-closing-tag-but-got-eof")
break
# Stop parsing.
def process_space_characters_drop_newline(self, token):
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
# want to drop leading newlines.
data = token["data"]
self.process_space_characters = self.process_space_characters_non_pre
if (data.startswith("\n") and
self.tree.open_elements[-1].name in ("pre", "listing", "textarea") and
not self.tree.open_elements[-1].has_content()):
data = data[1:]
if data:
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_text(data)
def process_characters(self, token):
if token["data"] == "\u0000":
# The tokenizer should always emit null on its own.
return
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_text(token["data"])
# This must be bad for performance
if self.parser.frameset_ok and any(
char not in space_characters for char in token["data"]):
self.parser.frameset_ok = False
def process_space_characters_non_pre(self, token):
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_text(token["data"])
def start_tag_process_in_head(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_body(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": "body"})
if (len(self.tree.open_elements) == 1 or
self.tree.open_elements[1].name != "body"):
assert self.parser.container
else:
self.parser.frameset_ok = False
for attr, value in token["data"].items():
if attr not in self.tree.open_elements[1].attributes:
self.tree.open_elements[1].attributes[attr] = value
def start_tag_frameset(self, token):
self.parser.parse_error("unexpected-start-tag", {"name": "frameset"})
if (len(self.tree.open_elements) == 1 or
self.tree.open_elements[1].name != "body"):
assert self.parser.container
elif not self.parser.frameset_ok:
pass
else:
if self.tree.open_elements[1].parent:
self.tree.open_elements[1].parent.remove_child(
self.tree.open_elements[1])
while self.tree.open_elements[-1].name != "html":
self.tree.open_elements.pop()
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in frameset"]
def start_tag_close_p(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.insert_element(token)
def start_tag_pre_listing(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.insert_element(token)
self.parser.frameset_ok = False
self.process_space_characters = self.process_space_characters_drop_newline
def start_tag_form(self, token):
if self.tree.form_element:
self.parser.parse_error("unexpected-start-tag", {"name": "form"})
else:
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.insert_element(token)
self.tree.form_element = self.tree.open_elements[-1]
def start_tag_list_item(self, token):
self.parser.frameset_ok = False
stop_names_map = {"li": ["li"], "dt": ["dt", "dd"], "dd": ["dt", "dd"]}
stop_names = stop_names_map[token["name"]]
for node in reversed(self.tree.open_elements):
if node.name in stop_names:
self.parser.phase.process_end_tag(
implied_tag_token(node.name))
break
if (node.name_tuple in special_elements and
node.name not in ("address", "div", "p")):
break
if self.tree.element_in_scope("p", variant="button"):
self.parser.phase.process_end_tag(implied_tag_token("p"))
self.tree.insert_element(token)
def start_tag_plaintext(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.insert_element(token)
self.parser.tokenizer.state = self.parser.tokenizer.plaintext_state
def start_tag_heading(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
if self.tree.open_elements[-1].name in heading_elements:
self.parser.parse_error("unexpected-start-tag", {"name": token["name"]})
self.tree.open_elements.pop()
self.tree.insert_element(token)
def start_tag_a(self, token):
afe_a_element = self.tree.element_in_active_formatting_elements("a")
if afe_a_element:
self.parser.parse_error(
"unexpected-start-tag-implies-end-tag",
{"startName": "a", "endName": "a"})
self.end_tag_formatting(implied_tag_token("a"))
if afe_a_element in self.tree.open_elements:
self.tree.open_elements.remove(afe_a_element)
if afe_a_element in self.tree.active_formatting_elements:
self.tree.active_formatting_elements.remove(afe_a_element)
self.tree.reconstruct_active_formatting_elements()
self.add_formatting_element(token)
def start_tag_formatting(self, token):
self.tree.reconstruct_active_formatting_elements()
self.add_formatting_element(token)
def start_tag_nobr(self, token):
self.tree.reconstruct_active_formatting_elements()
if self.tree.element_in_scope("nobr"):
self.parser.parse_error(
"unexpected-start-tag-implies-end-tag",
{"startName": "nobr", "endName": "nobr"})
self.process_end_tag(implied_tag_token("nobr"))
# XXX Need tests that trigger the following
self.tree.reconstruct_active_formatting_elements()
self.add_formatting_element(token)
def start_tag_button(self, token):
if self.tree.element_in_scope("button"):
self.parser.parse_error(
"unexpected-start-tag-implies-end-tag",
{"startName": "button", "endName": "button"})
self.process_end_tag(implied_tag_token("button"))
return token
else:
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(token)
self.parser.frameset_ok = False
def start_tag_applet_marquee_object(self, token):
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(token)
self.tree.active_formatting_elements.append(Marker)
self.parser.frameset_ok = False
def start_tag_xmp(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.reconstruct_active_formatting_elements()
self.parser.frameset_ok = False
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
def start_tag_table(self, token):
if self.parser.compatibility_mode != "quirks":
if self.tree.element_in_scope("p", variant="button"):
self.process_end_tag(implied_tag_token("p"))
self.tree.insert_element(token)
self.parser.frameset_ok = False
self.parser.phase = self.parser.phases["in table"]
def start_tag_void_formatting(self, token):
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
self.parser.frameset_ok = False
def start_tag_input(self, token):
frameset_ok = self.parser.frameset_ok
self.start_tag_void_formatting(token)
if ("type" in token["data"] and
token["data"]["type"].translate(ascii_upper_to_lower) == "hidden"):
# input type=hidden doesn't change frameset_ok
self.parser.frameset_ok = frameset_ok
def start_tag_param_source(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_hr(self, token):
if self.tree.element_in_scope("p", variant="button"):
self.end_tag_p(implied_tag_token("p"))
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
self.parser.frameset_ok = False
def start_tag_image(self, token):
# No really...
self.parser.parse_error(
"unexpected-start-tag-treated-as",
{"originalName": "image", "newName": "img"})
self.process_start_tag(implied_tag_token(
"img", "START_TAG", attributes=token["data"],
self_closing=token["selfClosing"]))
def start_tag_isindex(self, token):
self.parser.parse_error("deprecated-tag", {"name": "isindex"})
if self.tree.form_element:
return
form_attrs = {}
if "action" in token["data"]:
form_attrs["action"] = token["data"]["action"]
self.process_start_tag(
implied_tag_token("form", "START_TAG", attributes=form_attrs))
self.process_start_tag(implied_tag_token("hr", "START_TAG"))
self.process_start_tag(implied_tag_token("label", "START_TAG"))
# XXX Localization ...
if "prompt" in token["data"]:
prompt = token["data"]["prompt"]
else:
prompt = "This is a searchable index. Enter search keywords: "
self.process_characters({"type": Token.CHARACTERS, "data": prompt})
attributes = token["data"].copy()
if "action" in attributes:
del attributes["action"]
if "prompt" in attributes:
del attributes["prompt"]
attributes["name"] = "isindex"
self.process_start_tag(implied_tag_token(
"input", "START_TAG", attributes=attributes,
self_closing=token["selfClosing"]))
self.process_end_tag(implied_tag_token("label"))
self.process_start_tag(implied_tag_token("hr", "START_TAG"))
self.process_end_tag(implied_tag_token("form"))
def start_tag_textarea(self, token):
self.tree.insert_element(token)
self.parser.tokenizer.state = self.parser.tokenizer.rcdata_state
self.process_space_characters = self.process_space_characters_drop_newline
self.parser.frameset_ok = False
def start_tag_iframe(self, token):
self.parser.frameset_ok = False
self.start_tag_rawtext(token)
def start_tag_noscript(self, token):
if self.parser.scripting:
self.start_tag_rawtext(token)
else:
self.start_tag_other(token)
def start_tag_rawtext(self, token):
self.parser.parse_rcdata_rawtext(token, "RAWTEXT")
def start_tag_opt(self, token):
if self.tree.open_elements[-1].name == "option":
self.parser.phase.process_end_tag(implied_tag_token("option"))
self.tree.reconstruct_active_formatting_elements()
self.parser.tree.insert_element(token)
def start_tag_select(self, token):
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(token)
self.parser.frameset_ok = False
if self.parser.phase in (
self.parser.phases["in table"],
self.parser.phases["in caption"],
self.parser.phases["in column group"],
self.parser.phases["in table body"],
self.parser.phases["in row"],
self.parser.phases["in cell"]):
self.parser.phase = self.parser.phases["in select in table"]
else:
self.parser.phase = self.parser.phases["in select"]
def start_tag_rp_rt(self, token):
if self.tree.element_in_scope("ruby"):
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1].name != "ruby":
self.parser.parse_error("rp-or-rt-tag-not-in-ruby-scope")
self.tree.insert_element(token)
def start_tag_math(self, token):
self.tree.reconstruct_active_formatting_elements()
self.parser.adjust_mathml_attributes(token)
self.parser.adjust_foreign_attributes(token)
token["namespace"] = namespaces["mathml"]
self.tree.insert_element(token)
# Need to get the parse error right for the case where the token has a
# namespace not equal to the xmlns attribute.
if token["selfClosing"]:
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_svg(self, token):
self.tree.reconstruct_active_formatting_elements()
self.parser.adjust_svg_attributes(token)
self.parser.adjust_foreign_attributes(token)
token["namespace"] = namespaces["svg"]
self.tree.insert_element(token)
# Need to get the parse error right for the case where the token has a
# namespace not equal to the xmlns attribute.
if token["selfClosing"]:
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_misplaced(self, token):
"""Elements that should be children of other elements.
Here they are ignored: "caption", "col", "colgroup", "frame",
"frameset", "head", "option", "optgroup", "tbody", "td", "tfoot",
"th", "thead", "tr", "noscript".
"""
self.parser.parse_error("unexpected-start-tag-ignored", {"name": token["name"]})
def start_tag_other(self, token):
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(token)
def end_tag_p(self, token):
if not self.tree.element_in_scope("p", variant="button"):
self.start_tag_close_p(implied_tag_token("p", "START_TAG"))
self.parser.parse_error("unexpected-end-tag", {"name": "p"})
self.end_tag_p(implied_tag_token("p"))
else:
self.tree.generate_implied_end_tags("p")
if self.tree.open_elements[-1].name != "p":
self.parser.parse_error("unexpected-end-tag", {"name": "p"})
node = self.tree.open_elements.pop()
while node.name != "p":
node = self.tree.open_elements.pop()
def end_tag_body(self, token):
if not self.tree.element_in_scope("body"):
self.parser.parse_error("unexpected-end-tag", {"name": "body"})
return
elif self.tree.open_elements[-1].name != "body":
for node in self.tree.open_elements[2:]:
if node.name not in frozenset((
"dd", "dt", "li", "optgroup", "option", "p", "rp", "rt",
"tbody", "td", "tfoot", "th", "thead", "tr", "body", "html")):
# Not sure this is the correct name for the parse error.
self.parser.parse_error(
"expected-one-end-tag-but-got-another",
{"gotName": "body", "expectedName": node.name})
break
self.parser.phase = self.parser.phases["after body"]
def end_tag_html(self, token):
# We repeat the test for the body end tag token being ignored here.
if self.tree.element_in_scope("body"):
self.end_tag_body(implied_tag_token("body"))
return token
def end_tag_block(self, token):
# Put us back in the right whitespace handling mode.
if token["name"] == "pre":
self.process_space_characters = self.process_space_characters_non_pre
in_scope = self.tree.element_in_scope(token["name"])
if in_scope:
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error("end-tag-too-early", {"name": token["name"]})
if in_scope:
node = self.tree.open_elements.pop()
while node.name != token["name"]:
node = self.tree.open_elements.pop()
def end_tag_form(self, token):
node = self.tree.form_element
self.tree.form_element = None
if node is None or not self.tree.element_in_scope(node):
self.parser.parse_error("unexpected-end-tag", {"name": "form"})
else:
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1] != node:
self.parser.parse_error("end-tag-too-early-ignored", {"name": "form"})
self.tree.open_elements.remove(node)
def end_tag_list_item(self, token):
if token["name"] == "li":
variant = "list"
else:
variant = None
if not self.tree.element_in_scope(token["name"], variant=variant):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
else:
self.tree.generate_implied_end_tags(exclude=token["name"])
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error("end-tag-too-early", {"name": token["name"]})
node = self.tree.open_elements.pop()
while node.name != token["name"]:
node = self.tree.open_elements.pop()
def end_tag_heading(self, token):
for item in heading_elements:
if self.tree.element_in_scope(item):
self.tree.generate_implied_end_tags()
break
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error("end-tag-too-early", {"name": token["name"]})
for item in heading_elements:
if self.tree.element_in_scope(item):
item = self.tree.open_elements.pop()
while item.name not in heading_elements:
item = self.tree.open_elements.pop()
break
def end_tag_formatting(self, token):
"""The much-feared adoption agency algorithm."""
# http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
# XXX Better parseError messages appreciated.
# Step 1.
outer_loop_counter = 0
# Step 2.
while outer_loop_counter < 8:
# Step 3.
outer_loop_counter += 1
# Step 4.
# Let the formatting element be the last element in
# the list of active formatting elements that:
# - is between the end of the list and the last scope
# marker in the list, if any, or the start of the list
# otherwise, and
# - has the same tag name as the token.
formatting_element = self.tree.element_in_active_formatting_elements(
token["name"])
if (not formatting_element or (
formatting_element in self.tree.open_elements and
not self.tree.element_in_scope(formatting_element.name))):
# If there is no such node, then abort these steps
# and instead act as described in the "any other
# end tag" entry below.
self.end_tag_other(token)
return
# Otherwise, if there is such a node, but that node is
# not in the stack of open elements, then this is a
# parse error; remove the element from the list, and
# abort these steps.
elif formatting_element not in self.tree.open_elements:
self.parser.parse_error("adoption-agency-1.2", {"name": token["name"]})
self.tree.active_formatting_elements.remove(formatting_element)
return
# Otherwise, if there is such a node, and that node is
# also in the stack of open elements, but the element
# is not in scope, then this is a parse error; ignore
# the token, and abort these steps.
elif not self.tree.element_in_scope(formatting_element.name):
self.parser.parse_error("adoption-agency-4.4", {"name": token["name"]})
return
# Otherwise, there is a formatting element and that
# element is in the stack and is in scope. If the
# element is not the current node, this is a parse
# error. In any case, proceed with the algorithm as
# written in the following steps.
else:
if formatting_element != self.tree.open_elements[-1]:
self.parser.parse_error(
"adoption-agency-1.3", {"name": token["name"]})
# Step 5.
# Let the furthest block be the topmost node in the
# stack of open elements that is lower in the stack
# than the formatting element, and is an element in
# the special category. There might not be one.
afe_index = self.tree.open_elements.index(formatting_element)
furthest_block = None
for element in self.tree.open_elements[afe_index:]:
if element.name_tuple in special_elements:
furthest_block = element
break
# Step 6.
# If there is no furthest block, then the UA must
# first pop all the nodes from the bottom of the stack
# of open elements, from the current node up to and
# including the formatting element, then remove the
# formatting element from the list of active
# formatting elements, and finally abort these steps.
if furthest_block is None:
element = self.tree.open_elements.pop()
while element != formatting_element:
element = self.tree.open_elements.pop()
self.tree.active_formatting_elements.remove(element)
return
# Step 7.
common_ancestor = self.tree.open_elements[afe_index - 1]
# Step 8.
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 15. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 9.7.
bookmark = self.tree.active_formatting_elements.index(formatting_element)
# Step 9.
last_node = node = furthest_block
inner_loop_counter = 0
index = self.tree.open_elements.index(node)
while inner_loop_counter < 3:
inner_loop_counter += 1
# Node is element before node in open elements.
index -= 1
node = self.tree.open_elements[index]
if node not in self.tree.active_formatting_elements:
self.tree.open_elements.remove(node)
continue
# Step 9.6.
if node == formatting_element:
break
# Step 9.7.
if last_node == furthest_block:
bookmark = self.tree.active_formatting_elements.index(node) + 1
# Step 9.8.
clone = node.clone()
# Replace node with clone
self.tree.active_formatting_elements[
self.tree.active_formatting_elements.index(node)] = clone
self.tree.open_elements[self.tree.open_elements.index(node)] = clone
node = clone
# Step 9.9.
# Remove lastNode from its parents, if any
if last_node.parent:
last_node.parent.remove_child(last_node)
node.append_child(last_node)
# Step 9.10.
last_node = node
# Step 10.
# Foster parent lastNode if commonAncestor is a
# table, tbody, tfoot, thead, or tr we need to foster
# parent the lastNode
if last_node.parent:
last_node.parent.remove_child(last_node)
if common_ancestor.name in frozenset((
"table", "tbody", "tfoot", "thead", "tr")):
parent, insert_before = self.tree.get_table_misnested_node_position()
parent.insert_before(last_node, insert_before)
else:
common_ancestor.append_child(last_node)
# Step 11
clone = formatting_element.clone()
# Step 12
furthest_block.reparent_children(clone)
# Step 13
furthest_block.append_child(clone)
# Step 14
self.tree.active_formatting_elements.remove(formatting_element)
self.tree.active_formatting_elements.insert(bookmark, clone)
# Step 15
self.tree.open_elements.remove(formatting_element)
self.tree.open_elements.insert(
self.tree.open_elements.index(furthest_block) + 1, clone)
def end_tag_applet_marquee_object(self, token):
if self.tree.element_in_scope(token["name"]):
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error("end-tag-too-early", {"name": token["name"]})
if self.tree.element_in_scope(token["name"]):
element = self.tree.open_elements.pop()
while element.name != token["name"]:
element = self.tree.open_elements.pop()
self.tree.clear_active_formatting_elements()
def end_tag_br(self, token):
self.parser.parse_error(
"unexpected-end-tag-treated-as",
{"originalName": "br", "newName": "br element"})
self.tree.reconstruct_active_formatting_elements()
self.tree.insert_element(implied_tag_token("br", "START_TAG"))
self.tree.open_elements.pop()
def end_tag_other(self, token):
for node in self.tree.open_elements[::-1]:
if node.name == token["name"]:
self.tree.generate_implied_end_tags(exclude=token["name"])
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error(
"unexpected-end-tag", {"name": token["name"]})
while self.tree.open_elements.pop() != node:
pass
break
else:
if node.name_tuple in special_elements:
self.parser.parse_error(
"unexpected-end-tag", {"name": token["name"]})
break
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"), start_tag_process_in_head),
("body", start_tag_body),
("frameset", start_tag_frameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"), start_tag_close_p),
(heading_elements, start_tag_heading),
(("pre", "listing"), start_tag_pre_listing),
("form", start_tag_form),
(("li", "dd", "dt"), start_tag_list_item),
("plaintext", start_tag_plaintext),
("a", start_tag_a),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), start_tag_formatting),
("nobr", start_tag_nobr),
("button", start_tag_button),
(("applet", "marquee", "object"), start_tag_applet_marquee_object),
("xmp", start_tag_xmp),
("table", start_tag_table),
(("area", "br", "embed", "img", "keygen", "wbr"), start_tag_void_formatting),
(("param", "source", "track"), start_tag_param_source),
("input", start_tag_input),
("hr", start_tag_hr),
("image", start_tag_image),
("isindex", start_tag_isindex),
("textarea", start_tag_textarea),
("iframe", start_tag_iframe),
("noscript", start_tag_noscript),
(("noembed", "noframes"), start_tag_rawtext),
("select", start_tag_select),
(("rp", "rt"), start_tag_rp_rt),
(("option", "optgroup"), start_tag_opt),
(("math"), start_tag_math),
(("svg"), start_tag_svg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead", "tr"), start_tag_misplaced)
])
end_tag_handler = dispatch([
("body", end_tag_body),
("html", end_tag_html),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), end_tag_block),
("form", end_tag_form),
("p", end_tag_p),
(("dd", "dt", "li"), end_tag_list_item),
(heading_elements, end_tag_heading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), end_tag_formatting),
(("applet", "marquee", "object"), end_tag_applet_marquee_object),
("br", end_tag_br),
])
class TextPhase(Phase):
__slots__ = tuple()
def process_characters(self, token):
self.tree.insert_text(token["data"])
def process_eof(self):
self.parser.parse_error(
"expected-named-closing-tag-but-got-eof",
{"name": self.tree.open_elements[-1].name})
self.tree.open_elements.pop()
self.parser.phase = self.parser.original_phase
return True
def start_tag_other(self, token):
assert False, ( # pragma: no cover
f"Tried to process start tag {token['name']} in RCDATA/RAWTEXT mode")
def end_tag_script(self, token):
node = self.tree.open_elements.pop()
assert node.name == "script"
self.parser.phase = self.parser.original_phase
# The rest of this method is all stuff that only happens if
# document.write works.
def end_tag_other(self, token):
self.tree.open_elements.pop()
self.parser.phase = self.parser.original_phase
start_tag_handler = dispatch([])
end_tag_handler = dispatch([("script", end_tag_script)])
class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
__slots__ = tuple()
def _clear_stack_to_table_context(self):
# "Clear the stack back to a table context".
while self.tree.open_elements[-1].name not in ("table", "html"):
# self.parser.parse_error("unexpected-implied-end-tag-in-table",
# {"name": self.tree.open_elements[-1].name})
self.tree.open_elements.pop()
# When the current node is <html> it's a fragment case.
def process_eof(self):
if self.tree.open_elements[-1].name != "html":
self.parser.parse_error("eof-in-table")
else:
assert self.parser.container
# Stop parsing.
def process_space_characters(self, token):
original_phase = self.parser.phase
self.parser.phase = self.parser.phases["in table text"]
self.parser.phase.original_phase = original_phase
self.parser.phase.process_space_characters(token)
def process_characters(self, token):
original_phase = self.parser.phase
self.parser.phase = self.parser.phases["in table text"]
self.parser.phase.original_phase = original_phase
self.parser.phase.process_characters(token)
def insert_text(self, token):
# If we get here there must be at least one non-whitespace character.
# Do the table magic!
self.tree.insert_from_table = True
self.parser.phases["in body"].process_characters(token)
self.tree.insert_from_table = False
def start_tag_caption(self, token):
self._clear_stack_to_table_context()
self.tree.active_formatting_elements.append(Marker)
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in caption"]
def start_tag_colgroup(self, token):
self._clear_stack_to_table_context()
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in column group"]
def start_tag_col(self, token):
self.start_tag_colgroup(implied_tag_token("colgroup", "START_TAG"))
return token
def start_tag_rowgroup(self, token):
self._clear_stack_to_table_context()
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in table body"]
def start_tag_imply_tbody(self, token):
self.start_tag_rowgroup(implied_tag_token("tbody", "START_TAG"))
return token
def start_tag_table(self, token):
self.parser.parse_error(
"unexpected-start-tag-implies-end-tag",
{"startName": "table", "endName": "table"})
self.parser.phase.process_end_tag(implied_tag_token("table"))
if not self.parser.container:
return token
def start_tag_style_script(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_input(self, token):
if ("type" in token["data"] and
token["data"]["type"].translate(ascii_upper_to_lower) == "hidden"):
self.parser.parse_error("unexpected-hidden-input-in-table")
self.tree.insert_element(token)
# XXX associate with form.
self.tree.open_elements.pop()
else:
self.start_tag_other(token)
def start_tag_form(self, token):
self.parser.parse_error("unexpected-form-in-table")
if self.tree.form_element is None:
self.tree.insert_element(token)
self.tree.form_element = self.tree.open_elements[-1]
self.tree.open_elements.pop()
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insert_from_table = True
self.parser.phases["in body"].process_start_tag(token)
self.tree.insert_from_table = False
def end_tag_table(self, token):
if self.tree.element_in_scope("table", variant="table"):
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1].name != "table":
self.parser.parse_error("end-tag-too-early-named", {
"gotName": "table",
"expectedName": self.tree.open_elements[-1].name})
while self.tree.open_elements[-1].name != "table":
self.tree.open_elements.pop()
self.tree.open_elements.pop()
self.parser.reset_insertion_mode()
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_ignore(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_other(self, token):
self.parser.parse_error(
"unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insert_from_table = True
self.parser.phases["in body"].process_end_tag(token)
self.tree.insert_from_table = False
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("caption", start_tag_caption),
("colgroup", start_tag_colgroup),
("col", start_tag_col),
(("tbody", "tfoot", "thead"), start_tag_rowgroup),
(("td", "th", "tr"), start_tag_imply_tbody),
("table", start_tag_table),
(("style", "script"), start_tag_style_script),
("input", start_tag_input),
("form", start_tag_form)
])
end_tag_handler = dispatch([
("table", end_tag_table),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), end_tag_ignore)
])
class InTableTextPhase(Phase):
__slots__ = ("original_phase", "character_tokens")
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.original_phase = None
self.character_tokens = []
def flush_characters(self):
data = "".join([item["data"] for item in self.character_tokens])
if any(item not in space_characters for item in data):
token = {"type": Token.CHARACTERS, "data": data}
self.parser.phases["in table"].insert_text(token)
elif data:
self.tree.insert_text(data)
self.character_tokens = []
def process_comment(self, token):
self.flush_characters()
self.parser.phase = self.original_phase
return token
def process_eof(self):
self.flush_characters()
self.parser.phase = self.original_phase
return True
def process_characters(self, token):
if token["data"] == "\u0000":
return
self.character_tokens.append(token)
def process_space_characters(self, token):
# Pretty sure we should never reach here.
self.character_tokens.append(token)
# assert False
def process_start_tag(self, token):
self.flush_characters()
self.parser.phase = self.original_phase
return token
def process_end_tag(self, token):
self.flush_characters()
self.parser.phase = self.original_phase
return token
class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
__slots__ = tuple()
def ignore_end_tag_caption(self):
return not self.tree.element_in_scope("caption", variant="table")
def process_eof(self):
self.parser.phases["in body"].process_eof()
def process_characters(self, token):
return self.parser.phases["in body"].process_characters(token)
def start_tag_table_element(self, token):
self.parser.parse_error("unexpected-table-start-tag-in-caption")
# XXX Have to duplicate logic here to find out if the tag is ignored.
ignore_end_tag = self.ignore_end_tag_caption()
self.parser.phase.process_end_tag(implied_tag_token("caption"))
if not ignore_end_tag:
return token
def start_tag_other(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def end_tag_caption(self, token):
if not self.ignore_end_tag_caption():
# AT this code is quite similar to end_tag_table in "InTable".
self.tree.generate_implied_end_tags()
if self.tree.open_elements[-1].name != "caption":
self.parser.parse_error("expected-one-end-tag-but-got-another", {
"gotName": "caption",
"expectedName": self.tree.open_elements[-1].name})
while self.tree.open_elements[-1].name != "caption":
self.tree.open_elements.pop()
self.tree.open_elements.pop()
self.tree.clear_active_formatting_elements()
self.parser.phase = self.parser.phases["in table"]
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_table(self, token):
self.parser.parse_error("unexpected-table-end-tag-in-caption")
ignore_end_tag = self.ignore_end_tag_caption()
self.parser.phase.process_end_tag(implied_tag_token("caption"))
if not ignore_end_tag:
return token
def end_tag_ignore(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_other(self, token):
return self.parser.phases["in body"].process_end_tag(token)
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), start_tag_table_element)
])
end_tag_handler = dispatch([
("caption", end_tag_caption),
("table", end_tag_table),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), end_tag_ignore)
])
class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
__slots__ = tuple()
def ignore_end_tag_colgroup(self):
return self.tree.open_elements[-1].name == "html"
def process_eof(self):
if self.tree.open_elements[-1].name == "html":
assert self.parser.container
return
else:
ignore_end_tag = self.ignore_end_tag_colgroup()
self.end_tag_colgroup(implied_tag_token("colgroup"))
if not ignore_end_tag:
return True
def process_characters(self, token):
ignore_end_tag = self.ignore_end_tag_colgroup()
self.end_tag_colgroup(implied_tag_token("colgroup"))
if not ignore_end_tag:
return token
def start_tag_col(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def start_tag_other(self, token):
ignore_end_tag = self.ignore_end_tag_colgroup()
self.end_tag_colgroup(implied_tag_token("colgroup"))
if not ignore_end_tag:
return token
def end_tag_colgroup(self, token):
if self.ignore_end_tag_colgroup():
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
else:
self.tree.open_elements.pop()
self.parser.phase = self.parser.phases["in table"]
def end_tag_col(self, token):
self.parser.parse_error("no-end-tag", {"name": "col"})
def end_tag_other(self, token):
ignore_end_tag = self.ignore_end_tag_colgroup()
self.end_tag_colgroup(implied_tag_token("colgroup"))
if not ignore_end_tag:
return token
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("col", start_tag_col)
])
end_tag_handler = dispatch([
("colgroup", end_tag_colgroup),
("col", end_tag_col)
])
class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
__slots__ = tuple()
def _clear_stack_to_table_body_context(self):
while self.tree.open_elements[-1].name not in (
"tbody", "tfoot", "thead", "html"):
# self.parser.parse_error("unexpected-implied-end-tag-in-table",
# {"name": self.tree.open_elements[-1].name})
self.tree.open_elements.pop()
if self.tree.open_elements[-1].name == "html":
assert self.parser.container
def process_eof(self):
self.parser.phases["in table"].process_eof()
def process_space_characters(self, token):
return self.parser.phases["in table"].process_space_characters(token)
def process_characters(self, token):
return self.parser.phases["in table"].process_characters(token)
def start_tag_tr(self, token):
self._clear_stack_to_table_body_context()
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in row"]
def start_tag_table_cell(self, token):
self.parser.parse_error(
"unexpected-cell-in-table-body", {"name": token["name"]})
self.start_tag_tr(implied_tag_token("tr", "START_TAG"))
return token
def start_tag_table_other(self, token):
# XXX AT Any ideas on how to share this with end_tag_table?
if (self.tree.element_in_scope("tbody", variant="table") or
self.tree.element_in_scope("thead", variant="table") or
self.tree.element_in_scope("tfoot", variant="table")):
self._clear_stack_to_table_body_context()
self.end_tag_table_rowgroup(
implied_tag_token(self.tree.open_elements[-1].name))
return token
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error(
"unexpected-start-tag-out-of-table", {"name": token["name"]})
def start_tag_other(self, token):
return self.parser.phases["in table"].process_start_tag(token)
def end_tag_table_rowgroup(self, token):
if self.tree.element_in_scope(token["name"], variant="table"):
self._clear_stack_to_table_body_context()
self.tree.open_elements.pop()
self.parser.phase = self.parser.phases["in table"]
else:
self.parser.parse_error(
"unexpected-end-tag-in-table-body", {"name": token["name"]})
def end_tag_table(self, token):
if (self.tree.element_in_scope("tbody", variant="table") or
self.tree.element_in_scope("thead", variant="table") or
self.tree.element_in_scope("tfoot", variant="table")):
self._clear_stack_to_table_body_context()
self.end_tag_table_rowgroup(
implied_tag_token(self.tree.open_elements[-1].name))
return token
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_ignore(self, token):
self.parser.parse_error(
"unexpected-end-tag-in-table-body", {"name": token["name"]})
def end_tag_other(self, token):
return self.parser.phases["in table"].process_end_tag(token)
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("tr", start_tag_tr),
(("td", "th"), start_tag_table_cell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
start_tag_table_other)
])
end_tag_handler = dispatch([
(("tbody", "tfoot", "thead"), end_tag_table_rowgroup),
("table", end_tag_table),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), end_tag_ignore)
])
class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
__slots__ = tuple()
def _clear_stack_to_table_row_context(self):
while self.tree.open_elements[-1].name not in ("tr", "html"):
self.parser.parse_error(
"unexpected-implied-end-tag-in-table-row",
{"name": self.tree.open_elements[-1].name})
self.tree.open_elements.pop()
def ignore_end_tag_tr(self):
return not self.tree.element_in_scope("tr", variant="table")
def process_eof(self):
self.parser.phases["in table"].process_eof()
def process_space_characters(self, token):
return self.parser.phases["in table"].process_space_characters(token)
def process_characters(self, token):
return self.parser.phases["in table"].process_characters(token)
def start_tag_table_cell(self, token):
self._clear_stack_to_table_row_context()
self.tree.insert_element(token)
self.parser.phase = self.parser.phases["in cell"]
self.tree.active_formatting_elements.append(Marker)
def start_tag_table_other(self, token):
ignore_end_tag = self.ignore_end_tag_tr()
self.end_tag_tr(implied_tag_token("tr"))
# XXX how are we sure it's always ignored in the fragment case?
if not ignore_end_tag:
return token
def start_tag_other(self, token):
return self.parser.phases["in table"].process_start_tag(token)
def end_tag_tr(self, token):
if not self.ignore_end_tag_tr():
self._clear_stack_to_table_row_context()
self.tree.open_elements.pop()
self.parser.phase = self.parser.phases["in table body"]
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_table(self, token):
ignore_end_tag = self.ignore_end_tag_tr()
self.end_tag_tr(implied_tag_token("tr"))
# Reprocess the current tag if the tr end tag was not ignored.
# XXX how are we sure it's always ignored in the fragment case?
if not ignore_end_tag:
return token
def end_tag_table_rowgroup(self, token):
if self.tree.element_in_scope(token["name"], variant="table"):
self.end_tag_tr(implied_tag_token("tr"))
return token
else:
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_ignore(self, token):
self.parser.parse_error(
"unexpected-end-tag-in-table-row", {"name": token["name"]})
def end_tag_other(self, token):
return self.parser.phases["in table"].process_end_tag(token)
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
(("td", "th"), start_tag_table_cell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), start_tag_table_other)
])
end_tag_handler = dispatch([
("tr", end_tag_tr),
("table", end_tag_table),
(("tbody", "tfoot", "thead"), end_tag_table_rowgroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"), end_tag_ignore)
])
class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
__slots__ = tuple()
def _close_cell(self):
if self.tree.element_in_scope("td", variant="table"):
self.end_tag_table_cell(implied_tag_token("td"))
elif self.tree.element_in_scope("th", variant="table"):
self.end_tag_table_cell(implied_tag_token("th"))
def process_eof(self):
self.parser.phases["in body"].process_eof()
def process_characters(self, token):
return self.parser.phases["in body"].process_characters(token)
def start_tag_table_other(self, token):
if (self.tree.element_in_scope("td", variant="table") or
self.tree.element_in_scope("th", variant="table")):
self._close_cell()
return token
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error(
"unexpected-start-tag-out-of-table-cell", {"name": token["name"]})
def start_tag_other(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def end_tag_table_cell(self, token):
if self.tree.element_in_scope(token["name"], variant="table"):
self.tree.generate_implied_end_tags(token["name"])
if self.tree.open_elements[-1].name != token["name"]:
self.parser.parse_error(
"unexpected-cell-end-tag", {"name": token["name"]})
while True:
node = self.tree.open_elements.pop()
if node.name == token["name"]:
break
else:
self.tree.open_elements.pop()
self.tree.clear_active_formatting_elements()
self.parser.phase = self.parser.phases["in row"]
else:
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_ignore(self, token):
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_imply(self, token):
if self.tree.element_in_scope(token["name"], variant="table"):
self._close_cell()
return token
else:
# Sometimes fragment case.
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_other(self, token):
return self.parser.phases["in body"].process_end_tag(token)
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), start_tag_table_other)
])
end_tag_handler = dispatch([
(("td", "th"), end_tag_table_cell),
(("body", "caption", "col", "colgroup", "html"), end_tag_ignore),
(("table", "tbody", "tfoot", "thead", "tr"), end_tag_imply)
])
class InSelectPhase(Phase):
__slots__ = tuple()
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def process_eof(self):
if self.tree.open_elements[-1].name != "html":
self.parser.parse_error("eof-in-select")
else:
assert self.parser.container
def process_characters(self, token):
if token["data"] == "\u0000":
return
self.tree.insert_text(token["data"])
def start_tag_option(self, token):
# We need to imply </option> if <option> is the current node.
if self.tree.open_elements[-1].name == "option":
self.tree.open_elements.pop()
self.tree.insert_element(token)
def start_tag_optgroup(self, token):
if self.tree.open_elements[-1].name == "option":
self.tree.open_elements.pop()
if self.tree.open_elements[-1].name == "optgroup":
self.tree.open_elements.pop()
self.tree.insert_element(token)
def start_tag_select(self, token):
self.parser.parse_error("unexpected-select-in-select")
self.end_tag_select(implied_tag_token("select"))
def start_tag_input(self, token):
self.parser.parse_error("unexpected-input-in-select")
if self.tree.element_in_scope("select", variant="select"):
self.end_tag_select(implied_tag_token("select"))
return token
else:
assert self.parser.container
def start_tag_script(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-start-tag-in-select", {"name": token["name"]})
def end_tag_option(self, token):
if self.tree.open_elements[-1].name == "option":
self.tree.open_elements.pop()
else:
self.parser.parse_error("unexpected-end-tag-in-select", {"name": "option"})
def end_tag_optgroup(self, token):
# </optgroup> implicitly closes <option>.
if (self.tree.open_elements[-1].name == "option" and
self.tree.open_elements[-2].name == "optgroup"):
self.tree.open_elements.pop()
# It also closes </optgroup>.
if self.tree.open_elements[-1].name == "optgroup":
self.tree.open_elements.pop()
# But nothing else.
else:
self.parser.parse_error(
"unexpected-end-tag-in-select", {"name": "optgroup"})
def end_tag_select(self, token):
if self.tree.element_in_scope("select", variant="select"):
node = self.tree.open_elements.pop()
while node.name != "select":
node = self.tree.open_elements.pop()
self.parser.reset_insertion_mode()
else:
# Fragment case.
assert self.parser.container
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
def end_tag_other(self, token):
self.parser.parse_error("unexpected-end-tag-in-select", {"name": token["name"]})
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("option", start_tag_option),
("optgroup", start_tag_optgroup),
("select", start_tag_select),
(("input", "keygen", "textarea"), start_tag_input),
("script", start_tag_script)
])
end_tag_handler = dispatch([
("option", end_tag_option),
("optgroup", end_tag_optgroup),
("select", end_tag_select)
])
class InSelectInTablePhase(Phase):
__slots__ = tuple()
def process_eof(self):
self.parser.phases["in select"].process_eof()
def process_characters(self, token):
return self.parser.phases["in select"].process_characters(token)
def start_tag_table(self, token):
self.parser.parse_error(
"unexpected-table-element-start-tag-in-select-in-table",
{"name": token["name"]})
self.end_tag_other(implied_tag_token("select"))
return token
def start_tag_other(self, token):
return self.parser.phases["in select"].process_start_tag(token)
def end_tag_table(self, token):
self.parser.parse_error(
"unexpected-table-element-end-tag-in-select-in-table",
{"name": token["name"]})
if self.tree.element_in_scope(token["name"], variant="table"):
self.end_tag_other(implied_tag_token("select"))
return token
def end_tag_other(self, token):
return self.parser.phases["in select"].process_end_tag(token)
start_tag_handler = dispatch([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
start_tag_table)
])
end_tag_handler = dispatch([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
end_tag_table)
])
class InForeignContentPhase(Phase):
__slots__ = tuple()
breakout_elements = frozenset([
"b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl",
"dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i",
"img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s",
"small", "span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul",
"var"])
def adjust_svg_tag_names(self, token):
replacements = {
"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef",
"altglyphitem": "altGlyphItem",
"animatecolor": "animateColor",
"animatemotion": "animateMotion",
"animatetransform": "animateTransform",
"clippath": "clipPath",
"feblend": "feBlend",
"fecolormatrix": "feColorMatrix",
"fecomponenttransfer": "feComponentTransfer",
"fecomposite": "feComposite",
"feconvolvematrix": "feConvolveMatrix",
"fediffuselighting": "feDiffuseLighting",
"fedisplacementmap": "feDisplacementMap",
"fedistantlight": "feDistantLight",
"feflood": "feFlood",
"fefunca": "feFuncA",
"fefuncb": "feFuncB",
"fefuncg": "feFuncG",
"fefuncr": "feFuncR",
"fegaussianblur": "feGaussianBlur",
"feimage": "feImage",
"femerge": "feMerge",
"femergenode": "feMergeNode",
"femorphology": "feMorphology",
"feoffset": "feOffset",
"fepointlight": "fePointLight",
"fespecularlighting": "feSpecularLighting",
"fespotlight": "feSpotLight",
"fetile": "feTile",
"feturbulence": "feTurbulence",
"foreignobject": "foreignObject",
"glyphref": "glyphRef",
"lineargradient": "linearGradient",
"radialgradient": "radialGradient",
"textpath": "textPath",
}
if token["name"] in replacements:
token["name"] = replacements[token["name"]]
def process_characters(self, token):
if token["data"] == "\u0000":
token["data"] = "\uFFFD"
elif (self.parser.frameset_ok and
any(char not in space_characters for char in token["data"])):
self.parser.frameset_ok = False
Phase.process_characters(self, token)
def process_start_tag(self, token):
current_node = self.tree.open_elements[-1]
if (token["name"] in self.breakout_elements or (
token["name"] == "font" and
set(token["data"].keys()) & {"color", "face", "size"})):
self.parser.parse_error(
"unexpected-html-element-in-foreign-content", {"name": token["name"]})
while (self.tree.open_elements[-1].namespace !=
self.tree.default_namespace and
not self.parser.is_html_integration_point(
self.tree.open_elements[-1]) and
not self.parser.is_mathml_text_integration_point(
self.tree.open_elements[-1])):
self.tree.open_elements.pop()
return token
else:
if current_node.namespace == namespaces["mathml"]:
self.parser.adjust_mathml_attributes(token)
elif current_node.namespace == namespaces["svg"]:
self.adjust_svg_tag_names(token)
self.parser.adjust_svg_attributes(token)
self.parser.adjust_foreign_attributes(token)
token["namespace"] = current_node.namespace
self.tree.insert_element(token)
if token["selfClosing"]:
self.tree.open_elements.pop()
token["selfClosingAcknowledged"] = True
def process_end_tag(self, token):
node_index = len(self.tree.open_elements) - 1
node = self.tree.open_elements[-1]
if node.name.translate(ascii_upper_to_lower) != token["name"]:
self.parser.parse_error("unexpected-end-tag", {"name": token["name"]})
while True:
if node.name.translate(ascii_upper_to_lower) == token["name"]:
# XXX this isn't in the spec but it seems necessary
if self.parser.phase == self.parser.phases["in table text"]:
self.parser.phase.flush_characters()
self.parser.phase = self.parser.phase.original_phase
while self.tree.open_elements.pop() != node:
assert self.tree.open_elements
new_token = None
break
node_index -= 1
node = self.tree.open_elements[node_index]
if node.namespace != self.tree.default_namespace:
continue
else:
new_token = self.parser.phase.process_end_tag(token)
break
return new_token
class AfterBodyPhase(Phase):
__slots__ = tuple()
def process_eof(self):
# Stop parsing
pass
def process_comment(self, token):
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
self.tree.insert_comment(token, self.tree.open_elements[0])
def process_characters(self, token):
self.parser.parse_error("unexpected-char-after-body")
self.parser.phase = self.parser.phases["in body"]
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-start-tag-after-body", {"name": token["name"]})
self.parser.phase = self.parser.phases["in body"]
return token
def end_tag_html(self, name):
if self.parser.container:
self.parser.parse_error("unexpected-end-tag-after-body-innerhtml")
else:
self.parser.phase = self.parser.phases["after after body"]
def end_tag_other(self, token):
self.parser.parse_error(
"unexpected-end-tag-after-body", {"name": token["name"]})
self.parser.phase = self.parser.phases["in body"]
return token
start_tag_handler = dispatch([
("html", start_tag_html)
])
end_tag_handler = dispatch([("html", end_tag_html)])
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
__slots__ = tuple()
def process_eof(self):
if self.tree.open_elements[-1].name != "html":
self.parser.parse_error("eof-in-frameset")
else:
assert self.parser.container
def process_characters(self, token):
self.parser.parse_error("unexpected-char-in-frameset")
def start_tag_frameset(self, token):
self.tree.insert_element(token)
def start_tag_frame(self, token):
self.tree.insert_element(token)
self.tree.open_elements.pop()
def start_tag_noframes(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-start-tag-in-frameset", {"name": token["name"]})
def end_tag_frameset(self, token):
if self.tree.open_elements[-1].name == "html":
# Fragment case.
self.parser.parse_error("unexpected-frameset-in-frameset-innerhtml")
else:
self.tree.open_elements.pop()
if (not self.parser.container and
self.tree.open_elements[-1].name != "frameset"):
# If we're not in fragment mode and the current node is not a
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["after frameset"]
def end_tag_other(self, token):
self.parser.parse_error(
"unexpected-end-tag-in-frameset", {"name": token["name"]})
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("frameset", start_tag_frameset),
("frame", start_tag_frame),
("noframes", start_tag_noframes)
])
end_tag_handler = dispatch([
("frameset", end_tag_frameset)
])
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
__slots__ = tuple()
def process_eof(self):
# Stop parsing
pass
def process_characters(self, token):
self.parser.parse_error("unexpected-char-after-frameset")
def start_tag_noframes(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"unexpected-start-tag-after-frameset", {"name": token["name"]})
def end_tag_html(self, token):
self.parser.phase = self.parser.phases["after after frameset"]
def end_tag_other(self, token):
self.parser.parse_error(
"unexpected-end-tag-after-frameset", {"name": token["name"]})
start_tag_handler = dispatch([
("html", Phase.start_tag_html),
("noframes", start_tag_noframes)
])
end_tag_handler = dispatch([
("html", end_tag_html)
])
class AfterAfterBodyPhase(Phase):
__slots__ = tuple()
def process_eof(self):
pass
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_space_characters(self, token):
return self.parser.phases["in body"].process_space_characters(token)
def process_characters(self, token):
self.parser.parse_error("expected-eof-but-got-char")
self.parser.phase = self.parser.phases["in body"]
return token
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"expected-eof-but-got-start-tag", {"name": token["name"]})
self.parser.phase = self.parser.phases["in body"]
return token
def process_end_tag(self, token):
self.parser.parse_error(
"expected-eof-but-got-end-tag", {"name": token["name"]})
self.parser.phase = self.parser.phases["in body"]
return token
start_tag_handler = dispatch([
("html", start_tag_html)
])
class AfterAfterFramesetPhase(Phase):
__slots__ = tuple()
def process_eof(self):
pass
def process_comment(self, token):
self.tree.insert_comment(token, self.tree.document)
def process_space_characters(self, token):
return self.parser.phases["in body"].process_space_characters(token)
def process_characters(self, token):
self.parser.parse_error("expected-eof-but-got-char")
def start_tag_html(self, token):
return self.parser.phases["in body"].process_start_tag(token)
def start_tag_noframes(self, token):
return self.parser.phases["in head"].process_start_tag(token)
def start_tag_other(self, token):
self.parser.parse_error(
"expected-eof-but-got-start-tag", {"name": token["name"]})
def process_end_tag(self, token):
self.parser.parse_error(
"expected-eof-but-got-end-tag", {"name": token["name"]})
start_tag_handler = dispatch([
("html", start_tag_html),
("noframes", start_tag_noframes)
])
_phases = {
"initial": InitialPhase,
"before html": BeforeHtmlPhase,
"before head": BeforeHeadPhase,
"in head": InHeadPhase,
"in head noscript": InHeadNoscriptPhase,
"after head": AfterHeadPhase,
"in body": InBodyPhase,
"text": TextPhase,
"in table": InTablePhase,
"in table text": InTableTextPhase,
"in caption": InCaptionPhase,
"in column group": InColumnGroupPhase,
"in table body": InTableBodyPhase,
"in row": InRowPhase,
"in cell": InCellPhase,
"in select": InSelectPhase,
"in select in table": InSelectInTablePhase,
"in foreign content": InForeignContentPhase,
"after body": AfterBodyPhase,
"in frameset": InFramesetPhase,
"after frameset": AfterFramesetPhase,
"after after body": AfterAfterBodyPhase,
"after after frameset": AfterAfterFramesetPhase,
}
def adjust_attributes(token, replacements):
if token['data'].keys() & replacements.keys():
token['data'] = type(token['data'])(
(replacements.get(key, key), value) for key, value in token['data'].items())
def implied_tag_token(name, type="END_TAG", attributes=None, self_closing=False):
return {
"type": Token[type],
"name": name,
"data": {} if attributes is None else attributes,
"selfClosing": self_closing,
}