123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409 |
- from __future__ import absolute_import, division, unicode_literals
- from pip._vendor.six import text_type
- import re
- from codecs import register_error, xmlcharrefreplace_errors
- from .constants import voidElements, booleanAttributes, spaceCharacters
- from .constants import rcdataElements, entities, xmlEntities
- from . import treewalkers, _utils
- from xml.sax.saxutils import escape
- _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
- _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
- _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
- "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
- "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
- "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
- "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
- "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
- "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
- "\u3000]")
- _encode_entity_map = {}
- _is_ucs4 = len("\U0010FFFF") == 1
- for k, v in list(entities.items()):
- # skip multi-character entities
- if ((_is_ucs4 and len(v) > 1) or
- (not _is_ucs4 and len(v) > 2)):
- continue
- if v != "&":
- if len(v) == 2:
- v = _utils.surrogatePairToCodepoint(v)
- else:
- v = ord(v)
- if v not in _encode_entity_map or k.islower():
- # prefer < over < and similarly for &, >, etc.
- _encode_entity_map[v] = k
- def htmlentityreplace_errors(exc):
- if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
- res = []
- codepoints = []
- skip = False
- for i, c in enumerate(exc.object[exc.start:exc.end]):
- if skip:
- skip = False
- continue
- index = i + exc.start
- if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
- codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
- skip = True
- else:
- codepoint = ord(c)
- codepoints.append(codepoint)
- for cp in codepoints:
- e = _encode_entity_map.get(cp)
- if e:
- res.append("&")
- res.append(e)
- if not e.endswith(";"):
- res.append(";")
- else:
- res.append("&#x%s;" % (hex(cp)[2:]))
- return ("".join(res), exc.end)
- else:
- return xmlcharrefreplace_errors(exc)
- register_error("htmlentityreplace", htmlentityreplace_errors)
- def serialize(input, tree="etree", encoding=None, **serializer_opts):
- """Serializes the input token stream using the specified treewalker
- :arg input: the token stream to serialize
- :arg tree: the treewalker to use
- :arg encoding: the encoding to use
- :arg serializer_opts: any options to pass to the
- :py:class:`html5lib.serializer.HTMLSerializer` that gets created
- :returns: the tree serialized as a string
- Example:
- >>> from html5lib.html5parser import parse
- >>> from html5lib.serializer import serialize
- >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
- >>> serialize(token_stream, omit_optional_tags=False)
- '<html><head></head><body><p>Hi!</p></body></html>'
- """
- # XXX: Should we cache this?
- walker = treewalkers.getTreeWalker(tree)
- s = HTMLSerializer(**serializer_opts)
- return s.render(walker(input), encoding)
- class HTMLSerializer(object):
- # attribute quoting options
- quote_attr_values = "legacy" # be secure by default
- quote_char = '"'
- use_best_quote_char = True
- # tag syntax options
- omit_optional_tags = True
- minimize_boolean_attributes = True
- use_trailing_solidus = False
- space_before_trailing_solidus = True
- # escaping options
- escape_lt_in_attrs = False
- escape_rcdata = False
- resolve_entities = True
- # miscellaneous options
- alphabetical_attributes = False
- inject_meta_charset = True
- strip_whitespace = False
- sanitize = False
- options = ("quote_attr_values", "quote_char", "use_best_quote_char",
- "omit_optional_tags", "minimize_boolean_attributes",
- "use_trailing_solidus", "space_before_trailing_solidus",
- "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
- "alphabetical_attributes", "inject_meta_charset",
- "strip_whitespace", "sanitize")
- def __init__(self, **kwargs):
- """Initialize HTMLSerializer
- :arg inject_meta_charset: Whether or not to inject the meta charset.
- Defaults to ``True``.
- :arg quote_attr_values: Whether to quote attribute values that don't
- require quoting per legacy browser behavior (``"legacy"``), when
- required by the standard (``"spec"``), or always (``"always"``).
- Defaults to ``"legacy"``.
- :arg quote_char: Use given quote character for attribute quoting.
- Defaults to ``"`` which will use double quotes unless attribute
- value contains a double quote, in which case single quotes are
- used.
- :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
- values.
- Defaults to ``False``.
- :arg escape_rcdata: Whether to escape characters that need to be
- escaped within normal elements within rcdata elements such as
- style.
- Defaults to ``False``.
- :arg resolve_entities: Whether to resolve named character entities that
- appear in the source tree. The XML predefined entities < >
- & " ' are unaffected by this setting.
- Defaults to ``True``.
- :arg strip_whitespace: Whether to remove semantically meaningless
- whitespace. (This compresses all whitespace to a single space
- except within ``pre``.)
- Defaults to ``False``.
- :arg minimize_boolean_attributes: Shortens boolean attributes to give
- just the attribute value, for example::
- <input disabled="disabled">
- becomes::
- <input disabled>
- Defaults to ``True``.
- :arg use_trailing_solidus: Includes a close-tag slash at the end of the
- start tag of void elements (empty elements whose end tag is
- forbidden). E.g. ``<hr/>``.
- Defaults to ``False``.
- :arg space_before_trailing_solidus: Places a space immediately before
- the closing slash in a tag using a trailing solidus. E.g.
- ``<hr />``. Requires ``use_trailing_solidus=True``.
- Defaults to ``True``.
- :arg sanitize: Strip all unsafe or unknown constructs from output.
- See :py:class:`html5lib.filters.sanitizer.Filter`.
- Defaults to ``False``.
- :arg omit_optional_tags: Omit start/end tags that are optional.
- Defaults to ``True``.
- :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
- Defaults to ``False``.
- """
- unexpected_args = frozenset(kwargs) - frozenset(self.options)
- if len(unexpected_args) > 0:
- raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
- if 'quote_char' in kwargs:
- self.use_best_quote_char = False
- for attr in self.options:
- setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
- self.errors = []
- self.strict = False
- def encode(self, string):
- assert(isinstance(string, text_type))
- if self.encoding:
- return string.encode(self.encoding, "htmlentityreplace")
- else:
- return string
- def encodeStrict(self, string):
- assert(isinstance(string, text_type))
- if self.encoding:
- return string.encode(self.encoding, "strict")
- else:
- return string
- def serialize(self, treewalker, encoding=None):
- # pylint:disable=too-many-nested-blocks
- self.encoding = encoding
- in_cdata = False
- self.errors = []
- if encoding and self.inject_meta_charset:
- from .filters.inject_meta_charset import Filter
- treewalker = Filter(treewalker, encoding)
- # Alphabetical attributes is here under the assumption that none of
- # the later filters add or change order of attributes; it needs to be
- # before the sanitizer so escaped elements come out correctly
- if self.alphabetical_attributes:
- from .filters.alphabeticalattributes import Filter
- treewalker = Filter(treewalker)
- # WhitespaceFilter should be used before OptionalTagFilter
- # for maximum efficiently of this latter filter
- if self.strip_whitespace:
- from .filters.whitespace import Filter
- treewalker = Filter(treewalker)
- if self.sanitize:
- from .filters.sanitizer import Filter
- treewalker = Filter(treewalker)
- if self.omit_optional_tags:
- from .filters.optionaltags import Filter
- treewalker = Filter(treewalker)
- for token in treewalker:
- type = token["type"]
- if type == "Doctype":
- doctype = "<!DOCTYPE %s" % token["name"]
- if token["publicId"]:
- doctype += ' PUBLIC "%s"' % token["publicId"]
- elif token["systemId"]:
- doctype += " SYSTEM"
- if token["systemId"]:
- if token["systemId"].find('"') >= 0:
- if token["systemId"].find("'") >= 0:
- self.serializeError("System identifer contains both single and double quote characters")
- quote_char = "'"
- else:
- quote_char = '"'
- doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
- doctype += ">"
- yield self.encodeStrict(doctype)
- elif type in ("Characters", "SpaceCharacters"):
- if type == "SpaceCharacters" or in_cdata:
- if in_cdata and token["data"].find("</") >= 0:
- self.serializeError("Unexpected </ in CDATA")
- yield self.encode(token["data"])
- else:
- yield self.encode(escape(token["data"]))
- elif type in ("StartTag", "EmptyTag"):
- name = token["name"]
- yield self.encodeStrict("<%s" % name)
- if name in rcdataElements and not self.escape_rcdata:
- in_cdata = True
- elif in_cdata:
- self.serializeError("Unexpected child element of a CDATA element")
- for (_, attr_name), attr_value in token["data"].items():
- # TODO: Add namespace support here
- k = attr_name
- v = attr_value
- yield self.encodeStrict(' ')
- yield self.encodeStrict(k)
- if not self.minimize_boolean_attributes or \
- (k not in booleanAttributes.get(name, tuple()) and
- k not in booleanAttributes.get("", tuple())):
- yield self.encodeStrict("=")
- if self.quote_attr_values == "always" or len(v) == 0:
- quote_attr = True
- elif self.quote_attr_values == "spec":
- quote_attr = _quoteAttributeSpec.search(v) is not None
- elif self.quote_attr_values == "legacy":
- quote_attr = _quoteAttributeLegacy.search(v) is not None
- else:
- raise ValueError("quote_attr_values must be one of: "
- "'always', 'spec', or 'legacy'")
- v = v.replace("&", "&")
- if self.escape_lt_in_attrs:
- v = v.replace("<", "<")
- if quote_attr:
- quote_char = self.quote_char
- if self.use_best_quote_char:
- if "'" in v and '"' not in v:
- quote_char = '"'
- elif '"' in v and "'" not in v:
- quote_char = "'"
- if quote_char == "'":
- v = v.replace("'", "'")
- else:
- v = v.replace('"', """)
- yield self.encodeStrict(quote_char)
- yield self.encode(v)
- yield self.encodeStrict(quote_char)
- else:
- yield self.encode(v)
- if name in voidElements and self.use_trailing_solidus:
- if self.space_before_trailing_solidus:
- yield self.encodeStrict(" /")
- else:
- yield self.encodeStrict("/")
- yield self.encode(">")
- elif type == "EndTag":
- name = token["name"]
- if name in rcdataElements:
- in_cdata = False
- elif in_cdata:
- self.serializeError("Unexpected child element of a CDATA element")
- yield self.encodeStrict("</%s>" % name)
- elif type == "Comment":
- data = token["data"]
- if data.find("--") >= 0:
- self.serializeError("Comment contains --")
- yield self.encodeStrict("<!--%s-->" % token["data"])
- elif type == "Entity":
- name = token["name"]
- key = name + ";"
- if key not in entities:
- self.serializeError("Entity %s not recognized" % name)
- if self.resolve_entities and key not in xmlEntities:
- data = entities[key]
- else:
- data = "&%s;" % name
- yield self.encodeStrict(data)
- else:
- self.serializeError(token["data"])
- def render(self, treewalker, encoding=None):
- """Serializes the stream from the treewalker into a string
- :arg treewalker: the treewalker to serialize
- :arg encoding: the string encoding to use
- :returns: the serialized tree
- Example:
- >>> from html5lib import parse, getTreeWalker
- >>> from html5lib.serializer import HTMLSerializer
- >>> token_stream = parse('<html><body>Hi!</body></html>')
- >>> walker = getTreeWalker('etree')
- >>> serializer = HTMLSerializer(omit_optional_tags=False)
- >>> serializer.render(walker(token_stream))
- '<html><head></head><body>Hi!</body></html>'
- """
- if encoding:
- return b"".join(list(self.serialize(treewalker, encoding)))
- else:
- return "".join(list(self.serialize(treewalker)))
- def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
- # XXX The idea is to make data mandatory.
- self.errors.append(data)
- if self.strict:
- raise SerializeError
- class SerializeError(Exception):
- """Error in serialized tree"""
- pass
|