1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791 |
- from __future__ import absolute_import, division, unicode_literals
- from pip._vendor.six import with_metaclass, viewkeys
- import types
- from collections import OrderedDict
- from . import _inputstream
- from . import _tokenizer
- from . import treebuilders
- from .treebuilders.base import Marker
- from . import _utils
- from .constants import (
- spaceCharacters, asciiUpper2Lower,
- specialElements, headingElements, cdataElements, rcdataElements,
- tokenTypes, tagTokenTypes,
- namespaces,
- htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
- adjustForeignAttributes as adjustForeignAttributesMap,
- adjustMathMLAttributes, adjustSVGAttributes,
- E,
- _ReparseException
- )
- def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
- """Parse an HTML document as a string or file-like object into a tree
- :arg doc: the document to parse as a string or file-like object
- :arg treebuilder: the treebuilder to use when parsing
- :arg namespaceHTMLElements: whether or not to namespace HTML elements
- :returns: parsed tree
- Example:
- >>> from html5lib.html5parser import parse
- >>> parse('<html><body><p>This is a doc</p></body></html>')
- <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
- """
- tb = treebuilders.getTreeBuilder(treebuilder)
- p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parse(doc, **kwargs)
- def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
- """Parse an HTML fragment as a string or file-like object into a tree
- :arg doc: the fragment to parse as a string or file-like object
- :arg container: the container context to parse the fragment in
- :arg treebuilder: the treebuilder to use when parsing
- :arg namespaceHTMLElements: whether or not to namespace HTML elements
- :returns: parsed tree
- Example:
- >>> from html5lib.html5libparser import parseFragment
- >>> parseFragment('<b>this is a fragment</b>')
- <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
- """
- tb = treebuilders.getTreeBuilder(treebuilder)
- p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parseFragment(doc, container=container, **kwargs)
- def method_decorator_metaclass(function):
- class Decorated(type):
- def __new__(meta, classname, bases, classDict):
- for attributeName, attribute in classDict.items():
- if isinstance(attribute, types.FunctionType):
- attribute = function(attribute)
- classDict[attributeName] = attribute
- return type.__new__(meta, classname, bases, classDict)
- return Decorated
- class HTMLParser(object):
- """HTML parser
- Generates a tree structure from a stream of (possibly malformed) HTML.
- """
- def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
- """
- :arg tree: a treebuilder class controlling the type of tree that will be
- returned. Built in treebuilders can be accessed through
- html5lib.treebuilders.getTreeBuilder(treeType)
- :arg strict: raise an exception when a parse error is encountered
- :arg namespaceHTMLElements: whether or not to namespace HTML elements
- :arg debug: whether or not to enable debug mode which logs things
- Example:
- >>> from html5lib.html5parser import HTMLParser
- >>> parser = HTMLParser() # generates parser with etree builder
- >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
- """
- # Raise an exception on the first error encountered
- self.strict = strict
- if tree is None:
- tree = treebuilders.getTreeBuilder("etree")
- self.tree = tree(namespaceHTMLElements)
- self.errors = []
- self.phases = dict([(name, cls(self, self.tree)) for name, cls in
- getPhases(debug).items()])
- def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
- self.innerHTMLMode = innerHTML
- self.container = container
- self.scripting = scripting
- self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
- self.reset()
- try:
- self.mainLoop()
- except _ReparseException:
- self.reset()
- self.mainLoop()
- def reset(self):
- self.tree.reset()
- self.firstStartTag = False
- self.errors = []
- self.log = [] # only used with debug mode
- # "quirks" / "limited quirks" / "no quirks"
- self.compatMode = "no quirks"
- if self.innerHTMLMode:
- self.innerHTML = self.container.lower()
- if self.innerHTML in cdataElements:
- self.tokenizer.state = self.tokenizer.rcdataState
- elif self.innerHTML in rcdataElements:
- self.tokenizer.state = self.tokenizer.rawtextState
- elif self.innerHTML == 'plaintext':
- self.tokenizer.state = self.tokenizer.plaintextState
- else:
- # state already is data state
- # self.tokenizer.state = self.tokenizer.dataState
- pass
- self.phase = self.phases["beforeHtml"]
- self.phase.insertHtmlElement()
- self.resetInsertionMode()
- else:
- self.innerHTML = False # pylint:disable=redefined-variable-type
- self.phase = self.phases["initial"]
- self.lastPhase = None
- self.beforeRCDataPhase = None
- self.framesetOK = True
- @property
- def documentEncoding(self):
- """Name of the character encoding that was used to decode the input stream, or
- :obj:`None` if that is not determined yet
- """
- if not hasattr(self, 'tokenizer'):
- return None
- return self.tokenizer.stream.charEncoding[0].name
- def isHTMLIntegrationPoint(self, element):
- if (element.name == "annotation-xml" and
- element.namespace == namespaces["mathml"]):
- return ("encoding" in element.attributes and
- element.attributes["encoding"].translate(
- asciiUpper2Lower) in
- ("text/html", "application/xhtml+xml"))
- else:
- return (element.namespace, element.name) in htmlIntegrationPointElements
- def isMathMLTextIntegrationPoint(self, element):
- return (element.namespace, element.name) in mathmlTextIntegrationPointElements
- def mainLoop(self):
- CharactersToken = tokenTypes["Characters"]
- SpaceCharactersToken = tokenTypes["SpaceCharacters"]
- StartTagToken = tokenTypes["StartTag"]
- EndTagToken = tokenTypes["EndTag"]
- CommentToken = tokenTypes["Comment"]
- DoctypeToken = tokenTypes["Doctype"]
- ParseErrorToken = tokenTypes["ParseError"]
- for token in self.normalizedTokens():
- prev_token = None
- new_token = token
- while new_token is not None:
- prev_token = new_token
- currentNode = self.tree.openElements[-1] if self.tree.openElements else None
- currentNodeNamespace = currentNode.namespace if currentNode else None
- currentNodeName = currentNode.name if currentNode else None
- type = new_token["type"]
- if type == ParseErrorToken:
- self.parseError(new_token["data"], new_token.get("datavars", {}))
- new_token = None
- else:
- if (len(self.tree.openElements) == 0 or
- currentNodeNamespace == self.tree.defaultNamespace or
- (self.isMathMLTextIntegrationPoint(currentNode) and
- ((type == StartTagToken and
- token["name"] not in frozenset(["mglyph", "malignmark"])) or
- type in (CharactersToken, SpaceCharactersToken))) or
- (currentNodeNamespace == namespaces["mathml"] and
- currentNodeName == "annotation-xml" and
- type == StartTagToken and
- token["name"] == "svg") or
- (self.isHTMLIntegrationPoint(currentNode) and
- type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
- phase = self.phase
- else:
- phase = self.phases["inForeignContent"]
- if type == CharactersToken:
- new_token = phase.processCharacters(new_token)
- elif type == SpaceCharactersToken:
- new_token = phase.processSpaceCharacters(new_token)
- elif type == StartTagToken:
- new_token = phase.processStartTag(new_token)
- elif type == EndTagToken:
- new_token = phase.processEndTag(new_token)
- elif type == CommentToken:
- new_token = phase.processComment(new_token)
- elif type == DoctypeToken:
- new_token = phase.processDoctype(new_token)
- if (type == StartTagToken and prev_token["selfClosing"] and
- not prev_token["selfClosingAcknowledged"]):
- self.parseError("non-void-element-with-trailing-solidus",
- {"name": prev_token["name"]})
- # When the loop finishes it's EOF
- reprocess = True
- phases = []
- while reprocess:
- phases.append(self.phase)
- reprocess = self.phase.processEOF()
- if reprocess:
- assert self.phase not in phases
- def normalizedTokens(self):
- for token in self.tokenizer:
- yield self.normalizeToken(token)
- def parse(self, stream, *args, **kwargs):
- """Parse a HTML document into a well-formed tree
- :arg stream: a file-like object or string containing the HTML to be parsed
- The optional encoding parameter must be a string that indicates
- the encoding. If specified, that encoding will be used,
- regardless of any BOM or later declaration (such as in a meta
- element).
- :arg scripting: treat noscript elements as if JavaScript was turned on
- :returns: parsed tree
- Example:
- >>> from html5lib.html5parser import HTMLParser
- >>> parser = HTMLParser()
- >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
- <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
- """
- self._parse(stream, False, None, *args, **kwargs)
- return self.tree.getDocument()
- def parseFragment(self, stream, *args, **kwargs):
- """Parse a HTML fragment into a well-formed tree fragment
- :arg container: name of the element we're setting the innerHTML
- property if set to None, default to 'div'
- :arg stream: a file-like object or string containing the HTML to be parsed
- The optional encoding parameter must be a string that indicates
- the encoding. If specified, that encoding will be used,
- regardless of any BOM or later declaration (such as in a meta
- element)
- :arg scripting: treat noscript elements as if JavaScript was turned on
- :returns: parsed tree
- Example:
- >>> from html5lib.html5libparser import HTMLParser
- >>> parser = HTMLParser()
- >>> parser.parseFragment('<b>this is a fragment</b>')
- <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
- """
- self._parse(stream, True, *args, **kwargs)
- return self.tree.getFragment()
- def parseError(self, errorcode="XXX-undefined-error", datavars=None):
- # XXX The idea is to make errorcode mandatory.
- if datavars is None:
- datavars = {}
- self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
- if self.strict:
- raise ParseError(E[errorcode] % datavars)
- def normalizeToken(self, token):
- # HTML5 specific normalizations to the token stream
- if token["type"] == tokenTypes["StartTag"]:
- raw = token["data"]
- token["data"] = OrderedDict(raw)
- if len(raw) > len(token["data"]):
- # we had some duplicated attribute, fix so first wins
- token["data"].update(raw[::-1])
- return token
- def adjustMathMLAttributes(self, token):
- adjust_attributes(token, adjustMathMLAttributes)
- def adjustSVGAttributes(self, token):
- adjust_attributes(token, adjustSVGAttributes)
- def adjustForeignAttributes(self, token):
- adjust_attributes(token, adjustForeignAttributesMap)
- def reparseTokenNormal(self, token):
- # pylint:disable=unused-argument
- self.parser.phase()
- def resetInsertionMode(self):
- # The name of this method is mostly historical. (It's also used in the
- # specification.)
- last = False
- newModes = {
- "select": "inSelect",
- "td": "inCell",
- "th": "inCell",
- "tr": "inRow",
- "tbody": "inTableBody",
- "thead": "inTableBody",
- "tfoot": "inTableBody",
- "caption": "inCaption",
- "colgroup": "inColumnGroup",
- "table": "inTable",
- "head": "inBody",
- "body": "inBody",
- "frameset": "inFrameset",
- "html": "beforeHead"
- }
- for node in self.tree.openElements[::-1]:
- nodeName = node.name
- new_phase = None
- if node == self.tree.openElements[0]:
- assert self.innerHTML
- last = True
- nodeName = self.innerHTML
- # Check for conditions that should only happen in the innerHTML
- # case
- if nodeName in ("select", "colgroup", "head", "html"):
- assert self.innerHTML
- if not last and node.namespace != self.tree.defaultNamespace:
- continue
- if nodeName in newModes:
- new_phase = self.phases[newModes[nodeName]]
- break
- elif last:
- new_phase = self.phases["inBody"]
- break
- self.phase = new_phase
- def parseRCDataRawtext(self, token, contentType):
- # Generic RCDATA/RAWTEXT Parsing algorithm
- assert contentType in ("RAWTEXT", "RCDATA")
- self.tree.insertElement(token)
- if contentType == "RAWTEXT":
- self.tokenizer.state = self.tokenizer.rawtextState
- else:
- self.tokenizer.state = self.tokenizer.rcdataState
- self.originalPhase = self.phase
- self.phase = self.phases["text"]
- @_utils.memoize
- def getPhases(debug):
- def log(function):
- """Logger that records which phase processes each token"""
- type_names = dict((value, key) for key, value in
- tokenTypes.items())
- def wrapped(self, *args, **kwargs):
- if function.__name__.startswith("process") and len(args) > 0:
- token = args[0]
- try:
- info = {"type": type_names[token['type']]}
- except:
- raise
- if token['type'] in tagTokenTypes:
- info["name"] = token['name']
- self.parser.log.append((self.parser.tokenizer.state.__name__,
- self.parser.phase.__class__.__name__,
- self.__class__.__name__,
- function.__name__,
- info))
- return function(self, *args, **kwargs)
- else:
- return function(self, *args, **kwargs)
- return wrapped
- def getMetaclass(use_metaclass, metaclass_func):
- if use_metaclass:
- return method_decorator_metaclass(metaclass_func)
- else:
- return type
- # pylint:disable=unused-argument
- class Phase(with_metaclass(getMetaclass(debug, log))):
- """Base class for helper object that implements each phase of processing
- """
- def __init__(self, parser, tree):
- self.parser = parser
- self.tree = tree
- def processEOF(self):
- raise NotImplementedError
- def processComment(self, token):
- # For most phases the following is correct. Where it's not it will be
- # overridden.
- self.tree.insertComment(token, self.tree.openElements[-1])
- def processDoctype(self, token):
- self.parser.parseError("unexpected-doctype")
- def processCharacters(self, token):
- self.tree.insertText(token["data"])
- def processSpaceCharacters(self, token):
- self.tree.insertText(token["data"])
- def processStartTag(self, token):
- return self.startTagHandler[token["name"]](token)
- def startTagHtml(self, token):
- if not self.parser.firstStartTag and token["name"] == "html":
- self.parser.parseError("non-html-root")
- # XXX Need a check here to see if the first start tag token emitted is
- # this token... If it's not, invoke self.parser.parseError().
- for attr, value in token["data"].items():
- if attr not in self.tree.openElements[0].attributes:
- self.tree.openElements[0].attributes[attr] = value
- self.parser.firstStartTag = False
- def processEndTag(self, token):
- return self.endTagHandler[token["name"]](token)
- class InitialPhase(Phase):
- def processSpaceCharacters(self, token):
- pass
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
- def processDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
- correct = token["correct"]
- if (name != "html" or publicId is not None or
- systemId is not None and systemId != "about:legacy-compat"):
- self.parser.parseError("unknown-doctype")
- if publicId is None:
- publicId = ""
- self.tree.insertDoctype(token)
- if publicId != "":
- publicId = publicId.translate(asciiUpper2Lower)
- if (not correct or token["name"] != "html" or
- publicId.startswith(
- ("+//silmaril//dtd html pro v0r11 19970101//",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
- "-//as//dtd html 3.0 aswedit + extensions//",
- "-//ietf//dtd html 2.0 level 1//",
- "-//ietf//dtd html 2.0 level 2//",
- "-//ietf//dtd html 2.0 strict level 1//",
- "-//ietf//dtd html 2.0 strict level 2//",
- "-//ietf//dtd html 2.0 strict//",
- "-//ietf//dtd html 2.0//",
- "-//ietf//dtd html 2.1e//",
- "-//ietf//dtd html 3.0//",
- "-//ietf//dtd html 3.2 final//",
- "-//ietf//dtd html 3.2//",
- "-//ietf//dtd html 3//",
- "-//ietf//dtd html level 0//",
- "-//ietf//dtd html level 1//",
- "-//ietf//dtd html level 2//",
- "-//ietf//dtd html level 3//",
- "-//ietf//dtd html strict level 0//",
- "-//ietf//dtd html strict level 1//",
- "-//ietf//dtd html strict level 2//",
- "-//ietf//dtd html strict level 3//",
- "-//ietf//dtd html strict//",
- "-//ietf//dtd html//",
- "-//metrius//dtd metrius presentational//",
- "-//microsoft//dtd internet explorer 2.0 html strict//",
- "-//microsoft//dtd internet explorer 2.0 html//",
- "-//microsoft//dtd internet explorer 2.0 tables//",
- "-//microsoft//dtd internet explorer 3.0 html strict//",
- "-//microsoft//dtd internet explorer 3.0 html//",
- "-//microsoft//dtd internet explorer 3.0 tables//",
- "-//netscape comm. corp.//dtd html//",
- "-//netscape comm. corp.//dtd strict html//",
- "-//o'reilly and associates//dtd html 2.0//",
- "-//o'reilly and associates//dtd html extended 1.0//",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//",
- "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
- "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
- "-//spyglass//dtd html 2.0 extended//",
- "-//sq//dtd html 2.0 hotmetal + extensions//",
- "-//sun microsystems corp.//dtd hotjava html//",
- "-//sun microsystems corp.//dtd hotjava strict html//",
- "-//w3c//dtd html 3 1995-03-24//",
- "-//w3c//dtd html 3.2 draft//",
- "-//w3c//dtd html 3.2 final//",
- "-//w3c//dtd html 3.2//",
- "-//w3c//dtd html 3.2s draft//",
- "-//w3c//dtd html 4.0 frameset//",
- "-//w3c//dtd html 4.0 transitional//",
- "-//w3c//dtd html experimental 19960712//",
- "-//w3c//dtd html experimental 970421//",
- "-//w3c//dtd w3 html//",
- "-//w3o//dtd w3 html 3.0//",
- "-//webtechs//dtd mozilla html 2.0//",
- "-//webtechs//dtd mozilla html//")) or
- publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
- "-/w3c/dtd html 4.0 transitional/en",
- "html") or
- publicId.startswith(
- ("-//w3c//dtd html 4.01 frameset//",
- "-//w3c//dtd html 4.01 transitional//")) and
- systemId is None or
- systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
- self.parser.compatMode = "quirks"
- elif (publicId.startswith(
- ("-//w3c//dtd xhtml 1.0 frameset//",
- "-//w3c//dtd xhtml 1.0 transitional//")) or
- publicId.startswith(
- ("-//w3c//dtd html 4.01 frameset//",
- "-//w3c//dtd html 4.01 transitional//")) and
- systemId is not None):
- self.parser.compatMode = "limited quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
- def anythingElse(self):
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
- def processCharacters(self, token):
- self.parser.parseError("expected-doctype-but-got-chars")
- self.anythingElse()
- return token
- def processStartTag(self, token):
- self.parser.parseError("expected-doctype-but-got-start-tag",
- {"name": token["name"]})
- self.anythingElse()
- return token
- def processEndTag(self, token):
- self.parser.parseError("expected-doctype-but-got-end-tag",
- {"name": token["name"]})
- self.anythingElse()
- return token
- def processEOF(self):
- self.parser.parseError("expected-doctype-but-got-eof")
- self.anythingElse()
- return True
- class BeforeHtmlPhase(Phase):
- # helper methods
- def insertHtmlElement(self):
- self.tree.insertRoot(impliedTagToken("html", "StartTag"))
- self.parser.phase = self.parser.phases["beforeHead"]
- # other
- def processEOF(self):
- self.insertHtmlElement()
- return True
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
- def processSpaceCharacters(self, token):
- pass
- def processCharacters(self, token):
- self.insertHtmlElement()
- return token
- def processStartTag(self, token):
- if token["name"] == "html":
- self.parser.firstStartTag = True
- self.insertHtmlElement()
- return token
- def processEndTag(self, token):
- if token["name"] not in ("head", "body", "html", "br"):
- self.parser.parseError("unexpected-end-tag-before-html",
- {"name": token["name"]})
- else:
- self.insertHtmlElement()
- return token
- class BeforeHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- (("head", "body", "html", "br"), self.endTagImplyHead)
- ])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return True
- def processSpaceCharacters(self, token):
- pass
- def processCharacters(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagHead(self, token):
- self.tree.insertElement(token)
- self.tree.headPointer = self.tree.openElements[-1]
- self.parser.phase = self.parser.phases["inHead"]
- def startTagOther(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
- def endTagImplyHead(self, token):
- self.startTagHead(impliedTagToken("head", "StartTag"))
- return token
- def endTagOther(self, token):
- self.parser.parseError("end-tag-after-implied-root",
- {"name": token["name"]})
- class InHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("title", self.startTagTitle),
- (("noframes", "style"), self.startTagNoFramesStyle),
- ("noscript", self.startTagNoscript),
- ("script", self.startTagScript),
- (("base", "basefont", "bgsound", "command", "link"),
- self.startTagBaseLinkCommand),
- ("meta", self.startTagMeta),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("head", self.endTagHead),
- (("br", "html", "body"), self.endTagHtmlBodyBr)
- ])
- self.endTagHandler.default = self.endTagOther
- # the real thing
- def processEOF(self):
- self.anythingElse()
- return True
- def processCharacters(self, token):
- self.anythingElse()
- return token
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagHead(self, token):
- self.parser.parseError("two-heads-are-not-better-than-one")
- def startTagBaseLinkCommand(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def startTagMeta(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- attributes = token["data"]
- if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
- if "charset" in attributes:
- self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
- elif ("content" in attributes and
- "http-equiv" in attributes and
- attributes["http-equiv"].lower() == "content-type"):
- # Encoding it as UTF-8 here is a hack, as really we should pass
- # the abstract Unicode string, and just use the
- # ContentAttrParser on that, but using UTF-8 allows all chars
- # to be encoded and as a ASCII-superset works.
- data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
- parser = _inputstream.ContentAttrParser(data)
- codec = parser.parse()
- self.parser.tokenizer.stream.changeEncoding(codec)
- def startTagTitle(self, token):
- self.parser.parseRCDataRawtext(token, "RCDATA")
- def startTagNoFramesStyle(self, token):
- # Need to decide whether to implement the scripting-disabled case
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
- def startTagNoscript(self, token):
- if self.parser.scripting:
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
- else:
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inHeadNoscript"]
- def startTagScript(self, token):
- self.tree.insertElement(token)
- self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
- self.parser.originalPhase = self.parser.phase
- self.parser.phase = self.parser.phases["text"]
- def startTagOther(self, token):
- self.anythingElse()
- return token
- def endTagHead(self, token):
- node = self.parser.tree.openElements.pop()
- assert node.name == "head", "Expected head got %s" % node.name
- self.parser.phase = self.parser.phases["afterHead"]
- def endTagHtmlBodyBr(self, token):
- self.anythingElse()
- return token
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def anythingElse(self):
- self.endTagHead(impliedTagToken("head"))
- class InHeadNoscriptPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
- (("head", "noscript"), self.startTagHeadNoscript),
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("noscript", self.endTagNoscript),
- ("br", self.endTagBr),
- ])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- self.parser.parseError("eof-in-head-noscript")
- self.anythingElse()
- return True
- def processComment(self, token):
- return self.parser.phases["inHead"].processComment(token)
- def processCharacters(self, token):
- self.parser.parseError("char-in-head-noscript")
- self.anythingElse()
- return token
- def processSpaceCharacters(self, token):
- return self.parser.phases["inHead"].processSpaceCharacters(token)
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagBaseLinkCommand(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagHeadNoscript(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
- def startTagOther(self, token):
- self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
- self.anythingElse()
- return token
- def endTagNoscript(self, token):
- node = self.parser.tree.openElements.pop()
- assert node.name == "noscript", "Expected noscript got %s" % node.name
- self.parser.phase = self.parser.phases["inHead"]
- def endTagBr(self, token):
- self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
- self.anythingElse()
- return token
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def anythingElse(self):
- # Caller must raise parse error first!
- self.endTagNoscript(impliedTagToken("noscript"))
- class AfterHeadPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("body", self.startTagBody),
- ("frameset", self.startTagFrameset),
- (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
- "style", "title"),
- self.startTagFromHead),
- ("head", self.startTagHead)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
- self.endTagHtmlBodyBr)])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- self.anythingElse()
- return True
- def processCharacters(self, token):
- self.anythingElse()
- return token
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagBody(self, token):
- self.parser.framesetOK = False
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inBody"]
- def startTagFrameset(self, token):
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inFrameset"]
- def startTagFromHead(self, token):
- self.parser.parseError("unexpected-start-tag-out-of-my-head",
- {"name": token["name"]})
- self.tree.openElements.append(self.tree.headPointer)
- self.parser.phases["inHead"].processStartTag(token)
- for node in self.tree.openElements[::-1]:
- if node.name == "head":
- self.tree.openElements.remove(node)
- break
- def startTagHead(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
- def startTagOther(self, token):
- self.anythingElse()
- return token
- def endTagHtmlBodyBr(self, token):
- self.anythingElse()
- return token
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def anythingElse(self):
- self.tree.insertElement(impliedTagToken("body", "StartTag"))
- self.parser.phase = self.parser.phases["inBody"]
- self.parser.framesetOK = True
- class InBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
- # the really-really-really-very crazy mode
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- # Set this to the default handler
- self.processSpaceCharacters = self.processSpaceCharactersNonPre
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("base", "basefont", "bgsound", "command", "link", "meta",
- "script", "style", "title"),
- self.startTagProcessInHead),
- ("body", self.startTagBody),
- ("frameset", self.startTagFrameset),
- (("address", "article", "aside", "blockquote", "center", "details",
- "dir", "div", "dl", "fieldset", "figcaption", "figure",
- "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
- "section", "summary", "ul"),
- self.startTagCloseP),
- (headingElements, self.startTagHeading),
- (("pre", "listing"), self.startTagPreListing),
- ("form", self.startTagForm),
- (("li", "dd", "dt"), self.startTagListItem),
- ("plaintext", self.startTagPlaintext),
- ("a", self.startTagA),
- (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
- "strong", "tt", "u"), self.startTagFormatting),
- ("nobr", self.startTagNobr),
- ("button", self.startTagButton),
- (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
- ("xmp", self.startTagXmp),
- ("table", self.startTagTable),
- (("area", "br", "embed", "img", "keygen", "wbr"),
- self.startTagVoidFormatting),
- (("param", "source", "track"), self.startTagParamSource),
- ("input", self.startTagInput),
- ("hr", self.startTagHr),
- ("image", self.startTagImage),
- ("isindex", self.startTagIsIndex),
- ("textarea", self.startTagTextarea),
- ("iframe", self.startTagIFrame),
- ("noscript", self.startTagNoscript),
- (("noembed", "noframes"), self.startTagRawtext),
- ("select", self.startTagSelect),
- (("rp", "rt"), self.startTagRpRt),
- (("option", "optgroup"), self.startTagOpt),
- (("math"), self.startTagMath),
- (("svg"), self.startTagSvg),
- (("caption", "col", "colgroup", "frame", "head",
- "tbody", "td", "tfoot", "th", "thead",
- "tr"), self.startTagMisplaced)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("body", self.endTagBody),
- ("html", self.endTagHtml),
- (("address", "article", "aside", "blockquote", "button", "center",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
- "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
- "section", "summary", "ul"), self.endTagBlock),
- ("form", self.endTagForm),
- ("p", self.endTagP),
- (("dd", "dt", "li"), self.endTagListItem),
- (headingElements, self.endTagHeading),
- (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
- "strike", "strong", "tt", "u"), self.endTagFormatting),
- (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
- ("br", self.endTagBr),
- ])
- self.endTagHandler.default = self.endTagOther
- def isMatchingFormattingElement(self, node1, node2):
- return (node1.name == node2.name and
- node1.namespace == node2.namespace and
- node1.attributes == node2.attributes)
- # helper
- def addFormattingElement(self, token):
- self.tree.insertElement(token)
- element = self.tree.openElements[-1]
- matchingElements = []
- for node in self.tree.activeFormattingElements[::-1]:
- if node is Marker:
- break
- elif self.isMatchingFormattingElement(node, element):
- matchingElements.append(node)
- assert len(matchingElements) <= 3
- if len(matchingElements) == 3:
- self.tree.activeFormattingElements.remove(matchingElements[-1])
- self.tree.activeFormattingElements.append(element)
- # the real deal
- def processEOF(self):
- allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
- "tfoot", "th", "thead", "tr", "body",
- "html"))
- for node in self.tree.openElements[::-1]:
- if node.name not in allowed_elements:
- self.parser.parseError("expected-closing-tag-but-got-eof")
- break
- # Stop parsing
- def processSpaceCharactersDropNewline(self, token):
- # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
- # want to drop leading newlines
- data = token["data"]
- self.processSpaceCharacters = self.processSpaceCharactersNonPre
- if (data.startswith("\n") and
- self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
- not self.tree.openElements[-1].hasContent()):
- data = data[1:]
- if data:
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertText(data)
- def processCharacters(self, token):
- if token["data"] == "\u0000":
- # The tokenizer should always emit null on its own
- return
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertText(token["data"])
- # This must be bad for performance
- if (self.parser.framesetOK and
- any([char not in spaceCharacters
- for char in token["data"]])):
- self.parser.framesetOK = False
- def processSpaceCharactersNonPre(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertText(token["data"])
- def startTagProcessInHead(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagBody(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": "body"})
- if (len(self.tree.openElements) == 1 or
- self.tree.openElements[1].name != "body"):
- assert self.parser.innerHTML
- else:
- self.parser.framesetOK = False
- for attr, value in token["data"].items():
- if attr not in self.tree.openElements[1].attributes:
- self.tree.openElements[1].attributes[attr] = value
- def startTagFrameset(self, token):
- self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
- if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
- assert self.parser.innerHTML
- elif not self.parser.framesetOK:
- pass
- else:
- if self.tree.openElements[1].parent:
- self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
- while self.tree.openElements[-1].name != "html":
- self.tree.openElements.pop()
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inFrameset"]
- def startTagCloseP(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.insertElement(token)
- def startTagPreListing(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.insertElement(token)
- self.parser.framesetOK = False
- self.processSpaceCharacters = self.processSpaceCharactersDropNewline
- def startTagForm(self, token):
- if self.tree.formPointer:
- self.parser.parseError("unexpected-start-tag", {"name": "form"})
- else:
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.insertElement(token)
- self.tree.formPointer = self.tree.openElements[-1]
- def startTagListItem(self, token):
- self.parser.framesetOK = False
- stopNamesMap = {"li": ["li"],
- "dt": ["dt", "dd"],
- "dd": ["dt", "dd"]}
- stopNames = stopNamesMap[token["name"]]
- for node in reversed(self.tree.openElements):
- if node.name in stopNames:
- self.parser.phase.processEndTag(
- impliedTagToken(node.name, "EndTag"))
- break
- if (node.nameTuple in specialElements and
- node.name not in ("address", "div", "p")):
- break
- if self.tree.elementInScope("p", variant="button"):
- self.parser.phase.processEndTag(
- impliedTagToken("p", "EndTag"))
- self.tree.insertElement(token)
- def startTagPlaintext(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.insertElement(token)
- self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
- def startTagHeading(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- if self.tree.openElements[-1].name in headingElements:
- self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
- self.tree.openElements.pop()
- self.tree.insertElement(token)
- def startTagA(self, token):
- afeAElement = self.tree.elementInActiveFormattingElements("a")
- if afeAElement:
- self.parser.parseError("unexpected-start-tag-implies-end-tag",
- {"startName": "a", "endName": "a"})
- self.endTagFormatting(impliedTagToken("a"))
- if afeAElement in self.tree.openElements:
- self.tree.openElements.remove(afeAElement)
- if afeAElement in self.tree.activeFormattingElements:
- self.tree.activeFormattingElements.remove(afeAElement)
- self.tree.reconstructActiveFormattingElements()
- self.addFormattingElement(token)
- def startTagFormatting(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.addFormattingElement(token)
- def startTagNobr(self, token):
- self.tree.reconstructActiveFormattingElements()
- if self.tree.elementInScope("nobr"):
- self.parser.parseError("unexpected-start-tag-implies-end-tag",
- {"startName": "nobr", "endName": "nobr"})
- self.processEndTag(impliedTagToken("nobr"))
- # XXX Need tests that trigger the following
- self.tree.reconstructActiveFormattingElements()
- self.addFormattingElement(token)
- def startTagButton(self, token):
- if self.tree.elementInScope("button"):
- self.parser.parseError("unexpected-start-tag-implies-end-tag",
- {"startName": "button", "endName": "button"})
- self.processEndTag(impliedTagToken("button"))
- return token
- else:
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(token)
- self.parser.framesetOK = False
- def startTagAppletMarqueeObject(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(token)
- self.tree.activeFormattingElements.append(Marker)
- self.parser.framesetOK = False
- def startTagXmp(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.reconstructActiveFormattingElements()
- self.parser.framesetOK = False
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
- def startTagTable(self, token):
- if self.parser.compatMode != "quirks":
- if self.tree.elementInScope("p", variant="button"):
- self.processEndTag(impliedTagToken("p"))
- self.tree.insertElement(token)
- self.parser.framesetOK = False
- self.parser.phase = self.parser.phases["inTable"]
- def startTagVoidFormatting(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- self.parser.framesetOK = False
- def startTagInput(self, token):
- framesetOK = self.parser.framesetOK
- self.startTagVoidFormatting(token)
- if ("type" in token["data"] and
- token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
- # input type=hidden doesn't change framesetOK
- self.parser.framesetOK = framesetOK
- def startTagParamSource(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def startTagHr(self, token):
- if self.tree.elementInScope("p", variant="button"):
- self.endTagP(impliedTagToken("p"))
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- self.parser.framesetOK = False
- def startTagImage(self, token):
- # No really...
- self.parser.parseError("unexpected-start-tag-treated-as",
- {"originalName": "image", "newName": "img"})
- self.processStartTag(impliedTagToken("img", "StartTag",
- attributes=token["data"],
- selfClosing=token["selfClosing"]))
- def startTagIsIndex(self, token):
- self.parser.parseError("deprecated-tag", {"name": "isindex"})
- if self.tree.formPointer:
- return
- form_attrs = {}
- if "action" in token["data"]:
- form_attrs["action"] = token["data"]["action"]
- self.processStartTag(impliedTagToken("form", "StartTag",
- attributes=form_attrs))
- self.processStartTag(impliedTagToken("hr", "StartTag"))
- self.processStartTag(impliedTagToken("label", "StartTag"))
- # XXX Localization ...
- if "prompt" in token["data"]:
- prompt = token["data"]["prompt"]
- else:
- prompt = "This is a searchable index. Enter search keywords: "
- self.processCharacters(
- {"type": tokenTypes["Characters"], "data": prompt})
- attributes = token["data"].copy()
- if "action" in attributes:
- del attributes["action"]
- if "prompt" in attributes:
- del attributes["prompt"]
- attributes["name"] = "isindex"
- self.processStartTag(impliedTagToken("input", "StartTag",
- attributes=attributes,
- selfClosing=token["selfClosing"]))
- self.processEndTag(impliedTagToken("label"))
- self.processStartTag(impliedTagToken("hr", "StartTag"))
- self.processEndTag(impliedTagToken("form"))
- def startTagTextarea(self, token):
- self.tree.insertElement(token)
- self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
- self.processSpaceCharacters = self.processSpaceCharactersDropNewline
- self.parser.framesetOK = False
- def startTagIFrame(self, token):
- self.parser.framesetOK = False
- self.startTagRawtext(token)
- def startTagNoscript(self, token):
- if self.parser.scripting:
- self.startTagRawtext(token)
- else:
- self.startTagOther(token)
- def startTagRawtext(self, token):
- """iframe, noembed noframes, noscript(if scripting enabled)"""
- self.parser.parseRCDataRawtext(token, "RAWTEXT")
- def startTagOpt(self, token):
- if self.tree.openElements[-1].name == "option":
- self.parser.phase.processEndTag(impliedTagToken("option"))
- self.tree.reconstructActiveFormattingElements()
- self.parser.tree.insertElement(token)
- def startTagSelect(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(token)
- self.parser.framesetOK = False
- if self.parser.phase in (self.parser.phases["inTable"],
- self.parser.phases["inCaption"],
- self.parser.phases["inColumnGroup"],
- self.parser.phases["inTableBody"],
- self.parser.phases["inRow"],
- self.parser.phases["inCell"]):
- self.parser.phase = self.parser.phases["inSelectInTable"]
- else:
- self.parser.phase = self.parser.phases["inSelect"]
- def startTagRpRt(self, token):
- if self.tree.elementInScope("ruby"):
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != "ruby":
- self.parser.parseError()
- self.tree.insertElement(token)
- def startTagMath(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.parser.adjustMathMLAttributes(token)
- self.parser.adjustForeignAttributes(token)
- token["namespace"] = namespaces["mathml"]
- self.tree.insertElement(token)
- # Need to get the parse error right for the case where the token
- # has a namespace not equal to the xmlns attribute
- if token["selfClosing"]:
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def startTagSvg(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.parser.adjustSVGAttributes(token)
- self.parser.adjustForeignAttributes(token)
- token["namespace"] = namespaces["svg"]
- self.tree.insertElement(token)
- # Need to get the parse error right for the case where the token
- # has a namespace not equal to the xmlns attribute
- if token["selfClosing"]:
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def startTagMisplaced(self, token):
- """ Elements that should be children of other elements that have a
- different insertion mode; here they are ignored
- "caption", "col", "colgroup", "frame", "frameset", "head",
- "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
- "tr", "noscript"
- """
- self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
- def startTagOther(self, token):
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(token)
- def endTagP(self, token):
- if not self.tree.elementInScope("p", variant="button"):
- self.startTagCloseP(impliedTagToken("p", "StartTag"))
- self.parser.parseError("unexpected-end-tag", {"name": "p"})
- self.endTagP(impliedTagToken("p", "EndTag"))
- else:
- self.tree.generateImpliedEndTags("p")
- if self.tree.openElements[-1].name != "p":
- self.parser.parseError("unexpected-end-tag", {"name": "p"})
- node = self.tree.openElements.pop()
- while node.name != "p":
- node = self.tree.openElements.pop()
- def endTagBody(self, token):
- if not self.tree.elementInScope("body"):
- self.parser.parseError()
- return
- elif self.tree.openElements[-1].name != "body":
- for node in self.tree.openElements[2:]:
- if node.name not in frozenset(("dd", "dt", "li", "optgroup",
- "option", "p", "rp", "rt",
- "tbody", "td", "tfoot",
- "th", "thead", "tr", "body",
- "html")):
- # Not sure this is the correct name for the parse error
- self.parser.parseError(
- "expected-one-end-tag-but-got-another",
- {"gotName": "body", "expectedName": node.name})
- break
- self.parser.phase = self.parser.phases["afterBody"]
- def endTagHtml(self, token):
- # We repeat the test for the body end tag token being ignored here
- if self.tree.elementInScope("body"):
- self.endTagBody(impliedTagToken("body"))
- return token
- def endTagBlock(self, token):
- # Put us back in the right whitespace handling mode
- if token["name"] == "pre":
- self.processSpaceCharacters = self.processSpaceCharactersNonPre
- inScope = self.tree.elementInScope(token["name"])
- if inScope:
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("end-tag-too-early", {"name": token["name"]})
- if inScope:
- node = self.tree.openElements.pop()
- while node.name != token["name"]:
- node = self.tree.openElements.pop()
- def endTagForm(self, token):
- node = self.tree.formPointer
- self.tree.formPointer = None
- if node is None or not self.tree.elementInScope(node):
- self.parser.parseError("unexpected-end-tag",
- {"name": "form"})
- else:
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1] != node:
- self.parser.parseError("end-tag-too-early-ignored",
- {"name": "form"})
- self.tree.openElements.remove(node)
- def endTagListItem(self, token):
- if token["name"] == "li":
- variant = "list"
- else:
- variant = None
- if not self.tree.elementInScope(token["name"], variant=variant):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- else:
- self.tree.generateImpliedEndTags(exclude=token["name"])
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError(
- "end-tag-too-early",
- {"name": token["name"]})
- node = self.tree.openElements.pop()
- while node.name != token["name"]:
- node = self.tree.openElements.pop()
- def endTagHeading(self, token):
- for item in headingElements:
- if self.tree.elementInScope(item):
- self.tree.generateImpliedEndTags()
- break
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("end-tag-too-early", {"name": token["name"]})
- for item in headingElements:
- if self.tree.elementInScope(item):
- item = self.tree.openElements.pop()
- while item.name not in headingElements:
- item = self.tree.openElements.pop()
- break
- def endTagFormatting(self, token):
- """The much-feared adoption agency algorithm"""
- # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
- # XXX Better parseError messages appreciated.
- # Step 1
- outerLoopCounter = 0
- # Step 2
- while outerLoopCounter < 8:
- # Step 3
- outerLoopCounter += 1
- # Step 4:
- # Let the formatting element be the last element in
- # the list of active formatting elements that:
- # - is between the end of the list and the last scope
- # marker in the list, if any, or the start of the list
- # otherwise, and
- # - has the same tag name as the token.
- formattingElement = self.tree.elementInActiveFormattingElements(
- token["name"])
- if (not formattingElement or
- (formattingElement in self.tree.openElements and
- not self.tree.elementInScope(formattingElement.name))):
- # If there is no such node, then abort these steps
- # and instead act as described in the "any other
- # end tag" entry below.
- self.endTagOther(token)
- return
- # Otherwise, if there is such a node, but that node is
- # not in the stack of open elements, then this is a
- # parse error; remove the element from the list, and
- # abort these steps.
- elif formattingElement not in self.tree.openElements:
- self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
- self.tree.activeFormattingElements.remove(formattingElement)
- return
- # Otherwise, if there is such a node, and that node is
- # also in the stack of open elements, but the element
- # is not in scope, then this is a parse error; ignore
- # the token, and abort these steps.
- elif not self.tree.elementInScope(formattingElement.name):
- self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
- return
- # Otherwise, there is a formatting element and that
- # element is in the stack and is in scope. If the
- # element is not the current node, this is a parse
- # error. In any case, proceed with the algorithm as
- # written in the following steps.
- else:
- if formattingElement != self.tree.openElements[-1]:
- self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
- # Step 5:
- # Let the furthest block be the topmost node in the
- # stack of open elements that is lower in the stack
- # than the formatting element, and is an element in
- # the special category. There might not be one.
- afeIndex = self.tree.openElements.index(formattingElement)
- furthestBlock = None
- for element in self.tree.openElements[afeIndex:]:
- if element.nameTuple in specialElements:
- furthestBlock = element
- break
- # Step 6:
- # If there is no furthest block, then the UA must
- # first pop all the nodes from the bottom of the stack
- # of open elements, from the current node up to and
- # including the formatting element, then remove the
- # formatting element from the list of active
- # formatting elements, and finally abort these steps.
- if furthestBlock is None:
- element = self.tree.openElements.pop()
- while element != formattingElement:
- element = self.tree.openElements.pop()
- self.tree.activeFormattingElements.remove(element)
- return
- # Step 7
- commonAncestor = self.tree.openElements[afeIndex - 1]
- # Step 8:
- # The bookmark is supposed to help us identify where to reinsert
- # nodes in step 15. We have to ensure that we reinsert nodes after
- # the node before the active formatting element. Note the bookmark
- # can move in step 9.7
- bookmark = self.tree.activeFormattingElements.index(formattingElement)
- # Step 9
- lastNode = node = furthestBlock
- innerLoopCounter = 0
- index = self.tree.openElements.index(node)
- while innerLoopCounter < 3:
- innerLoopCounter += 1
- # Node is element before node in open elements
- index -= 1
- node = self.tree.openElements[index]
- if node not in self.tree.activeFormattingElements:
- self.tree.openElements.remove(node)
- continue
- # Step 9.6
- if node == formattingElement:
- break
- # Step 9.7
- if lastNode == furthestBlock:
- bookmark = self.tree.activeFormattingElements.index(node) + 1
- # Step 9.8
- clone = node.cloneNode()
- # Replace node with clone
- self.tree.activeFormattingElements[
- self.tree.activeFormattingElements.index(node)] = clone
- self.tree.openElements[
- self.tree.openElements.index(node)] = clone
- node = clone
- # Step 9.9
- # Remove lastNode from its parents, if any
- if lastNode.parent:
- lastNode.parent.removeChild(lastNode)
- node.appendChild(lastNode)
- # Step 9.10
- lastNode = node
- # Step 10
- # Foster parent lastNode if commonAncestor is a
- # table, tbody, tfoot, thead, or tr we need to foster
- # parent the lastNode
- if lastNode.parent:
- lastNode.parent.removeChild(lastNode)
- if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
- parent, insertBefore = self.tree.getTableMisnestedNodePosition()
- parent.insertBefore(lastNode, insertBefore)
- else:
- commonAncestor.appendChild(lastNode)
- # Step 11
- clone = formattingElement.cloneNode()
- # Step 12
- furthestBlock.reparentChildren(clone)
- # Step 13
- furthestBlock.appendChild(clone)
- # Step 14
- self.tree.activeFormattingElements.remove(formattingElement)
- self.tree.activeFormattingElements.insert(bookmark, clone)
- # Step 15
- self.tree.openElements.remove(formattingElement)
- self.tree.openElements.insert(
- self.tree.openElements.index(furthestBlock) + 1, clone)
- def endTagAppletMarqueeObject(self, token):
- if self.tree.elementInScope(token["name"]):
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("end-tag-too-early", {"name": token["name"]})
- if self.tree.elementInScope(token["name"]):
- element = self.tree.openElements.pop()
- while element.name != token["name"]:
- element = self.tree.openElements.pop()
- self.tree.clearActiveFormattingElements()
- def endTagBr(self, token):
- self.parser.parseError("unexpected-end-tag-treated-as",
- {"originalName": "br", "newName": "br element"})
- self.tree.reconstructActiveFormattingElements()
- self.tree.insertElement(impliedTagToken("br", "StartTag"))
- self.tree.openElements.pop()
- def endTagOther(self, token):
- for node in self.tree.openElements[::-1]:
- if node.name == token["name"]:
- self.tree.generateImpliedEndTags(exclude=token["name"])
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- while self.tree.openElements.pop() != node:
- pass
- break
- else:
- if node.nameTuple in specialElements:
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- break
- class TextPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("script", self.endTagScript)])
- self.endTagHandler.default = self.endTagOther
- def processCharacters(self, token):
- self.tree.insertText(token["data"])
- def processEOF(self):
- self.parser.parseError("expected-named-closing-tag-but-got-eof",
- {"name": self.tree.openElements[-1].name})
- self.tree.openElements.pop()
- self.parser.phase = self.parser.originalPhase
- return True
- def startTagOther(self, token):
- assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
- def endTagScript(self, token):
- node = self.tree.openElements.pop()
- assert node.name == "script"
- self.parser.phase = self.parser.originalPhase
- # The rest of this method is all stuff that only happens if
- # document.write works
- def endTagOther(self, token):
- self.tree.openElements.pop()
- self.parser.phase = self.parser.originalPhase
- class InTablePhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-table
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("caption", self.startTagCaption),
- ("colgroup", self.startTagColgroup),
- ("col", self.startTagCol),
- (("tbody", "tfoot", "thead"), self.startTagRowGroup),
- (("td", "th", "tr"), self.startTagImplyTbody),
- ("table", self.startTagTable),
- (("style", "script"), self.startTagStyleScript),
- ("input", self.startTagInput),
- ("form", self.startTagForm)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("table", self.endTagTable),
- (("body", "caption", "col", "colgroup", "html", "tbody", "td",
- "tfoot", "th", "thead", "tr"), self.endTagIgnore)
- ])
- self.endTagHandler.default = self.endTagOther
- # helper methods
- def clearStackToTableContext(self):
- # "clear the stack back to a table context"
- while self.tree.openElements[-1].name not in ("table", "html"):
- # self.parser.parseError("unexpected-implied-end-tag-in-table",
- # {"name": self.tree.openElements[-1].name})
- self.tree.openElements.pop()
- # When the current node is <html> it's an innerHTML case
- # processing methods
- def processEOF(self):
- if self.tree.openElements[-1].name != "html":
- self.parser.parseError("eof-in-table")
- else:
- assert self.parser.innerHTML
- # Stop parsing
- def processSpaceCharacters(self, token):
- originalPhase = self.parser.phase
- self.parser.phase = self.parser.phases["inTableText"]
- self.parser.phase.originalPhase = originalPhase
- self.parser.phase.processSpaceCharacters(token)
- def processCharacters(self, token):
- originalPhase = self.parser.phase
- self.parser.phase = self.parser.phases["inTableText"]
- self.parser.phase.originalPhase = originalPhase
- self.parser.phase.processCharacters(token)
- def insertText(self, token):
- # If we get here there must be at least one non-whitespace character
- # Do the table magic!
- self.tree.insertFromTable = True
- self.parser.phases["inBody"].processCharacters(token)
- self.tree.insertFromTable = False
- def startTagCaption(self, token):
- self.clearStackToTableContext()
- self.tree.activeFormattingElements.append(Marker)
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inCaption"]
- def startTagColgroup(self, token):
- self.clearStackToTableContext()
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inColumnGroup"]
- def startTagCol(self, token):
- self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
- return token
- def startTagRowGroup(self, token):
- self.clearStackToTableContext()
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inTableBody"]
- def startTagImplyTbody(self, token):
- self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
- return token
- def startTagTable(self, token):
- self.parser.parseError("unexpected-start-tag-implies-end-tag",
- {"startName": "table", "endName": "table"})
- self.parser.phase.processEndTag(impliedTagToken("table"))
- if not self.parser.innerHTML:
- return token
- def startTagStyleScript(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagInput(self, token):
- if ("type" in token["data"] and
- token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
- self.parser.parseError("unexpected-hidden-input-in-table")
- self.tree.insertElement(token)
- # XXX associate with form
- self.tree.openElements.pop()
- else:
- self.startTagOther(token)
- def startTagForm(self, token):
- self.parser.parseError("unexpected-form-in-table")
- if self.tree.formPointer is None:
- self.tree.insertElement(token)
- self.tree.formPointer = self.tree.openElements[-1]
- self.tree.openElements.pop()
- def startTagOther(self, token):
- self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
- # Do the table magic!
- self.tree.insertFromTable = True
- self.parser.phases["inBody"].processStartTag(token)
- self.tree.insertFromTable = False
- def endTagTable(self, token):
- if self.tree.elementInScope("table", variant="table"):
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != "table":
- self.parser.parseError("end-tag-too-early-named",
- {"gotName": "table",
- "expectedName": self.tree.openElements[-1].name})
- while self.tree.openElements[-1].name != "table":
- self.tree.openElements.pop()
- self.tree.openElements.pop()
- self.parser.resetInsertionMode()
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def endTagIgnore(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
- # Do the table magic!
- self.tree.insertFromTable = True
- self.parser.phases["inBody"].processEndTag(token)
- self.tree.insertFromTable = False
- class InTableTextPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.originalPhase = None
- self.characterTokens = []
- def flushCharacters(self):
- data = "".join([item["data"] for item in self.characterTokens])
- if any([item not in spaceCharacters for item in data]):
- token = {"type": tokenTypes["Characters"], "data": data}
- self.parser.phases["inTable"].insertText(token)
- elif data:
- self.tree.insertText(data)
- self.characterTokens = []
- def processComment(self, token):
- self.flushCharacters()
- self.parser.phase = self.originalPhase
- return token
- def processEOF(self):
- self.flushCharacters()
- self.parser.phase = self.originalPhase
- return True
- def processCharacters(self, token):
- if token["data"] == "\u0000":
- return
- self.characterTokens.append(token)
- def processSpaceCharacters(self, token):
- # pretty sure we should never reach here
- self.characterTokens.append(token)
- # assert False
- def processStartTag(self, token):
- self.flushCharacters()
- self.parser.phase = self.originalPhase
- return token
- def processEndTag(self, token):
- self.flushCharacters()
- self.parser.phase = self.originalPhase
- return token
- class InCaptionPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
- "thead", "tr"), self.startTagTableElement)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("caption", self.endTagCaption),
- ("table", self.endTagTable),
- (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
- "thead", "tr"), self.endTagIgnore)
- ])
- self.endTagHandler.default = self.endTagOther
- def ignoreEndTagCaption(self):
- return not self.tree.elementInScope("caption", variant="table")
- def processEOF(self):
- self.parser.phases["inBody"].processEOF()
- def processCharacters(self, token):
- return self.parser.phases["inBody"].processCharacters(token)
- def startTagTableElement(self, token):
- self.parser.parseError()
- # XXX Have to duplicate logic here to find out if the tag is ignored
- ignoreEndTag = self.ignoreEndTagCaption()
- self.parser.phase.processEndTag(impliedTagToken("caption"))
- if not ignoreEndTag:
- return token
- def startTagOther(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def endTagCaption(self, token):
- if not self.ignoreEndTagCaption():
- # AT this code is quite similar to endTagTable in "InTable"
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != "caption":
- self.parser.parseError("expected-one-end-tag-but-got-another",
- {"gotName": "caption",
- "expectedName": self.tree.openElements[-1].name})
- while self.tree.openElements[-1].name != "caption":
- self.tree.openElements.pop()
- self.tree.openElements.pop()
- self.tree.clearActiveFormattingElements()
- self.parser.phase = self.parser.phases["inTable"]
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def endTagTable(self, token):
- self.parser.parseError()
- ignoreEndTag = self.ignoreEndTagCaption()
- self.parser.phase.processEndTag(impliedTagToken("caption"))
- if not ignoreEndTag:
- return token
- def endTagIgnore(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def endTagOther(self, token):
- return self.parser.phases["inBody"].processEndTag(token)
- class InColumnGroupPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-column
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("col", self.startTagCol)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("colgroup", self.endTagColgroup),
- ("col", self.endTagCol)
- ])
- self.endTagHandler.default = self.endTagOther
- def ignoreEndTagColgroup(self):
- return self.tree.openElements[-1].name == "html"
- def processEOF(self):
- if self.tree.openElements[-1].name == "html":
- assert self.parser.innerHTML
- return
- else:
- ignoreEndTag = self.ignoreEndTagColgroup()
- self.endTagColgroup(impliedTagToken("colgroup"))
- if not ignoreEndTag:
- return True
- def processCharacters(self, token):
- ignoreEndTag = self.ignoreEndTagColgroup()
- self.endTagColgroup(impliedTagToken("colgroup"))
- if not ignoreEndTag:
- return token
- def startTagCol(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def startTagOther(self, token):
- ignoreEndTag = self.ignoreEndTagColgroup()
- self.endTagColgroup(impliedTagToken("colgroup"))
- if not ignoreEndTag:
- return token
- def endTagColgroup(self, token):
- if self.ignoreEndTagColgroup():
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- else:
- self.tree.openElements.pop()
- self.parser.phase = self.parser.phases["inTable"]
- def endTagCol(self, token):
- self.parser.parseError("no-end-tag", {"name": "col"})
- def endTagOther(self, token):
- ignoreEndTag = self.ignoreEndTagColgroup()
- self.endTagColgroup(impliedTagToken("colgroup"))
- if not ignoreEndTag:
- return token
- class InTableBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("tr", self.startTagTr),
- (("td", "th"), self.startTagTableCell),
- (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
- self.startTagTableOther)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
- ("table", self.endTagTable),
- (("body", "caption", "col", "colgroup", "html", "td", "th",
- "tr"), self.endTagIgnore)
- ])
- self.endTagHandler.default = self.endTagOther
- # helper methods
- def clearStackToTableBodyContext(self):
- while self.tree.openElements[-1].name not in ("tbody", "tfoot",
- "thead", "html"):
- # self.parser.parseError("unexpected-implied-end-tag-in-table",
- # {"name": self.tree.openElements[-1].name})
- self.tree.openElements.pop()
- if self.tree.openElements[-1].name == "html":
- assert self.parser.innerHTML
- # the rest
- def processEOF(self):
- self.parser.phases["inTable"].processEOF()
- def processSpaceCharacters(self, token):
- return self.parser.phases["inTable"].processSpaceCharacters(token)
- def processCharacters(self, token):
- return self.parser.phases["inTable"].processCharacters(token)
- def startTagTr(self, token):
- self.clearStackToTableBodyContext()
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inRow"]
- def startTagTableCell(self, token):
- self.parser.parseError("unexpected-cell-in-table-body",
- {"name": token["name"]})
- self.startTagTr(impliedTagToken("tr", "StartTag"))
- return token
- def startTagTableOther(self, token):
- # XXX AT Any ideas on how to share this with endTagTable?
- if (self.tree.elementInScope("tbody", variant="table") or
- self.tree.elementInScope("thead", variant="table") or
- self.tree.elementInScope("tfoot", variant="table")):
- self.clearStackToTableBodyContext()
- self.endTagTableRowGroup(
- impliedTagToken(self.tree.openElements[-1].name))
- return token
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def startTagOther(self, token):
- return self.parser.phases["inTable"].processStartTag(token)
- def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], variant="table"):
- self.clearStackToTableBodyContext()
- self.tree.openElements.pop()
- self.parser.phase = self.parser.phases["inTable"]
- else:
- self.parser.parseError("unexpected-end-tag-in-table-body",
- {"name": token["name"]})
- def endTagTable(self, token):
- if (self.tree.elementInScope("tbody", variant="table") or
- self.tree.elementInScope("thead", variant="table") or
- self.tree.elementInScope("tfoot", variant="table")):
- self.clearStackToTableBodyContext()
- self.endTagTableRowGroup(
- impliedTagToken(self.tree.openElements[-1].name))
- return token
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def endTagIgnore(self, token):
- self.parser.parseError("unexpected-end-tag-in-table-body",
- {"name": token["name"]})
- def endTagOther(self, token):
- return self.parser.phases["inTable"].processEndTag(token)
- class InRowPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-row
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("td", "th"), self.startTagTableCell),
- (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
- "tr"), self.startTagTableOther)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("tr", self.endTagTr),
- ("table", self.endTagTable),
- (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
- (("body", "caption", "col", "colgroup", "html", "td", "th"),
- self.endTagIgnore)
- ])
- self.endTagHandler.default = self.endTagOther
- # helper methods (XXX unify this with other table helper methods)
- def clearStackToTableRowContext(self):
- while self.tree.openElements[-1].name not in ("tr", "html"):
- self.parser.parseError("unexpected-implied-end-tag-in-table-row",
- {"name": self.tree.openElements[-1].name})
- self.tree.openElements.pop()
- def ignoreEndTagTr(self):
- return not self.tree.elementInScope("tr", variant="table")
- # the rest
- def processEOF(self):
- self.parser.phases["inTable"].processEOF()
- def processSpaceCharacters(self, token):
- return self.parser.phases["inTable"].processSpaceCharacters(token)
- def processCharacters(self, token):
- return self.parser.phases["inTable"].processCharacters(token)
- def startTagTableCell(self, token):
- self.clearStackToTableRowContext()
- self.tree.insertElement(token)
- self.parser.phase = self.parser.phases["inCell"]
- self.tree.activeFormattingElements.append(Marker)
- def startTagTableOther(self, token):
- ignoreEndTag = self.ignoreEndTagTr()
- self.endTagTr(impliedTagToken("tr"))
- # XXX how are we sure it's always ignored in the innerHTML case?
- if not ignoreEndTag:
- return token
- def startTagOther(self, token):
- return self.parser.phases["inTable"].processStartTag(token)
- def endTagTr(self, token):
- if not self.ignoreEndTagTr():
- self.clearStackToTableRowContext()
- self.tree.openElements.pop()
- self.parser.phase = self.parser.phases["inTableBody"]
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def endTagTable(self, token):
- ignoreEndTag = self.ignoreEndTagTr()
- self.endTagTr(impliedTagToken("tr"))
- # Reprocess the current tag if the tr end tag was not ignored
- # XXX how are we sure it's always ignored in the innerHTML case?
- if not ignoreEndTag:
- return token
- def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], variant="table"):
- self.endTagTr(impliedTagToken("tr"))
- return token
- else:
- self.parser.parseError()
- def endTagIgnore(self, token):
- self.parser.parseError("unexpected-end-tag-in-table-row",
- {"name": token["name"]})
- def endTagOther(self, token):
- return self.parser.phases["inTable"].processEndTag(token)
- class InCellPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
- "thead", "tr"), self.startTagTableOther)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- (("td", "th"), self.endTagTableCell),
- (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
- (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
- ])
- self.endTagHandler.default = self.endTagOther
- # helper
- def closeCell(self):
- if self.tree.elementInScope("td", variant="table"):
- self.endTagTableCell(impliedTagToken("td"))
- elif self.tree.elementInScope("th", variant="table"):
- self.endTagTableCell(impliedTagToken("th"))
- # the rest
- def processEOF(self):
- self.parser.phases["inBody"].processEOF()
- def processCharacters(self, token):
- return self.parser.phases["inBody"].processCharacters(token)
- def startTagTableOther(self, token):
- if (self.tree.elementInScope("td", variant="table") or
- self.tree.elementInScope("th", variant="table")):
- self.closeCell()
- return token
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def startTagOther(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def endTagTableCell(self, token):
- if self.tree.elementInScope(token["name"], variant="table"):
- self.tree.generateImpliedEndTags(token["name"])
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("unexpected-cell-end-tag",
- {"name": token["name"]})
- while True:
- node = self.tree.openElements.pop()
- if node.name == token["name"]:
- break
- else:
- self.tree.openElements.pop()
- self.tree.clearActiveFormattingElements()
- self.parser.phase = self.parser.phases["inRow"]
- else:
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def endTagIgnore(self, token):
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- def endTagImply(self, token):
- if self.tree.elementInScope(token["name"], variant="table"):
- self.closeCell()
- return token
- else:
- # sometimes innerHTML case
- self.parser.parseError()
- def endTagOther(self, token):
- return self.parser.phases["inBody"].processEndTag(token)
- class InSelectPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("option", self.startTagOption),
- ("optgroup", self.startTagOptgroup),
- ("select", self.startTagSelect),
- (("input", "keygen", "textarea"), self.startTagInput),
- ("script", self.startTagScript)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("option", self.endTagOption),
- ("optgroup", self.endTagOptgroup),
- ("select", self.endTagSelect)
- ])
- self.endTagHandler.default = self.endTagOther
- # http://www.whatwg.org/specs/web-apps/current-work/#in-select
- def processEOF(self):
- if self.tree.openElements[-1].name != "html":
- self.parser.parseError("eof-in-select")
- else:
- assert self.parser.innerHTML
- def processCharacters(self, token):
- if token["data"] == "\u0000":
- return
- self.tree.insertText(token["data"])
- def startTagOption(self, token):
- # We need to imply </option> if <option> is the current node.
- if self.tree.openElements[-1].name == "option":
- self.tree.openElements.pop()
- self.tree.insertElement(token)
- def startTagOptgroup(self, token):
- if self.tree.openElements[-1].name == "option":
- self.tree.openElements.pop()
- if self.tree.openElements[-1].name == "optgroup":
- self.tree.openElements.pop()
- self.tree.insertElement(token)
- def startTagSelect(self, token):
- self.parser.parseError("unexpected-select-in-select")
- self.endTagSelect(impliedTagToken("select"))
- def startTagInput(self, token):
- self.parser.parseError("unexpected-input-in-select")
- if self.tree.elementInScope("select", variant="select"):
- self.endTagSelect(impliedTagToken("select"))
- return token
- else:
- assert self.parser.innerHTML
- def startTagScript(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("unexpected-start-tag-in-select",
- {"name": token["name"]})
- def endTagOption(self, token):
- if self.tree.openElements[-1].name == "option":
- self.tree.openElements.pop()
- else:
- self.parser.parseError("unexpected-end-tag-in-select",
- {"name": "option"})
- def endTagOptgroup(self, token):
- # </optgroup> implicitly closes <option>
- if (self.tree.openElements[-1].name == "option" and
- self.tree.openElements[-2].name == "optgroup"):
- self.tree.openElements.pop()
- # It also closes </optgroup>
- if self.tree.openElements[-1].name == "optgroup":
- self.tree.openElements.pop()
- # But nothing else
- else:
- self.parser.parseError("unexpected-end-tag-in-select",
- {"name": "optgroup"})
- def endTagSelect(self, token):
- if self.tree.elementInScope("select", variant="select"):
- node = self.tree.openElements.pop()
- while node.name != "select":
- node = self.tree.openElements.pop()
- self.parser.resetInsertionMode()
- else:
- # innerHTML case
- assert self.parser.innerHTML
- self.parser.parseError()
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag-in-select",
- {"name": token["name"]})
- class InSelectInTablePhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
- self.startTagTable)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
- self.endTagTable)
- ])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- self.parser.phases["inSelect"].processEOF()
- def processCharacters(self, token):
- return self.parser.phases["inSelect"].processCharacters(token)
- def startTagTable(self, token):
- self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
- self.endTagOther(impliedTagToken("select"))
- return token
- def startTagOther(self, token):
- return self.parser.phases["inSelect"].processStartTag(token)
- def endTagTable(self, token):
- self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
- if self.tree.elementInScope(token["name"], variant="table"):
- self.endTagOther(impliedTagToken("select"))
- return token
- def endTagOther(self, token):
- return self.parser.phases["inSelect"].processEndTag(token)
- class InForeignContentPhase(Phase):
- breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
- "center", "code", "dd", "div", "dl", "dt",
- "em", "embed", "h1", "h2", "h3",
- "h4", "h5", "h6", "head", "hr", "i", "img",
- "li", "listing", "menu", "meta", "nobr",
- "ol", "p", "pre", "ruby", "s", "small",
- "span", "strong", "strike", "sub", "sup",
- "table", "tt", "u", "ul", "var"])
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- def adjustSVGTagNames(self, token):
- replacements = {"altglyph": "altGlyph",
- "altglyphdef": "altGlyphDef",
- "altglyphitem": "altGlyphItem",
- "animatecolor": "animateColor",
- "animatemotion": "animateMotion",
- "animatetransform": "animateTransform",
- "clippath": "clipPath",
- "feblend": "feBlend",
- "fecolormatrix": "feColorMatrix",
- "fecomponenttransfer": "feComponentTransfer",
- "fecomposite": "feComposite",
- "feconvolvematrix": "feConvolveMatrix",
- "fediffuselighting": "feDiffuseLighting",
- "fedisplacementmap": "feDisplacementMap",
- "fedistantlight": "feDistantLight",
- "feflood": "feFlood",
- "fefunca": "feFuncA",
- "fefuncb": "feFuncB",
- "fefuncg": "feFuncG",
- "fefuncr": "feFuncR",
- "fegaussianblur": "feGaussianBlur",
- "feimage": "feImage",
- "femerge": "feMerge",
- "femergenode": "feMergeNode",
- "femorphology": "feMorphology",
- "feoffset": "feOffset",
- "fepointlight": "fePointLight",
- "fespecularlighting": "feSpecularLighting",
- "fespotlight": "feSpotLight",
- "fetile": "feTile",
- "feturbulence": "feTurbulence",
- "foreignobject": "foreignObject",
- "glyphref": "glyphRef",
- "lineargradient": "linearGradient",
- "radialgradient": "radialGradient",
- "textpath": "textPath"}
- if token["name"] in replacements:
- token["name"] = replacements[token["name"]]
- def processCharacters(self, token):
- if token["data"] == "\u0000":
- token["data"] = "\uFFFD"
- elif (self.parser.framesetOK and
- any(char not in spaceCharacters for char in token["data"])):
- self.parser.framesetOK = False
- Phase.processCharacters(self, token)
- def processStartTag(self, token):
- currentNode = self.tree.openElements[-1]
- if (token["name"] in self.breakoutElements or
- (token["name"] == "font" and
- set(token["data"].keys()) & set(["color", "face", "size"]))):
- self.parser.parseError("unexpected-html-element-in-foreign-content",
- {"name": token["name"]})
- while (self.tree.openElements[-1].namespace !=
- self.tree.defaultNamespace and
- not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
- not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
- self.tree.openElements.pop()
- return token
- else:
- if currentNode.namespace == namespaces["mathml"]:
- self.parser.adjustMathMLAttributes(token)
- elif currentNode.namespace == namespaces["svg"]:
- self.adjustSVGTagNames(token)
- self.parser.adjustSVGAttributes(token)
- self.parser.adjustForeignAttributes(token)
- token["namespace"] = currentNode.namespace
- self.tree.insertElement(token)
- if token["selfClosing"]:
- self.tree.openElements.pop()
- token["selfClosingAcknowledged"] = True
- def processEndTag(self, token):
- nodeIndex = len(self.tree.openElements) - 1
- node = self.tree.openElements[-1]
- if node.name.translate(asciiUpper2Lower) != token["name"]:
- self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
- while True:
- if node.name.translate(asciiUpper2Lower) == token["name"]:
- # XXX this isn't in the spec but it seems necessary
- if self.parser.phase == self.parser.phases["inTableText"]:
- self.parser.phase.flushCharacters()
- self.parser.phase = self.parser.phase.originalPhase
- while self.tree.openElements.pop() != node:
- assert self.tree.openElements
- new_token = None
- break
- nodeIndex -= 1
- node = self.tree.openElements[nodeIndex]
- if node.namespace != self.tree.defaultNamespace:
- continue
- else:
- new_token = self.parser.phase.processEndTag(token)
- break
- return new_token
- class AfterBodyPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- # Stop parsing
- pass
- def processComment(self, token):
- # This is needed because data is to be appended to the <html> element
- # here and not to whatever is currently open.
- self.tree.insertComment(token, self.tree.openElements[0])
- def processCharacters(self, token):
- self.parser.parseError("unexpected-char-after-body")
- self.parser.phase = self.parser.phases["inBody"]
- return token
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("unexpected-start-tag-after-body",
- {"name": token["name"]})
- self.parser.phase = self.parser.phases["inBody"]
- return token
- def endTagHtml(self, name):
- if self.parser.innerHTML:
- self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
- else:
- self.parser.phase = self.parser.phases["afterAfterBody"]
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag-after-body",
- {"name": token["name"]})
- self.parser.phase = self.parser.phases["inBody"]
- return token
- class InFramesetPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("frameset", self.startTagFrameset),
- ("frame", self.startTagFrame),
- ("noframes", self.startTagNoframes)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("frameset", self.endTagFrameset)
- ])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- if self.tree.openElements[-1].name != "html":
- self.parser.parseError("eof-in-frameset")
- else:
- assert self.parser.innerHTML
- def processCharacters(self, token):
- self.parser.parseError("unexpected-char-in-frameset")
- def startTagFrameset(self, token):
- self.tree.insertElement(token)
- def startTagFrame(self, token):
- self.tree.insertElement(token)
- self.tree.openElements.pop()
- def startTagNoframes(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("unexpected-start-tag-in-frameset",
- {"name": token["name"]})
- def endTagFrameset(self, token):
- if self.tree.openElements[-1].name == "html":
- # innerHTML case
- self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
- else:
- self.tree.openElements.pop()
- if (not self.parser.innerHTML and
- self.tree.openElements[-1].name != "frameset"):
- # If we're not in innerHTML mode and the current node is not a
- # "frameset" element (anymore) then switch.
- self.parser.phase = self.parser.phases["afterFrameset"]
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag-in-frameset",
- {"name": token["name"]})
- class AfterFramesetPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#after3
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("noframes", self.startTagNoframes)
- ])
- self.startTagHandler.default = self.startTagOther
- self.endTagHandler = _utils.MethodDispatcher([
- ("html", self.endTagHtml)
- ])
- self.endTagHandler.default = self.endTagOther
- def processEOF(self):
- # Stop parsing
- pass
- def processCharacters(self, token):
- self.parser.parseError("unexpected-char-after-frameset")
- def startTagNoframes(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("unexpected-start-tag-after-frameset",
- {"name": token["name"]})
- def endTagHtml(self, token):
- self.parser.phase = self.parser.phases["afterAfterFrameset"]
- def endTagOther(self, token):
- self.parser.parseError("unexpected-end-tag-after-frameset",
- {"name": token["name"]})
- class AfterAfterBodyPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml)
- ])
- self.startTagHandler.default = self.startTagOther
- def processEOF(self):
- pass
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
- def processSpaceCharacters(self, token):
- return self.parser.phases["inBody"].processSpaceCharacters(token)
- def processCharacters(self, token):
- self.parser.parseError("expected-eof-but-got-char")
- self.parser.phase = self.parser.phases["inBody"]
- return token
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("expected-eof-but-got-start-tag",
- {"name": token["name"]})
- self.parser.phase = self.parser.phases["inBody"]
- return token
- def processEndTag(self, token):
- self.parser.parseError("expected-eof-but-got-end-tag",
- {"name": token["name"]})
- self.parser.phase = self.parser.phases["inBody"]
- return token
- class AfterAfterFramesetPhase(Phase):
- def __init__(self, parser, tree):
- Phase.__init__(self, parser, tree)
- self.startTagHandler = _utils.MethodDispatcher([
- ("html", self.startTagHtml),
- ("noframes", self.startTagNoFrames)
- ])
- self.startTagHandler.default = self.startTagOther
- def processEOF(self):
- pass
- def processComment(self, token):
- self.tree.insertComment(token, self.tree.document)
- def processSpaceCharacters(self, token):
- return self.parser.phases["inBody"].processSpaceCharacters(token)
- def processCharacters(self, token):
- self.parser.parseError("expected-eof-but-got-char")
- def startTagHtml(self, token):
- return self.parser.phases["inBody"].processStartTag(token)
- def startTagNoFrames(self, token):
- return self.parser.phases["inHead"].processStartTag(token)
- def startTagOther(self, token):
- self.parser.parseError("expected-eof-but-got-start-tag",
- {"name": token["name"]})
- def processEndTag(self, token):
- self.parser.parseError("expected-eof-but-got-end-tag",
- {"name": token["name"]})
- # pylint:enable=unused-argument
- return {
- "initial": InitialPhase,
- "beforeHtml": BeforeHtmlPhase,
- "beforeHead": BeforeHeadPhase,
- "inHead": InHeadPhase,
- "inHeadNoscript": InHeadNoscriptPhase,
- "afterHead": AfterHeadPhase,
- "inBody": InBodyPhase,
- "text": TextPhase,
- "inTable": InTablePhase,
- "inTableText": InTableTextPhase,
- "inCaption": InCaptionPhase,
- "inColumnGroup": InColumnGroupPhase,
- "inTableBody": InTableBodyPhase,
- "inRow": InRowPhase,
- "inCell": InCellPhase,
- "inSelect": InSelectPhase,
- "inSelectInTable": InSelectInTablePhase,
- "inForeignContent": InForeignContentPhase,
- "afterBody": AfterBodyPhase,
- "inFrameset": InFramesetPhase,
- "afterFrameset": AfterFramesetPhase,
- "afterAfterBody": AfterAfterBodyPhase,
- "afterAfterFrameset": AfterAfterFramesetPhase,
- # XXX after after frameset
- }
- def adjust_attributes(token, replacements):
- needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
- if needs_adjustment:
- token['data'] = OrderedDict((replacements.get(k, k), v)
- for k, v in token['data'].items())
- def impliedTagToken(name, type="EndTag", attributes=None,
- selfClosing=False):
- if attributes is None:
- attributes = {}
- return {"type": tokenTypes[type], "name": name, "data": attributes,
- "selfClosing": selfClosing}
- class ParseError(Exception):
- """Error in parsed document"""
- pass
|