html5parser.py 116 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import with_metaclass, viewkeys
  3. import types
  4. from collections import OrderedDict
  5. from . import _inputstream
  6. from . import _tokenizer
  7. from . import treebuilders
  8. from .treebuilders.base import Marker
  9. from . import _utils
  10. from .constants import (
  11. spaceCharacters, asciiUpper2Lower,
  12. specialElements, headingElements, cdataElements, rcdataElements,
  13. tokenTypes, tagTokenTypes,
  14. namespaces,
  15. htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
  16. adjustForeignAttributes as adjustForeignAttributesMap,
  17. adjustMathMLAttributes, adjustSVGAttributes,
  18. E,
  19. _ReparseException
  20. )
  21. def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  22. """Parse an HTML document as a string or file-like object into a tree
  23. :arg doc: the document to parse as a string or file-like object
  24. :arg treebuilder: the treebuilder to use when parsing
  25. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  26. :returns: parsed tree
  27. Example:
  28. >>> from html5lib.html5parser import parse
  29. >>> parse('<html><body><p>This is a doc</p></body></html>')
  30. <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
  31. """
  32. tb = treebuilders.getTreeBuilder(treebuilder)
  33. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  34. return p.parse(doc, **kwargs)
  35. def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  36. """Parse an HTML fragment as a string or file-like object into a tree
  37. :arg doc: the fragment to parse as a string or file-like object
  38. :arg container: the container context to parse the fragment in
  39. :arg treebuilder: the treebuilder to use when parsing
  40. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  41. :returns: parsed tree
  42. Example:
  43. >>> from html5lib.html5libparser import parseFragment
  44. >>> parseFragment('<b>this is a fragment</b>')
  45. <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
  46. """
  47. tb = treebuilders.getTreeBuilder(treebuilder)
  48. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  49. return p.parseFragment(doc, container=container, **kwargs)
  50. def method_decorator_metaclass(function):
  51. class Decorated(type):
  52. def __new__(meta, classname, bases, classDict):
  53. for attributeName, attribute in classDict.items():
  54. if isinstance(attribute, types.FunctionType):
  55. attribute = function(attribute)
  56. classDict[attributeName] = attribute
  57. return type.__new__(meta, classname, bases, classDict)
  58. return Decorated
  59. class HTMLParser(object):
  60. """HTML parser
  61. Generates a tree structure from a stream of (possibly malformed) HTML.
  62. """
  63. def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
  64. """
  65. :arg tree: a treebuilder class controlling the type of tree that will be
  66. returned. Built in treebuilders can be accessed through
  67. html5lib.treebuilders.getTreeBuilder(treeType)
  68. :arg strict: raise an exception when a parse error is encountered
  69. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  70. :arg debug: whether or not to enable debug mode which logs things
  71. Example:
  72. >>> from html5lib.html5parser import HTMLParser
  73. >>> parser = HTMLParser() # generates parser with etree builder
  74. >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
  75. """
  76. # Raise an exception on the first error encountered
  77. self.strict = strict
  78. if tree is None:
  79. tree = treebuilders.getTreeBuilder("etree")
  80. self.tree = tree(namespaceHTMLElements)
  81. self.errors = []
  82. self.phases = dict([(name, cls(self, self.tree)) for name, cls in
  83. getPhases(debug).items()])
  84. def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
  85. self.innerHTMLMode = innerHTML
  86. self.container = container
  87. self.scripting = scripting
  88. self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
  89. self.reset()
  90. try:
  91. self.mainLoop()
  92. except _ReparseException:
  93. self.reset()
  94. self.mainLoop()
  95. def reset(self):
  96. self.tree.reset()
  97. self.firstStartTag = False
  98. self.errors = []
  99. self.log = [] # only used with debug mode
  100. # "quirks" / "limited quirks" / "no quirks"
  101. self.compatMode = "no quirks"
  102. if self.innerHTMLMode:
  103. self.innerHTML = self.container.lower()
  104. if self.innerHTML in cdataElements:
  105. self.tokenizer.state = self.tokenizer.rcdataState
  106. elif self.innerHTML in rcdataElements:
  107. self.tokenizer.state = self.tokenizer.rawtextState
  108. elif self.innerHTML == 'plaintext':
  109. self.tokenizer.state = self.tokenizer.plaintextState
  110. else:
  111. # state already is data state
  112. # self.tokenizer.state = self.tokenizer.dataState
  113. pass
  114. self.phase = self.phases["beforeHtml"]
  115. self.phase.insertHtmlElement()
  116. self.resetInsertionMode()
  117. else:
  118. self.innerHTML = False # pylint:disable=redefined-variable-type
  119. self.phase = self.phases["initial"]
  120. self.lastPhase = None
  121. self.beforeRCDataPhase = None
  122. self.framesetOK = True
  123. @property
  124. def documentEncoding(self):
  125. """Name of the character encoding that was used to decode the input stream, or
  126. :obj:`None` if that is not determined yet
  127. """
  128. if not hasattr(self, 'tokenizer'):
  129. return None
  130. return self.tokenizer.stream.charEncoding[0].name
  131. def isHTMLIntegrationPoint(self, element):
  132. if (element.name == "annotation-xml" and
  133. element.namespace == namespaces["mathml"]):
  134. return ("encoding" in element.attributes and
  135. element.attributes["encoding"].translate(
  136. asciiUpper2Lower) in
  137. ("text/html", "application/xhtml+xml"))
  138. else:
  139. return (element.namespace, element.name) in htmlIntegrationPointElements
  140. def isMathMLTextIntegrationPoint(self, element):
  141. return (element.namespace, element.name) in mathmlTextIntegrationPointElements
  142. def mainLoop(self):
  143. CharactersToken = tokenTypes["Characters"]
  144. SpaceCharactersToken = tokenTypes["SpaceCharacters"]
  145. StartTagToken = tokenTypes["StartTag"]
  146. EndTagToken = tokenTypes["EndTag"]
  147. CommentToken = tokenTypes["Comment"]
  148. DoctypeToken = tokenTypes["Doctype"]
  149. ParseErrorToken = tokenTypes["ParseError"]
  150. for token in self.normalizedTokens():
  151. prev_token = None
  152. new_token = token
  153. while new_token is not None:
  154. prev_token = new_token
  155. currentNode = self.tree.openElements[-1] if self.tree.openElements else None
  156. currentNodeNamespace = currentNode.namespace if currentNode else None
  157. currentNodeName = currentNode.name if currentNode else None
  158. type = new_token["type"]
  159. if type == ParseErrorToken:
  160. self.parseError(new_token["data"], new_token.get("datavars", {}))
  161. new_token = None
  162. else:
  163. if (len(self.tree.openElements) == 0 or
  164. currentNodeNamespace == self.tree.defaultNamespace or
  165. (self.isMathMLTextIntegrationPoint(currentNode) and
  166. ((type == StartTagToken and
  167. token["name"] not in frozenset(["mglyph", "malignmark"])) or
  168. type in (CharactersToken, SpaceCharactersToken))) or
  169. (currentNodeNamespace == namespaces["mathml"] and
  170. currentNodeName == "annotation-xml" and
  171. type == StartTagToken and
  172. token["name"] == "svg") or
  173. (self.isHTMLIntegrationPoint(currentNode) and
  174. type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
  175. phase = self.phase
  176. else:
  177. phase = self.phases["inForeignContent"]
  178. if type == CharactersToken:
  179. new_token = phase.processCharacters(new_token)
  180. elif type == SpaceCharactersToken:
  181. new_token = phase.processSpaceCharacters(new_token)
  182. elif type == StartTagToken:
  183. new_token = phase.processStartTag(new_token)
  184. elif type == EndTagToken:
  185. new_token = phase.processEndTag(new_token)
  186. elif type == CommentToken:
  187. new_token = phase.processComment(new_token)
  188. elif type == DoctypeToken:
  189. new_token = phase.processDoctype(new_token)
  190. if (type == StartTagToken and prev_token["selfClosing"] and
  191. not prev_token["selfClosingAcknowledged"]):
  192. self.parseError("non-void-element-with-trailing-solidus",
  193. {"name": prev_token["name"]})
  194. # When the loop finishes it's EOF
  195. reprocess = True
  196. phases = []
  197. while reprocess:
  198. phases.append(self.phase)
  199. reprocess = self.phase.processEOF()
  200. if reprocess:
  201. assert self.phase not in phases
  202. def normalizedTokens(self):
  203. for token in self.tokenizer:
  204. yield self.normalizeToken(token)
  205. def parse(self, stream, *args, **kwargs):
  206. """Parse a HTML document into a well-formed tree
  207. :arg stream: a file-like object or string containing the HTML to be parsed
  208. The optional encoding parameter must be a string that indicates
  209. the encoding. If specified, that encoding will be used,
  210. regardless of any BOM or later declaration (such as in a meta
  211. element).
  212. :arg scripting: treat noscript elements as if JavaScript was turned on
  213. :returns: parsed tree
  214. Example:
  215. >>> from html5lib.html5parser import HTMLParser
  216. >>> parser = HTMLParser()
  217. >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
  218. <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
  219. """
  220. self._parse(stream, False, None, *args, **kwargs)
  221. return self.tree.getDocument()
  222. def parseFragment(self, stream, *args, **kwargs):
  223. """Parse a HTML fragment into a well-formed tree fragment
  224. :arg container: name of the element we're setting the innerHTML
  225. property if set to None, default to 'div'
  226. :arg stream: a file-like object or string containing the HTML to be parsed
  227. The optional encoding parameter must be a string that indicates
  228. the encoding. If specified, that encoding will be used,
  229. regardless of any BOM or later declaration (such as in a meta
  230. element)
  231. :arg scripting: treat noscript elements as if JavaScript was turned on
  232. :returns: parsed tree
  233. Example:
  234. >>> from html5lib.html5libparser import HTMLParser
  235. >>> parser = HTMLParser()
  236. >>> parser.parseFragment('<b>this is a fragment</b>')
  237. <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
  238. """
  239. self._parse(stream, True, *args, **kwargs)
  240. return self.tree.getFragment()
  241. def parseError(self, errorcode="XXX-undefined-error", datavars=None):
  242. # XXX The idea is to make errorcode mandatory.
  243. if datavars is None:
  244. datavars = {}
  245. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
  246. if self.strict:
  247. raise ParseError(E[errorcode] % datavars)
  248. def normalizeToken(self, token):
  249. # HTML5 specific normalizations to the token stream
  250. if token["type"] == tokenTypes["StartTag"]:
  251. raw = token["data"]
  252. token["data"] = OrderedDict(raw)
  253. if len(raw) > len(token["data"]):
  254. # we had some duplicated attribute, fix so first wins
  255. token["data"].update(raw[::-1])
  256. return token
  257. def adjustMathMLAttributes(self, token):
  258. adjust_attributes(token, adjustMathMLAttributes)
  259. def adjustSVGAttributes(self, token):
  260. adjust_attributes(token, adjustSVGAttributes)
  261. def adjustForeignAttributes(self, token):
  262. adjust_attributes(token, adjustForeignAttributesMap)
  263. def reparseTokenNormal(self, token):
  264. # pylint:disable=unused-argument
  265. self.parser.phase()
  266. def resetInsertionMode(self):
  267. # The name of this method is mostly historical. (It's also used in the
  268. # specification.)
  269. last = False
  270. newModes = {
  271. "select": "inSelect",
  272. "td": "inCell",
  273. "th": "inCell",
  274. "tr": "inRow",
  275. "tbody": "inTableBody",
  276. "thead": "inTableBody",
  277. "tfoot": "inTableBody",
  278. "caption": "inCaption",
  279. "colgroup": "inColumnGroup",
  280. "table": "inTable",
  281. "head": "inBody",
  282. "body": "inBody",
  283. "frameset": "inFrameset",
  284. "html": "beforeHead"
  285. }
  286. for node in self.tree.openElements[::-1]:
  287. nodeName = node.name
  288. new_phase = None
  289. if node == self.tree.openElements[0]:
  290. assert self.innerHTML
  291. last = True
  292. nodeName = self.innerHTML
  293. # Check for conditions that should only happen in the innerHTML
  294. # case
  295. if nodeName in ("select", "colgroup", "head", "html"):
  296. assert self.innerHTML
  297. if not last and node.namespace != self.tree.defaultNamespace:
  298. continue
  299. if nodeName in newModes:
  300. new_phase = self.phases[newModes[nodeName]]
  301. break
  302. elif last:
  303. new_phase = self.phases["inBody"]
  304. break
  305. self.phase = new_phase
  306. def parseRCDataRawtext(self, token, contentType):
  307. # Generic RCDATA/RAWTEXT Parsing algorithm
  308. assert contentType in ("RAWTEXT", "RCDATA")
  309. self.tree.insertElement(token)
  310. if contentType == "RAWTEXT":
  311. self.tokenizer.state = self.tokenizer.rawtextState
  312. else:
  313. self.tokenizer.state = self.tokenizer.rcdataState
  314. self.originalPhase = self.phase
  315. self.phase = self.phases["text"]
  316. @_utils.memoize
  317. def getPhases(debug):
  318. def log(function):
  319. """Logger that records which phase processes each token"""
  320. type_names = dict((value, key) for key, value in
  321. tokenTypes.items())
  322. def wrapped(self, *args, **kwargs):
  323. if function.__name__.startswith("process") and len(args) > 0:
  324. token = args[0]
  325. try:
  326. info = {"type": type_names[token['type']]}
  327. except:
  328. raise
  329. if token['type'] in tagTokenTypes:
  330. info["name"] = token['name']
  331. self.parser.log.append((self.parser.tokenizer.state.__name__,
  332. self.parser.phase.__class__.__name__,
  333. self.__class__.__name__,
  334. function.__name__,
  335. info))
  336. return function(self, *args, **kwargs)
  337. else:
  338. return function(self, *args, **kwargs)
  339. return wrapped
  340. def getMetaclass(use_metaclass, metaclass_func):
  341. if use_metaclass:
  342. return method_decorator_metaclass(metaclass_func)
  343. else:
  344. return type
  345. # pylint:disable=unused-argument
  346. class Phase(with_metaclass(getMetaclass(debug, log))):
  347. """Base class for helper object that implements each phase of processing
  348. """
  349. def __init__(self, parser, tree):
  350. self.parser = parser
  351. self.tree = tree
  352. def processEOF(self):
  353. raise NotImplementedError
  354. def processComment(self, token):
  355. # For most phases the following is correct. Where it's not it will be
  356. # overridden.
  357. self.tree.insertComment(token, self.tree.openElements[-1])
  358. def processDoctype(self, token):
  359. self.parser.parseError("unexpected-doctype")
  360. def processCharacters(self, token):
  361. self.tree.insertText(token["data"])
  362. def processSpaceCharacters(self, token):
  363. self.tree.insertText(token["data"])
  364. def processStartTag(self, token):
  365. return self.startTagHandler[token["name"]](token)
  366. def startTagHtml(self, token):
  367. if not self.parser.firstStartTag and token["name"] == "html":
  368. self.parser.parseError("non-html-root")
  369. # XXX Need a check here to see if the first start tag token emitted is
  370. # this token... If it's not, invoke self.parser.parseError().
  371. for attr, value in token["data"].items():
  372. if attr not in self.tree.openElements[0].attributes:
  373. self.tree.openElements[0].attributes[attr] = value
  374. self.parser.firstStartTag = False
  375. def processEndTag(self, token):
  376. return self.endTagHandler[token["name"]](token)
  377. class InitialPhase(Phase):
  378. def processSpaceCharacters(self, token):
  379. pass
  380. def processComment(self, token):
  381. self.tree.insertComment(token, self.tree.document)
  382. def processDoctype(self, token):
  383. name = token["name"]
  384. publicId = token["publicId"]
  385. systemId = token["systemId"]
  386. correct = token["correct"]
  387. if (name != "html" or publicId is not None or
  388. systemId is not None and systemId != "about:legacy-compat"):
  389. self.parser.parseError("unknown-doctype")
  390. if publicId is None:
  391. publicId = ""
  392. self.tree.insertDoctype(token)
  393. if publicId != "":
  394. publicId = publicId.translate(asciiUpper2Lower)
  395. if (not correct or token["name"] != "html" or
  396. publicId.startswith(
  397. ("+//silmaril//dtd html pro v0r11 19970101//",
  398. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  399. "-//as//dtd html 3.0 aswedit + extensions//",
  400. "-//ietf//dtd html 2.0 level 1//",
  401. "-//ietf//dtd html 2.0 level 2//",
  402. "-//ietf//dtd html 2.0 strict level 1//",
  403. "-//ietf//dtd html 2.0 strict level 2//",
  404. "-//ietf//dtd html 2.0 strict//",
  405. "-//ietf//dtd html 2.0//",
  406. "-//ietf//dtd html 2.1e//",
  407. "-//ietf//dtd html 3.0//",
  408. "-//ietf//dtd html 3.2 final//",
  409. "-//ietf//dtd html 3.2//",
  410. "-//ietf//dtd html 3//",
  411. "-//ietf//dtd html level 0//",
  412. "-//ietf//dtd html level 1//",
  413. "-//ietf//dtd html level 2//",
  414. "-//ietf//dtd html level 3//",
  415. "-//ietf//dtd html strict level 0//",
  416. "-//ietf//dtd html strict level 1//",
  417. "-//ietf//dtd html strict level 2//",
  418. "-//ietf//dtd html strict level 3//",
  419. "-//ietf//dtd html strict//",
  420. "-//ietf//dtd html//",
  421. "-//metrius//dtd metrius presentational//",
  422. "-//microsoft//dtd internet explorer 2.0 html strict//",
  423. "-//microsoft//dtd internet explorer 2.0 html//",
  424. "-//microsoft//dtd internet explorer 2.0 tables//",
  425. "-//microsoft//dtd internet explorer 3.0 html strict//",
  426. "-//microsoft//dtd internet explorer 3.0 html//",
  427. "-//microsoft//dtd internet explorer 3.0 tables//",
  428. "-//netscape comm. corp.//dtd html//",
  429. "-//netscape comm. corp.//dtd strict html//",
  430. "-//o'reilly and associates//dtd html 2.0//",
  431. "-//o'reilly and associates//dtd html extended 1.0//",
  432. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  433. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  434. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  435. "-//spyglass//dtd html 2.0 extended//",
  436. "-//sq//dtd html 2.0 hotmetal + extensions//",
  437. "-//sun microsystems corp.//dtd hotjava html//",
  438. "-//sun microsystems corp.//dtd hotjava strict html//",
  439. "-//w3c//dtd html 3 1995-03-24//",
  440. "-//w3c//dtd html 3.2 draft//",
  441. "-//w3c//dtd html 3.2 final//",
  442. "-//w3c//dtd html 3.2//",
  443. "-//w3c//dtd html 3.2s draft//",
  444. "-//w3c//dtd html 4.0 frameset//",
  445. "-//w3c//dtd html 4.0 transitional//",
  446. "-//w3c//dtd html experimental 19960712//",
  447. "-//w3c//dtd html experimental 970421//",
  448. "-//w3c//dtd w3 html//",
  449. "-//w3o//dtd w3 html 3.0//",
  450. "-//webtechs//dtd mozilla html 2.0//",
  451. "-//webtechs//dtd mozilla html//")) or
  452. publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
  453. "-/w3c/dtd html 4.0 transitional/en",
  454. "html") or
  455. publicId.startswith(
  456. ("-//w3c//dtd html 4.01 frameset//",
  457. "-//w3c//dtd html 4.01 transitional//")) and
  458. systemId is None or
  459. systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
  460. self.parser.compatMode = "quirks"
  461. elif (publicId.startswith(
  462. ("-//w3c//dtd xhtml 1.0 frameset//",
  463. "-//w3c//dtd xhtml 1.0 transitional//")) or
  464. publicId.startswith(
  465. ("-//w3c//dtd html 4.01 frameset//",
  466. "-//w3c//dtd html 4.01 transitional//")) and
  467. systemId is not None):
  468. self.parser.compatMode = "limited quirks"
  469. self.parser.phase = self.parser.phases["beforeHtml"]
  470. def anythingElse(self):
  471. self.parser.compatMode = "quirks"
  472. self.parser.phase = self.parser.phases["beforeHtml"]
  473. def processCharacters(self, token):
  474. self.parser.parseError("expected-doctype-but-got-chars")
  475. self.anythingElse()
  476. return token
  477. def processStartTag(self, token):
  478. self.parser.parseError("expected-doctype-but-got-start-tag",
  479. {"name": token["name"]})
  480. self.anythingElse()
  481. return token
  482. def processEndTag(self, token):
  483. self.parser.parseError("expected-doctype-but-got-end-tag",
  484. {"name": token["name"]})
  485. self.anythingElse()
  486. return token
  487. def processEOF(self):
  488. self.parser.parseError("expected-doctype-but-got-eof")
  489. self.anythingElse()
  490. return True
  491. class BeforeHtmlPhase(Phase):
  492. # helper methods
  493. def insertHtmlElement(self):
  494. self.tree.insertRoot(impliedTagToken("html", "StartTag"))
  495. self.parser.phase = self.parser.phases["beforeHead"]
  496. # other
  497. def processEOF(self):
  498. self.insertHtmlElement()
  499. return True
  500. def processComment(self, token):
  501. self.tree.insertComment(token, self.tree.document)
  502. def processSpaceCharacters(self, token):
  503. pass
  504. def processCharacters(self, token):
  505. self.insertHtmlElement()
  506. return token
  507. def processStartTag(self, token):
  508. if token["name"] == "html":
  509. self.parser.firstStartTag = True
  510. self.insertHtmlElement()
  511. return token
  512. def processEndTag(self, token):
  513. if token["name"] not in ("head", "body", "html", "br"):
  514. self.parser.parseError("unexpected-end-tag-before-html",
  515. {"name": token["name"]})
  516. else:
  517. self.insertHtmlElement()
  518. return token
  519. class BeforeHeadPhase(Phase):
  520. def __init__(self, parser, tree):
  521. Phase.__init__(self, parser, tree)
  522. self.startTagHandler = _utils.MethodDispatcher([
  523. ("html", self.startTagHtml),
  524. ("head", self.startTagHead)
  525. ])
  526. self.startTagHandler.default = self.startTagOther
  527. self.endTagHandler = _utils.MethodDispatcher([
  528. (("head", "body", "html", "br"), self.endTagImplyHead)
  529. ])
  530. self.endTagHandler.default = self.endTagOther
  531. def processEOF(self):
  532. self.startTagHead(impliedTagToken("head", "StartTag"))
  533. return True
  534. def processSpaceCharacters(self, token):
  535. pass
  536. def processCharacters(self, token):
  537. self.startTagHead(impliedTagToken("head", "StartTag"))
  538. return token
  539. def startTagHtml(self, token):
  540. return self.parser.phases["inBody"].processStartTag(token)
  541. def startTagHead(self, token):
  542. self.tree.insertElement(token)
  543. self.tree.headPointer = self.tree.openElements[-1]
  544. self.parser.phase = self.parser.phases["inHead"]
  545. def startTagOther(self, token):
  546. self.startTagHead(impliedTagToken("head", "StartTag"))
  547. return token
  548. def endTagImplyHead(self, token):
  549. self.startTagHead(impliedTagToken("head", "StartTag"))
  550. return token
  551. def endTagOther(self, token):
  552. self.parser.parseError("end-tag-after-implied-root",
  553. {"name": token["name"]})
  554. class InHeadPhase(Phase):
  555. def __init__(self, parser, tree):
  556. Phase.__init__(self, parser, tree)
  557. self.startTagHandler = _utils.MethodDispatcher([
  558. ("html", self.startTagHtml),
  559. ("title", self.startTagTitle),
  560. (("noframes", "style"), self.startTagNoFramesStyle),
  561. ("noscript", self.startTagNoscript),
  562. ("script", self.startTagScript),
  563. (("base", "basefont", "bgsound", "command", "link"),
  564. self.startTagBaseLinkCommand),
  565. ("meta", self.startTagMeta),
  566. ("head", self.startTagHead)
  567. ])
  568. self.startTagHandler.default = self.startTagOther
  569. self.endTagHandler = _utils.MethodDispatcher([
  570. ("head", self.endTagHead),
  571. (("br", "html", "body"), self.endTagHtmlBodyBr)
  572. ])
  573. self.endTagHandler.default = self.endTagOther
  574. # the real thing
  575. def processEOF(self):
  576. self.anythingElse()
  577. return True
  578. def processCharacters(self, token):
  579. self.anythingElse()
  580. return token
  581. def startTagHtml(self, token):
  582. return self.parser.phases["inBody"].processStartTag(token)
  583. def startTagHead(self, token):
  584. self.parser.parseError("two-heads-are-not-better-than-one")
  585. def startTagBaseLinkCommand(self, token):
  586. self.tree.insertElement(token)
  587. self.tree.openElements.pop()
  588. token["selfClosingAcknowledged"] = True
  589. def startTagMeta(self, token):
  590. self.tree.insertElement(token)
  591. self.tree.openElements.pop()
  592. token["selfClosingAcknowledged"] = True
  593. attributes = token["data"]
  594. if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
  595. if "charset" in attributes:
  596. self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
  597. elif ("content" in attributes and
  598. "http-equiv" in attributes and
  599. attributes["http-equiv"].lower() == "content-type"):
  600. # Encoding it as UTF-8 here is a hack, as really we should pass
  601. # the abstract Unicode string, and just use the
  602. # ContentAttrParser on that, but using UTF-8 allows all chars
  603. # to be encoded and as a ASCII-superset works.
  604. data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
  605. parser = _inputstream.ContentAttrParser(data)
  606. codec = parser.parse()
  607. self.parser.tokenizer.stream.changeEncoding(codec)
  608. def startTagTitle(self, token):
  609. self.parser.parseRCDataRawtext(token, "RCDATA")
  610. def startTagNoFramesStyle(self, token):
  611. # Need to decide whether to implement the scripting-disabled case
  612. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  613. def startTagNoscript(self, token):
  614. if self.parser.scripting:
  615. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  616. else:
  617. self.tree.insertElement(token)
  618. self.parser.phase = self.parser.phases["inHeadNoscript"]
  619. def startTagScript(self, token):
  620. self.tree.insertElement(token)
  621. self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
  622. self.parser.originalPhase = self.parser.phase
  623. self.parser.phase = self.parser.phases["text"]
  624. def startTagOther(self, token):
  625. self.anythingElse()
  626. return token
  627. def endTagHead(self, token):
  628. node = self.parser.tree.openElements.pop()
  629. assert node.name == "head", "Expected head got %s" % node.name
  630. self.parser.phase = self.parser.phases["afterHead"]
  631. def endTagHtmlBodyBr(self, token):
  632. self.anythingElse()
  633. return token
  634. def endTagOther(self, token):
  635. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  636. def anythingElse(self):
  637. self.endTagHead(impliedTagToken("head"))
  638. class InHeadNoscriptPhase(Phase):
  639. def __init__(self, parser, tree):
  640. Phase.__init__(self, parser, tree)
  641. self.startTagHandler = _utils.MethodDispatcher([
  642. ("html", self.startTagHtml),
  643. (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
  644. (("head", "noscript"), self.startTagHeadNoscript),
  645. ])
  646. self.startTagHandler.default = self.startTagOther
  647. self.endTagHandler = _utils.MethodDispatcher([
  648. ("noscript", self.endTagNoscript),
  649. ("br", self.endTagBr),
  650. ])
  651. self.endTagHandler.default = self.endTagOther
  652. def processEOF(self):
  653. self.parser.parseError("eof-in-head-noscript")
  654. self.anythingElse()
  655. return True
  656. def processComment(self, token):
  657. return self.parser.phases["inHead"].processComment(token)
  658. def processCharacters(self, token):
  659. self.parser.parseError("char-in-head-noscript")
  660. self.anythingElse()
  661. return token
  662. def processSpaceCharacters(self, token):
  663. return self.parser.phases["inHead"].processSpaceCharacters(token)
  664. def startTagHtml(self, token):
  665. return self.parser.phases["inBody"].processStartTag(token)
  666. def startTagBaseLinkCommand(self, token):
  667. return self.parser.phases["inHead"].processStartTag(token)
  668. def startTagHeadNoscript(self, token):
  669. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  670. def startTagOther(self, token):
  671. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  672. self.anythingElse()
  673. return token
  674. def endTagNoscript(self, token):
  675. node = self.parser.tree.openElements.pop()
  676. assert node.name == "noscript", "Expected noscript got %s" % node.name
  677. self.parser.phase = self.parser.phases["inHead"]
  678. def endTagBr(self, token):
  679. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  680. self.anythingElse()
  681. return token
  682. def endTagOther(self, token):
  683. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  684. def anythingElse(self):
  685. # Caller must raise parse error first!
  686. self.endTagNoscript(impliedTagToken("noscript"))
  687. class AfterHeadPhase(Phase):
  688. def __init__(self, parser, tree):
  689. Phase.__init__(self, parser, tree)
  690. self.startTagHandler = _utils.MethodDispatcher([
  691. ("html", self.startTagHtml),
  692. ("body", self.startTagBody),
  693. ("frameset", self.startTagFrameset),
  694. (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
  695. "style", "title"),
  696. self.startTagFromHead),
  697. ("head", self.startTagHead)
  698. ])
  699. self.startTagHandler.default = self.startTagOther
  700. self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
  701. self.endTagHtmlBodyBr)])
  702. self.endTagHandler.default = self.endTagOther
  703. def processEOF(self):
  704. self.anythingElse()
  705. return True
  706. def processCharacters(self, token):
  707. self.anythingElse()
  708. return token
  709. def startTagHtml(self, token):
  710. return self.parser.phases["inBody"].processStartTag(token)
  711. def startTagBody(self, token):
  712. self.parser.framesetOK = False
  713. self.tree.insertElement(token)
  714. self.parser.phase = self.parser.phases["inBody"]
  715. def startTagFrameset(self, token):
  716. self.tree.insertElement(token)
  717. self.parser.phase = self.parser.phases["inFrameset"]
  718. def startTagFromHead(self, token):
  719. self.parser.parseError("unexpected-start-tag-out-of-my-head",
  720. {"name": token["name"]})
  721. self.tree.openElements.append(self.tree.headPointer)
  722. self.parser.phases["inHead"].processStartTag(token)
  723. for node in self.tree.openElements[::-1]:
  724. if node.name == "head":
  725. self.tree.openElements.remove(node)
  726. break
  727. def startTagHead(self, token):
  728. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  729. def startTagOther(self, token):
  730. self.anythingElse()
  731. return token
  732. def endTagHtmlBodyBr(self, token):
  733. self.anythingElse()
  734. return token
  735. def endTagOther(self, token):
  736. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  737. def anythingElse(self):
  738. self.tree.insertElement(impliedTagToken("body", "StartTag"))
  739. self.parser.phase = self.parser.phases["inBody"]
  740. self.parser.framesetOK = True
  741. class InBodyPhase(Phase):
  742. # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
  743. # the really-really-really-very crazy mode
  744. def __init__(self, parser, tree):
  745. Phase.__init__(self, parser, tree)
  746. # Set this to the default handler
  747. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  748. self.startTagHandler = _utils.MethodDispatcher([
  749. ("html", self.startTagHtml),
  750. (("base", "basefont", "bgsound", "command", "link", "meta",
  751. "script", "style", "title"),
  752. self.startTagProcessInHead),
  753. ("body", self.startTagBody),
  754. ("frameset", self.startTagFrameset),
  755. (("address", "article", "aside", "blockquote", "center", "details",
  756. "dir", "div", "dl", "fieldset", "figcaption", "figure",
  757. "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
  758. "section", "summary", "ul"),
  759. self.startTagCloseP),
  760. (headingElements, self.startTagHeading),
  761. (("pre", "listing"), self.startTagPreListing),
  762. ("form", self.startTagForm),
  763. (("li", "dd", "dt"), self.startTagListItem),
  764. ("plaintext", self.startTagPlaintext),
  765. ("a", self.startTagA),
  766. (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
  767. "strong", "tt", "u"), self.startTagFormatting),
  768. ("nobr", self.startTagNobr),
  769. ("button", self.startTagButton),
  770. (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
  771. ("xmp", self.startTagXmp),
  772. ("table", self.startTagTable),
  773. (("area", "br", "embed", "img", "keygen", "wbr"),
  774. self.startTagVoidFormatting),
  775. (("param", "source", "track"), self.startTagParamSource),
  776. ("input", self.startTagInput),
  777. ("hr", self.startTagHr),
  778. ("image", self.startTagImage),
  779. ("isindex", self.startTagIsIndex),
  780. ("textarea", self.startTagTextarea),
  781. ("iframe", self.startTagIFrame),
  782. ("noscript", self.startTagNoscript),
  783. (("noembed", "noframes"), self.startTagRawtext),
  784. ("select", self.startTagSelect),
  785. (("rp", "rt"), self.startTagRpRt),
  786. (("option", "optgroup"), self.startTagOpt),
  787. (("math"), self.startTagMath),
  788. (("svg"), self.startTagSvg),
  789. (("caption", "col", "colgroup", "frame", "head",
  790. "tbody", "td", "tfoot", "th", "thead",
  791. "tr"), self.startTagMisplaced)
  792. ])
  793. self.startTagHandler.default = self.startTagOther
  794. self.endTagHandler = _utils.MethodDispatcher([
  795. ("body", self.endTagBody),
  796. ("html", self.endTagHtml),
  797. (("address", "article", "aside", "blockquote", "button", "center",
  798. "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  799. "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
  800. "section", "summary", "ul"), self.endTagBlock),
  801. ("form", self.endTagForm),
  802. ("p", self.endTagP),
  803. (("dd", "dt", "li"), self.endTagListItem),
  804. (headingElements, self.endTagHeading),
  805. (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
  806. "strike", "strong", "tt", "u"), self.endTagFormatting),
  807. (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
  808. ("br", self.endTagBr),
  809. ])
  810. self.endTagHandler.default = self.endTagOther
  811. def isMatchingFormattingElement(self, node1, node2):
  812. return (node1.name == node2.name and
  813. node1.namespace == node2.namespace and
  814. node1.attributes == node2.attributes)
  815. # helper
  816. def addFormattingElement(self, token):
  817. self.tree.insertElement(token)
  818. element = self.tree.openElements[-1]
  819. matchingElements = []
  820. for node in self.tree.activeFormattingElements[::-1]:
  821. if node is Marker:
  822. break
  823. elif self.isMatchingFormattingElement(node, element):
  824. matchingElements.append(node)
  825. assert len(matchingElements) <= 3
  826. if len(matchingElements) == 3:
  827. self.tree.activeFormattingElements.remove(matchingElements[-1])
  828. self.tree.activeFormattingElements.append(element)
  829. # the real deal
  830. def processEOF(self):
  831. allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
  832. "tfoot", "th", "thead", "tr", "body",
  833. "html"))
  834. for node in self.tree.openElements[::-1]:
  835. if node.name not in allowed_elements:
  836. self.parser.parseError("expected-closing-tag-but-got-eof")
  837. break
  838. # Stop parsing
  839. def processSpaceCharactersDropNewline(self, token):
  840. # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
  841. # want to drop leading newlines
  842. data = token["data"]
  843. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  844. if (data.startswith("\n") and
  845. self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
  846. not self.tree.openElements[-1].hasContent()):
  847. data = data[1:]
  848. if data:
  849. self.tree.reconstructActiveFormattingElements()
  850. self.tree.insertText(data)
  851. def processCharacters(self, token):
  852. if token["data"] == "\u0000":
  853. # The tokenizer should always emit null on its own
  854. return
  855. self.tree.reconstructActiveFormattingElements()
  856. self.tree.insertText(token["data"])
  857. # This must be bad for performance
  858. if (self.parser.framesetOK and
  859. any([char not in spaceCharacters
  860. for char in token["data"]])):
  861. self.parser.framesetOK = False
  862. def processSpaceCharactersNonPre(self, token):
  863. self.tree.reconstructActiveFormattingElements()
  864. self.tree.insertText(token["data"])
  865. def startTagProcessInHead(self, token):
  866. return self.parser.phases["inHead"].processStartTag(token)
  867. def startTagBody(self, token):
  868. self.parser.parseError("unexpected-start-tag", {"name": "body"})
  869. if (len(self.tree.openElements) == 1 or
  870. self.tree.openElements[1].name != "body"):
  871. assert self.parser.innerHTML
  872. else:
  873. self.parser.framesetOK = False
  874. for attr, value in token["data"].items():
  875. if attr not in self.tree.openElements[1].attributes:
  876. self.tree.openElements[1].attributes[attr] = value
  877. def startTagFrameset(self, token):
  878. self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
  879. if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
  880. assert self.parser.innerHTML
  881. elif not self.parser.framesetOK:
  882. pass
  883. else:
  884. if self.tree.openElements[1].parent:
  885. self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
  886. while self.tree.openElements[-1].name != "html":
  887. self.tree.openElements.pop()
  888. self.tree.insertElement(token)
  889. self.parser.phase = self.parser.phases["inFrameset"]
  890. def startTagCloseP(self, token):
  891. if self.tree.elementInScope("p", variant="button"):
  892. self.endTagP(impliedTagToken("p"))
  893. self.tree.insertElement(token)
  894. def startTagPreListing(self, token):
  895. if self.tree.elementInScope("p", variant="button"):
  896. self.endTagP(impliedTagToken("p"))
  897. self.tree.insertElement(token)
  898. self.parser.framesetOK = False
  899. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  900. def startTagForm(self, token):
  901. if self.tree.formPointer:
  902. self.parser.parseError("unexpected-start-tag", {"name": "form"})
  903. else:
  904. if self.tree.elementInScope("p", variant="button"):
  905. self.endTagP(impliedTagToken("p"))
  906. self.tree.insertElement(token)
  907. self.tree.formPointer = self.tree.openElements[-1]
  908. def startTagListItem(self, token):
  909. self.parser.framesetOK = False
  910. stopNamesMap = {"li": ["li"],
  911. "dt": ["dt", "dd"],
  912. "dd": ["dt", "dd"]}
  913. stopNames = stopNamesMap[token["name"]]
  914. for node in reversed(self.tree.openElements):
  915. if node.name in stopNames:
  916. self.parser.phase.processEndTag(
  917. impliedTagToken(node.name, "EndTag"))
  918. break
  919. if (node.nameTuple in specialElements and
  920. node.name not in ("address", "div", "p")):
  921. break
  922. if self.tree.elementInScope("p", variant="button"):
  923. self.parser.phase.processEndTag(
  924. impliedTagToken("p", "EndTag"))
  925. self.tree.insertElement(token)
  926. def startTagPlaintext(self, token):
  927. if self.tree.elementInScope("p", variant="button"):
  928. self.endTagP(impliedTagToken("p"))
  929. self.tree.insertElement(token)
  930. self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
  931. def startTagHeading(self, token):
  932. if self.tree.elementInScope("p", variant="button"):
  933. self.endTagP(impliedTagToken("p"))
  934. if self.tree.openElements[-1].name in headingElements:
  935. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  936. self.tree.openElements.pop()
  937. self.tree.insertElement(token)
  938. def startTagA(self, token):
  939. afeAElement = self.tree.elementInActiveFormattingElements("a")
  940. if afeAElement:
  941. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  942. {"startName": "a", "endName": "a"})
  943. self.endTagFormatting(impliedTagToken("a"))
  944. if afeAElement in self.tree.openElements:
  945. self.tree.openElements.remove(afeAElement)
  946. if afeAElement in self.tree.activeFormattingElements:
  947. self.tree.activeFormattingElements.remove(afeAElement)
  948. self.tree.reconstructActiveFormattingElements()
  949. self.addFormattingElement(token)
  950. def startTagFormatting(self, token):
  951. self.tree.reconstructActiveFormattingElements()
  952. self.addFormattingElement(token)
  953. def startTagNobr(self, token):
  954. self.tree.reconstructActiveFormattingElements()
  955. if self.tree.elementInScope("nobr"):
  956. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  957. {"startName": "nobr", "endName": "nobr"})
  958. self.processEndTag(impliedTagToken("nobr"))
  959. # XXX Need tests that trigger the following
  960. self.tree.reconstructActiveFormattingElements()
  961. self.addFormattingElement(token)
  962. def startTagButton(self, token):
  963. if self.tree.elementInScope("button"):
  964. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  965. {"startName": "button", "endName": "button"})
  966. self.processEndTag(impliedTagToken("button"))
  967. return token
  968. else:
  969. self.tree.reconstructActiveFormattingElements()
  970. self.tree.insertElement(token)
  971. self.parser.framesetOK = False
  972. def startTagAppletMarqueeObject(self, token):
  973. self.tree.reconstructActiveFormattingElements()
  974. self.tree.insertElement(token)
  975. self.tree.activeFormattingElements.append(Marker)
  976. self.parser.framesetOK = False
  977. def startTagXmp(self, token):
  978. if self.tree.elementInScope("p", variant="button"):
  979. self.endTagP(impliedTagToken("p"))
  980. self.tree.reconstructActiveFormattingElements()
  981. self.parser.framesetOK = False
  982. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  983. def startTagTable(self, token):
  984. if self.parser.compatMode != "quirks":
  985. if self.tree.elementInScope("p", variant="button"):
  986. self.processEndTag(impliedTagToken("p"))
  987. self.tree.insertElement(token)
  988. self.parser.framesetOK = False
  989. self.parser.phase = self.parser.phases["inTable"]
  990. def startTagVoidFormatting(self, token):
  991. self.tree.reconstructActiveFormattingElements()
  992. self.tree.insertElement(token)
  993. self.tree.openElements.pop()
  994. token["selfClosingAcknowledged"] = True
  995. self.parser.framesetOK = False
  996. def startTagInput(self, token):
  997. framesetOK = self.parser.framesetOK
  998. self.startTagVoidFormatting(token)
  999. if ("type" in token["data"] and
  1000. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1001. # input type=hidden doesn't change framesetOK
  1002. self.parser.framesetOK = framesetOK
  1003. def startTagParamSource(self, token):
  1004. self.tree.insertElement(token)
  1005. self.tree.openElements.pop()
  1006. token["selfClosingAcknowledged"] = True
  1007. def startTagHr(self, token):
  1008. if self.tree.elementInScope("p", variant="button"):
  1009. self.endTagP(impliedTagToken("p"))
  1010. self.tree.insertElement(token)
  1011. self.tree.openElements.pop()
  1012. token["selfClosingAcknowledged"] = True
  1013. self.parser.framesetOK = False
  1014. def startTagImage(self, token):
  1015. # No really...
  1016. self.parser.parseError("unexpected-start-tag-treated-as",
  1017. {"originalName": "image", "newName": "img"})
  1018. self.processStartTag(impliedTagToken("img", "StartTag",
  1019. attributes=token["data"],
  1020. selfClosing=token["selfClosing"]))
  1021. def startTagIsIndex(self, token):
  1022. self.parser.parseError("deprecated-tag", {"name": "isindex"})
  1023. if self.tree.formPointer:
  1024. return
  1025. form_attrs = {}
  1026. if "action" in token["data"]:
  1027. form_attrs["action"] = token["data"]["action"]
  1028. self.processStartTag(impliedTagToken("form", "StartTag",
  1029. attributes=form_attrs))
  1030. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1031. self.processStartTag(impliedTagToken("label", "StartTag"))
  1032. # XXX Localization ...
  1033. if "prompt" in token["data"]:
  1034. prompt = token["data"]["prompt"]
  1035. else:
  1036. prompt = "This is a searchable index. Enter search keywords: "
  1037. self.processCharacters(
  1038. {"type": tokenTypes["Characters"], "data": prompt})
  1039. attributes = token["data"].copy()
  1040. if "action" in attributes:
  1041. del attributes["action"]
  1042. if "prompt" in attributes:
  1043. del attributes["prompt"]
  1044. attributes["name"] = "isindex"
  1045. self.processStartTag(impliedTagToken("input", "StartTag",
  1046. attributes=attributes,
  1047. selfClosing=token["selfClosing"]))
  1048. self.processEndTag(impliedTagToken("label"))
  1049. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1050. self.processEndTag(impliedTagToken("form"))
  1051. def startTagTextarea(self, token):
  1052. self.tree.insertElement(token)
  1053. self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
  1054. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  1055. self.parser.framesetOK = False
  1056. def startTagIFrame(self, token):
  1057. self.parser.framesetOK = False
  1058. self.startTagRawtext(token)
  1059. def startTagNoscript(self, token):
  1060. if self.parser.scripting:
  1061. self.startTagRawtext(token)
  1062. else:
  1063. self.startTagOther(token)
  1064. def startTagRawtext(self, token):
  1065. """iframe, noembed noframes, noscript(if scripting enabled)"""
  1066. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  1067. def startTagOpt(self, token):
  1068. if self.tree.openElements[-1].name == "option":
  1069. self.parser.phase.processEndTag(impliedTagToken("option"))
  1070. self.tree.reconstructActiveFormattingElements()
  1071. self.parser.tree.insertElement(token)
  1072. def startTagSelect(self, token):
  1073. self.tree.reconstructActiveFormattingElements()
  1074. self.tree.insertElement(token)
  1075. self.parser.framesetOK = False
  1076. if self.parser.phase in (self.parser.phases["inTable"],
  1077. self.parser.phases["inCaption"],
  1078. self.parser.phases["inColumnGroup"],
  1079. self.parser.phases["inTableBody"],
  1080. self.parser.phases["inRow"],
  1081. self.parser.phases["inCell"]):
  1082. self.parser.phase = self.parser.phases["inSelectInTable"]
  1083. else:
  1084. self.parser.phase = self.parser.phases["inSelect"]
  1085. def startTagRpRt(self, token):
  1086. if self.tree.elementInScope("ruby"):
  1087. self.tree.generateImpliedEndTags()
  1088. if self.tree.openElements[-1].name != "ruby":
  1089. self.parser.parseError()
  1090. self.tree.insertElement(token)
  1091. def startTagMath(self, token):
  1092. self.tree.reconstructActiveFormattingElements()
  1093. self.parser.adjustMathMLAttributes(token)
  1094. self.parser.adjustForeignAttributes(token)
  1095. token["namespace"] = namespaces["mathml"]
  1096. self.tree.insertElement(token)
  1097. # Need to get the parse error right for the case where the token
  1098. # has a namespace not equal to the xmlns attribute
  1099. if token["selfClosing"]:
  1100. self.tree.openElements.pop()
  1101. token["selfClosingAcknowledged"] = True
  1102. def startTagSvg(self, token):
  1103. self.tree.reconstructActiveFormattingElements()
  1104. self.parser.adjustSVGAttributes(token)
  1105. self.parser.adjustForeignAttributes(token)
  1106. token["namespace"] = namespaces["svg"]
  1107. self.tree.insertElement(token)
  1108. # Need to get the parse error right for the case where the token
  1109. # has a namespace not equal to the xmlns attribute
  1110. if token["selfClosing"]:
  1111. self.tree.openElements.pop()
  1112. token["selfClosingAcknowledged"] = True
  1113. def startTagMisplaced(self, token):
  1114. """ Elements that should be children of other elements that have a
  1115. different insertion mode; here they are ignored
  1116. "caption", "col", "colgroup", "frame", "frameset", "head",
  1117. "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
  1118. "tr", "noscript"
  1119. """
  1120. self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
  1121. def startTagOther(self, token):
  1122. self.tree.reconstructActiveFormattingElements()
  1123. self.tree.insertElement(token)
  1124. def endTagP(self, token):
  1125. if not self.tree.elementInScope("p", variant="button"):
  1126. self.startTagCloseP(impliedTagToken("p", "StartTag"))
  1127. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1128. self.endTagP(impliedTagToken("p", "EndTag"))
  1129. else:
  1130. self.tree.generateImpliedEndTags("p")
  1131. if self.tree.openElements[-1].name != "p":
  1132. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1133. node = self.tree.openElements.pop()
  1134. while node.name != "p":
  1135. node = self.tree.openElements.pop()
  1136. def endTagBody(self, token):
  1137. if not self.tree.elementInScope("body"):
  1138. self.parser.parseError()
  1139. return
  1140. elif self.tree.openElements[-1].name != "body":
  1141. for node in self.tree.openElements[2:]:
  1142. if node.name not in frozenset(("dd", "dt", "li", "optgroup",
  1143. "option", "p", "rp", "rt",
  1144. "tbody", "td", "tfoot",
  1145. "th", "thead", "tr", "body",
  1146. "html")):
  1147. # Not sure this is the correct name for the parse error
  1148. self.parser.parseError(
  1149. "expected-one-end-tag-but-got-another",
  1150. {"gotName": "body", "expectedName": node.name})
  1151. break
  1152. self.parser.phase = self.parser.phases["afterBody"]
  1153. def endTagHtml(self, token):
  1154. # We repeat the test for the body end tag token being ignored here
  1155. if self.tree.elementInScope("body"):
  1156. self.endTagBody(impliedTagToken("body"))
  1157. return token
  1158. def endTagBlock(self, token):
  1159. # Put us back in the right whitespace handling mode
  1160. if token["name"] == "pre":
  1161. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  1162. inScope = self.tree.elementInScope(token["name"])
  1163. if inScope:
  1164. self.tree.generateImpliedEndTags()
  1165. if self.tree.openElements[-1].name != token["name"]:
  1166. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1167. if inScope:
  1168. node = self.tree.openElements.pop()
  1169. while node.name != token["name"]:
  1170. node = self.tree.openElements.pop()
  1171. def endTagForm(self, token):
  1172. node = self.tree.formPointer
  1173. self.tree.formPointer = None
  1174. if node is None or not self.tree.elementInScope(node):
  1175. self.parser.parseError("unexpected-end-tag",
  1176. {"name": "form"})
  1177. else:
  1178. self.tree.generateImpliedEndTags()
  1179. if self.tree.openElements[-1] != node:
  1180. self.parser.parseError("end-tag-too-early-ignored",
  1181. {"name": "form"})
  1182. self.tree.openElements.remove(node)
  1183. def endTagListItem(self, token):
  1184. if token["name"] == "li":
  1185. variant = "list"
  1186. else:
  1187. variant = None
  1188. if not self.tree.elementInScope(token["name"], variant=variant):
  1189. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1190. else:
  1191. self.tree.generateImpliedEndTags(exclude=token["name"])
  1192. if self.tree.openElements[-1].name != token["name"]:
  1193. self.parser.parseError(
  1194. "end-tag-too-early",
  1195. {"name": token["name"]})
  1196. node = self.tree.openElements.pop()
  1197. while node.name != token["name"]:
  1198. node = self.tree.openElements.pop()
  1199. def endTagHeading(self, token):
  1200. for item in headingElements:
  1201. if self.tree.elementInScope(item):
  1202. self.tree.generateImpliedEndTags()
  1203. break
  1204. if self.tree.openElements[-1].name != token["name"]:
  1205. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1206. for item in headingElements:
  1207. if self.tree.elementInScope(item):
  1208. item = self.tree.openElements.pop()
  1209. while item.name not in headingElements:
  1210. item = self.tree.openElements.pop()
  1211. break
  1212. def endTagFormatting(self, token):
  1213. """The much-feared adoption agency algorithm"""
  1214. # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
  1215. # XXX Better parseError messages appreciated.
  1216. # Step 1
  1217. outerLoopCounter = 0
  1218. # Step 2
  1219. while outerLoopCounter < 8:
  1220. # Step 3
  1221. outerLoopCounter += 1
  1222. # Step 4:
  1223. # Let the formatting element be the last element in
  1224. # the list of active formatting elements that:
  1225. # - is between the end of the list and the last scope
  1226. # marker in the list, if any, or the start of the list
  1227. # otherwise, and
  1228. # - has the same tag name as the token.
  1229. formattingElement = self.tree.elementInActiveFormattingElements(
  1230. token["name"])
  1231. if (not formattingElement or
  1232. (formattingElement in self.tree.openElements and
  1233. not self.tree.elementInScope(formattingElement.name))):
  1234. # If there is no such node, then abort these steps
  1235. # and instead act as described in the "any other
  1236. # end tag" entry below.
  1237. self.endTagOther(token)
  1238. return
  1239. # Otherwise, if there is such a node, but that node is
  1240. # not in the stack of open elements, then this is a
  1241. # parse error; remove the element from the list, and
  1242. # abort these steps.
  1243. elif formattingElement not in self.tree.openElements:
  1244. self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
  1245. self.tree.activeFormattingElements.remove(formattingElement)
  1246. return
  1247. # Otherwise, if there is such a node, and that node is
  1248. # also in the stack of open elements, but the element
  1249. # is not in scope, then this is a parse error; ignore
  1250. # the token, and abort these steps.
  1251. elif not self.tree.elementInScope(formattingElement.name):
  1252. self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
  1253. return
  1254. # Otherwise, there is a formatting element and that
  1255. # element is in the stack and is in scope. If the
  1256. # element is not the current node, this is a parse
  1257. # error. In any case, proceed with the algorithm as
  1258. # written in the following steps.
  1259. else:
  1260. if formattingElement != self.tree.openElements[-1]:
  1261. self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
  1262. # Step 5:
  1263. # Let the furthest block be the topmost node in the
  1264. # stack of open elements that is lower in the stack
  1265. # than the formatting element, and is an element in
  1266. # the special category. There might not be one.
  1267. afeIndex = self.tree.openElements.index(formattingElement)
  1268. furthestBlock = None
  1269. for element in self.tree.openElements[afeIndex:]:
  1270. if element.nameTuple in specialElements:
  1271. furthestBlock = element
  1272. break
  1273. # Step 6:
  1274. # If there is no furthest block, then the UA must
  1275. # first pop all the nodes from the bottom of the stack
  1276. # of open elements, from the current node up to and
  1277. # including the formatting element, then remove the
  1278. # formatting element from the list of active
  1279. # formatting elements, and finally abort these steps.
  1280. if furthestBlock is None:
  1281. element = self.tree.openElements.pop()
  1282. while element != formattingElement:
  1283. element = self.tree.openElements.pop()
  1284. self.tree.activeFormattingElements.remove(element)
  1285. return
  1286. # Step 7
  1287. commonAncestor = self.tree.openElements[afeIndex - 1]
  1288. # Step 8:
  1289. # The bookmark is supposed to help us identify where to reinsert
  1290. # nodes in step 15. We have to ensure that we reinsert nodes after
  1291. # the node before the active formatting element. Note the bookmark
  1292. # can move in step 9.7
  1293. bookmark = self.tree.activeFormattingElements.index(formattingElement)
  1294. # Step 9
  1295. lastNode = node = furthestBlock
  1296. innerLoopCounter = 0
  1297. index = self.tree.openElements.index(node)
  1298. while innerLoopCounter < 3:
  1299. innerLoopCounter += 1
  1300. # Node is element before node in open elements
  1301. index -= 1
  1302. node = self.tree.openElements[index]
  1303. if node not in self.tree.activeFormattingElements:
  1304. self.tree.openElements.remove(node)
  1305. continue
  1306. # Step 9.6
  1307. if node == formattingElement:
  1308. break
  1309. # Step 9.7
  1310. if lastNode == furthestBlock:
  1311. bookmark = self.tree.activeFormattingElements.index(node) + 1
  1312. # Step 9.8
  1313. clone = node.cloneNode()
  1314. # Replace node with clone
  1315. self.tree.activeFormattingElements[
  1316. self.tree.activeFormattingElements.index(node)] = clone
  1317. self.tree.openElements[
  1318. self.tree.openElements.index(node)] = clone
  1319. node = clone
  1320. # Step 9.9
  1321. # Remove lastNode from its parents, if any
  1322. if lastNode.parent:
  1323. lastNode.parent.removeChild(lastNode)
  1324. node.appendChild(lastNode)
  1325. # Step 9.10
  1326. lastNode = node
  1327. # Step 10
  1328. # Foster parent lastNode if commonAncestor is a
  1329. # table, tbody, tfoot, thead, or tr we need to foster
  1330. # parent the lastNode
  1331. if lastNode.parent:
  1332. lastNode.parent.removeChild(lastNode)
  1333. if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
  1334. parent, insertBefore = self.tree.getTableMisnestedNodePosition()
  1335. parent.insertBefore(lastNode, insertBefore)
  1336. else:
  1337. commonAncestor.appendChild(lastNode)
  1338. # Step 11
  1339. clone = formattingElement.cloneNode()
  1340. # Step 12
  1341. furthestBlock.reparentChildren(clone)
  1342. # Step 13
  1343. furthestBlock.appendChild(clone)
  1344. # Step 14
  1345. self.tree.activeFormattingElements.remove(formattingElement)
  1346. self.tree.activeFormattingElements.insert(bookmark, clone)
  1347. # Step 15
  1348. self.tree.openElements.remove(formattingElement)
  1349. self.tree.openElements.insert(
  1350. self.tree.openElements.index(furthestBlock) + 1, clone)
  1351. def endTagAppletMarqueeObject(self, token):
  1352. if self.tree.elementInScope(token["name"]):
  1353. self.tree.generateImpliedEndTags()
  1354. if self.tree.openElements[-1].name != token["name"]:
  1355. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1356. if self.tree.elementInScope(token["name"]):
  1357. element = self.tree.openElements.pop()
  1358. while element.name != token["name"]:
  1359. element = self.tree.openElements.pop()
  1360. self.tree.clearActiveFormattingElements()
  1361. def endTagBr(self, token):
  1362. self.parser.parseError("unexpected-end-tag-treated-as",
  1363. {"originalName": "br", "newName": "br element"})
  1364. self.tree.reconstructActiveFormattingElements()
  1365. self.tree.insertElement(impliedTagToken("br", "StartTag"))
  1366. self.tree.openElements.pop()
  1367. def endTagOther(self, token):
  1368. for node in self.tree.openElements[::-1]:
  1369. if node.name == token["name"]:
  1370. self.tree.generateImpliedEndTags(exclude=token["name"])
  1371. if self.tree.openElements[-1].name != token["name"]:
  1372. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1373. while self.tree.openElements.pop() != node:
  1374. pass
  1375. break
  1376. else:
  1377. if node.nameTuple in specialElements:
  1378. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1379. break
  1380. class TextPhase(Phase):
  1381. def __init__(self, parser, tree):
  1382. Phase.__init__(self, parser, tree)
  1383. self.startTagHandler = _utils.MethodDispatcher([])
  1384. self.startTagHandler.default = self.startTagOther
  1385. self.endTagHandler = _utils.MethodDispatcher([
  1386. ("script", self.endTagScript)])
  1387. self.endTagHandler.default = self.endTagOther
  1388. def processCharacters(self, token):
  1389. self.tree.insertText(token["data"])
  1390. def processEOF(self):
  1391. self.parser.parseError("expected-named-closing-tag-but-got-eof",
  1392. {"name": self.tree.openElements[-1].name})
  1393. self.tree.openElements.pop()
  1394. self.parser.phase = self.parser.originalPhase
  1395. return True
  1396. def startTagOther(self, token):
  1397. assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
  1398. def endTagScript(self, token):
  1399. node = self.tree.openElements.pop()
  1400. assert node.name == "script"
  1401. self.parser.phase = self.parser.originalPhase
  1402. # The rest of this method is all stuff that only happens if
  1403. # document.write works
  1404. def endTagOther(self, token):
  1405. self.tree.openElements.pop()
  1406. self.parser.phase = self.parser.originalPhase
  1407. class InTablePhase(Phase):
  1408. # http://www.whatwg.org/specs/web-apps/current-work/#in-table
  1409. def __init__(self, parser, tree):
  1410. Phase.__init__(self, parser, tree)
  1411. self.startTagHandler = _utils.MethodDispatcher([
  1412. ("html", self.startTagHtml),
  1413. ("caption", self.startTagCaption),
  1414. ("colgroup", self.startTagColgroup),
  1415. ("col", self.startTagCol),
  1416. (("tbody", "tfoot", "thead"), self.startTagRowGroup),
  1417. (("td", "th", "tr"), self.startTagImplyTbody),
  1418. ("table", self.startTagTable),
  1419. (("style", "script"), self.startTagStyleScript),
  1420. ("input", self.startTagInput),
  1421. ("form", self.startTagForm)
  1422. ])
  1423. self.startTagHandler.default = self.startTagOther
  1424. self.endTagHandler = _utils.MethodDispatcher([
  1425. ("table", self.endTagTable),
  1426. (("body", "caption", "col", "colgroup", "html", "tbody", "td",
  1427. "tfoot", "th", "thead", "tr"), self.endTagIgnore)
  1428. ])
  1429. self.endTagHandler.default = self.endTagOther
  1430. # helper methods
  1431. def clearStackToTableContext(self):
  1432. # "clear the stack back to a table context"
  1433. while self.tree.openElements[-1].name not in ("table", "html"):
  1434. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1435. # {"name": self.tree.openElements[-1].name})
  1436. self.tree.openElements.pop()
  1437. # When the current node is <html> it's an innerHTML case
  1438. # processing methods
  1439. def processEOF(self):
  1440. if self.tree.openElements[-1].name != "html":
  1441. self.parser.parseError("eof-in-table")
  1442. else:
  1443. assert self.parser.innerHTML
  1444. # Stop parsing
  1445. def processSpaceCharacters(self, token):
  1446. originalPhase = self.parser.phase
  1447. self.parser.phase = self.parser.phases["inTableText"]
  1448. self.parser.phase.originalPhase = originalPhase
  1449. self.parser.phase.processSpaceCharacters(token)
  1450. def processCharacters(self, token):
  1451. originalPhase = self.parser.phase
  1452. self.parser.phase = self.parser.phases["inTableText"]
  1453. self.parser.phase.originalPhase = originalPhase
  1454. self.parser.phase.processCharacters(token)
  1455. def insertText(self, token):
  1456. # If we get here there must be at least one non-whitespace character
  1457. # Do the table magic!
  1458. self.tree.insertFromTable = True
  1459. self.parser.phases["inBody"].processCharacters(token)
  1460. self.tree.insertFromTable = False
  1461. def startTagCaption(self, token):
  1462. self.clearStackToTableContext()
  1463. self.tree.activeFormattingElements.append(Marker)
  1464. self.tree.insertElement(token)
  1465. self.parser.phase = self.parser.phases["inCaption"]
  1466. def startTagColgroup(self, token):
  1467. self.clearStackToTableContext()
  1468. self.tree.insertElement(token)
  1469. self.parser.phase = self.parser.phases["inColumnGroup"]
  1470. def startTagCol(self, token):
  1471. self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
  1472. return token
  1473. def startTagRowGroup(self, token):
  1474. self.clearStackToTableContext()
  1475. self.tree.insertElement(token)
  1476. self.parser.phase = self.parser.phases["inTableBody"]
  1477. def startTagImplyTbody(self, token):
  1478. self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
  1479. return token
  1480. def startTagTable(self, token):
  1481. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  1482. {"startName": "table", "endName": "table"})
  1483. self.parser.phase.processEndTag(impliedTagToken("table"))
  1484. if not self.parser.innerHTML:
  1485. return token
  1486. def startTagStyleScript(self, token):
  1487. return self.parser.phases["inHead"].processStartTag(token)
  1488. def startTagInput(self, token):
  1489. if ("type" in token["data"] and
  1490. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1491. self.parser.parseError("unexpected-hidden-input-in-table")
  1492. self.tree.insertElement(token)
  1493. # XXX associate with form
  1494. self.tree.openElements.pop()
  1495. else:
  1496. self.startTagOther(token)
  1497. def startTagForm(self, token):
  1498. self.parser.parseError("unexpected-form-in-table")
  1499. if self.tree.formPointer is None:
  1500. self.tree.insertElement(token)
  1501. self.tree.formPointer = self.tree.openElements[-1]
  1502. self.tree.openElements.pop()
  1503. def startTagOther(self, token):
  1504. self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
  1505. # Do the table magic!
  1506. self.tree.insertFromTable = True
  1507. self.parser.phases["inBody"].processStartTag(token)
  1508. self.tree.insertFromTable = False
  1509. def endTagTable(self, token):
  1510. if self.tree.elementInScope("table", variant="table"):
  1511. self.tree.generateImpliedEndTags()
  1512. if self.tree.openElements[-1].name != "table":
  1513. self.parser.parseError("end-tag-too-early-named",
  1514. {"gotName": "table",
  1515. "expectedName": self.tree.openElements[-1].name})
  1516. while self.tree.openElements[-1].name != "table":
  1517. self.tree.openElements.pop()
  1518. self.tree.openElements.pop()
  1519. self.parser.resetInsertionMode()
  1520. else:
  1521. # innerHTML case
  1522. assert self.parser.innerHTML
  1523. self.parser.parseError()
  1524. def endTagIgnore(self, token):
  1525. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1526. def endTagOther(self, token):
  1527. self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
  1528. # Do the table magic!
  1529. self.tree.insertFromTable = True
  1530. self.parser.phases["inBody"].processEndTag(token)
  1531. self.tree.insertFromTable = False
  1532. class InTableTextPhase(Phase):
  1533. def __init__(self, parser, tree):
  1534. Phase.__init__(self, parser, tree)
  1535. self.originalPhase = None
  1536. self.characterTokens = []
  1537. def flushCharacters(self):
  1538. data = "".join([item["data"] for item in self.characterTokens])
  1539. if any([item not in spaceCharacters for item in data]):
  1540. token = {"type": tokenTypes["Characters"], "data": data}
  1541. self.parser.phases["inTable"].insertText(token)
  1542. elif data:
  1543. self.tree.insertText(data)
  1544. self.characterTokens = []
  1545. def processComment(self, token):
  1546. self.flushCharacters()
  1547. self.parser.phase = self.originalPhase
  1548. return token
  1549. def processEOF(self):
  1550. self.flushCharacters()
  1551. self.parser.phase = self.originalPhase
  1552. return True
  1553. def processCharacters(self, token):
  1554. if token["data"] == "\u0000":
  1555. return
  1556. self.characterTokens.append(token)
  1557. def processSpaceCharacters(self, token):
  1558. # pretty sure we should never reach here
  1559. self.characterTokens.append(token)
  1560. # assert False
  1561. def processStartTag(self, token):
  1562. self.flushCharacters()
  1563. self.parser.phase = self.originalPhase
  1564. return token
  1565. def processEndTag(self, token):
  1566. self.flushCharacters()
  1567. self.parser.phase = self.originalPhase
  1568. return token
  1569. class InCaptionPhase(Phase):
  1570. # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
  1571. def __init__(self, parser, tree):
  1572. Phase.__init__(self, parser, tree)
  1573. self.startTagHandler = _utils.MethodDispatcher([
  1574. ("html", self.startTagHtml),
  1575. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1576. "thead", "tr"), self.startTagTableElement)
  1577. ])
  1578. self.startTagHandler.default = self.startTagOther
  1579. self.endTagHandler = _utils.MethodDispatcher([
  1580. ("caption", self.endTagCaption),
  1581. ("table", self.endTagTable),
  1582. (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
  1583. "thead", "tr"), self.endTagIgnore)
  1584. ])
  1585. self.endTagHandler.default = self.endTagOther
  1586. def ignoreEndTagCaption(self):
  1587. return not self.tree.elementInScope("caption", variant="table")
  1588. def processEOF(self):
  1589. self.parser.phases["inBody"].processEOF()
  1590. def processCharacters(self, token):
  1591. return self.parser.phases["inBody"].processCharacters(token)
  1592. def startTagTableElement(self, token):
  1593. self.parser.parseError()
  1594. # XXX Have to duplicate logic here to find out if the tag is ignored
  1595. ignoreEndTag = self.ignoreEndTagCaption()
  1596. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1597. if not ignoreEndTag:
  1598. return token
  1599. def startTagOther(self, token):
  1600. return self.parser.phases["inBody"].processStartTag(token)
  1601. def endTagCaption(self, token):
  1602. if not self.ignoreEndTagCaption():
  1603. # AT this code is quite similar to endTagTable in "InTable"
  1604. self.tree.generateImpliedEndTags()
  1605. if self.tree.openElements[-1].name != "caption":
  1606. self.parser.parseError("expected-one-end-tag-but-got-another",
  1607. {"gotName": "caption",
  1608. "expectedName": self.tree.openElements[-1].name})
  1609. while self.tree.openElements[-1].name != "caption":
  1610. self.tree.openElements.pop()
  1611. self.tree.openElements.pop()
  1612. self.tree.clearActiveFormattingElements()
  1613. self.parser.phase = self.parser.phases["inTable"]
  1614. else:
  1615. # innerHTML case
  1616. assert self.parser.innerHTML
  1617. self.parser.parseError()
  1618. def endTagTable(self, token):
  1619. self.parser.parseError()
  1620. ignoreEndTag = self.ignoreEndTagCaption()
  1621. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1622. if not ignoreEndTag:
  1623. return token
  1624. def endTagIgnore(self, token):
  1625. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1626. def endTagOther(self, token):
  1627. return self.parser.phases["inBody"].processEndTag(token)
  1628. class InColumnGroupPhase(Phase):
  1629. # http://www.whatwg.org/specs/web-apps/current-work/#in-column
  1630. def __init__(self, parser, tree):
  1631. Phase.__init__(self, parser, tree)
  1632. self.startTagHandler = _utils.MethodDispatcher([
  1633. ("html", self.startTagHtml),
  1634. ("col", self.startTagCol)
  1635. ])
  1636. self.startTagHandler.default = self.startTagOther
  1637. self.endTagHandler = _utils.MethodDispatcher([
  1638. ("colgroup", self.endTagColgroup),
  1639. ("col", self.endTagCol)
  1640. ])
  1641. self.endTagHandler.default = self.endTagOther
  1642. def ignoreEndTagColgroup(self):
  1643. return self.tree.openElements[-1].name == "html"
  1644. def processEOF(self):
  1645. if self.tree.openElements[-1].name == "html":
  1646. assert self.parser.innerHTML
  1647. return
  1648. else:
  1649. ignoreEndTag = self.ignoreEndTagColgroup()
  1650. self.endTagColgroup(impliedTagToken("colgroup"))
  1651. if not ignoreEndTag:
  1652. return True
  1653. def processCharacters(self, token):
  1654. ignoreEndTag = self.ignoreEndTagColgroup()
  1655. self.endTagColgroup(impliedTagToken("colgroup"))
  1656. if not ignoreEndTag:
  1657. return token
  1658. def startTagCol(self, token):
  1659. self.tree.insertElement(token)
  1660. self.tree.openElements.pop()
  1661. token["selfClosingAcknowledged"] = True
  1662. def startTagOther(self, token):
  1663. ignoreEndTag = self.ignoreEndTagColgroup()
  1664. self.endTagColgroup(impliedTagToken("colgroup"))
  1665. if not ignoreEndTag:
  1666. return token
  1667. def endTagColgroup(self, token):
  1668. if self.ignoreEndTagColgroup():
  1669. # innerHTML case
  1670. assert self.parser.innerHTML
  1671. self.parser.parseError()
  1672. else:
  1673. self.tree.openElements.pop()
  1674. self.parser.phase = self.parser.phases["inTable"]
  1675. def endTagCol(self, token):
  1676. self.parser.parseError("no-end-tag", {"name": "col"})
  1677. def endTagOther(self, token):
  1678. ignoreEndTag = self.ignoreEndTagColgroup()
  1679. self.endTagColgroup(impliedTagToken("colgroup"))
  1680. if not ignoreEndTag:
  1681. return token
  1682. class InTableBodyPhase(Phase):
  1683. # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
  1684. def __init__(self, parser, tree):
  1685. Phase.__init__(self, parser, tree)
  1686. self.startTagHandler = _utils.MethodDispatcher([
  1687. ("html", self.startTagHtml),
  1688. ("tr", self.startTagTr),
  1689. (("td", "th"), self.startTagTableCell),
  1690. (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
  1691. self.startTagTableOther)
  1692. ])
  1693. self.startTagHandler.default = self.startTagOther
  1694. self.endTagHandler = _utils.MethodDispatcher([
  1695. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1696. ("table", self.endTagTable),
  1697. (("body", "caption", "col", "colgroup", "html", "td", "th",
  1698. "tr"), self.endTagIgnore)
  1699. ])
  1700. self.endTagHandler.default = self.endTagOther
  1701. # helper methods
  1702. def clearStackToTableBodyContext(self):
  1703. while self.tree.openElements[-1].name not in ("tbody", "tfoot",
  1704. "thead", "html"):
  1705. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1706. # {"name": self.tree.openElements[-1].name})
  1707. self.tree.openElements.pop()
  1708. if self.tree.openElements[-1].name == "html":
  1709. assert self.parser.innerHTML
  1710. # the rest
  1711. def processEOF(self):
  1712. self.parser.phases["inTable"].processEOF()
  1713. def processSpaceCharacters(self, token):
  1714. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1715. def processCharacters(self, token):
  1716. return self.parser.phases["inTable"].processCharacters(token)
  1717. def startTagTr(self, token):
  1718. self.clearStackToTableBodyContext()
  1719. self.tree.insertElement(token)
  1720. self.parser.phase = self.parser.phases["inRow"]
  1721. def startTagTableCell(self, token):
  1722. self.parser.parseError("unexpected-cell-in-table-body",
  1723. {"name": token["name"]})
  1724. self.startTagTr(impliedTagToken("tr", "StartTag"))
  1725. return token
  1726. def startTagTableOther(self, token):
  1727. # XXX AT Any ideas on how to share this with endTagTable?
  1728. if (self.tree.elementInScope("tbody", variant="table") or
  1729. self.tree.elementInScope("thead", variant="table") or
  1730. self.tree.elementInScope("tfoot", variant="table")):
  1731. self.clearStackToTableBodyContext()
  1732. self.endTagTableRowGroup(
  1733. impliedTagToken(self.tree.openElements[-1].name))
  1734. return token
  1735. else:
  1736. # innerHTML case
  1737. assert self.parser.innerHTML
  1738. self.parser.parseError()
  1739. def startTagOther(self, token):
  1740. return self.parser.phases["inTable"].processStartTag(token)
  1741. def endTagTableRowGroup(self, token):
  1742. if self.tree.elementInScope(token["name"], variant="table"):
  1743. self.clearStackToTableBodyContext()
  1744. self.tree.openElements.pop()
  1745. self.parser.phase = self.parser.phases["inTable"]
  1746. else:
  1747. self.parser.parseError("unexpected-end-tag-in-table-body",
  1748. {"name": token["name"]})
  1749. def endTagTable(self, token):
  1750. if (self.tree.elementInScope("tbody", variant="table") or
  1751. self.tree.elementInScope("thead", variant="table") or
  1752. self.tree.elementInScope("tfoot", variant="table")):
  1753. self.clearStackToTableBodyContext()
  1754. self.endTagTableRowGroup(
  1755. impliedTagToken(self.tree.openElements[-1].name))
  1756. return token
  1757. else:
  1758. # innerHTML case
  1759. assert self.parser.innerHTML
  1760. self.parser.parseError()
  1761. def endTagIgnore(self, token):
  1762. self.parser.parseError("unexpected-end-tag-in-table-body",
  1763. {"name": token["name"]})
  1764. def endTagOther(self, token):
  1765. return self.parser.phases["inTable"].processEndTag(token)
  1766. class InRowPhase(Phase):
  1767. # http://www.whatwg.org/specs/web-apps/current-work/#in-row
  1768. def __init__(self, parser, tree):
  1769. Phase.__init__(self, parser, tree)
  1770. self.startTagHandler = _utils.MethodDispatcher([
  1771. ("html", self.startTagHtml),
  1772. (("td", "th"), self.startTagTableCell),
  1773. (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
  1774. "tr"), self.startTagTableOther)
  1775. ])
  1776. self.startTagHandler.default = self.startTagOther
  1777. self.endTagHandler = _utils.MethodDispatcher([
  1778. ("tr", self.endTagTr),
  1779. ("table", self.endTagTable),
  1780. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1781. (("body", "caption", "col", "colgroup", "html", "td", "th"),
  1782. self.endTagIgnore)
  1783. ])
  1784. self.endTagHandler.default = self.endTagOther
  1785. # helper methods (XXX unify this with other table helper methods)
  1786. def clearStackToTableRowContext(self):
  1787. while self.tree.openElements[-1].name not in ("tr", "html"):
  1788. self.parser.parseError("unexpected-implied-end-tag-in-table-row",
  1789. {"name": self.tree.openElements[-1].name})
  1790. self.tree.openElements.pop()
  1791. def ignoreEndTagTr(self):
  1792. return not self.tree.elementInScope("tr", variant="table")
  1793. # the rest
  1794. def processEOF(self):
  1795. self.parser.phases["inTable"].processEOF()
  1796. def processSpaceCharacters(self, token):
  1797. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1798. def processCharacters(self, token):
  1799. return self.parser.phases["inTable"].processCharacters(token)
  1800. def startTagTableCell(self, token):
  1801. self.clearStackToTableRowContext()
  1802. self.tree.insertElement(token)
  1803. self.parser.phase = self.parser.phases["inCell"]
  1804. self.tree.activeFormattingElements.append(Marker)
  1805. def startTagTableOther(self, token):
  1806. ignoreEndTag = self.ignoreEndTagTr()
  1807. self.endTagTr(impliedTagToken("tr"))
  1808. # XXX how are we sure it's always ignored in the innerHTML case?
  1809. if not ignoreEndTag:
  1810. return token
  1811. def startTagOther(self, token):
  1812. return self.parser.phases["inTable"].processStartTag(token)
  1813. def endTagTr(self, token):
  1814. if not self.ignoreEndTagTr():
  1815. self.clearStackToTableRowContext()
  1816. self.tree.openElements.pop()
  1817. self.parser.phase = self.parser.phases["inTableBody"]
  1818. else:
  1819. # innerHTML case
  1820. assert self.parser.innerHTML
  1821. self.parser.parseError()
  1822. def endTagTable(self, token):
  1823. ignoreEndTag = self.ignoreEndTagTr()
  1824. self.endTagTr(impliedTagToken("tr"))
  1825. # Reprocess the current tag if the tr end tag was not ignored
  1826. # XXX how are we sure it's always ignored in the innerHTML case?
  1827. if not ignoreEndTag:
  1828. return token
  1829. def endTagTableRowGroup(self, token):
  1830. if self.tree.elementInScope(token["name"], variant="table"):
  1831. self.endTagTr(impliedTagToken("tr"))
  1832. return token
  1833. else:
  1834. self.parser.parseError()
  1835. def endTagIgnore(self, token):
  1836. self.parser.parseError("unexpected-end-tag-in-table-row",
  1837. {"name": token["name"]})
  1838. def endTagOther(self, token):
  1839. return self.parser.phases["inTable"].processEndTag(token)
  1840. class InCellPhase(Phase):
  1841. # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
  1842. def __init__(self, parser, tree):
  1843. Phase.__init__(self, parser, tree)
  1844. self.startTagHandler = _utils.MethodDispatcher([
  1845. ("html", self.startTagHtml),
  1846. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1847. "thead", "tr"), self.startTagTableOther)
  1848. ])
  1849. self.startTagHandler.default = self.startTagOther
  1850. self.endTagHandler = _utils.MethodDispatcher([
  1851. (("td", "th"), self.endTagTableCell),
  1852. (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
  1853. (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
  1854. ])
  1855. self.endTagHandler.default = self.endTagOther
  1856. # helper
  1857. def closeCell(self):
  1858. if self.tree.elementInScope("td", variant="table"):
  1859. self.endTagTableCell(impliedTagToken("td"))
  1860. elif self.tree.elementInScope("th", variant="table"):
  1861. self.endTagTableCell(impliedTagToken("th"))
  1862. # the rest
  1863. def processEOF(self):
  1864. self.parser.phases["inBody"].processEOF()
  1865. def processCharacters(self, token):
  1866. return self.parser.phases["inBody"].processCharacters(token)
  1867. def startTagTableOther(self, token):
  1868. if (self.tree.elementInScope("td", variant="table") or
  1869. self.tree.elementInScope("th", variant="table")):
  1870. self.closeCell()
  1871. return token
  1872. else:
  1873. # innerHTML case
  1874. assert self.parser.innerHTML
  1875. self.parser.parseError()
  1876. def startTagOther(self, token):
  1877. return self.parser.phases["inBody"].processStartTag(token)
  1878. def endTagTableCell(self, token):
  1879. if self.tree.elementInScope(token["name"], variant="table"):
  1880. self.tree.generateImpliedEndTags(token["name"])
  1881. if self.tree.openElements[-1].name != token["name"]:
  1882. self.parser.parseError("unexpected-cell-end-tag",
  1883. {"name": token["name"]})
  1884. while True:
  1885. node = self.tree.openElements.pop()
  1886. if node.name == token["name"]:
  1887. break
  1888. else:
  1889. self.tree.openElements.pop()
  1890. self.tree.clearActiveFormattingElements()
  1891. self.parser.phase = self.parser.phases["inRow"]
  1892. else:
  1893. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1894. def endTagIgnore(self, token):
  1895. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1896. def endTagImply(self, token):
  1897. if self.tree.elementInScope(token["name"], variant="table"):
  1898. self.closeCell()
  1899. return token
  1900. else:
  1901. # sometimes innerHTML case
  1902. self.parser.parseError()
  1903. def endTagOther(self, token):
  1904. return self.parser.phases["inBody"].processEndTag(token)
  1905. class InSelectPhase(Phase):
  1906. def __init__(self, parser, tree):
  1907. Phase.__init__(self, parser, tree)
  1908. self.startTagHandler = _utils.MethodDispatcher([
  1909. ("html", self.startTagHtml),
  1910. ("option", self.startTagOption),
  1911. ("optgroup", self.startTagOptgroup),
  1912. ("select", self.startTagSelect),
  1913. (("input", "keygen", "textarea"), self.startTagInput),
  1914. ("script", self.startTagScript)
  1915. ])
  1916. self.startTagHandler.default = self.startTagOther
  1917. self.endTagHandler = _utils.MethodDispatcher([
  1918. ("option", self.endTagOption),
  1919. ("optgroup", self.endTagOptgroup),
  1920. ("select", self.endTagSelect)
  1921. ])
  1922. self.endTagHandler.default = self.endTagOther
  1923. # http://www.whatwg.org/specs/web-apps/current-work/#in-select
  1924. def processEOF(self):
  1925. if self.tree.openElements[-1].name != "html":
  1926. self.parser.parseError("eof-in-select")
  1927. else:
  1928. assert self.parser.innerHTML
  1929. def processCharacters(self, token):
  1930. if token["data"] == "\u0000":
  1931. return
  1932. self.tree.insertText(token["data"])
  1933. def startTagOption(self, token):
  1934. # We need to imply </option> if <option> is the current node.
  1935. if self.tree.openElements[-1].name == "option":
  1936. self.tree.openElements.pop()
  1937. self.tree.insertElement(token)
  1938. def startTagOptgroup(self, token):
  1939. if self.tree.openElements[-1].name == "option":
  1940. self.tree.openElements.pop()
  1941. if self.tree.openElements[-1].name == "optgroup":
  1942. self.tree.openElements.pop()
  1943. self.tree.insertElement(token)
  1944. def startTagSelect(self, token):
  1945. self.parser.parseError("unexpected-select-in-select")
  1946. self.endTagSelect(impliedTagToken("select"))
  1947. def startTagInput(self, token):
  1948. self.parser.parseError("unexpected-input-in-select")
  1949. if self.tree.elementInScope("select", variant="select"):
  1950. self.endTagSelect(impliedTagToken("select"))
  1951. return token
  1952. else:
  1953. assert self.parser.innerHTML
  1954. def startTagScript(self, token):
  1955. return self.parser.phases["inHead"].processStartTag(token)
  1956. def startTagOther(self, token):
  1957. self.parser.parseError("unexpected-start-tag-in-select",
  1958. {"name": token["name"]})
  1959. def endTagOption(self, token):
  1960. if self.tree.openElements[-1].name == "option":
  1961. self.tree.openElements.pop()
  1962. else:
  1963. self.parser.parseError("unexpected-end-tag-in-select",
  1964. {"name": "option"})
  1965. def endTagOptgroup(self, token):
  1966. # </optgroup> implicitly closes <option>
  1967. if (self.tree.openElements[-1].name == "option" and
  1968. self.tree.openElements[-2].name == "optgroup"):
  1969. self.tree.openElements.pop()
  1970. # It also closes </optgroup>
  1971. if self.tree.openElements[-1].name == "optgroup":
  1972. self.tree.openElements.pop()
  1973. # But nothing else
  1974. else:
  1975. self.parser.parseError("unexpected-end-tag-in-select",
  1976. {"name": "optgroup"})
  1977. def endTagSelect(self, token):
  1978. if self.tree.elementInScope("select", variant="select"):
  1979. node = self.tree.openElements.pop()
  1980. while node.name != "select":
  1981. node = self.tree.openElements.pop()
  1982. self.parser.resetInsertionMode()
  1983. else:
  1984. # innerHTML case
  1985. assert self.parser.innerHTML
  1986. self.parser.parseError()
  1987. def endTagOther(self, token):
  1988. self.parser.parseError("unexpected-end-tag-in-select",
  1989. {"name": token["name"]})
  1990. class InSelectInTablePhase(Phase):
  1991. def __init__(self, parser, tree):
  1992. Phase.__init__(self, parser, tree)
  1993. self.startTagHandler = _utils.MethodDispatcher([
  1994. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1995. self.startTagTable)
  1996. ])
  1997. self.startTagHandler.default = self.startTagOther
  1998. self.endTagHandler = _utils.MethodDispatcher([
  1999. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  2000. self.endTagTable)
  2001. ])
  2002. self.endTagHandler.default = self.endTagOther
  2003. def processEOF(self):
  2004. self.parser.phases["inSelect"].processEOF()
  2005. def processCharacters(self, token):
  2006. return self.parser.phases["inSelect"].processCharacters(token)
  2007. def startTagTable(self, token):
  2008. self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
  2009. self.endTagOther(impliedTagToken("select"))
  2010. return token
  2011. def startTagOther(self, token):
  2012. return self.parser.phases["inSelect"].processStartTag(token)
  2013. def endTagTable(self, token):
  2014. self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
  2015. if self.tree.elementInScope(token["name"], variant="table"):
  2016. self.endTagOther(impliedTagToken("select"))
  2017. return token
  2018. def endTagOther(self, token):
  2019. return self.parser.phases["inSelect"].processEndTag(token)
  2020. class InForeignContentPhase(Phase):
  2021. breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
  2022. "center", "code", "dd", "div", "dl", "dt",
  2023. "em", "embed", "h1", "h2", "h3",
  2024. "h4", "h5", "h6", "head", "hr", "i", "img",
  2025. "li", "listing", "menu", "meta", "nobr",
  2026. "ol", "p", "pre", "ruby", "s", "small",
  2027. "span", "strong", "strike", "sub", "sup",
  2028. "table", "tt", "u", "ul", "var"])
  2029. def __init__(self, parser, tree):
  2030. Phase.__init__(self, parser, tree)
  2031. def adjustSVGTagNames(self, token):
  2032. replacements = {"altglyph": "altGlyph",
  2033. "altglyphdef": "altGlyphDef",
  2034. "altglyphitem": "altGlyphItem",
  2035. "animatecolor": "animateColor",
  2036. "animatemotion": "animateMotion",
  2037. "animatetransform": "animateTransform",
  2038. "clippath": "clipPath",
  2039. "feblend": "feBlend",
  2040. "fecolormatrix": "feColorMatrix",
  2041. "fecomponenttransfer": "feComponentTransfer",
  2042. "fecomposite": "feComposite",
  2043. "feconvolvematrix": "feConvolveMatrix",
  2044. "fediffuselighting": "feDiffuseLighting",
  2045. "fedisplacementmap": "feDisplacementMap",
  2046. "fedistantlight": "feDistantLight",
  2047. "feflood": "feFlood",
  2048. "fefunca": "feFuncA",
  2049. "fefuncb": "feFuncB",
  2050. "fefuncg": "feFuncG",
  2051. "fefuncr": "feFuncR",
  2052. "fegaussianblur": "feGaussianBlur",
  2053. "feimage": "feImage",
  2054. "femerge": "feMerge",
  2055. "femergenode": "feMergeNode",
  2056. "femorphology": "feMorphology",
  2057. "feoffset": "feOffset",
  2058. "fepointlight": "fePointLight",
  2059. "fespecularlighting": "feSpecularLighting",
  2060. "fespotlight": "feSpotLight",
  2061. "fetile": "feTile",
  2062. "feturbulence": "feTurbulence",
  2063. "foreignobject": "foreignObject",
  2064. "glyphref": "glyphRef",
  2065. "lineargradient": "linearGradient",
  2066. "radialgradient": "radialGradient",
  2067. "textpath": "textPath"}
  2068. if token["name"] in replacements:
  2069. token["name"] = replacements[token["name"]]
  2070. def processCharacters(self, token):
  2071. if token["data"] == "\u0000":
  2072. token["data"] = "\uFFFD"
  2073. elif (self.parser.framesetOK and
  2074. any(char not in spaceCharacters for char in token["data"])):
  2075. self.parser.framesetOK = False
  2076. Phase.processCharacters(self, token)
  2077. def processStartTag(self, token):
  2078. currentNode = self.tree.openElements[-1]
  2079. if (token["name"] in self.breakoutElements or
  2080. (token["name"] == "font" and
  2081. set(token["data"].keys()) & set(["color", "face", "size"]))):
  2082. self.parser.parseError("unexpected-html-element-in-foreign-content",
  2083. {"name": token["name"]})
  2084. while (self.tree.openElements[-1].namespace !=
  2085. self.tree.defaultNamespace and
  2086. not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
  2087. not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
  2088. self.tree.openElements.pop()
  2089. return token
  2090. else:
  2091. if currentNode.namespace == namespaces["mathml"]:
  2092. self.parser.adjustMathMLAttributes(token)
  2093. elif currentNode.namespace == namespaces["svg"]:
  2094. self.adjustSVGTagNames(token)
  2095. self.parser.adjustSVGAttributes(token)
  2096. self.parser.adjustForeignAttributes(token)
  2097. token["namespace"] = currentNode.namespace
  2098. self.tree.insertElement(token)
  2099. if token["selfClosing"]:
  2100. self.tree.openElements.pop()
  2101. token["selfClosingAcknowledged"] = True
  2102. def processEndTag(self, token):
  2103. nodeIndex = len(self.tree.openElements) - 1
  2104. node = self.tree.openElements[-1]
  2105. if node.name.translate(asciiUpper2Lower) != token["name"]:
  2106. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  2107. while True:
  2108. if node.name.translate(asciiUpper2Lower) == token["name"]:
  2109. # XXX this isn't in the spec but it seems necessary
  2110. if self.parser.phase == self.parser.phases["inTableText"]:
  2111. self.parser.phase.flushCharacters()
  2112. self.parser.phase = self.parser.phase.originalPhase
  2113. while self.tree.openElements.pop() != node:
  2114. assert self.tree.openElements
  2115. new_token = None
  2116. break
  2117. nodeIndex -= 1
  2118. node = self.tree.openElements[nodeIndex]
  2119. if node.namespace != self.tree.defaultNamespace:
  2120. continue
  2121. else:
  2122. new_token = self.parser.phase.processEndTag(token)
  2123. break
  2124. return new_token
  2125. class AfterBodyPhase(Phase):
  2126. def __init__(self, parser, tree):
  2127. Phase.__init__(self, parser, tree)
  2128. self.startTagHandler = _utils.MethodDispatcher([
  2129. ("html", self.startTagHtml)
  2130. ])
  2131. self.startTagHandler.default = self.startTagOther
  2132. self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
  2133. self.endTagHandler.default = self.endTagOther
  2134. def processEOF(self):
  2135. # Stop parsing
  2136. pass
  2137. def processComment(self, token):
  2138. # This is needed because data is to be appended to the <html> element
  2139. # here and not to whatever is currently open.
  2140. self.tree.insertComment(token, self.tree.openElements[0])
  2141. def processCharacters(self, token):
  2142. self.parser.parseError("unexpected-char-after-body")
  2143. self.parser.phase = self.parser.phases["inBody"]
  2144. return token
  2145. def startTagHtml(self, token):
  2146. return self.parser.phases["inBody"].processStartTag(token)
  2147. def startTagOther(self, token):
  2148. self.parser.parseError("unexpected-start-tag-after-body",
  2149. {"name": token["name"]})
  2150. self.parser.phase = self.parser.phases["inBody"]
  2151. return token
  2152. def endTagHtml(self, name):
  2153. if self.parser.innerHTML:
  2154. self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
  2155. else:
  2156. self.parser.phase = self.parser.phases["afterAfterBody"]
  2157. def endTagOther(self, token):
  2158. self.parser.parseError("unexpected-end-tag-after-body",
  2159. {"name": token["name"]})
  2160. self.parser.phase = self.parser.phases["inBody"]
  2161. return token
  2162. class InFramesetPhase(Phase):
  2163. # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
  2164. def __init__(self, parser, tree):
  2165. Phase.__init__(self, parser, tree)
  2166. self.startTagHandler = _utils.MethodDispatcher([
  2167. ("html", self.startTagHtml),
  2168. ("frameset", self.startTagFrameset),
  2169. ("frame", self.startTagFrame),
  2170. ("noframes", self.startTagNoframes)
  2171. ])
  2172. self.startTagHandler.default = self.startTagOther
  2173. self.endTagHandler = _utils.MethodDispatcher([
  2174. ("frameset", self.endTagFrameset)
  2175. ])
  2176. self.endTagHandler.default = self.endTagOther
  2177. def processEOF(self):
  2178. if self.tree.openElements[-1].name != "html":
  2179. self.parser.parseError("eof-in-frameset")
  2180. else:
  2181. assert self.parser.innerHTML
  2182. def processCharacters(self, token):
  2183. self.parser.parseError("unexpected-char-in-frameset")
  2184. def startTagFrameset(self, token):
  2185. self.tree.insertElement(token)
  2186. def startTagFrame(self, token):
  2187. self.tree.insertElement(token)
  2188. self.tree.openElements.pop()
  2189. def startTagNoframes(self, token):
  2190. return self.parser.phases["inBody"].processStartTag(token)
  2191. def startTagOther(self, token):
  2192. self.parser.parseError("unexpected-start-tag-in-frameset",
  2193. {"name": token["name"]})
  2194. def endTagFrameset(self, token):
  2195. if self.tree.openElements[-1].name == "html":
  2196. # innerHTML case
  2197. self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
  2198. else:
  2199. self.tree.openElements.pop()
  2200. if (not self.parser.innerHTML and
  2201. self.tree.openElements[-1].name != "frameset"):
  2202. # If we're not in innerHTML mode and the current node is not a
  2203. # "frameset" element (anymore) then switch.
  2204. self.parser.phase = self.parser.phases["afterFrameset"]
  2205. def endTagOther(self, token):
  2206. self.parser.parseError("unexpected-end-tag-in-frameset",
  2207. {"name": token["name"]})
  2208. class AfterFramesetPhase(Phase):
  2209. # http://www.whatwg.org/specs/web-apps/current-work/#after3
  2210. def __init__(self, parser, tree):
  2211. Phase.__init__(self, parser, tree)
  2212. self.startTagHandler = _utils.MethodDispatcher([
  2213. ("html", self.startTagHtml),
  2214. ("noframes", self.startTagNoframes)
  2215. ])
  2216. self.startTagHandler.default = self.startTagOther
  2217. self.endTagHandler = _utils.MethodDispatcher([
  2218. ("html", self.endTagHtml)
  2219. ])
  2220. self.endTagHandler.default = self.endTagOther
  2221. def processEOF(self):
  2222. # Stop parsing
  2223. pass
  2224. def processCharacters(self, token):
  2225. self.parser.parseError("unexpected-char-after-frameset")
  2226. def startTagNoframes(self, token):
  2227. return self.parser.phases["inHead"].processStartTag(token)
  2228. def startTagOther(self, token):
  2229. self.parser.parseError("unexpected-start-tag-after-frameset",
  2230. {"name": token["name"]})
  2231. def endTagHtml(self, token):
  2232. self.parser.phase = self.parser.phases["afterAfterFrameset"]
  2233. def endTagOther(self, token):
  2234. self.parser.parseError("unexpected-end-tag-after-frameset",
  2235. {"name": token["name"]})
  2236. class AfterAfterBodyPhase(Phase):
  2237. def __init__(self, parser, tree):
  2238. Phase.__init__(self, parser, tree)
  2239. self.startTagHandler = _utils.MethodDispatcher([
  2240. ("html", self.startTagHtml)
  2241. ])
  2242. self.startTagHandler.default = self.startTagOther
  2243. def processEOF(self):
  2244. pass
  2245. def processComment(self, token):
  2246. self.tree.insertComment(token, self.tree.document)
  2247. def processSpaceCharacters(self, token):
  2248. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2249. def processCharacters(self, token):
  2250. self.parser.parseError("expected-eof-but-got-char")
  2251. self.parser.phase = self.parser.phases["inBody"]
  2252. return token
  2253. def startTagHtml(self, token):
  2254. return self.parser.phases["inBody"].processStartTag(token)
  2255. def startTagOther(self, token):
  2256. self.parser.parseError("expected-eof-but-got-start-tag",
  2257. {"name": token["name"]})
  2258. self.parser.phase = self.parser.phases["inBody"]
  2259. return token
  2260. def processEndTag(self, token):
  2261. self.parser.parseError("expected-eof-but-got-end-tag",
  2262. {"name": token["name"]})
  2263. self.parser.phase = self.parser.phases["inBody"]
  2264. return token
  2265. class AfterAfterFramesetPhase(Phase):
  2266. def __init__(self, parser, tree):
  2267. Phase.__init__(self, parser, tree)
  2268. self.startTagHandler = _utils.MethodDispatcher([
  2269. ("html", self.startTagHtml),
  2270. ("noframes", self.startTagNoFrames)
  2271. ])
  2272. self.startTagHandler.default = self.startTagOther
  2273. def processEOF(self):
  2274. pass
  2275. def processComment(self, token):
  2276. self.tree.insertComment(token, self.tree.document)
  2277. def processSpaceCharacters(self, token):
  2278. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2279. def processCharacters(self, token):
  2280. self.parser.parseError("expected-eof-but-got-char")
  2281. def startTagHtml(self, token):
  2282. return self.parser.phases["inBody"].processStartTag(token)
  2283. def startTagNoFrames(self, token):
  2284. return self.parser.phases["inHead"].processStartTag(token)
  2285. def startTagOther(self, token):
  2286. self.parser.parseError("expected-eof-but-got-start-tag",
  2287. {"name": token["name"]})
  2288. def processEndTag(self, token):
  2289. self.parser.parseError("expected-eof-but-got-end-tag",
  2290. {"name": token["name"]})
  2291. # pylint:enable=unused-argument
  2292. return {
  2293. "initial": InitialPhase,
  2294. "beforeHtml": BeforeHtmlPhase,
  2295. "beforeHead": BeforeHeadPhase,
  2296. "inHead": InHeadPhase,
  2297. "inHeadNoscript": InHeadNoscriptPhase,
  2298. "afterHead": AfterHeadPhase,
  2299. "inBody": InBodyPhase,
  2300. "text": TextPhase,
  2301. "inTable": InTablePhase,
  2302. "inTableText": InTableTextPhase,
  2303. "inCaption": InCaptionPhase,
  2304. "inColumnGroup": InColumnGroupPhase,
  2305. "inTableBody": InTableBodyPhase,
  2306. "inRow": InRowPhase,
  2307. "inCell": InCellPhase,
  2308. "inSelect": InSelectPhase,
  2309. "inSelectInTable": InSelectInTablePhase,
  2310. "inForeignContent": InForeignContentPhase,
  2311. "afterBody": AfterBodyPhase,
  2312. "inFrameset": InFramesetPhase,
  2313. "afterFrameset": AfterFramesetPhase,
  2314. "afterAfterBody": AfterAfterBodyPhase,
  2315. "afterAfterFrameset": AfterAfterFramesetPhase,
  2316. # XXX after after frameset
  2317. }
  2318. def adjust_attributes(token, replacements):
  2319. needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
  2320. if needs_adjustment:
  2321. token['data'] = OrderedDict((replacements.get(k, k), v)
  2322. for k, v in token['data'].items())
  2323. def impliedTagToken(name, type="EndTag", attributes=None,
  2324. selfClosing=False):
  2325. if attributes is None:
  2326. attributes = {}
  2327. return {"type": tokenTypes[type], "name": name, "data": attributes,
  2328. "selfClosing": selfClosing}
  2329. class ParseError(Exception):
  2330. """Error in parsed document"""
  2331. pass