_tokenizer.py 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import unichr as chr
  3. from collections import deque, OrderedDict
  4. from sys import version_info
  5. from .constants import spaceCharacters
  6. from .constants import entities
  7. from .constants import asciiLetters, asciiUpper2Lower
  8. from .constants import digits, hexDigits, EOF
  9. from .constants import tokenTypes, tagTokenTypes
  10. from .constants import replacementCharacters
  11. from ._inputstream import HTMLInputStream
  12. from ._trie import Trie
  13. entitiesTrie = Trie(entities)
  14. if version_info >= (3, 7):
  15. attributeMap = dict
  16. else:
  17. attributeMap = OrderedDict
  18. class HTMLTokenizer(object):
  19. """ This class takes care of tokenizing HTML.
  20. * self.currentToken
  21. Holds the token that is currently being processed.
  22. * self.state
  23. Holds a reference to the method to be invoked... XXX
  24. * self.stream
  25. Points to HTMLInputStream object.
  26. """
  27. def __init__(self, stream, parser=None, **kwargs):
  28. self.stream = HTMLInputStream(stream, **kwargs)
  29. self.parser = parser
  30. # Setup the initial tokenizer state
  31. self.escapeFlag = False
  32. self.lastFourChars = []
  33. self.state = self.dataState
  34. self.escape = False
  35. # The current token being created
  36. self.currentToken = None
  37. super(HTMLTokenizer, self).__init__()
  38. def __iter__(self):
  39. """ This is where the magic happens.
  40. We do our usually processing through the states and when we have a token
  41. to return we yield the token which pauses processing until the next token
  42. is requested.
  43. """
  44. self.tokenQueue = deque([])
  45. # Start processing. When EOF is reached self.state will return False
  46. # instead of True and the loop will terminate.
  47. while self.state():
  48. while self.stream.errors:
  49. yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
  50. while self.tokenQueue:
  51. yield self.tokenQueue.popleft()
  52. def consumeNumberEntity(self, isHex):
  53. """This function returns either U+FFFD or the character based on the
  54. decimal or hexadecimal representation. It also discards ";" if present.
  55. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
  56. """
  57. allowed = digits
  58. radix = 10
  59. if isHex:
  60. allowed = hexDigits
  61. radix = 16
  62. charStack = []
  63. # Consume all the characters that are in range while making sure we
  64. # don't hit an EOF.
  65. c = self.stream.char()
  66. while c in allowed and c is not EOF:
  67. charStack.append(c)
  68. c = self.stream.char()
  69. # Convert the set of characters consumed to an int.
  70. charAsInt = int("".join(charStack), radix)
  71. # Certain characters get replaced with others
  72. if charAsInt in replacementCharacters:
  73. char = replacementCharacters[charAsInt]
  74. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  75. "illegal-codepoint-for-numeric-entity",
  76. "datavars": {"charAsInt": charAsInt}})
  77. elif ((0xD800 <= charAsInt <= 0xDFFF) or
  78. (charAsInt > 0x10FFFF)):
  79. char = "\uFFFD"
  80. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  81. "illegal-codepoint-for-numeric-entity",
  82. "datavars": {"charAsInt": charAsInt}})
  83. else:
  84. # Should speed up this check somehow (e.g. move the set to a constant)
  85. if ((0x0001 <= charAsInt <= 0x0008) or
  86. (0x000E <= charAsInt <= 0x001F) or
  87. (0x007F <= charAsInt <= 0x009F) or
  88. (0xFDD0 <= charAsInt <= 0xFDEF) or
  89. charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
  90. 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  91. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
  92. 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
  93. 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
  94. 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
  95. 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  96. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
  97. 0xFFFFF, 0x10FFFE, 0x10FFFF])):
  98. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  99. "data":
  100. "illegal-codepoint-for-numeric-entity",
  101. "datavars": {"charAsInt": charAsInt}})
  102. try:
  103. # Try/except needed as UCS-2 Python builds' unichar only works
  104. # within the BMP.
  105. char = chr(charAsInt)
  106. except ValueError:
  107. v = charAsInt - 0x10000
  108. char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
  109. # Discard the ; if present. Otherwise, put it back on the queue and
  110. # invoke parseError on parser.
  111. if c != ";":
  112. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  113. "numeric-entity-without-semicolon"})
  114. self.stream.unget(c)
  115. return char
  116. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  117. # Initialise to the default output for when no entity is matched
  118. output = "&"
  119. charStack = [self.stream.char()]
  120. if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
  121. (allowedChar is not None and allowedChar == charStack[0])):
  122. self.stream.unget(charStack[0])
  123. elif charStack[0] == "#":
  124. # Read the next character to see if it's hex or decimal
  125. hex = False
  126. charStack.append(self.stream.char())
  127. if charStack[-1] in ("x", "X"):
  128. hex = True
  129. charStack.append(self.stream.char())
  130. # charStack[-1] should be the first digit
  131. if (hex and charStack[-1] in hexDigits) \
  132. or (not hex and charStack[-1] in digits):
  133. # At least one digit found, so consume the whole number
  134. self.stream.unget(charStack[-1])
  135. output = self.consumeNumberEntity(hex)
  136. else:
  137. # No digits found
  138. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  139. "data": "expected-numeric-entity"})
  140. self.stream.unget(charStack.pop())
  141. output = "&" + "".join(charStack)
  142. else:
  143. # At this point in the process might have named entity. Entities
  144. # are stored in the global variable "entities".
  145. #
  146. # Consume characters and compare to these to a substring of the
  147. # entity names in the list until the substring no longer matches.
  148. while (charStack[-1] is not EOF):
  149. if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
  150. break
  151. charStack.append(self.stream.char())
  152. # At this point we have a string that starts with some characters
  153. # that may match an entity
  154. # Try to find the longest entity the string will match to take care
  155. # of &noti for instance.
  156. try:
  157. entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
  158. entityLength = len(entityName)
  159. except KeyError:
  160. entityName = None
  161. if entityName is not None:
  162. if entityName[-1] != ";":
  163. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  164. "named-entity-without-semicolon"})
  165. if (entityName[-1] != ";" and fromAttribute and
  166. (charStack[entityLength] in asciiLetters or
  167. charStack[entityLength] in digits or
  168. charStack[entityLength] == "=")):
  169. self.stream.unget(charStack.pop())
  170. output = "&" + "".join(charStack)
  171. else:
  172. output = entities[entityName]
  173. self.stream.unget(charStack.pop())
  174. output += "".join(charStack[entityLength:])
  175. else:
  176. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  177. "expected-named-entity"})
  178. self.stream.unget(charStack.pop())
  179. output = "&" + "".join(charStack)
  180. if fromAttribute:
  181. self.currentToken["data"][-1][1] += output
  182. else:
  183. if output in spaceCharacters:
  184. tokenType = "SpaceCharacters"
  185. else:
  186. tokenType = "Characters"
  187. self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
  188. def processEntityInAttribute(self, allowedChar):
  189. """This method replaces the need for "entityInAttributeValueState".
  190. """
  191. self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
  192. def emitCurrentToken(self):
  193. """This method is a generic handler for emitting the tags. It also sets
  194. the state to "data" because that's what's needed after a token has been
  195. emitted.
  196. """
  197. token = self.currentToken
  198. # Add token to the queue to be yielded
  199. if (token["type"] in tagTokenTypes):
  200. token["name"] = token["name"].translate(asciiUpper2Lower)
  201. if token["type"] == tokenTypes["StartTag"]:
  202. raw = token["data"]
  203. data = attributeMap(raw)
  204. if len(raw) > len(data):
  205. # we had some duplicated attribute, fix so first wins
  206. data.update(raw[::-1])
  207. token["data"] = data
  208. if token["type"] == tokenTypes["EndTag"]:
  209. if token["data"]:
  210. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  211. "data": "attributes-in-end-tag"})
  212. if token["selfClosing"]:
  213. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  214. "data": "self-closing-flag-on-end-tag"})
  215. self.tokenQueue.append(token)
  216. self.state = self.dataState
  217. # Below are the various tokenizer states worked out.
  218. def dataState(self):
  219. data = self.stream.char()
  220. if data == "&":
  221. self.state = self.entityDataState
  222. elif data == "<":
  223. self.state = self.tagOpenState
  224. elif data == "\u0000":
  225. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  226. "data": "invalid-codepoint"})
  227. self.tokenQueue.append({"type": tokenTypes["Characters"],
  228. "data": "\u0000"})
  229. elif data is EOF:
  230. # Tokenization ends.
  231. return False
  232. elif data in spaceCharacters:
  233. # Directly after emitting a token you switch back to the "data
  234. # state". At that point spaceCharacters are important so they are
  235. # emitted separately.
  236. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  237. data + self.stream.charsUntil(spaceCharacters, True)})
  238. # No need to update lastFourChars here, since the first space will
  239. # have already been appended to lastFourChars and will have broken
  240. # any <!-- or --> sequences
  241. else:
  242. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  243. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  244. data + chars})
  245. return True
  246. def entityDataState(self):
  247. self.consumeEntity()
  248. self.state = self.dataState
  249. return True
  250. def rcdataState(self):
  251. data = self.stream.char()
  252. if data == "&":
  253. self.state = self.characterReferenceInRcdata
  254. elif data == "<":
  255. self.state = self.rcdataLessThanSignState
  256. elif data == EOF:
  257. # Tokenization ends.
  258. return False
  259. elif data == "\u0000":
  260. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  261. "data": "invalid-codepoint"})
  262. self.tokenQueue.append({"type": tokenTypes["Characters"],
  263. "data": "\uFFFD"})
  264. elif data in spaceCharacters:
  265. # Directly after emitting a token you switch back to the "data
  266. # state". At that point spaceCharacters are important so they are
  267. # emitted separately.
  268. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  269. data + self.stream.charsUntil(spaceCharacters, True)})
  270. # No need to update lastFourChars here, since the first space will
  271. # have already been appended to lastFourChars and will have broken
  272. # any <!-- or --> sequences
  273. else:
  274. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  275. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  276. data + chars})
  277. return True
  278. def characterReferenceInRcdata(self):
  279. self.consumeEntity()
  280. self.state = self.rcdataState
  281. return True
  282. def rawtextState(self):
  283. data = self.stream.char()
  284. if data == "<":
  285. self.state = self.rawtextLessThanSignState
  286. elif data == "\u0000":
  287. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  288. "data": "invalid-codepoint"})
  289. self.tokenQueue.append({"type": tokenTypes["Characters"],
  290. "data": "\uFFFD"})
  291. elif data == EOF:
  292. # Tokenization ends.
  293. return False
  294. else:
  295. chars = self.stream.charsUntil(("<", "\u0000"))
  296. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  297. data + chars})
  298. return True
  299. def scriptDataState(self):
  300. data = self.stream.char()
  301. if data == "<":
  302. self.state = self.scriptDataLessThanSignState
  303. elif data == "\u0000":
  304. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  305. "data": "invalid-codepoint"})
  306. self.tokenQueue.append({"type": tokenTypes["Characters"],
  307. "data": "\uFFFD"})
  308. elif data == EOF:
  309. # Tokenization ends.
  310. return False
  311. else:
  312. chars = self.stream.charsUntil(("<", "\u0000"))
  313. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  314. data + chars})
  315. return True
  316. def plaintextState(self):
  317. data = self.stream.char()
  318. if data == EOF:
  319. # Tokenization ends.
  320. return False
  321. elif data == "\u0000":
  322. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  323. "data": "invalid-codepoint"})
  324. self.tokenQueue.append({"type": tokenTypes["Characters"],
  325. "data": "\uFFFD"})
  326. else:
  327. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  328. data + self.stream.charsUntil("\u0000")})
  329. return True
  330. def tagOpenState(self):
  331. data = self.stream.char()
  332. if data == "!":
  333. self.state = self.markupDeclarationOpenState
  334. elif data == "/":
  335. self.state = self.closeTagOpenState
  336. elif data in asciiLetters:
  337. self.currentToken = {"type": tokenTypes["StartTag"],
  338. "name": data, "data": [],
  339. "selfClosing": False,
  340. "selfClosingAcknowledged": False}
  341. self.state = self.tagNameState
  342. elif data == ">":
  343. # XXX In theory it could be something besides a tag name. But
  344. # do we really care?
  345. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  346. "expected-tag-name-but-got-right-bracket"})
  347. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
  348. self.state = self.dataState
  349. elif data == "?":
  350. # XXX In theory it could be something besides a tag name. But
  351. # do we really care?
  352. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  353. "expected-tag-name-but-got-question-mark"})
  354. self.stream.unget(data)
  355. self.state = self.bogusCommentState
  356. else:
  357. # XXX
  358. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  359. "expected-tag-name"})
  360. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  361. self.stream.unget(data)
  362. self.state = self.dataState
  363. return True
  364. def closeTagOpenState(self):
  365. data = self.stream.char()
  366. if data in asciiLetters:
  367. self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
  368. "data": [], "selfClosing": False}
  369. self.state = self.tagNameState
  370. elif data == ">":
  371. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  372. "expected-closing-tag-but-got-right-bracket"})
  373. self.state = self.dataState
  374. elif data is EOF:
  375. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  376. "expected-closing-tag-but-got-eof"})
  377. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  378. self.state = self.dataState
  379. else:
  380. # XXX data can be _'_...
  381. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  382. "expected-closing-tag-but-got-char",
  383. "datavars": {"data": data}})
  384. self.stream.unget(data)
  385. self.state = self.bogusCommentState
  386. return True
  387. def tagNameState(self):
  388. data = self.stream.char()
  389. if data in spaceCharacters:
  390. self.state = self.beforeAttributeNameState
  391. elif data == ">":
  392. self.emitCurrentToken()
  393. elif data is EOF:
  394. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  395. "eof-in-tag-name"})
  396. self.state = self.dataState
  397. elif data == "/":
  398. self.state = self.selfClosingStartTagState
  399. elif data == "\u0000":
  400. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  401. "data": "invalid-codepoint"})
  402. self.currentToken["name"] += "\uFFFD"
  403. else:
  404. self.currentToken["name"] += data
  405. # (Don't use charsUntil here, because tag names are
  406. # very short and it's faster to not do anything fancy)
  407. return True
  408. def rcdataLessThanSignState(self):
  409. data = self.stream.char()
  410. if data == "/":
  411. self.temporaryBuffer = ""
  412. self.state = self.rcdataEndTagOpenState
  413. else:
  414. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  415. self.stream.unget(data)
  416. self.state = self.rcdataState
  417. return True
  418. def rcdataEndTagOpenState(self):
  419. data = self.stream.char()
  420. if data in asciiLetters:
  421. self.temporaryBuffer += data
  422. self.state = self.rcdataEndTagNameState
  423. else:
  424. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  425. self.stream.unget(data)
  426. self.state = self.rcdataState
  427. return True
  428. def rcdataEndTagNameState(self):
  429. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  430. data = self.stream.char()
  431. if data in spaceCharacters and appropriate:
  432. self.currentToken = {"type": tokenTypes["EndTag"],
  433. "name": self.temporaryBuffer,
  434. "data": [], "selfClosing": False}
  435. self.state = self.beforeAttributeNameState
  436. elif data == "/" and appropriate:
  437. self.currentToken = {"type": tokenTypes["EndTag"],
  438. "name": self.temporaryBuffer,
  439. "data": [], "selfClosing": False}
  440. self.state = self.selfClosingStartTagState
  441. elif data == ">" and appropriate:
  442. self.currentToken = {"type": tokenTypes["EndTag"],
  443. "name": self.temporaryBuffer,
  444. "data": [], "selfClosing": False}
  445. self.emitCurrentToken()
  446. self.state = self.dataState
  447. elif data in asciiLetters:
  448. self.temporaryBuffer += data
  449. else:
  450. self.tokenQueue.append({"type": tokenTypes["Characters"],
  451. "data": "</" + self.temporaryBuffer})
  452. self.stream.unget(data)
  453. self.state = self.rcdataState
  454. return True
  455. def rawtextLessThanSignState(self):
  456. data = self.stream.char()
  457. if data == "/":
  458. self.temporaryBuffer = ""
  459. self.state = self.rawtextEndTagOpenState
  460. else:
  461. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  462. self.stream.unget(data)
  463. self.state = self.rawtextState
  464. return True
  465. def rawtextEndTagOpenState(self):
  466. data = self.stream.char()
  467. if data in asciiLetters:
  468. self.temporaryBuffer += data
  469. self.state = self.rawtextEndTagNameState
  470. else:
  471. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  472. self.stream.unget(data)
  473. self.state = self.rawtextState
  474. return True
  475. def rawtextEndTagNameState(self):
  476. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  477. data = self.stream.char()
  478. if data in spaceCharacters and appropriate:
  479. self.currentToken = {"type": tokenTypes["EndTag"],
  480. "name": self.temporaryBuffer,
  481. "data": [], "selfClosing": False}
  482. self.state = self.beforeAttributeNameState
  483. elif data == "/" and appropriate:
  484. self.currentToken = {"type": tokenTypes["EndTag"],
  485. "name": self.temporaryBuffer,
  486. "data": [], "selfClosing": False}
  487. self.state = self.selfClosingStartTagState
  488. elif data == ">" and appropriate:
  489. self.currentToken = {"type": tokenTypes["EndTag"],
  490. "name": self.temporaryBuffer,
  491. "data": [], "selfClosing": False}
  492. self.emitCurrentToken()
  493. self.state = self.dataState
  494. elif data in asciiLetters:
  495. self.temporaryBuffer += data
  496. else:
  497. self.tokenQueue.append({"type": tokenTypes["Characters"],
  498. "data": "</" + self.temporaryBuffer})
  499. self.stream.unget(data)
  500. self.state = self.rawtextState
  501. return True
  502. def scriptDataLessThanSignState(self):
  503. data = self.stream.char()
  504. if data == "/":
  505. self.temporaryBuffer = ""
  506. self.state = self.scriptDataEndTagOpenState
  507. elif data == "!":
  508. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
  509. self.state = self.scriptDataEscapeStartState
  510. else:
  511. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  512. self.stream.unget(data)
  513. self.state = self.scriptDataState
  514. return True
  515. def scriptDataEndTagOpenState(self):
  516. data = self.stream.char()
  517. if data in asciiLetters:
  518. self.temporaryBuffer += data
  519. self.state = self.scriptDataEndTagNameState
  520. else:
  521. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  522. self.stream.unget(data)
  523. self.state = self.scriptDataState
  524. return True
  525. def scriptDataEndTagNameState(self):
  526. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  527. data = self.stream.char()
  528. if data in spaceCharacters and appropriate:
  529. self.currentToken = {"type": tokenTypes["EndTag"],
  530. "name": self.temporaryBuffer,
  531. "data": [], "selfClosing": False}
  532. self.state = self.beforeAttributeNameState
  533. elif data == "/" and appropriate:
  534. self.currentToken = {"type": tokenTypes["EndTag"],
  535. "name": self.temporaryBuffer,
  536. "data": [], "selfClosing": False}
  537. self.state = self.selfClosingStartTagState
  538. elif data == ">" and appropriate:
  539. self.currentToken = {"type": tokenTypes["EndTag"],
  540. "name": self.temporaryBuffer,
  541. "data": [], "selfClosing": False}
  542. self.emitCurrentToken()
  543. self.state = self.dataState
  544. elif data in asciiLetters:
  545. self.temporaryBuffer += data
  546. else:
  547. self.tokenQueue.append({"type": tokenTypes["Characters"],
  548. "data": "</" + self.temporaryBuffer})
  549. self.stream.unget(data)
  550. self.state = self.scriptDataState
  551. return True
  552. def scriptDataEscapeStartState(self):
  553. data = self.stream.char()
  554. if data == "-":
  555. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  556. self.state = self.scriptDataEscapeStartDashState
  557. else:
  558. self.stream.unget(data)
  559. self.state = self.scriptDataState
  560. return True
  561. def scriptDataEscapeStartDashState(self):
  562. data = self.stream.char()
  563. if data == "-":
  564. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  565. self.state = self.scriptDataEscapedDashDashState
  566. else:
  567. self.stream.unget(data)
  568. self.state = self.scriptDataState
  569. return True
  570. def scriptDataEscapedState(self):
  571. data = self.stream.char()
  572. if data == "-":
  573. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  574. self.state = self.scriptDataEscapedDashState
  575. elif data == "<":
  576. self.state = self.scriptDataEscapedLessThanSignState
  577. elif data == "\u0000":
  578. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  579. "data": "invalid-codepoint"})
  580. self.tokenQueue.append({"type": tokenTypes["Characters"],
  581. "data": "\uFFFD"})
  582. elif data == EOF:
  583. self.state = self.dataState
  584. else:
  585. chars = self.stream.charsUntil(("<", "-", "\u0000"))
  586. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  587. data + chars})
  588. return True
  589. def scriptDataEscapedDashState(self):
  590. data = self.stream.char()
  591. if data == "-":
  592. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  593. self.state = self.scriptDataEscapedDashDashState
  594. elif data == "<":
  595. self.state = self.scriptDataEscapedLessThanSignState
  596. elif data == "\u0000":
  597. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  598. "data": "invalid-codepoint"})
  599. self.tokenQueue.append({"type": tokenTypes["Characters"],
  600. "data": "\uFFFD"})
  601. self.state = self.scriptDataEscapedState
  602. elif data == EOF:
  603. self.state = self.dataState
  604. else:
  605. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  606. self.state = self.scriptDataEscapedState
  607. return True
  608. def scriptDataEscapedDashDashState(self):
  609. data = self.stream.char()
  610. if data == "-":
  611. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  612. elif data == "<":
  613. self.state = self.scriptDataEscapedLessThanSignState
  614. elif data == ">":
  615. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  616. self.state = self.scriptDataState
  617. elif data == "\u0000":
  618. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  619. "data": "invalid-codepoint"})
  620. self.tokenQueue.append({"type": tokenTypes["Characters"],
  621. "data": "\uFFFD"})
  622. self.state = self.scriptDataEscapedState
  623. elif data == EOF:
  624. self.state = self.dataState
  625. else:
  626. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  627. self.state = self.scriptDataEscapedState
  628. return True
  629. def scriptDataEscapedLessThanSignState(self):
  630. data = self.stream.char()
  631. if data == "/":
  632. self.temporaryBuffer = ""
  633. self.state = self.scriptDataEscapedEndTagOpenState
  634. elif data in asciiLetters:
  635. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
  636. self.temporaryBuffer = data
  637. self.state = self.scriptDataDoubleEscapeStartState
  638. else:
  639. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  640. self.stream.unget(data)
  641. self.state = self.scriptDataEscapedState
  642. return True
  643. def scriptDataEscapedEndTagOpenState(self):
  644. data = self.stream.char()
  645. if data in asciiLetters:
  646. self.temporaryBuffer = data
  647. self.state = self.scriptDataEscapedEndTagNameState
  648. else:
  649. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  650. self.stream.unget(data)
  651. self.state = self.scriptDataEscapedState
  652. return True
  653. def scriptDataEscapedEndTagNameState(self):
  654. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  655. data = self.stream.char()
  656. if data in spaceCharacters and appropriate:
  657. self.currentToken = {"type": tokenTypes["EndTag"],
  658. "name": self.temporaryBuffer,
  659. "data": [], "selfClosing": False}
  660. self.state = self.beforeAttributeNameState
  661. elif data == "/" and appropriate:
  662. self.currentToken = {"type": tokenTypes["EndTag"],
  663. "name": self.temporaryBuffer,
  664. "data": [], "selfClosing": False}
  665. self.state = self.selfClosingStartTagState
  666. elif data == ">" and appropriate:
  667. self.currentToken = {"type": tokenTypes["EndTag"],
  668. "name": self.temporaryBuffer,
  669. "data": [], "selfClosing": False}
  670. self.emitCurrentToken()
  671. self.state = self.dataState
  672. elif data in asciiLetters:
  673. self.temporaryBuffer += data
  674. else:
  675. self.tokenQueue.append({"type": tokenTypes["Characters"],
  676. "data": "</" + self.temporaryBuffer})
  677. self.stream.unget(data)
  678. self.state = self.scriptDataEscapedState
  679. return True
  680. def scriptDataDoubleEscapeStartState(self):
  681. data = self.stream.char()
  682. if data in (spaceCharacters | frozenset(("/", ">"))):
  683. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  684. if self.temporaryBuffer.lower() == "script":
  685. self.state = self.scriptDataDoubleEscapedState
  686. else:
  687. self.state = self.scriptDataEscapedState
  688. elif data in asciiLetters:
  689. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  690. self.temporaryBuffer += data
  691. else:
  692. self.stream.unget(data)
  693. self.state = self.scriptDataEscapedState
  694. return True
  695. def scriptDataDoubleEscapedState(self):
  696. data = self.stream.char()
  697. if data == "-":
  698. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  699. self.state = self.scriptDataDoubleEscapedDashState
  700. elif data == "<":
  701. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  702. self.state = self.scriptDataDoubleEscapedLessThanSignState
  703. elif data == "\u0000":
  704. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  705. "data": "invalid-codepoint"})
  706. self.tokenQueue.append({"type": tokenTypes["Characters"],
  707. "data": "\uFFFD"})
  708. elif data == EOF:
  709. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  710. "eof-in-script-in-script"})
  711. self.state = self.dataState
  712. else:
  713. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  714. return True
  715. def scriptDataDoubleEscapedDashState(self):
  716. data = self.stream.char()
  717. if data == "-":
  718. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  719. self.state = self.scriptDataDoubleEscapedDashDashState
  720. elif data == "<":
  721. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  722. self.state = self.scriptDataDoubleEscapedLessThanSignState
  723. elif data == "\u0000":
  724. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  725. "data": "invalid-codepoint"})
  726. self.tokenQueue.append({"type": tokenTypes["Characters"],
  727. "data": "\uFFFD"})
  728. self.state = self.scriptDataDoubleEscapedState
  729. elif data == EOF:
  730. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  731. "eof-in-script-in-script"})
  732. self.state = self.dataState
  733. else:
  734. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  735. self.state = self.scriptDataDoubleEscapedState
  736. return True
  737. def scriptDataDoubleEscapedDashDashState(self):
  738. data = self.stream.char()
  739. if data == "-":
  740. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  741. elif data == "<":
  742. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  743. self.state = self.scriptDataDoubleEscapedLessThanSignState
  744. elif data == ">":
  745. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  746. self.state = self.scriptDataState
  747. elif data == "\u0000":
  748. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  749. "data": "invalid-codepoint"})
  750. self.tokenQueue.append({"type": tokenTypes["Characters"],
  751. "data": "\uFFFD"})
  752. self.state = self.scriptDataDoubleEscapedState
  753. elif data == EOF:
  754. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  755. "eof-in-script-in-script"})
  756. self.state = self.dataState
  757. else:
  758. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  759. self.state = self.scriptDataDoubleEscapedState
  760. return True
  761. def scriptDataDoubleEscapedLessThanSignState(self):
  762. data = self.stream.char()
  763. if data == "/":
  764. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
  765. self.temporaryBuffer = ""
  766. self.state = self.scriptDataDoubleEscapeEndState
  767. else:
  768. self.stream.unget(data)
  769. self.state = self.scriptDataDoubleEscapedState
  770. return True
  771. def scriptDataDoubleEscapeEndState(self):
  772. data = self.stream.char()
  773. if data in (spaceCharacters | frozenset(("/", ">"))):
  774. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  775. if self.temporaryBuffer.lower() == "script":
  776. self.state = self.scriptDataEscapedState
  777. else:
  778. self.state = self.scriptDataDoubleEscapedState
  779. elif data in asciiLetters:
  780. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  781. self.temporaryBuffer += data
  782. else:
  783. self.stream.unget(data)
  784. self.state = self.scriptDataDoubleEscapedState
  785. return True
  786. def beforeAttributeNameState(self):
  787. data = self.stream.char()
  788. if data in spaceCharacters:
  789. self.stream.charsUntil(spaceCharacters, True)
  790. elif data in asciiLetters:
  791. self.currentToken["data"].append([data, ""])
  792. self.state = self.attributeNameState
  793. elif data == ">":
  794. self.emitCurrentToken()
  795. elif data == "/":
  796. self.state = self.selfClosingStartTagState
  797. elif data in ("'", '"', "=", "<"):
  798. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  799. "invalid-character-in-attribute-name"})
  800. self.currentToken["data"].append([data, ""])
  801. self.state = self.attributeNameState
  802. elif data == "\u0000":
  803. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  804. "data": "invalid-codepoint"})
  805. self.currentToken["data"].append(["\uFFFD", ""])
  806. self.state = self.attributeNameState
  807. elif data is EOF:
  808. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  809. "expected-attribute-name-but-got-eof"})
  810. self.state = self.dataState
  811. else:
  812. self.currentToken["data"].append([data, ""])
  813. self.state = self.attributeNameState
  814. return True
  815. def attributeNameState(self):
  816. data = self.stream.char()
  817. leavingThisState = True
  818. emitToken = False
  819. if data == "=":
  820. self.state = self.beforeAttributeValueState
  821. elif data in asciiLetters:
  822. self.currentToken["data"][-1][0] += data +\
  823. self.stream.charsUntil(asciiLetters, True)
  824. leavingThisState = False
  825. elif data == ">":
  826. # XXX If we emit here the attributes are converted to a dict
  827. # without being checked and when the code below runs we error
  828. # because data is a dict not a list
  829. emitToken = True
  830. elif data in spaceCharacters:
  831. self.state = self.afterAttributeNameState
  832. elif data == "/":
  833. self.state = self.selfClosingStartTagState
  834. elif data == "\u0000":
  835. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  836. "data": "invalid-codepoint"})
  837. self.currentToken["data"][-1][0] += "\uFFFD"
  838. leavingThisState = False
  839. elif data in ("'", '"', "<"):
  840. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  841. "data":
  842. "invalid-character-in-attribute-name"})
  843. self.currentToken["data"][-1][0] += data
  844. leavingThisState = False
  845. elif data is EOF:
  846. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  847. "data": "eof-in-attribute-name"})
  848. self.state = self.dataState
  849. else:
  850. self.currentToken["data"][-1][0] += data
  851. leavingThisState = False
  852. if leavingThisState:
  853. # Attributes are not dropped at this stage. That happens when the
  854. # start tag token is emitted so values can still be safely appended
  855. # to attributes, but we do want to report the parse error in time.
  856. self.currentToken["data"][-1][0] = (
  857. self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
  858. for name, _ in self.currentToken["data"][:-1]:
  859. if self.currentToken["data"][-1][0] == name:
  860. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  861. "duplicate-attribute"})
  862. break
  863. # XXX Fix for above XXX
  864. if emitToken:
  865. self.emitCurrentToken()
  866. return True
  867. def afterAttributeNameState(self):
  868. data = self.stream.char()
  869. if data in spaceCharacters:
  870. self.stream.charsUntil(spaceCharacters, True)
  871. elif data == "=":
  872. self.state = self.beforeAttributeValueState
  873. elif data == ">":
  874. self.emitCurrentToken()
  875. elif data in asciiLetters:
  876. self.currentToken["data"].append([data, ""])
  877. self.state = self.attributeNameState
  878. elif data == "/":
  879. self.state = self.selfClosingStartTagState
  880. elif data == "\u0000":
  881. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  882. "data": "invalid-codepoint"})
  883. self.currentToken["data"].append(["\uFFFD", ""])
  884. self.state = self.attributeNameState
  885. elif data in ("'", '"', "<"):
  886. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  887. "invalid-character-after-attribute-name"})
  888. self.currentToken["data"].append([data, ""])
  889. self.state = self.attributeNameState
  890. elif data is EOF:
  891. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  892. "expected-end-of-tag-but-got-eof"})
  893. self.state = self.dataState
  894. else:
  895. self.currentToken["data"].append([data, ""])
  896. self.state = self.attributeNameState
  897. return True
  898. def beforeAttributeValueState(self):
  899. data = self.stream.char()
  900. if data in spaceCharacters:
  901. self.stream.charsUntil(spaceCharacters, True)
  902. elif data == "\"":
  903. self.state = self.attributeValueDoubleQuotedState
  904. elif data == "&":
  905. self.state = self.attributeValueUnQuotedState
  906. self.stream.unget(data)
  907. elif data == "'":
  908. self.state = self.attributeValueSingleQuotedState
  909. elif data == ">":
  910. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  911. "expected-attribute-value-but-got-right-bracket"})
  912. self.emitCurrentToken()
  913. elif data == "\u0000":
  914. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  915. "data": "invalid-codepoint"})
  916. self.currentToken["data"][-1][1] += "\uFFFD"
  917. self.state = self.attributeValueUnQuotedState
  918. elif data in ("=", "<", "`"):
  919. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  920. "equals-in-unquoted-attribute-value"})
  921. self.currentToken["data"][-1][1] += data
  922. self.state = self.attributeValueUnQuotedState
  923. elif data is EOF:
  924. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  925. "expected-attribute-value-but-got-eof"})
  926. self.state = self.dataState
  927. else:
  928. self.currentToken["data"][-1][1] += data
  929. self.state = self.attributeValueUnQuotedState
  930. return True
  931. def attributeValueDoubleQuotedState(self):
  932. data = self.stream.char()
  933. if data == "\"":
  934. self.state = self.afterAttributeValueState
  935. elif data == "&":
  936. self.processEntityInAttribute('"')
  937. elif data == "\u0000":
  938. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  939. "data": "invalid-codepoint"})
  940. self.currentToken["data"][-1][1] += "\uFFFD"
  941. elif data is EOF:
  942. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  943. "eof-in-attribute-value-double-quote"})
  944. self.state = self.dataState
  945. else:
  946. self.currentToken["data"][-1][1] += data +\
  947. self.stream.charsUntil(("\"", "&", "\u0000"))
  948. return True
  949. def attributeValueSingleQuotedState(self):
  950. data = self.stream.char()
  951. if data == "'":
  952. self.state = self.afterAttributeValueState
  953. elif data == "&":
  954. self.processEntityInAttribute("'")
  955. elif data == "\u0000":
  956. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  957. "data": "invalid-codepoint"})
  958. self.currentToken["data"][-1][1] += "\uFFFD"
  959. elif data is EOF:
  960. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  961. "eof-in-attribute-value-single-quote"})
  962. self.state = self.dataState
  963. else:
  964. self.currentToken["data"][-1][1] += data +\
  965. self.stream.charsUntil(("'", "&", "\u0000"))
  966. return True
  967. def attributeValueUnQuotedState(self):
  968. data = self.stream.char()
  969. if data in spaceCharacters:
  970. self.state = self.beforeAttributeNameState
  971. elif data == "&":
  972. self.processEntityInAttribute(">")
  973. elif data == ">":
  974. self.emitCurrentToken()
  975. elif data in ('"', "'", "=", "<", "`"):
  976. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  977. "unexpected-character-in-unquoted-attribute-value"})
  978. self.currentToken["data"][-1][1] += data
  979. elif data == "\u0000":
  980. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  981. "data": "invalid-codepoint"})
  982. self.currentToken["data"][-1][1] += "\uFFFD"
  983. elif data is EOF:
  984. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  985. "eof-in-attribute-value-no-quotes"})
  986. self.state = self.dataState
  987. else:
  988. self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
  989. frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
  990. return True
  991. def afterAttributeValueState(self):
  992. data = self.stream.char()
  993. if data in spaceCharacters:
  994. self.state = self.beforeAttributeNameState
  995. elif data == ">":
  996. self.emitCurrentToken()
  997. elif data == "/":
  998. self.state = self.selfClosingStartTagState
  999. elif data is EOF:
  1000. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1001. "unexpected-EOF-after-attribute-value"})
  1002. self.stream.unget(data)
  1003. self.state = self.dataState
  1004. else:
  1005. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1006. "unexpected-character-after-attribute-value"})
  1007. self.stream.unget(data)
  1008. self.state = self.beforeAttributeNameState
  1009. return True
  1010. def selfClosingStartTagState(self):
  1011. data = self.stream.char()
  1012. if data == ">":
  1013. self.currentToken["selfClosing"] = True
  1014. self.emitCurrentToken()
  1015. elif data is EOF:
  1016. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1017. "data":
  1018. "unexpected-EOF-after-solidus-in-tag"})
  1019. self.stream.unget(data)
  1020. self.state = self.dataState
  1021. else:
  1022. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1023. "unexpected-character-after-solidus-in-tag"})
  1024. self.stream.unget(data)
  1025. self.state = self.beforeAttributeNameState
  1026. return True
  1027. def bogusCommentState(self):
  1028. # Make a new comment token and give it as value all the characters
  1029. # until the first > or EOF (charsUntil checks for EOF automatically)
  1030. # and emit it.
  1031. data = self.stream.charsUntil(">")
  1032. data = data.replace("\u0000", "\uFFFD")
  1033. self.tokenQueue.append(
  1034. {"type": tokenTypes["Comment"], "data": data})
  1035. # Eat the character directly after the bogus comment which is either a
  1036. # ">" or an EOF.
  1037. self.stream.char()
  1038. self.state = self.dataState
  1039. return True
  1040. def markupDeclarationOpenState(self):
  1041. charStack = [self.stream.char()]
  1042. if charStack[-1] == "-":
  1043. charStack.append(self.stream.char())
  1044. if charStack[-1] == "-":
  1045. self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
  1046. self.state = self.commentStartState
  1047. return True
  1048. elif charStack[-1] in ('d', 'D'):
  1049. matched = True
  1050. for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
  1051. ('y', 'Y'), ('p', 'P'), ('e', 'E')):
  1052. charStack.append(self.stream.char())
  1053. if charStack[-1] not in expected:
  1054. matched = False
  1055. break
  1056. if matched:
  1057. self.currentToken = {"type": tokenTypes["Doctype"],
  1058. "name": "",
  1059. "publicId": None, "systemId": None,
  1060. "correct": True}
  1061. self.state = self.doctypeState
  1062. return True
  1063. elif (charStack[-1] == "[" and
  1064. self.parser is not None and
  1065. self.parser.tree.openElements and
  1066. self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
  1067. matched = True
  1068. for expected in ["C", "D", "A", "T", "A", "["]:
  1069. charStack.append(self.stream.char())
  1070. if charStack[-1] != expected:
  1071. matched = False
  1072. break
  1073. if matched:
  1074. self.state = self.cdataSectionState
  1075. return True
  1076. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1077. "expected-dashes-or-doctype"})
  1078. while charStack:
  1079. self.stream.unget(charStack.pop())
  1080. self.state = self.bogusCommentState
  1081. return True
  1082. def commentStartState(self):
  1083. data = self.stream.char()
  1084. if data == "-":
  1085. self.state = self.commentStartDashState
  1086. elif data == "\u0000":
  1087. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1088. "data": "invalid-codepoint"})
  1089. self.currentToken["data"] += "\uFFFD"
  1090. elif data == ">":
  1091. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1092. "incorrect-comment"})
  1093. self.tokenQueue.append(self.currentToken)
  1094. self.state = self.dataState
  1095. elif data is EOF:
  1096. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1097. "eof-in-comment"})
  1098. self.tokenQueue.append(self.currentToken)
  1099. self.state = self.dataState
  1100. else:
  1101. self.currentToken["data"] += data
  1102. self.state = self.commentState
  1103. return True
  1104. def commentStartDashState(self):
  1105. data = self.stream.char()
  1106. if data == "-":
  1107. self.state = self.commentEndState
  1108. elif data == "\u0000":
  1109. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1110. "data": "invalid-codepoint"})
  1111. self.currentToken["data"] += "-\uFFFD"
  1112. elif data == ">":
  1113. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1114. "incorrect-comment"})
  1115. self.tokenQueue.append(self.currentToken)
  1116. self.state = self.dataState
  1117. elif data is EOF:
  1118. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1119. "eof-in-comment"})
  1120. self.tokenQueue.append(self.currentToken)
  1121. self.state = self.dataState
  1122. else:
  1123. self.currentToken["data"] += "-" + data
  1124. self.state = self.commentState
  1125. return True
  1126. def commentState(self):
  1127. data = self.stream.char()
  1128. if data == "-":
  1129. self.state = self.commentEndDashState
  1130. elif data == "\u0000":
  1131. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1132. "data": "invalid-codepoint"})
  1133. self.currentToken["data"] += "\uFFFD"
  1134. elif data is EOF:
  1135. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1136. "data": "eof-in-comment"})
  1137. self.tokenQueue.append(self.currentToken)
  1138. self.state = self.dataState
  1139. else:
  1140. self.currentToken["data"] += data + \
  1141. self.stream.charsUntil(("-", "\u0000"))
  1142. return True
  1143. def commentEndDashState(self):
  1144. data = self.stream.char()
  1145. if data == "-":
  1146. self.state = self.commentEndState
  1147. elif data == "\u0000":
  1148. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1149. "data": "invalid-codepoint"})
  1150. self.currentToken["data"] += "-\uFFFD"
  1151. self.state = self.commentState
  1152. elif data is EOF:
  1153. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1154. "eof-in-comment-end-dash"})
  1155. self.tokenQueue.append(self.currentToken)
  1156. self.state = self.dataState
  1157. else:
  1158. self.currentToken["data"] += "-" + data
  1159. self.state = self.commentState
  1160. return True
  1161. def commentEndState(self):
  1162. data = self.stream.char()
  1163. if data == ">":
  1164. self.tokenQueue.append(self.currentToken)
  1165. self.state = self.dataState
  1166. elif data == "\u0000":
  1167. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1168. "data": "invalid-codepoint"})
  1169. self.currentToken["data"] += "--\uFFFD"
  1170. self.state = self.commentState
  1171. elif data == "!":
  1172. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1173. "unexpected-bang-after-double-dash-in-comment"})
  1174. self.state = self.commentEndBangState
  1175. elif data == "-":
  1176. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1177. "unexpected-dash-after-double-dash-in-comment"})
  1178. self.currentToken["data"] += data
  1179. elif data is EOF:
  1180. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1181. "eof-in-comment-double-dash"})
  1182. self.tokenQueue.append(self.currentToken)
  1183. self.state = self.dataState
  1184. else:
  1185. # XXX
  1186. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1187. "unexpected-char-in-comment"})
  1188. self.currentToken["data"] += "--" + data
  1189. self.state = self.commentState
  1190. return True
  1191. def commentEndBangState(self):
  1192. data = self.stream.char()
  1193. if data == ">":
  1194. self.tokenQueue.append(self.currentToken)
  1195. self.state = self.dataState
  1196. elif data == "-":
  1197. self.currentToken["data"] += "--!"
  1198. self.state = self.commentEndDashState
  1199. elif data == "\u0000":
  1200. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1201. "data": "invalid-codepoint"})
  1202. self.currentToken["data"] += "--!\uFFFD"
  1203. self.state = self.commentState
  1204. elif data is EOF:
  1205. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1206. "eof-in-comment-end-bang-state"})
  1207. self.tokenQueue.append(self.currentToken)
  1208. self.state = self.dataState
  1209. else:
  1210. self.currentToken["data"] += "--!" + data
  1211. self.state = self.commentState
  1212. return True
  1213. def doctypeState(self):
  1214. data = self.stream.char()
  1215. if data in spaceCharacters:
  1216. self.state = self.beforeDoctypeNameState
  1217. elif data is EOF:
  1218. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1219. "expected-doctype-name-but-got-eof"})
  1220. self.currentToken["correct"] = False
  1221. self.tokenQueue.append(self.currentToken)
  1222. self.state = self.dataState
  1223. else:
  1224. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1225. "need-space-after-doctype"})
  1226. self.stream.unget(data)
  1227. self.state = self.beforeDoctypeNameState
  1228. return True
  1229. def beforeDoctypeNameState(self):
  1230. data = self.stream.char()
  1231. if data in spaceCharacters:
  1232. pass
  1233. elif data == ">":
  1234. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1235. "expected-doctype-name-but-got-right-bracket"})
  1236. self.currentToken["correct"] = False
  1237. self.tokenQueue.append(self.currentToken)
  1238. self.state = self.dataState
  1239. elif data == "\u0000":
  1240. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1241. "data": "invalid-codepoint"})
  1242. self.currentToken["name"] = "\uFFFD"
  1243. self.state = self.doctypeNameState
  1244. elif data is EOF:
  1245. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1246. "expected-doctype-name-but-got-eof"})
  1247. self.currentToken["correct"] = False
  1248. self.tokenQueue.append(self.currentToken)
  1249. self.state = self.dataState
  1250. else:
  1251. self.currentToken["name"] = data
  1252. self.state = self.doctypeNameState
  1253. return True
  1254. def doctypeNameState(self):
  1255. data = self.stream.char()
  1256. if data in spaceCharacters:
  1257. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1258. self.state = self.afterDoctypeNameState
  1259. elif data == ">":
  1260. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1261. self.tokenQueue.append(self.currentToken)
  1262. self.state = self.dataState
  1263. elif data == "\u0000":
  1264. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1265. "data": "invalid-codepoint"})
  1266. self.currentToken["name"] += "\uFFFD"
  1267. self.state = self.doctypeNameState
  1268. elif data is EOF:
  1269. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1270. "eof-in-doctype-name"})
  1271. self.currentToken["correct"] = False
  1272. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1273. self.tokenQueue.append(self.currentToken)
  1274. self.state = self.dataState
  1275. else:
  1276. self.currentToken["name"] += data
  1277. return True
  1278. def afterDoctypeNameState(self):
  1279. data = self.stream.char()
  1280. if data in spaceCharacters:
  1281. pass
  1282. elif data == ">":
  1283. self.tokenQueue.append(self.currentToken)
  1284. self.state = self.dataState
  1285. elif data is EOF:
  1286. self.currentToken["correct"] = False
  1287. self.stream.unget(data)
  1288. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1289. "eof-in-doctype"})
  1290. self.tokenQueue.append(self.currentToken)
  1291. self.state = self.dataState
  1292. else:
  1293. if data in ("p", "P"):
  1294. matched = True
  1295. for expected in (("u", "U"), ("b", "B"), ("l", "L"),
  1296. ("i", "I"), ("c", "C")):
  1297. data = self.stream.char()
  1298. if data not in expected:
  1299. matched = False
  1300. break
  1301. if matched:
  1302. self.state = self.afterDoctypePublicKeywordState
  1303. return True
  1304. elif data in ("s", "S"):
  1305. matched = True
  1306. for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
  1307. ("e", "E"), ("m", "M")):
  1308. data = self.stream.char()
  1309. if data not in expected:
  1310. matched = False
  1311. break
  1312. if matched:
  1313. self.state = self.afterDoctypeSystemKeywordState
  1314. return True
  1315. # All the characters read before the current 'data' will be
  1316. # [a-zA-Z], so they're garbage in the bogus doctype and can be
  1317. # discarded; only the latest character might be '>' or EOF
  1318. # and needs to be ungetted
  1319. self.stream.unget(data)
  1320. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1321. "expected-space-or-right-bracket-in-doctype", "datavars":
  1322. {"data": data}})
  1323. self.currentToken["correct"] = False
  1324. self.state = self.bogusDoctypeState
  1325. return True
  1326. def afterDoctypePublicKeywordState(self):
  1327. data = self.stream.char()
  1328. if data in spaceCharacters:
  1329. self.state = self.beforeDoctypePublicIdentifierState
  1330. elif data in ("'", '"'):
  1331. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1332. "unexpected-char-in-doctype"})
  1333. self.stream.unget(data)
  1334. self.state = self.beforeDoctypePublicIdentifierState
  1335. elif data is EOF:
  1336. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1337. "eof-in-doctype"})
  1338. self.currentToken["correct"] = False
  1339. self.tokenQueue.append(self.currentToken)
  1340. self.state = self.dataState
  1341. else:
  1342. self.stream.unget(data)
  1343. self.state = self.beforeDoctypePublicIdentifierState
  1344. return True
  1345. def beforeDoctypePublicIdentifierState(self):
  1346. data = self.stream.char()
  1347. if data in spaceCharacters:
  1348. pass
  1349. elif data == "\"":
  1350. self.currentToken["publicId"] = ""
  1351. self.state = self.doctypePublicIdentifierDoubleQuotedState
  1352. elif data == "'":
  1353. self.currentToken["publicId"] = ""
  1354. self.state = self.doctypePublicIdentifierSingleQuotedState
  1355. elif data == ">":
  1356. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1357. "unexpected-end-of-doctype"})
  1358. self.currentToken["correct"] = False
  1359. self.tokenQueue.append(self.currentToken)
  1360. self.state = self.dataState
  1361. elif data is EOF:
  1362. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1363. "eof-in-doctype"})
  1364. self.currentToken["correct"] = False
  1365. self.tokenQueue.append(self.currentToken)
  1366. self.state = self.dataState
  1367. else:
  1368. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1369. "unexpected-char-in-doctype"})
  1370. self.currentToken["correct"] = False
  1371. self.state = self.bogusDoctypeState
  1372. return True
  1373. def doctypePublicIdentifierDoubleQuotedState(self):
  1374. data = self.stream.char()
  1375. if data == "\"":
  1376. self.state = self.afterDoctypePublicIdentifierState
  1377. elif data == "\u0000":
  1378. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1379. "data": "invalid-codepoint"})
  1380. self.currentToken["publicId"] += "\uFFFD"
  1381. elif data == ">":
  1382. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1383. "unexpected-end-of-doctype"})
  1384. self.currentToken["correct"] = False
  1385. self.tokenQueue.append(self.currentToken)
  1386. self.state = self.dataState
  1387. elif data is EOF:
  1388. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1389. "eof-in-doctype"})
  1390. self.currentToken["correct"] = False
  1391. self.tokenQueue.append(self.currentToken)
  1392. self.state = self.dataState
  1393. else:
  1394. self.currentToken["publicId"] += data
  1395. return True
  1396. def doctypePublicIdentifierSingleQuotedState(self):
  1397. data = self.stream.char()
  1398. if data == "'":
  1399. self.state = self.afterDoctypePublicIdentifierState
  1400. elif data == "\u0000":
  1401. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1402. "data": "invalid-codepoint"})
  1403. self.currentToken["publicId"] += "\uFFFD"
  1404. elif data == ">":
  1405. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1406. "unexpected-end-of-doctype"})
  1407. self.currentToken["correct"] = False
  1408. self.tokenQueue.append(self.currentToken)
  1409. self.state = self.dataState
  1410. elif data is EOF:
  1411. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1412. "eof-in-doctype"})
  1413. self.currentToken["correct"] = False
  1414. self.tokenQueue.append(self.currentToken)
  1415. self.state = self.dataState
  1416. else:
  1417. self.currentToken["publicId"] += data
  1418. return True
  1419. def afterDoctypePublicIdentifierState(self):
  1420. data = self.stream.char()
  1421. if data in spaceCharacters:
  1422. self.state = self.betweenDoctypePublicAndSystemIdentifiersState
  1423. elif data == ">":
  1424. self.tokenQueue.append(self.currentToken)
  1425. self.state = self.dataState
  1426. elif data == '"':
  1427. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1428. "unexpected-char-in-doctype"})
  1429. self.currentToken["systemId"] = ""
  1430. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1431. elif data == "'":
  1432. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1433. "unexpected-char-in-doctype"})
  1434. self.currentToken["systemId"] = ""
  1435. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1436. elif data is EOF:
  1437. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1438. "eof-in-doctype"})
  1439. self.currentToken["correct"] = False
  1440. self.tokenQueue.append(self.currentToken)
  1441. self.state = self.dataState
  1442. else:
  1443. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1444. "unexpected-char-in-doctype"})
  1445. self.currentToken["correct"] = False
  1446. self.state = self.bogusDoctypeState
  1447. return True
  1448. def betweenDoctypePublicAndSystemIdentifiersState(self):
  1449. data = self.stream.char()
  1450. if data in spaceCharacters:
  1451. pass
  1452. elif data == ">":
  1453. self.tokenQueue.append(self.currentToken)
  1454. self.state = self.dataState
  1455. elif data == '"':
  1456. self.currentToken["systemId"] = ""
  1457. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1458. elif data == "'":
  1459. self.currentToken["systemId"] = ""
  1460. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1461. elif data == EOF:
  1462. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1463. "eof-in-doctype"})
  1464. self.currentToken["correct"] = False
  1465. self.tokenQueue.append(self.currentToken)
  1466. self.state = self.dataState
  1467. else:
  1468. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1469. "unexpected-char-in-doctype"})
  1470. self.currentToken["correct"] = False
  1471. self.state = self.bogusDoctypeState
  1472. return True
  1473. def afterDoctypeSystemKeywordState(self):
  1474. data = self.stream.char()
  1475. if data in spaceCharacters:
  1476. self.state = self.beforeDoctypeSystemIdentifierState
  1477. elif data in ("'", '"'):
  1478. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1479. "unexpected-char-in-doctype"})
  1480. self.stream.unget(data)
  1481. self.state = self.beforeDoctypeSystemIdentifierState
  1482. elif data is EOF:
  1483. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1484. "eof-in-doctype"})
  1485. self.currentToken["correct"] = False
  1486. self.tokenQueue.append(self.currentToken)
  1487. self.state = self.dataState
  1488. else:
  1489. self.stream.unget(data)
  1490. self.state = self.beforeDoctypeSystemIdentifierState
  1491. return True
  1492. def beforeDoctypeSystemIdentifierState(self):
  1493. data = self.stream.char()
  1494. if data in spaceCharacters:
  1495. pass
  1496. elif data == "\"":
  1497. self.currentToken["systemId"] = ""
  1498. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1499. elif data == "'":
  1500. self.currentToken["systemId"] = ""
  1501. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1502. elif data == ">":
  1503. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1504. "unexpected-char-in-doctype"})
  1505. self.currentToken["correct"] = False
  1506. self.tokenQueue.append(self.currentToken)
  1507. self.state = self.dataState
  1508. elif data is EOF:
  1509. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1510. "eof-in-doctype"})
  1511. self.currentToken["correct"] = False
  1512. self.tokenQueue.append(self.currentToken)
  1513. self.state = self.dataState
  1514. else:
  1515. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1516. "unexpected-char-in-doctype"})
  1517. self.currentToken["correct"] = False
  1518. self.state = self.bogusDoctypeState
  1519. return True
  1520. def doctypeSystemIdentifierDoubleQuotedState(self):
  1521. data = self.stream.char()
  1522. if data == "\"":
  1523. self.state = self.afterDoctypeSystemIdentifierState
  1524. elif data == "\u0000":
  1525. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1526. "data": "invalid-codepoint"})
  1527. self.currentToken["systemId"] += "\uFFFD"
  1528. elif data == ">":
  1529. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1530. "unexpected-end-of-doctype"})
  1531. self.currentToken["correct"] = False
  1532. self.tokenQueue.append(self.currentToken)
  1533. self.state = self.dataState
  1534. elif data is EOF:
  1535. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1536. "eof-in-doctype"})
  1537. self.currentToken["correct"] = False
  1538. self.tokenQueue.append(self.currentToken)
  1539. self.state = self.dataState
  1540. else:
  1541. self.currentToken["systemId"] += data
  1542. return True
  1543. def doctypeSystemIdentifierSingleQuotedState(self):
  1544. data = self.stream.char()
  1545. if data == "'":
  1546. self.state = self.afterDoctypeSystemIdentifierState
  1547. elif data == "\u0000":
  1548. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1549. "data": "invalid-codepoint"})
  1550. self.currentToken["systemId"] += "\uFFFD"
  1551. elif data == ">":
  1552. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1553. "unexpected-end-of-doctype"})
  1554. self.currentToken["correct"] = False
  1555. self.tokenQueue.append(self.currentToken)
  1556. self.state = self.dataState
  1557. elif data is EOF:
  1558. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1559. "eof-in-doctype"})
  1560. self.currentToken["correct"] = False
  1561. self.tokenQueue.append(self.currentToken)
  1562. self.state = self.dataState
  1563. else:
  1564. self.currentToken["systemId"] += data
  1565. return True
  1566. def afterDoctypeSystemIdentifierState(self):
  1567. data = self.stream.char()
  1568. if data in spaceCharacters:
  1569. pass
  1570. elif data == ">":
  1571. self.tokenQueue.append(self.currentToken)
  1572. self.state = self.dataState
  1573. elif data is EOF:
  1574. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1575. "eof-in-doctype"})
  1576. self.currentToken["correct"] = False
  1577. self.tokenQueue.append(self.currentToken)
  1578. self.state = self.dataState
  1579. else:
  1580. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1581. "unexpected-char-in-doctype"})
  1582. self.state = self.bogusDoctypeState
  1583. return True
  1584. def bogusDoctypeState(self):
  1585. data = self.stream.char()
  1586. if data == ">":
  1587. self.tokenQueue.append(self.currentToken)
  1588. self.state = self.dataState
  1589. elif data is EOF:
  1590. # XXX EMIT
  1591. self.stream.unget(data)
  1592. self.tokenQueue.append(self.currentToken)
  1593. self.state = self.dataState
  1594. else:
  1595. pass
  1596. return True
  1597. def cdataSectionState(self):
  1598. data = []
  1599. while True:
  1600. data.append(self.stream.charsUntil("]"))
  1601. data.append(self.stream.charsUntil(">"))
  1602. char = self.stream.char()
  1603. if char == EOF:
  1604. break
  1605. else:
  1606. assert char == ">"
  1607. if data[-1][-2:] == "]]":
  1608. data[-1] = data[-1][:-2]
  1609. break
  1610. else:
  1611. data.append(char)
  1612. data = "".join(data) # pylint:disable=redefined-variable-type
  1613. # Deal with null here rather than in the parser
  1614. nullCount = data.count("\u0000")
  1615. if nullCount > 0:
  1616. for _ in range(nullCount):
  1617. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1618. "data": "invalid-codepoint"})
  1619. data = data.replace("\u0000", "\uFFFD")
  1620. if data:
  1621. self.tokenQueue.append({"type": tokenTypes["Characters"],
  1622. "data": data})
  1623. self.state = self.dataState
  1624. return True