_tokenizer.py 75 KB


  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import unichr as chr
  3. from collections import deque
  4. from .constants import spaceCharacters
  5. from .constants import entities
  6. from .constants import asciiLetters, asciiUpper2Lower
  7. from .constants import digits, hexDigits, EOF
  8. from .constants import tokenTypes, tagTokenTypes
  9. from .constants import replacementCharacters
  10. from ._inputstream import HTMLInputStream
  11. from ._trie import Trie
  12. entitiesTrie = Trie(entities)
  13. class HTMLTokenizer(object):
  14. """ This class takes care of tokenizing HTML.
  15. * self.currentToken
  16. Holds the token that is currently being processed.
  17. * self.state
  18. Holds a reference to the method to be invoked... XXX
  19. * self.stream
  20. Points to HTMLInputStream object.
  21. """
  22. def __init__(self, stream, parser=None, **kwargs):
  23. self.stream = HTMLInputStream(stream, **kwargs)
  24. self.parser = parser
  25. # Setup the initial tokenizer state
  26. self.escapeFlag = False
  27. self.lastFourChars = []
  28. self.state = self.dataState
  29. self.escape = False
  30. # The current token being created
  31. self.currentToken = None
  32. super(HTMLTokenizer, self).__init__()
  33. def __iter__(self):
  34. """ This is where the magic happens.
  35. We do our usually processing through the states and when we have a token
  36. to return we yield the token which pauses processing until the next token
  37. is requested.
  38. """
  39. self.tokenQueue = deque([])
  40. # Start processing. When EOF is reached self.state will return False
  41. # instead of True and the loop will terminate.
  42. while self.state():
  43. while self.stream.errors:
  44. yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
  45. while self.tokenQueue:
  46. yield self.tokenQueue.popleft()
  47. def consumeNumberEntity(self, isHex):
  48. """This function returns either U+FFFD or the character based on the
  49. decimal or hexadecimal representation. It also discards ";" if present.
  50. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
  51. """
  52. allowed = digits
  53. radix = 10
  54. if isHex:
  55. allowed = hexDigits
  56. radix = 16
  57. charStack = []
  58. # Consume all the characters that are in range while making sure we
  59. # don't hit an EOF.
  60. c = self.stream.char()
  61. while c in allowed and c is not EOF:
  62. charStack.append(c)
  63. c = self.stream.char()
  64. # Convert the set of characters consumed to an int.
  65. charAsInt = int("".join(charStack), radix)
  66. # Certain characters get replaced with others
  67. if charAsInt in replacementCharacters:
  68. char = replacementCharacters[charAsInt]
  69. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  70. "illegal-codepoint-for-numeric-entity",
  71. "datavars": {"charAsInt": charAsInt}})
  72. elif ((0xD800 <= charAsInt <= 0xDFFF) or
  73. (charAsInt > 0x10FFFF)):
  74. char = "\uFFFD"
  75. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  76. "illegal-codepoint-for-numeric-entity",
  77. "datavars": {"charAsInt": charAsInt}})
  78. else:
  79. # Should speed up this check somehow (e.g. move the set to a constant)
  80. if ((0x0001 <= charAsInt <= 0x0008) or
  81. (0x000E <= charAsInt <= 0x001F) or
  82. (0x007F <= charAsInt <= 0x009F) or
  83. (0xFDD0 <= charAsInt <= 0xFDEF) or
  84. charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
  85. 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  86. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
  87. 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
  88. 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
  89. 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
  90. 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  91. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
  92. 0xFFFFF, 0x10FFFE, 0x10FFFF])):
  93. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  94. "data":
  95. "illegal-codepoint-for-numeric-entity",
  96. "datavars": {"charAsInt": charAsInt}})
  97. try:
  98. # Try/except needed as UCS-2 Python builds' unichar only works
  99. # within the BMP.
  100. char = chr(charAsInt)
  101. except ValueError:
  102. v = charAsInt - 0x10000
  103. char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
  104. # Discard the ; if present. Otherwise, put it back on the queue and
  105. # invoke parseError on parser.
  106. if c != ";":
  107. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  108. "numeric-entity-without-semicolon"})
  109. self.stream.unget(c)
  110. return char
  111. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  112. # Initialise to the default output for when no entity is matched
  113. output = "&"
  114. charStack = [self.stream.char()]
  115. if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
  116. (allowedChar is not None and allowedChar == charStack[0])):
  117. self.stream.unget(charStack[0])
  118. elif charStack[0] == "#":
  119. # Read the next character to see if it's hex or decimal
  120. hex = False
  121. charStack.append(self.stream.char())
  122. if charStack[-1] in ("x", "X"):
  123. hex = True
  124. charStack.append(self.stream.char())
  125. # charStack[-1] should be the first digit
  126. if (hex and charStack[-1] in hexDigits) \
  127. or (not hex and charStack[-1] in digits):
  128. # At least one digit found, so consume the whole number
  129. self.stream.unget(charStack[-1])
  130. output = self.consumeNumberEntity(hex)
  131. else:
  132. # No digits found
  133. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  134. "data": "expected-numeric-entity"})
  135. self.stream.unget(charStack.pop())
  136. output = "&" + "".join(charStack)
  137. else:
  138. # At this point in the process might have named entity. Entities
  139. # are stored in the global variable "entities".
  140. #
  141. # Consume characters and compare to these to a substring of the
  142. # entity names in the list until the substring no longer matches.
  143. while (charStack[-1] is not EOF):
  144. if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
  145. break
  146. charStack.append(self.stream.char())
  147. # At this point we have a string that starts with some characters
  148. # that may match an entity
  149. # Try to find the longest entity the string will match to take care
  150. # of &noti for instance.
  151. try:
  152. entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
  153. entityLength = len(entityName)
  154. except KeyError:
  155. entityName = None
  156. if entityName is not None:
  157. if entityName[-1] != ";":
  158. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  159. "named-entity-without-semicolon"})
  160. if (entityName[-1] != ";" and fromAttribute and
  161. (charStack[entityLength] in asciiLetters or
  162. charStack[entityLength] in digits or
  163. charStack[entityLength] == "=")):
  164. self.stream.unget(charStack.pop())
  165. output = "&" + "".join(charStack)
  166. else:
  167. output = entities[entityName]
  168. self.stream.unget(charStack.pop())
  169. output += "".join(charStack[entityLength:])
  170. else:
  171. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  172. "expected-named-entity"})
  173. self.stream.unget(charStack.pop())
  174. output = "&" + "".join(charStack)
  175. if fromAttribute:
  176. self.currentToken["data"][-1][1] += output
  177. else:
  178. if output in spaceCharacters:
  179. tokenType = "SpaceCharacters"
  180. else:
  181. tokenType = "Characters"
  182. self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
  183. def processEntityInAttribute(self, allowedChar):
  184. """This method replaces the need for "entityInAttributeValueState".
  185. """
  186. self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
  187. def emitCurrentToken(self):
  188. """This method is a generic handler for emitting the tags. It also sets
  189. the state to "data" because that's what's needed after a token has been
  190. emitted.
  191. """
  192. token = self.currentToken
  193. # Add token to the queue to be yielded
  194. if (token["type"] in tagTokenTypes):
  195. token["name"] = token["name"].translate(asciiUpper2Lower)
  196. if token["type"] == tokenTypes["EndTag"]:
  197. if token["data"]:
  198. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  199. "data": "attributes-in-end-tag"})
  200. if token["selfClosing"]:
  201. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  202. "data": "self-closing-flag-on-end-tag"})
  203. self.tokenQueue.append(token)
  204. self.state = self.dataState
  205. # Below are the various tokenizer states worked out.
  206. def dataState(self):
  207. data = self.stream.char()
  208. if data == "&":
  209. self.state = self.entityDataState
  210. elif data == "<":
  211. self.state = self.tagOpenState
  212. elif data == "\u0000":
  213. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  214. "data": "invalid-codepoint"})
  215. self.tokenQueue.append({"type": tokenTypes["Characters"],
  216. "data": "\u0000"})
  217. elif data is EOF:
  218. # Tokenization ends.
  219. return False
  220. elif data in spaceCharacters:
  221. # Directly after emitting a token you switch back to the "data
  222. # state". At that point spaceCharacters are important so they are
  223. # emitted separately.
  224. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  225. data + self.stream.charsUntil(spaceCharacters, True)})
  226. # No need to update lastFourChars here, since the first space will
  227. # have already been appended to lastFourChars and will have broken
  228. # any <!-- or --> sequences
  229. else:
  230. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  231. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  232. data + chars})
  233. return True
  234. def entityDataState(self):
  235. self.consumeEntity()
  236. self.state = self.dataState
  237. return True
  238. def rcdataState(self):
  239. data = self.stream.char()
  240. if data == "&":
  241. self.state = self.characterReferenceInRcdata
  242. elif data == "<":
  243. self.state = self.rcdataLessThanSignState
  244. elif data == EOF:
  245. # Tokenization ends.
  246. return False
  247. elif data == "\u0000":
  248. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  249. "data": "invalid-codepoint"})
  250. self.tokenQueue.append({"type": tokenTypes["Characters"],
  251. "data": "\uFFFD"})
  252. elif data in spaceCharacters:
  253. # Directly after emitting a token you switch back to the "data
  254. # state". At that point spaceCharacters are important so they are
  255. # emitted separately.
  256. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  257. data + self.stream.charsUntil(spaceCharacters, True)})
  258. # No need to update lastFourChars here, since the first space will
  259. # have already been appended to lastFourChars and will have broken
  260. # any <!-- or --> sequences
  261. else:
  262. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  263. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  264. data + chars})
  265. return True
  266. def characterReferenceInRcdata(self):
  267. self.consumeEntity()
  268. self.state = self.rcdataState
  269. return True
  270. def rawtextState(self):
  271. data = self.stream.char()
  272. if data == "<":
  273. self.state = self.rawtextLessThanSignState
  274. elif data == "\u0000":
  275. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  276. "data": "invalid-codepoint"})
  277. self.tokenQueue.append({"type": tokenTypes["Characters"],
  278. "data": "\uFFFD"})
  279. elif data == EOF:
  280. # Tokenization ends.
  281. return False
  282. else:
  283. chars = self.stream.charsUntil(("<", "\u0000"))
  284. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  285. data + chars})
  286. return True
  287. def scriptDataState(self):
  288. data = self.stream.char()
  289. if data == "<":
  290. self.state = self.scriptDataLessThanSignState
  291. elif data == "\u0000":
  292. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  293. "data": "invalid-codepoint"})
  294. self.tokenQueue.append({"type": tokenTypes["Characters"],
  295. "data": "\uFFFD"})
  296. elif data == EOF:
  297. # Tokenization ends.
  298. return False
  299. else:
  300. chars = self.stream.charsUntil(("<", "\u0000"))
  301. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  302. data + chars})
  303. return True
  304. def plaintextState(self):
  305. data = self.stream.char()
  306. if data == EOF:
  307. # Tokenization ends.
  308. return False
  309. elif data == "\u0000":
  310. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  311. "data": "invalid-codepoint"})
  312. self.tokenQueue.append({"type": tokenTypes["Characters"],
  313. "data": "\uFFFD"})
  314. else:
  315. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  316. data + self.stream.charsUntil("\u0000")})
  317. return True
  318. def tagOpenState(self):
  319. data = self.stream.char()
  320. if data == "!":
  321. self.state = self.markupDeclarationOpenState
  322. elif data == "/":
  323. self.state = self.closeTagOpenState
  324. elif data in asciiLetters:
  325. self.currentToken = {"type": tokenTypes["StartTag"],
  326. "name": data, "data": [],
  327. "selfClosing": False,
  328. "selfClosingAcknowledged": False}
  329. self.state = self.tagNameState
  330. elif data == ">":
  331. # XXX In theory it could be something besides a tag name. But
  332. # do we really care?
  333. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  334. "expected-tag-name-but-got-right-bracket"})
  335. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
  336. self.state = self.dataState
  337. elif data == "?":
  338. # XXX In theory it could be something besides a tag name. But
  339. # do we really care?
  340. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  341. "expected-tag-name-but-got-question-mark"})
  342. self.stream.unget(data)
  343. self.state = self.bogusCommentState
  344. else:
  345. # XXX
  346. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  347. "expected-tag-name"})
  348. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  349. self.stream.unget(data)
  350. self.state = self.dataState
  351. return True
  352. def closeTagOpenState(self):
  353. data = self.stream.char()
  354. if data in asciiLetters:
  355. self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
  356. "data": [], "selfClosing": False}
  357. self.state = self.tagNameState
  358. elif data == ">":
  359. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  360. "expected-closing-tag-but-got-right-bracket"})
  361. self.state = self.dataState
  362. elif data is EOF:
  363. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  364. "expected-closing-tag-but-got-eof"})
  365. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  366. self.state = self.dataState
  367. else:
  368. # XXX data can be _'_...
  369. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  370. "expected-closing-tag-but-got-char",
  371. "datavars": {"data": data}})
  372. self.stream.unget(data)
  373. self.state = self.bogusCommentState
  374. return True
  375. def tagNameState(self):
  376. data = self.stream.char()
  377. if data in spaceCharacters:
  378. self.state = self.beforeAttributeNameState
  379. elif data == ">":
  380. self.emitCurrentToken()
  381. elif data is EOF:
  382. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  383. "eof-in-tag-name"})
  384. self.state = self.dataState
  385. elif data == "/":
  386. self.state = self.selfClosingStartTagState
  387. elif data == "\u0000":
  388. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  389. "data": "invalid-codepoint"})
  390. self.currentToken["name"] += "\uFFFD"
  391. else:
  392. self.currentToken["name"] += data
  393. # (Don't use charsUntil here, because tag names are
  394. # very short and it's faster to not do anything fancy)
  395. return True
  396. def rcdataLessThanSignState(self):
  397. data = self.stream.char()
  398. if data == "/":
  399. self.temporaryBuffer = ""
  400. self.state = self.rcdataEndTagOpenState
  401. else:
  402. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  403. self.stream.unget(data)
  404. self.state = self.rcdataState
  405. return True
  406. def rcdataEndTagOpenState(self):
  407. data = self.stream.char()
  408. if data in asciiLetters:
  409. self.temporaryBuffer += data
  410. self.state = self.rcdataEndTagNameState
  411. else:
  412. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  413. self.stream.unget(data)
  414. self.state = self.rcdataState
  415. return True
  416. def rcdataEndTagNameState(self):
  417. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  418. data = self.stream.char()
  419. if data in spaceCharacters and appropriate:
  420. self.currentToken = {"type": tokenTypes["EndTag"],
  421. "name": self.temporaryBuffer,
  422. "data": [], "selfClosing": False}
  423. self.state = self.beforeAttributeNameState
  424. elif data == "/" and appropriate:
  425. self.currentToken = {"type": tokenTypes["EndTag"],
  426. "name": self.temporaryBuffer,
  427. "data": [], "selfClosing": False}
  428. self.state = self.selfClosingStartTagState
  429. elif data == ">" and appropriate:
  430. self.currentToken = {"type": tokenTypes["EndTag"],
  431. "name": self.temporaryBuffer,
  432. "data": [], "selfClosing": False}
  433. self.emitCurrentToken()
  434. self.state = self.dataState
  435. elif data in asciiLetters:
  436. self.temporaryBuffer += data
  437. else:
  438. self.tokenQueue.append({"type": tokenTypes["Characters"],
  439. "data": "</" + self.temporaryBuffer})
  440. self.stream.unget(data)
  441. self.state = self.rcdataState
  442. return True
  443. def rawtextLessThanSignState(self):
  444. data = self.stream.char()
  445. if data == "/":
  446. self.temporaryBuffer = ""
  447. self.state = self.rawtextEndTagOpenState
  448. else:
  449. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  450. self.stream.unget(data)
  451. self.state = self.rawtextState
  452. return True
  453. def rawtextEndTagOpenState(self):
  454. data = self.stream.char()
  455. if data in asciiLetters:
  456. self.temporaryBuffer += data
  457. self.state = self.rawtextEndTagNameState
  458. else:
  459. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  460. self.stream.unget(data)
  461. self.state = self.rawtextState
  462. return True
  463. def rawtextEndTagNameState(self):
  464. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  465. data = self.stream.char()
  466. if data in spaceCharacters and appropriate:
  467. self.currentToken = {"type": tokenTypes["EndTag"],
  468. "name": self.temporaryBuffer,
  469. "data": [], "selfClosing": False}
  470. self.state = self.beforeAttributeNameState
  471. elif data == "/" and appropriate:
  472. self.currentToken = {"type": tokenTypes["EndTag"],
  473. "name": self.temporaryBuffer,
  474. "data": [], "selfClosing": False}
  475. self.state = self.selfClosingStartTagState
  476. elif data == ">" and appropriate:
  477. self.currentToken = {"type": tokenTypes["EndTag"],
  478. "name": self.temporaryBuffer,
  479. "data": [], "selfClosing": False}
  480. self.emitCurrentToken()
  481. self.state = self.dataState
  482. elif data in asciiLetters:
  483. self.temporaryBuffer += data
  484. else:
  485. self.tokenQueue.append({"type": tokenTypes["Characters"],
  486. "data": "</" + self.temporaryBuffer})
  487. self.stream.unget(data)
  488. self.state = self.rawtextState
  489. return True
  490. def scriptDataLessThanSignState(self):
  491. data = self.stream.char()
  492. if data == "/":
  493. self.temporaryBuffer = ""
  494. self.state = self.scriptDataEndTagOpenState
  495. elif data == "!":
  496. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
  497. self.state = self.scriptDataEscapeStartState
  498. else:
  499. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  500. self.stream.unget(data)
  501. self.state = self.scriptDataState
  502. return True
  503. def scriptDataEndTagOpenState(self):
  504. data = self.stream.char()
  505. if data in asciiLetters:
  506. self.temporaryBuffer += data
  507. self.state = self.scriptDataEndTagNameState
  508. else:
  509. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  510. self.stream.unget(data)
  511. self.state = self.scriptDataState
  512. return True
  513. def scriptDataEndTagNameState(self):
  514. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  515. data = self.stream.char()
  516. if data in spaceCharacters and appropriate:
  517. self.currentToken = {"type": tokenTypes["EndTag"],
  518. "name": self.temporaryBuffer,
  519. "data": [], "selfClosing": False}
  520. self.state = self.beforeAttributeNameState
  521. elif data == "/" and appropriate:
  522. self.currentToken = {"type": tokenTypes["EndTag"],
  523. "name": self.temporaryBuffer,
  524. "data": [], "selfClosing": False}
  525. self.state = self.selfClosingStartTagState
  526. elif data == ">" and appropriate:
  527. self.currentToken = {"type": tokenTypes["EndTag"],
  528. "name": self.temporaryBuffer,
  529. "data": [], "selfClosing": False}
  530. self.emitCurrentToken()
  531. self.state = self.dataState
  532. elif data in asciiLetters:
  533. self.temporaryBuffer += data
  534. else:
  535. self.tokenQueue.append({"type": tokenTypes["Characters"],
  536. "data": "</" + self.temporaryBuffer})
  537. self.stream.unget(data)
  538. self.state = self.scriptDataState
  539. return True
  540. def scriptDataEscapeStartState(self):
  541. data = self.stream.char()
  542. if data == "-":
  543. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  544. self.state = self.scriptDataEscapeStartDashState
  545. else:
  546. self.stream.unget(data)
  547. self.state = self.scriptDataState
  548. return True
  549. def scriptDataEscapeStartDashState(self):
  550. data = self.stream.char()
  551. if data == "-":
  552. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  553. self.state = self.scriptDataEscapedDashDashState
  554. else:
  555. self.stream.unget(data)
  556. self.state = self.scriptDataState
  557. return True
  558. def scriptDataEscapedState(self):
  559. data = self.stream.char()
  560. if data == "-":
  561. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  562. self.state = self.scriptDataEscapedDashState
  563. elif data == "<":
  564. self.state = self.scriptDataEscapedLessThanSignState
  565. elif data == "\u0000":
  566. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  567. "data": "invalid-codepoint"})
  568. self.tokenQueue.append({"type": tokenTypes["Characters"],
  569. "data": "\uFFFD"})
  570. elif data == EOF:
  571. self.state = self.dataState
  572. else:
  573. chars = self.stream.charsUntil(("<", "-", "\u0000"))
  574. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  575. data + chars})
  576. return True
  577. def scriptDataEscapedDashState(self):
  578. data = self.stream.char()
  579. if data == "-":
  580. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  581. self.state = self.scriptDataEscapedDashDashState
  582. elif data == "<":
  583. self.state = self.scriptDataEscapedLessThanSignState
  584. elif data == "\u0000":
  585. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  586. "data": "invalid-codepoint"})
  587. self.tokenQueue.append({"type": tokenTypes["Characters"],
  588. "data": "\uFFFD"})
  589. self.state = self.scriptDataEscapedState
  590. elif data == EOF:
  591. self.state = self.dataState
  592. else:
  593. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  594. self.state = self.scriptDataEscapedState
  595. return True
  596. def scriptDataEscapedDashDashState(self):
  597. data = self.stream.char()
  598. if data == "-":
  599. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  600. elif data == "<":
  601. self.state = self.scriptDataEscapedLessThanSignState
  602. elif data == ">":
  603. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  604. self.state = self.scriptDataState
  605. elif data == "\u0000":
  606. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  607. "data": "invalid-codepoint"})
  608. self.tokenQueue.append({"type": tokenTypes["Characters"],
  609. "data": "\uFFFD"})
  610. self.state = self.scriptDataEscapedState
  611. elif data == EOF:
  612. self.state = self.dataState
  613. else:
  614. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  615. self.state = self.scriptDataEscapedState
  616. return True
  617. def scriptDataEscapedLessThanSignState(self):
  618. data = self.stream.char()
  619. if data == "/":
  620. self.temporaryBuffer = ""
  621. self.state = self.scriptDataEscapedEndTagOpenState
  622. elif data in asciiLetters:
  623. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
  624. self.temporaryBuffer = data
  625. self.state = self.scriptDataDoubleEscapeStartState
  626. else:
  627. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  628. self.stream.unget(data)
  629. self.state = self.scriptDataEscapedState
  630. return True
  631. def scriptDataEscapedEndTagOpenState(self):
  632. data = self.stream.char()
  633. if data in asciiLetters:
  634. self.temporaryBuffer = data
  635. self.state = self.scriptDataEscapedEndTagNameState
  636. else:
  637. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  638. self.stream.unget(data)
  639. self.state = self.scriptDataEscapedState
  640. return True
  641. def scriptDataEscapedEndTagNameState(self):
  642. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  643. data = self.stream.char()
  644. if data in spaceCharacters and appropriate:
  645. self.currentToken = {"type": tokenTypes["EndTag"],
  646. "name": self.temporaryBuffer,
  647. "data": [], "selfClosing": False}
  648. self.state = self.beforeAttributeNameState
  649. elif data == "/" and appropriate:
  650. self.currentToken = {"type": tokenTypes["EndTag"],
  651. "name": self.temporaryBuffer,
  652. "data": [], "selfClosing": False}
  653. self.state = self.selfClosingStartTagState
  654. elif data == ">" and appropriate:
  655. self.currentToken = {"type": tokenTypes["EndTag"],
  656. "name": self.temporaryBuffer,
  657. "data": [], "selfClosing": False}
  658. self.emitCurrentToken()
  659. self.state = self.dataState
  660. elif data in asciiLetters:
  661. self.temporaryBuffer += data
  662. else:
  663. self.tokenQueue.append({"type": tokenTypes["Characters"],
  664. "data": "</" + self.temporaryBuffer})
  665. self.stream.unget(data)
  666. self.state = self.scriptDataEscapedState
  667. return True
  668. def scriptDataDoubleEscapeStartState(self):
  669. data = self.stream.char()
  670. if data in (spaceCharacters | frozenset(("/", ">"))):
  671. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  672. if self.temporaryBuffer.lower() == "script":
  673. self.state = self.scriptDataDoubleEscapedState
  674. else:
  675. self.state = self.scriptDataEscapedState
  676. elif data in asciiLetters:
  677. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  678. self.temporaryBuffer += data
  679. else:
  680. self.stream.unget(data)
  681. self.state = self.scriptDataEscapedState
  682. return True
  683. def scriptDataDoubleEscapedState(self):
  684. data = self.stream.char()
  685. if data == "-":
  686. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  687. self.state = self.scriptDataDoubleEscapedDashState
  688. elif data == "<":
  689. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  690. self.state = self.scriptDataDoubleEscapedLessThanSignState
  691. elif data == "\u0000":
  692. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  693. "data": "invalid-codepoint"})
  694. self.tokenQueue.append({"type": tokenTypes["Characters"],
  695. "data": "\uFFFD"})
  696. elif data == EOF:
  697. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  698. "eof-in-script-in-script"})
  699. self.state = self.dataState
  700. else:
  701. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  702. return True
  703. def scriptDataDoubleEscapedDashState(self):
  704. data = self.stream.char()
  705. if data == "-":
  706. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  707. self.state = self.scriptDataDoubleEscapedDashDashState
  708. elif data == "<":
  709. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  710. self.state = self.scriptDataDoubleEscapedLessThanSignState
  711. elif data == "\u0000":
  712. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  713. "data": "invalid-codepoint"})
  714. self.tokenQueue.append({"type": tokenTypes["Characters"],
  715. "data": "\uFFFD"})
  716. self.state = self.scriptDataDoubleEscapedState
  717. elif data == EOF:
  718. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  719. "eof-in-script-in-script"})
  720. self.state = self.dataState
  721. else:
  722. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  723. self.state = self.scriptDataDoubleEscapedState
  724. return True
  725. def scriptDataDoubleEscapedDashDashState(self):
  726. data = self.stream.char()
  727. if data == "-":
  728. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  729. elif data == "<":
  730. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  731. self.state = self.scriptDataDoubleEscapedLessThanSignState
  732. elif data == ">":
  733. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  734. self.state = self.scriptDataState
  735. elif data == "\u0000":
  736. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  737. "data": "invalid-codepoint"})
  738. self.tokenQueue.append({"type": tokenTypes["Characters"],
  739. "data": "\uFFFD"})
  740. self.state = self.scriptDataDoubleEscapedState
  741. elif data == EOF:
  742. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  743. "eof-in-script-in-script"})
  744. self.state = self.dataState
  745. else:
  746. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  747. self.state = self.scriptDataDoubleEscapedState
  748. return True
  749. def scriptDataDoubleEscapedLessThanSignState(self):
  750. data = self.stream.char()
  751. if data == "/":
  752. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
  753. self.temporaryBuffer = ""
  754. self.state = self.scriptDataDoubleEscapeEndState
  755. else:
  756. self.stream.unget(data)
  757. self.state = self.scriptDataDoubleEscapedState
  758. return True
  759. def scriptDataDoubleEscapeEndState(self):
  760. data = self.stream.char()
  761. if data in (spaceCharacters | frozenset(("/", ">"))):
  762. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  763. if self.temporaryBuffer.lower() == "script":
  764. self.state = self.scriptDataEscapedState
  765. else:
  766. self.state = self.scriptDataDoubleEscapedState
  767. elif data in asciiLetters:
  768. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  769. self.temporaryBuffer += data
  770. else:
  771. self.stream.unget(data)
  772. self.state = self.scriptDataDoubleEscapedState
  773. return True
  774. def beforeAttributeNameState(self):
  775. data = self.stream.char()
  776. if data in spaceCharacters:
  777. self.stream.charsUntil(spaceCharacters, True)
  778. elif data in asciiLetters:
  779. self.currentToken["data"].append([data, ""])
  780. self.state = self.attributeNameState
  781. elif data == ">":
  782. self.emitCurrentToken()
  783. elif data == "/":
  784. self.state = self.selfClosingStartTagState
  785. elif data in ("'", '"', "=", "<"):
  786. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  787. "invalid-character-in-attribute-name"})
  788. self.currentToken["data"].append([data, ""])
  789. self.state = self.attributeNameState
  790. elif data == "\u0000":
  791. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  792. "data": "invalid-codepoint"})
  793. self.currentToken["data"].append(["\uFFFD", ""])
  794. self.state = self.attributeNameState
  795. elif data is EOF:
  796. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  797. "expected-attribute-name-but-got-eof"})
  798. self.state = self.dataState
  799. else:
  800. self.currentToken["data"].append([data, ""])
  801. self.state = self.attributeNameState
  802. return True
  803. def attributeNameState(self):
  804. data = self.stream.char()
  805. leavingThisState = True
  806. emitToken = False
  807. if data == "=":
  808. self.state = self.beforeAttributeValueState
  809. elif data in asciiLetters:
  810. self.currentToken["data"][-1][0] += data +\
  811. self.stream.charsUntil(asciiLetters, True)
  812. leavingThisState = False
  813. elif data == ">":
  814. # XXX If we emit here the attributes are converted to a dict
  815. # without being checked and when the code below runs we error
  816. # because data is a dict not a list
  817. emitToken = True
  818. elif data in spaceCharacters:
  819. self.state = self.afterAttributeNameState
  820. elif data == "/":
  821. self.state = self.selfClosingStartTagState
  822. elif data == "\u0000":
  823. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  824. "data": "invalid-codepoint"})
  825. self.currentToken["data"][-1][0] += "\uFFFD"
  826. leavingThisState = False
  827. elif data in ("'", '"', "<"):
  828. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  829. "data":
  830. "invalid-character-in-attribute-name"})
  831. self.currentToken["data"][-1][0] += data
  832. leavingThisState = False
  833. elif data is EOF:
  834. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  835. "data": "eof-in-attribute-name"})
  836. self.state = self.dataState
  837. else:
  838. self.currentToken["data"][-1][0] += data
  839. leavingThisState = False
  840. if leavingThisState:
  841. # Attributes are not dropped at this stage. That happens when the
  842. # start tag token is emitted so values can still be safely appended
  843. # to attributes, but we do want to report the parse error in time.
  844. self.currentToken["data"][-1][0] = (
  845. self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
  846. for name, _ in self.currentToken["data"][:-1]:
  847. if self.currentToken["data"][-1][0] == name:
  848. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  849. "duplicate-attribute"})
  850. break
  851. # XXX Fix for above XXX
  852. if emitToken:
  853. self.emitCurrentToken()
  854. return True
  855. def afterAttributeNameState(self):
  856. data = self.stream.char()
  857. if data in spaceCharacters:
  858. self.stream.charsUntil(spaceCharacters, True)
  859. elif data == "=":
  860. self.state = self.beforeAttributeValueState
  861. elif data == ">":
  862. self.emitCurrentToken()
  863. elif data in asciiLetters:
  864. self.currentToken["data"].append([data, ""])
  865. self.state = self.attributeNameState
  866. elif data == "/":
  867. self.state = self.selfClosingStartTagState
  868. elif data == "\u0000":
  869. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  870. "data": "invalid-codepoint"})
  871. self.currentToken["data"].append(["\uFFFD", ""])
  872. self.state = self.attributeNameState
  873. elif data in ("'", '"', "<"):
  874. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  875. "invalid-character-after-attribute-name"})
  876. self.currentToken["data"].append([data, ""])
  877. self.state = self.attributeNameState
  878. elif data is EOF:
  879. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  880. "expected-end-of-tag-but-got-eof"})
  881. self.state = self.dataState
  882. else:
  883. self.currentToken["data"].append([data, ""])
  884. self.state = self.attributeNameState
  885. return True
  886. def beforeAttributeValueState(self):
  887. data = self.stream.char()
  888. if data in spaceCharacters:
  889. self.stream.charsUntil(spaceCharacters, True)
  890. elif data == "\"":
  891. self.state = self.attributeValueDoubleQuotedState
  892. elif data == "&":
  893. self.state = self.attributeValueUnQuotedState
  894. self.stream.unget(data)
  895. elif data == "'":
  896. self.state = self.attributeValueSingleQuotedState
  897. elif data == ">":
  898. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  899. "expected-attribute-value-but-got-right-bracket"})
  900. self.emitCurrentToken()
  901. elif data == "\u0000":
  902. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  903. "data": "invalid-codepoint"})
  904. self.currentToken["data"][-1][1] += "\uFFFD"
  905. self.state = self.attributeValueUnQuotedState
  906. elif data in ("=", "<", "`"):
  907. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  908. "equals-in-unquoted-attribute-value"})
  909. self.currentToken["data"][-1][1] += data
  910. self.state = self.attributeValueUnQuotedState
  911. elif data is EOF:
  912. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  913. "expected-attribute-value-but-got-eof"})
  914. self.state = self.dataState
  915. else:
  916. self.currentToken["data"][-1][1] += data
  917. self.state = self.attributeValueUnQuotedState
  918. return True
  919. def attributeValueDoubleQuotedState(self):
  920. data = self.stream.char()
  921. if data == "\"":
  922. self.state = self.afterAttributeValueState
  923. elif data == "&":
  924. self.processEntityInAttribute('"')
  925. elif data == "\u0000":
  926. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  927. "data": "invalid-codepoint"})
  928. self.currentToken["data"][-1][1] += "\uFFFD"
  929. elif data is EOF:
  930. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  931. "eof-in-attribute-value-double-quote"})
  932. self.state = self.dataState
  933. else:
  934. self.currentToken["data"][-1][1] += data +\
  935. self.stream.charsUntil(("\"", "&", "\u0000"))
  936. return True
  937. def attributeValueSingleQuotedState(self):
  938. data = self.stream.char()
  939. if data == "'":
  940. self.state = self.afterAttributeValueState
  941. elif data == "&":
  942. self.processEntityInAttribute("'")
  943. elif data == "\u0000":
  944. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  945. "data": "invalid-codepoint"})
  946. self.currentToken["data"][-1][1] += "\uFFFD"
  947. elif data is EOF:
  948. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  949. "eof-in-attribute-value-single-quote"})
  950. self.state = self.dataState
  951. else:
  952. self.currentToken["data"][-1][1] += data +\
  953. self.stream.charsUntil(("'", "&", "\u0000"))
  954. return True
  955. def attributeValueUnQuotedState(self):
  956. data = self.stream.char()
  957. if data in spaceCharacters:
  958. self.state = self.beforeAttributeNameState
  959. elif data == "&":
  960. self.processEntityInAttribute(">")
  961. elif data == ">":
  962. self.emitCurrentToken()
  963. elif data in ('"', "'", "=", "<", "`"):
  964. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  965. "unexpected-character-in-unquoted-attribute-value"})
  966. self.currentToken["data"][-1][1] += data
  967. elif data == "\u0000":
  968. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  969. "data": "invalid-codepoint"})
  970. self.currentToken["data"][-1][1] += "\uFFFD"
  971. elif data is EOF:
  972. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  973. "eof-in-attribute-value-no-quotes"})
  974. self.state = self.dataState
  975. else:
  976. self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
  977. frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
  978. return True
  979. def afterAttributeValueState(self):
  980. data = self.stream.char()
  981. if data in spaceCharacters:
  982. self.state = self.beforeAttributeNameState
  983. elif data == ">":
  984. self.emitCurrentToken()
  985. elif data == "/":
  986. self.state = self.selfClosingStartTagState
  987. elif data is EOF:
  988. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  989. "unexpected-EOF-after-attribute-value"})
  990. self.stream.unget(data)
  991. self.state = self.dataState
  992. else:
  993. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  994. "unexpected-character-after-attribute-value"})
  995. self.stream.unget(data)
  996. self.state = self.beforeAttributeNameState
  997. return True
  998. def selfClosingStartTagState(self):
  999. data = self.stream.char()
  1000. if data == ">":
  1001. self.currentToken["selfClosing"] = True
  1002. self.emitCurrentToken()
  1003. elif data is EOF:
  1004. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1005. "data":
  1006. "unexpected-EOF-after-solidus-in-tag"})
  1007. self.stream.unget(data)
  1008. self.state = self.dataState
  1009. else:
  1010. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1011. "unexpected-character-after-solidus-in-tag"})
  1012. self.stream.unget(data)
  1013. self.state = self.beforeAttributeNameState
  1014. return True
  1015. def bogusCommentState(self):
  1016. # Make a new comment token and give it as value all the characters
  1017. # until the first > or EOF (charsUntil checks for EOF automatically)
  1018. # and emit it.
  1019. data = self.stream.charsUntil(">")
  1020. data = data.replace("\u0000", "\uFFFD")
  1021. self.tokenQueue.append(
  1022. {"type": tokenTypes["Comment"], "data": data})
  1023. # Eat the character directly after the bogus comment which is either a
  1024. # ">" or an EOF.
  1025. self.stream.char()
  1026. self.state = self.dataState
  1027. return True
  1028. def markupDeclarationOpenState(self):
  1029. charStack = [self.stream.char()]
  1030. if charStack[-1] == "-":
  1031. charStack.append(self.stream.char())
  1032. if charStack[-1] == "-":
  1033. self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
  1034. self.state = self.commentStartState
  1035. return True
  1036. elif charStack[-1] in ('d', 'D'):
  1037. matched = True
  1038. for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
  1039. ('y', 'Y'), ('p', 'P'), ('e', 'E')):
  1040. charStack.append(self.stream.char())
  1041. if charStack[-1] not in expected:
  1042. matched = False
  1043. break
  1044. if matched:
  1045. self.currentToken = {"type": tokenTypes["Doctype"],
  1046. "name": "",
  1047. "publicId": None, "systemId": None,
  1048. "correct": True}
  1049. self.state = self.doctypeState
  1050. return True
  1051. elif (charStack[-1] == "[" and
  1052. self.parser is not None and
  1053. self.parser.tree.openElements and
  1054. self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
  1055. matched = True
  1056. for expected in ["C", "D", "A", "T", "A", "["]:
  1057. charStack.append(self.stream.char())
  1058. if charStack[-1] != expected:
  1059. matched = False
  1060. break
  1061. if matched:
  1062. self.state = self.cdataSectionState
  1063. return True
  1064. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1065. "expected-dashes-or-doctype"})
  1066. while charStack:
  1067. self.stream.unget(charStack.pop())
  1068. self.state = self.bogusCommentState
  1069. return True
  1070. def commentStartState(self):
  1071. data = self.stream.char()
  1072. if data == "-":
  1073. self.state = self.commentStartDashState
  1074. elif data == "\u0000":
  1075. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1076. "data": "invalid-codepoint"})
  1077. self.currentToken["data"] += "\uFFFD"
  1078. elif data == ">":
  1079. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1080. "incorrect-comment"})
  1081. self.tokenQueue.append(self.currentToken)
  1082. self.state = self.dataState
  1083. elif data is EOF:
  1084. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1085. "eof-in-comment"})
  1086. self.tokenQueue.append(self.currentToken)
  1087. self.state = self.dataState
  1088. else:
  1089. self.currentToken["data"] += data
  1090. self.state = self.commentState
  1091. return True
  1092. def commentStartDashState(self):
  1093. data = self.stream.char()
  1094. if data == "-":
  1095. self.state = self.commentEndState
  1096. elif data == "\u0000":
  1097. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1098. "data": "invalid-codepoint"})
  1099. self.currentToken["data"] += "-\uFFFD"
  1100. elif data == ">":
  1101. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1102. "incorrect-comment"})
  1103. self.tokenQueue.append(self.currentToken)
  1104. self.state = self.dataState
  1105. elif data is EOF:
  1106. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1107. "eof-in-comment"})
  1108. self.tokenQueue.append(self.currentToken)
  1109. self.state = self.dataState
  1110. else:
  1111. self.currentToken["data"] += "-" + data
  1112. self.state = self.commentState
  1113. return True
  1114. def commentState(self):
  1115. data = self.stream.char()
  1116. if data == "-":
  1117. self.state = self.commentEndDashState
  1118. elif data == "\u0000":
  1119. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1120. "data": "invalid-codepoint"})
  1121. self.currentToken["data"] += "\uFFFD"
  1122. elif data is EOF:
  1123. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1124. "data": "eof-in-comment"})
  1125. self.tokenQueue.append(self.currentToken)
  1126. self.state = self.dataState
  1127. else:
  1128. self.currentToken["data"] += data + \
  1129. self.stream.charsUntil(("-", "\u0000"))
  1130. return True
  1131. def commentEndDashState(self):
  1132. data = self.stream.char()
  1133. if data == "-":
  1134. self.state = self.commentEndState
  1135. elif data == "\u0000":
  1136. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1137. "data": "invalid-codepoint"})
  1138. self.currentToken["data"] += "-\uFFFD"
  1139. self.state = self.commentState
  1140. elif data is EOF:
  1141. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1142. "eof-in-comment-end-dash"})
  1143. self.tokenQueue.append(self.currentToken)
  1144. self.state = self.dataState
  1145. else:
  1146. self.currentToken["data"] += "-" + data
  1147. self.state = self.commentState
  1148. return True
  1149. def commentEndState(self):
  1150. data = self.stream.char()
  1151. if data == ">":
  1152. self.tokenQueue.append(self.currentToken)
  1153. self.state = self.dataState
  1154. elif data == "\u0000":
  1155. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1156. "data": "invalid-codepoint"})
  1157. self.currentToken["data"] += "--\uFFFD"
  1158. self.state = self.commentState
  1159. elif data == "!":
  1160. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1161. "unexpected-bang-after-double-dash-in-comment"})
  1162. self.state = self.commentEndBangState
  1163. elif data == "-":
  1164. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1165. "unexpected-dash-after-double-dash-in-comment"})
  1166. self.currentToken["data"] += data
  1167. elif data is EOF:
  1168. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1169. "eof-in-comment-double-dash"})
  1170. self.tokenQueue.append(self.currentToken)
  1171. self.state = self.dataState
  1172. else:
  1173. # XXX
  1174. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1175. "unexpected-char-in-comment"})
  1176. self.currentToken["data"] += "--" + data
  1177. self.state = self.commentState
  1178. return True
  1179. def commentEndBangState(self):
  1180. data = self.stream.char()
  1181. if data == ">":
  1182. self.tokenQueue.append(self.currentToken)
  1183. self.state = self.dataState
  1184. elif data == "-":
  1185. self.currentToken["data"] += "--!"
  1186. self.state = self.commentEndDashState
  1187. elif data == "\u0000":
  1188. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1189. "data": "invalid-codepoint"})
  1190. self.currentToken["data"] += "--!\uFFFD"
  1191. self.state = self.commentState
  1192. elif data is EOF:
  1193. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1194. "eof-in-comment-end-bang-state"})
  1195. self.tokenQueue.append(self.currentToken)
  1196. self.state = self.dataState
  1197. else:
  1198. self.currentToken["data"] += "--!" + data
  1199. self.state = self.commentState
  1200. return True
  1201. def doctypeState(self):
  1202. data = self.stream.char()
  1203. if data in spaceCharacters:
  1204. self.state = self.beforeDoctypeNameState
  1205. elif data is EOF:
  1206. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1207. "expected-doctype-name-but-got-eof"})
  1208. self.currentToken["correct"] = False
  1209. self.tokenQueue.append(self.currentToken)
  1210. self.state = self.dataState
  1211. else:
  1212. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1213. "need-space-after-doctype"})
  1214. self.stream.unget(data)
  1215. self.state = self.beforeDoctypeNameState
  1216. return True
  1217. def beforeDoctypeNameState(self):
  1218. data = self.stream.char()
  1219. if data in spaceCharacters:
  1220. pass
  1221. elif data == ">":
  1222. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1223. "expected-doctype-name-but-got-right-bracket"})
  1224. self.currentToken["correct"] = False
  1225. self.tokenQueue.append(self.currentToken)
  1226. self.state = self.dataState
  1227. elif data == "\u0000":
  1228. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1229. "data": "invalid-codepoint"})
  1230. self.currentToken["name"] = "\uFFFD"
  1231. self.state = self.doctypeNameState
  1232. elif data is EOF:
  1233. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1234. "expected-doctype-name-but-got-eof"})
  1235. self.currentToken["correct"] = False
  1236. self.tokenQueue.append(self.currentToken)
  1237. self.state = self.dataState
  1238. else:
  1239. self.currentToken["name"] = data
  1240. self.state = self.doctypeNameState
  1241. return True
  1242. def doctypeNameState(self):
  1243. data = self.stream.char()
  1244. if data in spaceCharacters:
  1245. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1246. self.state = self.afterDoctypeNameState
  1247. elif data == ">":
  1248. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1249. self.tokenQueue.append(self.currentToken)
  1250. self.state = self.dataState
  1251. elif data == "\u0000":
  1252. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1253. "data": "invalid-codepoint"})
  1254. self.currentToken["name"] += "\uFFFD"
  1255. self.state = self.doctypeNameState
  1256. elif data is EOF:
  1257. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1258. "eof-in-doctype-name"})
  1259. self.currentToken["correct"] = False
  1260. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1261. self.tokenQueue.append(self.currentToken)
  1262. self.state = self.dataState
  1263. else:
  1264. self.currentToken["name"] += data
  1265. return True
  1266. def afterDoctypeNameState(self):
  1267. data = self.stream.char()
  1268. if data in spaceCharacters:
  1269. pass
  1270. elif data == ">":
  1271. self.tokenQueue.append(self.currentToken)
  1272. self.state = self.dataState
  1273. elif data is EOF:
  1274. self.currentToken["correct"] = False
  1275. self.stream.unget(data)
  1276. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1277. "eof-in-doctype"})
  1278. self.tokenQueue.append(self.currentToken)
  1279. self.state = self.dataState
  1280. else:
  1281. if data in ("p", "P"):
  1282. matched = True
  1283. for expected in (("u", "U"), ("b", "B"), ("l", "L"),
  1284. ("i", "I"), ("c", "C")):
  1285. data = self.stream.char()
  1286. if data not in expected:
  1287. matched = False
  1288. break
  1289. if matched:
  1290. self.state = self.afterDoctypePublicKeywordState
  1291. return True
  1292. elif data in ("s", "S"):
  1293. matched = True
  1294. for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
  1295. ("e", "E"), ("m", "M")):
  1296. data = self.stream.char()
  1297. if data not in expected:
  1298. matched = False
  1299. break
  1300. if matched:
  1301. self.state = self.afterDoctypeSystemKeywordState
  1302. return True
  1303. # All the characters read before the current 'data' will be
  1304. # [a-zA-Z], so they're garbage in the bogus doctype and can be
  1305. # discarded; only the latest character might be '>' or EOF
  1306. # and needs to be ungetted
  1307. self.stream.unget(data)
  1308. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1309. "expected-space-or-right-bracket-in-doctype", "datavars":
  1310. {"data": data}})
  1311. self.currentToken["correct"] = False
  1312. self.state = self.bogusDoctypeState
  1313. return True
  1314. def afterDoctypePublicKeywordState(self):
  1315. data = self.stream.char()
  1316. if data in spaceCharacters:
  1317. self.state = self.beforeDoctypePublicIdentifierState
  1318. elif data in ("'", '"'):
  1319. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1320. "unexpected-char-in-doctype"})
  1321. self.stream.unget(data)
  1322. self.state = self.beforeDoctypePublicIdentifierState
  1323. elif data is EOF:
  1324. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1325. "eof-in-doctype"})
  1326. self.currentToken["correct"] = False
  1327. self.tokenQueue.append(self.currentToken)
  1328. self.state = self.dataState
  1329. else:
  1330. self.stream.unget(data)
  1331. self.state = self.beforeDoctypePublicIdentifierState
  1332. return True
  1333. def beforeDoctypePublicIdentifierState(self):
  1334. data = self.stream.char()
  1335. if data in spaceCharacters:
  1336. pass
  1337. elif data == "\"":
  1338. self.currentToken["publicId"] = ""
  1339. self.state = self.doctypePublicIdentifierDoubleQuotedState
  1340. elif data == "'":
  1341. self.currentToken["publicId"] = ""
  1342. self.state = self.doctypePublicIdentifierSingleQuotedState
  1343. elif data == ">":
  1344. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1345. "unexpected-end-of-doctype"})
  1346. self.currentToken["correct"] = False
  1347. self.tokenQueue.append(self.currentToken)
  1348. self.state = self.dataState
  1349. elif data is EOF:
  1350. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1351. "eof-in-doctype"})
  1352. self.currentToken["correct"] = False
  1353. self.tokenQueue.append(self.currentToken)
  1354. self.state = self.dataState
  1355. else:
  1356. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1357. "unexpected-char-in-doctype"})
  1358. self.currentToken["correct"] = False
  1359. self.state = self.bogusDoctypeState
  1360. return True
  1361. def doctypePublicIdentifierDoubleQuotedState(self):
  1362. data = self.stream.char()
  1363. if data == "\"":
  1364. self.state = self.afterDoctypePublicIdentifierState
  1365. elif data == "\u0000":
  1366. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1367. "data": "invalid-codepoint"})
  1368. self.currentToken["publicId"] += "\uFFFD"
  1369. elif data == ">":
  1370. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1371. "unexpected-end-of-doctype"})
  1372. self.currentToken["correct"] = False
  1373. self.tokenQueue.append(self.currentToken)
  1374. self.state = self.dataState
  1375. elif data is EOF:
  1376. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1377. "eof-in-doctype"})
  1378. self.currentToken["correct"] = False
  1379. self.tokenQueue.append(self.currentToken)
  1380. self.state = self.dataState
  1381. else:
  1382. self.currentToken["publicId"] += data
  1383. return True
  1384. def doctypePublicIdentifierSingleQuotedState(self):
  1385. data = self.stream.char()
  1386. if data == "'":
  1387. self.state = self.afterDoctypePublicIdentifierState
  1388. elif data == "\u0000":
  1389. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1390. "data": "invalid-codepoint"})
  1391. self.currentToken["publicId"] += "\uFFFD"
  1392. elif data == ">":
  1393. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1394. "unexpected-end-of-doctype"})
  1395. self.currentToken["correct"] = False
  1396. self.tokenQueue.append(self.currentToken)
  1397. self.state = self.dataState
  1398. elif data is EOF:
  1399. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1400. "eof-in-doctype"})
  1401. self.currentToken["correct"] = False
  1402. self.tokenQueue.append(self.currentToken)
  1403. self.state = self.dataState
  1404. else:
  1405. self.currentToken["publicId"] += data
  1406. return True
  1407. def afterDoctypePublicIdentifierState(self):
  1408. data = self.stream.char()
  1409. if data in spaceCharacters:
  1410. self.state = self.betweenDoctypePublicAndSystemIdentifiersState
  1411. elif data == ">":
  1412. self.tokenQueue.append(self.currentToken)
  1413. self.state = self.dataState
  1414. elif data == '"':
  1415. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1416. "unexpected-char-in-doctype"})
  1417. self.currentToken["systemId"] = ""
  1418. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1419. elif data == "'":
  1420. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1421. "unexpected-char-in-doctype"})
  1422. self.currentToken["systemId"] = ""
  1423. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1424. elif data is EOF:
  1425. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1426. "eof-in-doctype"})
  1427. self.currentToken["correct"] = False
  1428. self.tokenQueue.append(self.currentToken)
  1429. self.state = self.dataState
  1430. else:
  1431. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1432. "unexpected-char-in-doctype"})
  1433. self.currentToken["correct"] = False
  1434. self.state = self.bogusDoctypeState
  1435. return True
  1436. def betweenDoctypePublicAndSystemIdentifiersState(self):
  1437. data = self.stream.char()
  1438. if data in spaceCharacters:
  1439. pass
  1440. elif data == ">":
  1441. self.tokenQueue.append(self.currentToken)
  1442. self.state = self.dataState
  1443. elif data == '"':
  1444. self.currentToken["systemId"] = ""
  1445. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1446. elif data == "'":
  1447. self.currentToken["systemId"] = ""
  1448. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1449. elif data == EOF:
  1450. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1451. "eof-in-doctype"})
  1452. self.currentToken["correct"] = False
  1453. self.tokenQueue.append(self.currentToken)
  1454. self.state = self.dataState
  1455. else:
  1456. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1457. "unexpected-char-in-doctype"})
  1458. self.currentToken["correct"] = False
  1459. self.state = self.bogusDoctypeState
  1460. return True
  1461. def afterDoctypeSystemKeywordState(self):
  1462. data = self.stream.char()
  1463. if data in spaceCharacters:
  1464. self.state = self.beforeDoctypeSystemIdentifierState
  1465. elif data in ("'", '"'):
  1466. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1467. "unexpected-char-in-doctype"})
  1468. self.stream.unget(data)
  1469. self.state = self.beforeDoctypeSystemIdentifierState
  1470. elif data is EOF:
  1471. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1472. "eof-in-doctype"})
  1473. self.currentToken["correct"] = False
  1474. self.tokenQueue.append(self.currentToken)
  1475. self.state = self.dataState
  1476. else:
  1477. self.stream.unget(data)
  1478. self.state = self.beforeDoctypeSystemIdentifierState
  1479. return True
  1480. def beforeDoctypeSystemIdentifierState(self):
  1481. data = self.stream.char()
  1482. if data in spaceCharacters:
  1483. pass
  1484. elif data == "\"":
  1485. self.currentToken["systemId"] = ""
  1486. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1487. elif data == "'":
  1488. self.currentToken["systemId"] = ""
  1489. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1490. elif data == ">":
  1491. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1492. "unexpected-char-in-doctype"})
  1493. self.currentToken["correct"] = False
  1494. self.tokenQueue.append(self.currentToken)
  1495. self.state = self.dataState
  1496. elif data is EOF:
  1497. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1498. "eof-in-doctype"})
  1499. self.currentToken["correct"] = False
  1500. self.tokenQueue.append(self.currentToken)
  1501. self.state = self.dataState
  1502. else:
  1503. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1504. "unexpected-char-in-doctype"})
  1505. self.currentToken["correct"] = False
  1506. self.state = self.bogusDoctypeState
  1507. return True
  1508. def doctypeSystemIdentifierDoubleQuotedState(self):
  1509. data = self.stream.char()
  1510. if data == "\"":
  1511. self.state = self.afterDoctypeSystemIdentifierState
  1512. elif data == "\u0000":
  1513. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1514. "data": "invalid-codepoint"})
  1515. self.currentToken["systemId"] += "\uFFFD"
  1516. elif data == ">":
  1517. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1518. "unexpected-end-of-doctype"})
  1519. self.currentToken["correct"] = False
  1520. self.tokenQueue.append(self.currentToken)
  1521. self.state = self.dataState
  1522. elif data is EOF:
  1523. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1524. "eof-in-doctype"})
  1525. self.currentToken["correct"] = False
  1526. self.tokenQueue.append(self.currentToken)
  1527. self.state = self.dataState
  1528. else:
  1529. self.currentToken["systemId"] += data
  1530. return True
  1531. def doctypeSystemIdentifierSingleQuotedState(self):
  1532. data = self.stream.char()
  1533. if data == "'":
  1534. self.state = self.afterDoctypeSystemIdentifierState
  1535. elif data == "\u0000":
  1536. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1537. "data": "invalid-codepoint"})
  1538. self.currentToken["systemId"] += "\uFFFD"
  1539. elif data == ">":
  1540. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1541. "unexpected-end-of-doctype"})
  1542. self.currentToken["correct"] = False
  1543. self.tokenQueue.append(self.currentToken)
  1544. self.state = self.dataState
  1545. elif data is EOF:
  1546. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1547. "eof-in-doctype"})
  1548. self.currentToken["correct"] = False
  1549. self.tokenQueue.append(self.currentToken)
  1550. self.state = self.dataState
  1551. else:
  1552. self.currentToken["systemId"] += data
  1553. return True
  1554. def afterDoctypeSystemIdentifierState(self):
  1555. data = self.stream.char()
  1556. if data in spaceCharacters:
  1557. pass
  1558. elif data == ">":
  1559. self.tokenQueue.append(self.currentToken)
  1560. self.state = self.dataState
  1561. elif data is EOF:
  1562. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1563. "eof-in-doctype"})
  1564. self.currentToken["correct"] = False
  1565. self.tokenQueue.append(self.currentToken)
  1566. self.state = self.dataState
  1567. else:
  1568. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1569. "unexpected-char-in-doctype"})
  1570. self.state = self.bogusDoctypeState
  1571. return True
  1572. def bogusDoctypeState(self):
  1573. data = self.stream.char()
  1574. if data == ">":
  1575. self.tokenQueue.append(self.currentToken)
  1576. self.state = self.dataState
  1577. elif data is EOF:
  1578. # XXX EMIT
  1579. self.stream.unget(data)
  1580. self.tokenQueue.append(self.currentToken)
  1581. self.state = self.dataState
  1582. else:
  1583. pass
  1584. return True
  1585. def cdataSectionState(self):
  1586. data = []
  1587. while True:
  1588. data.append(self.stream.charsUntil("]"))
  1589. data.append(self.stream.charsUntil(">"))
  1590. char = self.stream.char()
  1591. if char == EOF:
  1592. break
  1593. else:
  1594. assert char == ">"
  1595. if data[-1][-2:] == "]]":
  1596. data[-1] = data[-1][:-2]
  1597. break
  1598. else:
  1599. data.append(char)
  1600. data = "".join(data) # pylint:disable=redefined-variable-type
  1601. # Deal with null here rather than in the parser
  1602. nullCount = data.count("\u0000")
  1603. if nullCount > 0:
  1604. for _ in range(nullCount):
  1605. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1606. "data": "invalid-codepoint"})
  1607. data = data.replace("\u0000", "\uFFFD")
  1608. if data:
  1609. self.tokenQueue.append({"type": tokenTypes["Characters"],
  1610. "data": data})
  1611. self.state = self.dataState
  1612. return True