1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735 |
- from __future__ import absolute_import, division, unicode_literals
- from pip._vendor.six import unichr as chr
- from collections import deque, OrderedDict
- from sys import version_info
- from .constants import spaceCharacters
- from .constants import entities
- from .constants import asciiLetters, asciiUpper2Lower
- from .constants import digits, hexDigits, EOF
- from .constants import tokenTypes, tagTokenTypes
- from .constants import replacementCharacters
- from ._inputstream import HTMLInputStream
- from ._trie import Trie
- entitiesTrie = Trie(entities)
- if version_info >= (3, 7):
- attributeMap = dict
- else:
- attributeMap = OrderedDict
- class HTMLTokenizer(object):
- """ This class takes care of tokenizing HTML.
- * self.currentToken
- Holds the token that is currently being processed.
- * self.state
- Holds a reference to the method to be invoked... XXX
- * self.stream
- Points to HTMLInputStream object.
- """
- def __init__(self, stream, parser=None, **kwargs):
- self.stream = HTMLInputStream(stream, **kwargs)
- self.parser = parser
- # Setup the initial tokenizer state
- self.escapeFlag = False
- self.lastFourChars = []
- self.state = self.dataState
- self.escape = False
- # The current token being created
- self.currentToken = None
- super(HTMLTokenizer, self).__init__()
- def __iter__(self):
- """ This is where the magic happens.
- We do our usually processing through the states and when we have a token
- to return we yield the token which pauses processing until the next token
- is requested.
- """
- self.tokenQueue = deque([])
- # Start processing. When EOF is reached self.state will return False
- # instead of True and the loop will terminate.
- while self.state():
- while self.stream.errors:
- yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
- while self.tokenQueue:
- yield self.tokenQueue.popleft()
- def consumeNumberEntity(self, isHex):
- """This function returns either U+FFFD or the character based on the
- decimal or hexadecimal representation. It also discards ";" if present.
- If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
- """
- allowed = digits
- radix = 10
- if isHex:
- allowed = hexDigits
- radix = 16
- charStack = []
- # Consume all the characters that are in range while making sure we
- # don't hit an EOF.
- c = self.stream.char()
- while c in allowed and c is not EOF:
- charStack.append(c)
- c = self.stream.char()
- # Convert the set of characters consumed to an int.
- charAsInt = int("".join(charStack), radix)
- # Certain characters get replaced with others
- if charAsInt in replacementCharacters:
- char = replacementCharacters[charAsInt]
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- elif ((0xD800 <= charAsInt <= 0xDFFF) or
- (charAsInt > 0x10FFFF)):
- char = "\uFFFD"
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- else:
- # Should speed up this check somehow (e.g. move the set to a constant)
- if ((0x0001 <= charAsInt <= 0x0008) or
- (0x000E <= charAsInt <= 0x001F) or
- (0x007F <= charAsInt <= 0x009F) or
- (0xFDD0 <= charAsInt <= 0xFDEF) or
- charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
- 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
- 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
- 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
- 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
- 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
- 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
- 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
- 0xFFFFF, 0x10FFFE, 0x10FFFF])):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- try:
- # Try/except needed as UCS-2 Python builds' unichar only works
- # within the BMP.
- char = chr(charAsInt)
- except ValueError:
- v = charAsInt - 0x10000
- char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
- # Discard the ; if present. Otherwise, put it back on the queue and
- # invoke parseError on parser.
- if c != ";":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "numeric-entity-without-semicolon"})
- self.stream.unget(c)
- return char
- def consumeEntity(self, allowedChar=None, fromAttribute=False):
- # Initialise to the default output for when no entity is matched
- output = "&"
- charStack = [self.stream.char()]
- if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
- (allowedChar is not None and allowedChar == charStack[0])):
- self.stream.unget(charStack[0])
- elif charStack[0] == "#":
- # Read the next character to see if it's hex or decimal
- hex = False
- charStack.append(self.stream.char())
- if charStack[-1] in ("x", "X"):
- hex = True
- charStack.append(self.stream.char())
- # charStack[-1] should be the first digit
- if (hex and charStack[-1] in hexDigits) \
- or (not hex and charStack[-1] in digits):
- # At least one digit found, so consume the whole number
- self.stream.unget(charStack[-1])
- output = self.consumeNumberEntity(hex)
- else:
- # No digits found
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "expected-numeric-entity"})
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- else:
- # At this point in the process might have named entity. Entities
- # are stored in the global variable "entities".
- #
- # Consume characters and compare to these to a substring of the
- # entity names in the list until the substring no longer matches.
- while (charStack[-1] is not EOF):
- if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
- break
- charStack.append(self.stream.char())
- # At this point we have a string that starts with some characters
- # that may match an entity
- # Try to find the longest entity the string will match to take care
- # of ¬i for instance.
- try:
- entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
- entityLength = len(entityName)
- except KeyError:
- entityName = None
- if entityName is not None:
- if entityName[-1] != ";":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "named-entity-without-semicolon"})
- if (entityName[-1] != ";" and fromAttribute and
- (charStack[entityLength] in asciiLetters or
- charStack[entityLength] in digits or
- charStack[entityLength] == "=")):
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- else:
- output = entities[entityName]
- self.stream.unget(charStack.pop())
- output += "".join(charStack[entityLength:])
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-named-entity"})
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- if fromAttribute:
- self.currentToken["data"][-1][1] += output
- else:
- if output in spaceCharacters:
- tokenType = "SpaceCharacters"
- else:
- tokenType = "Characters"
- self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
- def processEntityInAttribute(self, allowedChar):
- """This method replaces the need for "entityInAttributeValueState".
- """
- self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
- def emitCurrentToken(self):
- """This method is a generic handler for emitting the tags. It also sets
- the state to "data" because that's what's needed after a token has been
- emitted.
- """
- token = self.currentToken
- # Add token to the queue to be yielded
- if (token["type"] in tagTokenTypes):
- token["name"] = token["name"].translate(asciiUpper2Lower)
- if token["type"] == tokenTypes["StartTag"]:
- raw = token["data"]
- data = attributeMap(raw)
- if len(raw) > len(data):
- # we had some duplicated attribute, fix so first wins
- data.update(raw[::-1])
- token["data"] = data
- if token["type"] == tokenTypes["EndTag"]:
- if token["data"]:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "attributes-in-end-tag"})
- if token["selfClosing"]:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "self-closing-flag-on-end-tag"})
- self.tokenQueue.append(token)
- self.state = self.dataState
- # Below are the various tokenizer states worked out.
- def dataState(self):
- data = self.stream.char()
- if data == "&":
- self.state = self.entityDataState
- elif data == "<":
- self.state = self.tagOpenState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\u0000"})
- elif data is EOF:
- # Tokenization ends.
- return False
- elif data in spaceCharacters:
- # Directly after emitting a token you switch back to the "data
- # state". At that point spaceCharacters are important so they are
- # emitted separately.
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
- data + self.stream.charsUntil(spaceCharacters, True)})
- # No need to update lastFourChars here, since the first space will
- # have already been appended to lastFourChars and will have broken
- # any <!-- or --> sequences
- else:
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def entityDataState(self):
- self.consumeEntity()
- self.state = self.dataState
- return True
- def rcdataState(self):
- data = self.stream.char()
- if data == "&":
- self.state = self.characterReferenceInRcdata
- elif data == "<":
- self.state = self.rcdataLessThanSignState
- elif data == EOF:
- # Tokenization ends.
- return False
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data in spaceCharacters:
- # Directly after emitting a token you switch back to the "data
- # state". At that point spaceCharacters are important so they are
- # emitted separately.
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
- data + self.stream.charsUntil(spaceCharacters, True)})
- # No need to update lastFourChars here, since the first space will
- # have already been appended to lastFourChars and will have broken
- # any <!-- or --> sequences
- else:
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def characterReferenceInRcdata(self):
- self.consumeEntity()
- self.state = self.rcdataState
- return True
- def rawtextState(self):
- data = self.stream.char()
- if data == "<":
- self.state = self.rawtextLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- # Tokenization ends.
- return False
- else:
- chars = self.stream.charsUntil(("<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def scriptDataState(self):
- data = self.stream.char()
- if data == "<":
- self.state = self.scriptDataLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- # Tokenization ends.
- return False
- else:
- chars = self.stream.charsUntil(("<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def plaintextState(self):
- data = self.stream.char()
- if data == EOF:
- # Tokenization ends.
- return False
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + self.stream.charsUntil("\u0000")})
- return True
- def tagOpenState(self):
- data = self.stream.char()
- if data == "!":
- self.state = self.markupDeclarationOpenState
- elif data == "/":
- self.state = self.closeTagOpenState
- elif data in asciiLetters:
- self.currentToken = {"type": tokenTypes["StartTag"],
- "name": data, "data": [],
- "selfClosing": False,
- "selfClosingAcknowledged": False}
- self.state = self.tagNameState
- elif data == ">":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-right-bracket"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
- self.state = self.dataState
- elif data == "?":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-question-mark"})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.dataState
- return True
- def closeTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
- "data": [], "selfClosing": False}
- self.state = self.tagNameState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-right-bracket"})
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-eof"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.state = self.dataState
- else:
- # XXX data can be _'_...
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-char",
- "datavars": {"data": data}})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- return True
- def tagNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-tag-name"})
- self.state = self.dataState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] += "\uFFFD"
- else:
- self.currentToken["name"] += data
- # (Don't use charsUntil here, because tag names are
- # very short and it's faster to not do anything fancy)
- return True
- def rcdataLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.rcdataEndTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rcdataEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.rcdataEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rcdataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rawtextLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.rawtextEndTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def rawtextEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.rawtextEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def rawtextEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def scriptDataLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.scriptDataEndTagOpenState
- elif data == "!":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
- self.state = self.scriptDataEscapeStartState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.scriptDataEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapeStartState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapeStartDashState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapeStartDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashDashState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapedState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashState
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- self.state = self.dataState
- else:
- chars = self.stream.charsUntil(("<", "-", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def scriptDataEscapedDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashDashState
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataEscapedState
- elif data == EOF:
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedDashDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
- self.state = self.scriptDataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataEscapedState
- elif data == EOF:
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.scriptDataEscapedEndTagOpenState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
- self.temporaryBuffer = data
- self.state = self.scriptDataDoubleEscapeStartState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer = data
- self.state = self.scriptDataEscapedEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataDoubleEscapeStartState(self):
- data = self.stream.char()
- if data in (spaceCharacters | frozenset(("/", ">"))):
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- if self.temporaryBuffer.lower() == "script":
- self.state = self.scriptDataDoubleEscapedState
- else:
- self.state = self.scriptDataEscapedState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.temporaryBuffer += data
- else:
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataDoubleEscapedState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataDoubleEscapedDashState
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- return True
- def scriptDataDoubleEscapedDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataDoubleEscapedDashDashState
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataDoubleEscapedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapedDashDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
- self.state = self.scriptDataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataDoubleEscapedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapedLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
- self.temporaryBuffer = ""
- self.state = self.scriptDataDoubleEscapeEndState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapeEndState(self):
- data = self.stream.char()
- if data in (spaceCharacters | frozenset(("/", ">"))):
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- if self.temporaryBuffer.lower() == "script":
- self.state = self.scriptDataEscapedState
- else:
- self.state = self.scriptDataDoubleEscapedState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.temporaryBuffer += data
- else:
- self.stream.unget(data)
- self.state = self.scriptDataDoubleEscapedState
- return True
- def beforeAttributeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data in ("'", '"', "=", "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "invalid-character-in-attribute-name"})
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"].append(["\uFFFD", ""])
- self.state = self.attributeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-name-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- return True
- def attributeNameState(self):
- data = self.stream.char()
- leavingThisState = True
- emitToken = False
- if data == "=":
- self.state = self.beforeAttributeValueState
- elif data in asciiLetters:
- self.currentToken["data"][-1][0] += data +\
- self.stream.charsUntil(asciiLetters, True)
- leavingThisState = False
- elif data == ">":
- # XXX If we emit here the attributes are converted to a dict
- # without being checked and when the code below runs we error
- # because data is a dict not a list
- emitToken = True
- elif data in spaceCharacters:
- self.state = self.afterAttributeNameState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][0] += "\uFFFD"
- leavingThisState = False
- elif data in ("'", '"', "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "invalid-character-in-attribute-name"})
- self.currentToken["data"][-1][0] += data
- leavingThisState = False
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "eof-in-attribute-name"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][0] += data
- leavingThisState = False
- if leavingThisState:
- # Attributes are not dropped at this stage. That happens when the
- # start tag token is emitted so values can still be safely appended
- # to attributes, but we do want to report the parse error in time.
- self.currentToken["data"][-1][0] = (
- self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
- for name, _ in self.currentToken["data"][:-1]:
- if self.currentToken["data"][-1][0] == name:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "duplicate-attribute"})
- break
- # XXX Fix for above XXX
- if emitToken:
- self.emitCurrentToken()
- return True
- def afterAttributeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data == "=":
- self.state = self.beforeAttributeValueState
- elif data == ">":
- self.emitCurrentToken()
- elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"].append(["\uFFFD", ""])
- self.state = self.attributeNameState
- elif data in ("'", '"', "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "invalid-character-after-attribute-name"})
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-end-of-tag-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- return True
- def beforeAttributeValueState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data == "\"":
- self.state = self.attributeValueDoubleQuotedState
- elif data == "&":
- self.state = self.attributeValueUnQuotedState
- self.stream.unget(data)
- elif data == "'":
- self.state = self.attributeValueSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-value-but-got-right-bracket"})
- self.emitCurrentToken()
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- self.state = self.attributeValueUnQuotedState
- elif data in ("=", "<", "`"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "equals-in-unquoted-attribute-value"})
- self.currentToken["data"][-1][1] += data
- self.state = self.attributeValueUnQuotedState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-value-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data
- self.state = self.attributeValueUnQuotedState
- return True
- def attributeValueDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterAttributeValueState
- elif data == "&":
- self.processEntityInAttribute('"')
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-double-quote"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data +\
- self.stream.charsUntil(("\"", "&", "\u0000"))
- return True
- def attributeValueSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterAttributeValueState
- elif data == "&":
- self.processEntityInAttribute("'")
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-single-quote"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data +\
- self.stream.charsUntil(("'", "&", "\u0000"))
- return True
- def attributeValueUnQuotedState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == "&":
- self.processEntityInAttribute(">")
- elif data == ">":
- self.emitCurrentToken()
- elif data in ('"', "'", "=", "<", "`"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-in-unquoted-attribute-value"})
- self.currentToken["data"][-1][1] += data
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-no-quotes"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
- frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
- return True
- def afterAttributeValueState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-EOF-after-attribute-value"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-after-attribute-value"})
- self.stream.unget(data)
- self.state = self.beforeAttributeNameState
- return True
- def selfClosingStartTagState(self):
- data = self.stream.char()
- if data == ">":
- self.currentToken["selfClosing"] = True
- self.emitCurrentToken()
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "unexpected-EOF-after-solidus-in-tag"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-after-solidus-in-tag"})
- self.stream.unget(data)
- self.state = self.beforeAttributeNameState
- return True
- def bogusCommentState(self):
- # Make a new comment token and give it as value all the characters
- # until the first > or EOF (charsUntil checks for EOF automatically)
- # and emit it.
- data = self.stream.charsUntil(">")
- data = data.replace("\u0000", "\uFFFD")
- self.tokenQueue.append(
- {"type": tokenTypes["Comment"], "data": data})
- # Eat the character directly after the bogus comment which is either a
- # ">" or an EOF.
- self.stream.char()
- self.state = self.dataState
- return True
- def markupDeclarationOpenState(self):
- charStack = [self.stream.char()]
- if charStack[-1] == "-":
- charStack.append(self.stream.char())
- if charStack[-1] == "-":
- self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
- self.state = self.commentStartState
- return True
- elif charStack[-1] in ('d', 'D'):
- matched = True
- for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
- ('y', 'Y'), ('p', 'P'), ('e', 'E')):
- charStack.append(self.stream.char())
- if charStack[-1] not in expected:
- matched = False
- break
- if matched:
- self.currentToken = {"type": tokenTypes["Doctype"],
- "name": "",
- "publicId": None, "systemId": None,
- "correct": True}
- self.state = self.doctypeState
- return True
- elif (charStack[-1] == "[" and
- self.parser is not None and
- self.parser.tree.openElements and
- self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
- matched = True
- for expected in ["C", "D", "A", "T", "A", "["]:
- charStack.append(self.stream.char())
- if charStack[-1] != expected:
- matched = False
- break
- if matched:
- self.state = self.cdataSectionState
- return True
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-dashes-or-doctype"})
- while charStack:
- self.stream.unget(charStack.pop())
- self.state = self.bogusCommentState
- return True
- def commentStartState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentStartDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += data
- self.state = self.commentState
- return True
- def commentStartDashState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "-\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "-" + data
- self.state = self.commentState
- return True
- def commentState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += data + \
- self.stream.charsUntil(("-", "\u0000"))
- return True
- def commentEndDashState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "-\uFFFD"
- self.state = self.commentState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-end-dash"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "-" + data
- self.state = self.commentState
- return True
- def commentEndState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "--\uFFFD"
- self.state = self.commentState
- elif data == "!":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-bang-after-double-dash-in-comment"})
- self.state = self.commentEndBangState
- elif data == "-":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-dash-after-double-dash-in-comment"})
- self.currentToken["data"] += data
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-double-dash"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-comment"})
- self.currentToken["data"] += "--" + data
- self.state = self.commentState
- return True
- def commentEndBangState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "-":
- self.currentToken["data"] += "--!"
- self.state = self.commentEndDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "--!\uFFFD"
- self.state = self.commentState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-end-bang-state"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "--!" + data
- self.state = self.commentState
- return True
- def doctypeState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-eof"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "need-space-after-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypeNameState
- return True
- def beforeDoctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-right-bracket"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] = "\uFFFD"
- self.state = self.doctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-eof"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["name"] = data
- self.state = self.doctypeNameState
- return True
- def doctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.state = self.afterDoctypeNameState
- elif data == ">":
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] += "\uFFFD"
- self.state = self.doctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype-name"})
- self.currentToken["correct"] = False
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["name"] += data
- return True
- def afterDoctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.currentToken["correct"] = False
- self.stream.unget(data)
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- if data in ("p", "P"):
- matched = True
- for expected in (("u", "U"), ("b", "B"), ("l", "L"),
- ("i", "I"), ("c", "C")):
- data = self.stream.char()
- if data not in expected:
- matched = False
- break
- if matched:
- self.state = self.afterDoctypePublicKeywordState
- return True
- elif data in ("s", "S"):
- matched = True
- for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
- ("e", "E"), ("m", "M")):
- data = self.stream.char()
- if data not in expected:
- matched = False
- break
- if matched:
- self.state = self.afterDoctypeSystemKeywordState
- return True
- # All the characters read before the current 'data' will be
- # [a-zA-Z], so they're garbage in the bogus doctype and can be
- # discarded; only the latest character might be '>' or EOF
- # and needs to be ungetted
- self.stream.unget(data)
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-space-or-right-bracket-in-doctype", "datavars":
- {"data": data}})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def afterDoctypePublicKeywordState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypePublicIdentifierState
- elif data in ("'", '"'):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypePublicIdentifierState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.stream.unget(data)
- self.state = self.beforeDoctypePublicIdentifierState
- return True
- def beforeDoctypePublicIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == "\"":
- self.currentToken["publicId"] = ""
- self.state = self.doctypePublicIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["publicId"] = ""
- self.state = self.doctypePublicIdentifierSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def doctypePublicIdentifierDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterDoctypePublicIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["publicId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["publicId"] += data
- return True
- def doctypePublicIdentifierSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterDoctypePublicIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["publicId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["publicId"] += data
- return True
- def afterDoctypePublicIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.betweenDoctypePublicAndSystemIdentifiersState
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == '"':
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def betweenDoctypePublicAndSystemIdentifiersState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == '"':
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def afterDoctypeSystemKeywordState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypeSystemIdentifierState
- elif data in ("'", '"'):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypeSystemIdentifierState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.stream.unget(data)
- self.state = self.beforeDoctypeSystemIdentifierState
- return True
- def beforeDoctypeSystemIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == "\"":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def doctypeSystemIdentifierDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterDoctypeSystemIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["systemId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["systemId"] += data
- return True
- def doctypeSystemIdentifierSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterDoctypeSystemIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["systemId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["systemId"] += data
- return True
- def afterDoctypeSystemIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.state = self.bogusDoctypeState
- return True
- def bogusDoctypeState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- # XXX EMIT
- self.stream.unget(data)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- pass
- return True
- def cdataSectionState(self):
- data = []
- while True:
- data.append(self.stream.charsUntil("]"))
- data.append(self.stream.charsUntil(">"))
- char = self.stream.char()
- if char == EOF:
- break
- else:
- assert char == ">"
- if data[-1][-2:] == "]]":
- data[-1] = data[-1][:-2]
- break
- else:
- data.append(char)
- data = "".join(data) # pylint:disable=redefined-variable-type
- # Deal with null here rather than in the parser
- nullCount = data.count("\u0000")
- if nullCount > 0:
- for _ in range(nullCount):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- data = data.replace("\u0000", "\uFFFD")
- if data:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": data})
- self.state = self.dataState
- return True
|