_inputstream.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type, binary_type
  3. from pip._vendor.six.moves import http_client, urllib
  4. import codecs
  5. import re
  6. from pip._vendor import webencodings
  7. from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
  8. from .constants import _ReparseException
  9. from . import _utils
  10. from io import StringIO
  11. try:
  12. from io import BytesIO
  13. except ImportError:
  14. BytesIO = StringIO
  15. # Non-unicode versions of constants for use in the pre-parser
  16. spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
  17. asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
  18. asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
  19. spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
  20. invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
  21. if _utils.supports_lone_surrogates:
  22. # Use one extra step of indirection and create surrogates with
  23. # eval. Not using this indirection would introduce an illegal
  24. # unicode literal on platforms not supporting such lone
  25. # surrogates.
  26. assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
  27. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
  28. eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
  29. "]")
  30. else:
  31. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
  32. non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  33. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
  34. 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
  35. 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
  36. 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  37. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
  38. 0x10FFFE, 0x10FFFF])
  39. ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
  40. # Cache for charsUntil()
  41. charsUntilRegEx = {}
  42. class BufferedStream(object):
  43. """Buffering for streams that do not have buffering of their own
  44. The buffer is implemented as a list of chunks on the assumption that
  45. joining many strings will be slow since it is O(n**2)
  46. """
  47. def __init__(self, stream):
  48. self.stream = stream
  49. self.buffer = []
  50. self.position = [-1, 0] # chunk number, offset
  51. def tell(self):
  52. pos = 0
  53. for chunk in self.buffer[:self.position[0]]:
  54. pos += len(chunk)
  55. pos += self.position[1]
  56. return pos
  57. def seek(self, pos):
  58. assert pos <= self._bufferedBytes()
  59. offset = pos
  60. i = 0
  61. while len(self.buffer[i]) < offset:
  62. offset -= len(self.buffer[i])
  63. i += 1
  64. self.position = [i, offset]
  65. def read(self, bytes):
  66. if not self.buffer:
  67. return self._readStream(bytes)
  68. elif (self.position[0] == len(self.buffer) and
  69. self.position[1] == len(self.buffer[-1])):
  70. return self._readStream(bytes)
  71. else:
  72. return self._readFromBuffer(bytes)
  73. def _bufferedBytes(self):
  74. return sum([len(item) for item in self.buffer])
  75. def _readStream(self, bytes):
  76. data = self.stream.read(bytes)
  77. self.buffer.append(data)
  78. self.position[0] += 1
  79. self.position[1] = len(data)
  80. return data
  81. def _readFromBuffer(self, bytes):
  82. remainingBytes = bytes
  83. rv = []
  84. bufferIndex = self.position[0]
  85. bufferOffset = self.position[1]
  86. while bufferIndex < len(self.buffer) and remainingBytes != 0:
  87. assert remainingBytes > 0
  88. bufferedData = self.buffer[bufferIndex]
  89. if remainingBytes <= len(bufferedData) - bufferOffset:
  90. bytesToRead = remainingBytes
  91. self.position = [bufferIndex, bufferOffset + bytesToRead]
  92. else:
  93. bytesToRead = len(bufferedData) - bufferOffset
  94. self.position = [bufferIndex, len(bufferedData)]
  95. bufferIndex += 1
  96. rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
  97. remainingBytes -= bytesToRead
  98. bufferOffset = 0
  99. if remainingBytes:
  100. rv.append(self._readStream(remainingBytes))
  101. return b"".join(rv)
  102. def HTMLInputStream(source, **kwargs):
  103. # Work around Python bug #20007: read(0) closes the connection.
  104. # http://bugs.python.org/issue20007
  105. if (isinstance(source, http_client.HTTPResponse) or
  106. # Also check for addinfourl wrapping HTTPResponse
  107. (isinstance(source, urllib.response.addbase) and
  108. isinstance(source.fp, http_client.HTTPResponse))):
  109. isUnicode = False
  110. elif hasattr(source, "read"):
  111. isUnicode = isinstance(source.read(0), text_type)
  112. else:
  113. isUnicode = isinstance(source, text_type)
  114. if isUnicode:
  115. encodings = [x for x in kwargs if x.endswith("_encoding")]
  116. if encodings:
  117. raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
  118. return HTMLUnicodeInputStream(source, **kwargs)
  119. else:
  120. return HTMLBinaryInputStream(source, **kwargs)
  121. class HTMLUnicodeInputStream(object):
  122. """Provides a unicode stream of characters to the HTMLTokenizer.
  123. This class takes care of character encoding and removing or replacing
  124. incorrect byte-sequences and also provides column and line tracking.
  125. """
  126. _defaultChunkSize = 10240
  127. def __init__(self, source):
  128. """Initialises the HTMLInputStream.
  129. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  130. for use by html5lib.
  131. source can be either a file-object, local filename or a string.
  132. The optional encoding parameter must be a string that indicates
  133. the encoding. If specified, that encoding will be used,
  134. regardless of any BOM or later declaration (such as in a meta
  135. element)
  136. """
  137. if not _utils.supports_lone_surrogates:
  138. # Such platforms will have already checked for such
  139. # surrogate errors, so no need to do this checking.
  140. self.reportCharacterErrors = None
  141. elif len("\U0010FFFF") == 1:
  142. self.reportCharacterErrors = self.characterErrorsUCS4
  143. else:
  144. self.reportCharacterErrors = self.characterErrorsUCS2
  145. # List of where new lines occur
  146. self.newLines = [0]
  147. self.charEncoding = (lookupEncoding("utf-8"), "certain")
  148. self.dataStream = self.openStream(source)
  149. self.reset()
  150. def reset(self):
  151. self.chunk = ""
  152. self.chunkSize = 0
  153. self.chunkOffset = 0
  154. self.errors = []
  155. # number of (complete) lines in previous chunks
  156. self.prevNumLines = 0
  157. # number of columns in the last line of the previous chunk
  158. self.prevNumCols = 0
  159. # Deal with CR LF and surrogates split over chunk boundaries
  160. self._bufferedCharacter = None
  161. def openStream(self, source):
  162. """Produces a file object from source.
  163. source can be either a file object, local filename or a string.
  164. """
  165. # Already a file object
  166. if hasattr(source, 'read'):
  167. stream = source
  168. else:
  169. stream = StringIO(source)
  170. return stream
  171. def _position(self, offset):
  172. chunk = self.chunk
  173. nLines = chunk.count('\n', 0, offset)
  174. positionLine = self.prevNumLines + nLines
  175. lastLinePos = chunk.rfind('\n', 0, offset)
  176. if lastLinePos == -1:
  177. positionColumn = self.prevNumCols + offset
  178. else:
  179. positionColumn = offset - (lastLinePos + 1)
  180. return (positionLine, positionColumn)
  181. def position(self):
  182. """Returns (line, col) of the current position in the stream."""
  183. line, col = self._position(self.chunkOffset)
  184. return (line + 1, col)
  185. def char(self):
  186. """ Read one character from the stream or queue if available. Return
  187. EOF when EOF is reached.
  188. """
  189. # Read a new chunk from the input stream if necessary
  190. if self.chunkOffset >= self.chunkSize:
  191. if not self.readChunk():
  192. return EOF
  193. chunkOffset = self.chunkOffset
  194. char = self.chunk[chunkOffset]
  195. self.chunkOffset = chunkOffset + 1
  196. return char
  197. def readChunk(self, chunkSize=None):
  198. if chunkSize is None:
  199. chunkSize = self._defaultChunkSize
  200. self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
  201. self.chunk = ""
  202. self.chunkSize = 0
  203. self.chunkOffset = 0
  204. data = self.dataStream.read(chunkSize)
  205. # Deal with CR LF and surrogates broken across chunks
  206. if self._bufferedCharacter:
  207. data = self._bufferedCharacter + data
  208. self._bufferedCharacter = None
  209. elif not data:
  210. # We have no more data, bye-bye stream
  211. return False
  212. if len(data) > 1:
  213. lastv = ord(data[-1])
  214. if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
  215. self._bufferedCharacter = data[-1]
  216. data = data[:-1]
  217. if self.reportCharacterErrors:
  218. self.reportCharacterErrors(data)
  219. # Replace invalid characters
  220. data = data.replace("\r\n", "\n")
  221. data = data.replace("\r", "\n")
  222. self.chunk = data
  223. self.chunkSize = len(data)
  224. return True
  225. def characterErrorsUCS4(self, data):
  226. for _ in range(len(invalid_unicode_re.findall(data))):
  227. self.errors.append("invalid-codepoint")
  228. def characterErrorsUCS2(self, data):
  229. # Someone picked the wrong compile option
  230. # You lose
  231. skip = False
  232. for match in invalid_unicode_re.finditer(data):
  233. if skip:
  234. continue
  235. codepoint = ord(match.group())
  236. pos = match.start()
  237. # Pretty sure there should be endianness issues here
  238. if _utils.isSurrogatePair(data[pos:pos + 2]):
  239. # We have a surrogate pair!
  240. char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
  241. if char_val in non_bmp_invalid_codepoints:
  242. self.errors.append("invalid-codepoint")
  243. skip = True
  244. elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
  245. pos == len(data) - 1):
  246. self.errors.append("invalid-codepoint")
  247. else:
  248. skip = False
  249. self.errors.append("invalid-codepoint")
  250. def charsUntil(self, characters, opposite=False):
  251. """ Returns a string of characters from the stream up to but not
  252. including any character in 'characters' or EOF. 'characters' must be
  253. a container that supports the 'in' method and iteration over its
  254. characters.
  255. """
  256. # Use a cache of regexps to find the required characters
  257. try:
  258. chars = charsUntilRegEx[(characters, opposite)]
  259. except KeyError:
  260. if __debug__:
  261. for c in characters:
  262. assert(ord(c) < 128)
  263. regex = "".join(["\\x%02x" % ord(c) for c in characters])
  264. if not opposite:
  265. regex = "^%s" % regex
  266. chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
  267. rv = []
  268. while True:
  269. # Find the longest matching prefix
  270. m = chars.match(self.chunk, self.chunkOffset)
  271. if m is None:
  272. # If nothing matched, and it wasn't because we ran out of chunk,
  273. # then stop
  274. if self.chunkOffset != self.chunkSize:
  275. break
  276. else:
  277. end = m.end()
  278. # If not the whole chunk matched, return everything
  279. # up to the part that didn't match
  280. if end != self.chunkSize:
  281. rv.append(self.chunk[self.chunkOffset:end])
  282. self.chunkOffset = end
  283. break
  284. # If the whole remainder of the chunk matched,
  285. # use it all and read the next chunk
  286. rv.append(self.chunk[self.chunkOffset:])
  287. if not self.readChunk():
  288. # Reached EOF
  289. break
  290. r = "".join(rv)
  291. return r
  292. def unget(self, char):
  293. # Only one character is allowed to be ungotten at once - it must
  294. # be consumed again before any further call to unget
  295. if char is not None:
  296. if self.chunkOffset == 0:
  297. # unget is called quite rarely, so it's a good idea to do
  298. # more work here if it saves a bit of work in the frequently
  299. # called char and charsUntil.
  300. # So, just prepend the ungotten character onto the current
  301. # chunk:
  302. self.chunk = char + self.chunk
  303. self.chunkSize += 1
  304. else:
  305. self.chunkOffset -= 1
  306. assert self.chunk[self.chunkOffset] == char
  307. class HTMLBinaryInputStream(HTMLUnicodeInputStream):
  308. """Provides a unicode stream of characters to the HTMLTokenizer.
  309. This class takes care of character encoding and removing or replacing
  310. incorrect byte-sequences and also provides column and line tracking.
  311. """
  312. def __init__(self, source, override_encoding=None, transport_encoding=None,
  313. same_origin_parent_encoding=None, likely_encoding=None,
  314. default_encoding="windows-1252", useChardet=True):
  315. """Initialises the HTMLInputStream.
  316. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  317. for use by html5lib.
  318. source can be either a file-object, local filename or a string.
  319. The optional encoding parameter must be a string that indicates
  320. the encoding. If specified, that encoding will be used,
  321. regardless of any BOM or later declaration (such as in a meta
  322. element)
  323. """
  324. # Raw Stream - for unicode objects this will encode to utf-8 and set
  325. # self.charEncoding as appropriate
  326. self.rawStream = self.openStream(source)
  327. HTMLUnicodeInputStream.__init__(self, self.rawStream)
  328. # Encoding Information
  329. # Number of bytes to use when looking for a meta element with
  330. # encoding information
  331. self.numBytesMeta = 1024
  332. # Number of bytes to use when using detecting encoding using chardet
  333. self.numBytesChardet = 100
  334. # Things from args
  335. self.override_encoding = override_encoding
  336. self.transport_encoding = transport_encoding
  337. self.same_origin_parent_encoding = same_origin_parent_encoding
  338. self.likely_encoding = likely_encoding
  339. self.default_encoding = default_encoding
  340. # Determine encoding
  341. self.charEncoding = self.determineEncoding(useChardet)
  342. assert self.charEncoding[0] is not None
  343. # Call superclass
  344. self.reset()
  345. def reset(self):
  346. self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
  347. HTMLUnicodeInputStream.reset(self)
  348. def openStream(self, source):
  349. """Produces a file object from source.
  350. source can be either a file object, local filename or a string.
  351. """
  352. # Already a file object
  353. if hasattr(source, 'read'):
  354. stream = source
  355. else:
  356. stream = BytesIO(source)
  357. try:
  358. stream.seek(stream.tell())
  359. except: # pylint:disable=bare-except
  360. stream = BufferedStream(stream)
  361. return stream
  362. def determineEncoding(self, chardet=True):
  363. # BOMs take precedence over everything
  364. # This will also read past the BOM if present
  365. charEncoding = self.detectBOM(), "certain"
  366. if charEncoding[0] is not None:
  367. return charEncoding
  368. # If we've been overriden, we've been overriden
  369. charEncoding = lookupEncoding(self.override_encoding), "certain"
  370. if charEncoding[0] is not None:
  371. return charEncoding
  372. # Now check the transport layer
  373. charEncoding = lookupEncoding(self.transport_encoding), "certain"
  374. if charEncoding[0] is not None:
  375. return charEncoding
  376. # Look for meta elements with encoding information
  377. charEncoding = self.detectEncodingMeta(), "tentative"
  378. if charEncoding[0] is not None:
  379. return charEncoding
  380. # Parent document encoding
  381. charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
  382. if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
  383. return charEncoding
  384. # "likely" encoding
  385. charEncoding = lookupEncoding(self.likely_encoding), "tentative"
  386. if charEncoding[0] is not None:
  387. return charEncoding
  388. # Guess with chardet, if available
  389. if chardet:
  390. try:
  391. from pip._vendor.chardet.universaldetector import UniversalDetector
  392. except ImportError:
  393. pass
  394. else:
  395. buffers = []
  396. detector = UniversalDetector()
  397. while not detector.done:
  398. buffer = self.rawStream.read(self.numBytesChardet)
  399. assert isinstance(buffer, bytes)
  400. if not buffer:
  401. break
  402. buffers.append(buffer)
  403. detector.feed(buffer)
  404. detector.close()
  405. encoding = lookupEncoding(detector.result['encoding'])
  406. self.rawStream.seek(0)
  407. if encoding is not None:
  408. return encoding, "tentative"
  409. # Try the default encoding
  410. charEncoding = lookupEncoding(self.default_encoding), "tentative"
  411. if charEncoding[0] is not None:
  412. return charEncoding
  413. # Fallback to html5lib's default if even that hasn't worked
  414. return lookupEncoding("windows-1252"), "tentative"
  415. def changeEncoding(self, newEncoding):
  416. assert self.charEncoding[1] != "certain"
  417. newEncoding = lookupEncoding(newEncoding)
  418. if newEncoding is None:
  419. return
  420. if newEncoding.name in ("utf-16be", "utf-16le"):
  421. newEncoding = lookupEncoding("utf-8")
  422. assert newEncoding is not None
  423. elif newEncoding == self.charEncoding[0]:
  424. self.charEncoding = (self.charEncoding[0], "certain")
  425. else:
  426. self.rawStream.seek(0)
  427. self.charEncoding = (newEncoding, "certain")
  428. self.reset()
  429. raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
  430. def detectBOM(self):
  431. """Attempts to detect at BOM at the start of the stream. If
  432. an encoding can be determined from the BOM return the name of the
  433. encoding otherwise return None"""
  434. bomDict = {
  435. codecs.BOM_UTF8: 'utf-8',
  436. codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
  437. codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
  438. }
  439. # Go to beginning of file and read in 4 bytes
  440. string = self.rawStream.read(4)
  441. assert isinstance(string, bytes)
  442. # Try detecting the BOM using bytes from the string
  443. encoding = bomDict.get(string[:3]) # UTF-8
  444. seek = 3
  445. if not encoding:
  446. # Need to detect UTF-32 before UTF-16
  447. encoding = bomDict.get(string) # UTF-32
  448. seek = 4
  449. if not encoding:
  450. encoding = bomDict.get(string[:2]) # UTF-16
  451. seek = 2
  452. # Set the read position past the BOM if one was found, otherwise
  453. # set it to the start of the stream
  454. if encoding:
  455. self.rawStream.seek(seek)
  456. return lookupEncoding(encoding)
  457. else:
  458. self.rawStream.seek(0)
  459. return None
  460. def detectEncodingMeta(self):
  461. """Report the encoding declared by the meta element
  462. """
  463. buffer = self.rawStream.read(self.numBytesMeta)
  464. assert isinstance(buffer, bytes)
  465. parser = EncodingParser(buffer)
  466. self.rawStream.seek(0)
  467. encoding = parser.getEncoding()
  468. if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
  469. encoding = lookupEncoding("utf-8")
  470. return encoding
  471. class EncodingBytes(bytes):
  472. """String-like object with an associated position and various extra methods
  473. If the position is ever greater than the string length then an exception is
  474. raised"""
  475. def __new__(self, value):
  476. assert isinstance(value, bytes)
  477. return bytes.__new__(self, value.lower())
  478. def __init__(self, value):
  479. # pylint:disable=unused-argument
  480. self._position = -1
  481. def __iter__(self):
  482. return self
  483. def __next__(self):
  484. p = self._position = self._position + 1
  485. if p >= len(self):
  486. raise StopIteration
  487. elif p < 0:
  488. raise TypeError
  489. return self[p:p + 1]
  490. def next(self):
  491. # Py2 compat
  492. return self.__next__()
  493. def previous(self):
  494. p = self._position
  495. if p >= len(self):
  496. raise StopIteration
  497. elif p < 0:
  498. raise TypeError
  499. self._position = p = p - 1
  500. return self[p:p + 1]
  501. def setPosition(self, position):
  502. if self._position >= len(self):
  503. raise StopIteration
  504. self._position = position
  505. def getPosition(self):
  506. if self._position >= len(self):
  507. raise StopIteration
  508. if self._position >= 0:
  509. return self._position
  510. else:
  511. return None
  512. position = property(getPosition, setPosition)
  513. def getCurrentByte(self):
  514. return self[self.position:self.position + 1]
  515. currentByte = property(getCurrentByte)
  516. def skip(self, chars=spaceCharactersBytes):
  517. """Skip past a list of characters"""
  518. p = self.position # use property for the error-checking
  519. while p < len(self):
  520. c = self[p:p + 1]
  521. if c not in chars:
  522. self._position = p
  523. return c
  524. p += 1
  525. self._position = p
  526. return None
  527. def skipUntil(self, chars):
  528. p = self.position
  529. while p < len(self):
  530. c = self[p:p + 1]
  531. if c in chars:
  532. self._position = p
  533. return c
  534. p += 1
  535. self._position = p
  536. return None
  537. def matchBytes(self, bytes):
  538. """Look for a sequence of bytes at the start of a string. If the bytes
  539. are found return True and advance the position to the byte after the
  540. match. Otherwise return False and leave the position alone"""
  541. p = self.position
  542. data = self[p:p + len(bytes)]
  543. rv = data.startswith(bytes)
  544. if rv:
  545. self.position += len(bytes)
  546. return rv
  547. def jumpTo(self, bytes):
  548. """Look for the next sequence of bytes matching a given sequence. If
  549. a match is found advance the position to the last byte of the match"""
  550. newPosition = self[self.position:].find(bytes)
  551. if newPosition > -1:
  552. # XXX: This is ugly, but I can't see a nicer way to fix this.
  553. if self._position == -1:
  554. self._position = 0
  555. self._position += (newPosition + len(bytes) - 1)
  556. return True
  557. else:
  558. raise StopIteration
  559. class EncodingParser(object):
  560. """Mini parser for detecting character encoding from meta elements"""
  561. def __init__(self, data):
  562. """string - the data to work on for encoding detection"""
  563. self.data = EncodingBytes(data)
  564. self.encoding = None
  565. def getEncoding(self):
  566. methodDispatch = (
  567. (b"<!--", self.handleComment),
  568. (b"<meta", self.handleMeta),
  569. (b"</", self.handlePossibleEndTag),
  570. (b"<!", self.handleOther),
  571. (b"<?", self.handleOther),
  572. (b"<", self.handlePossibleStartTag))
  573. for _ in self.data:
  574. keepParsing = True
  575. for key, method in methodDispatch:
  576. if self.data.matchBytes(key):
  577. try:
  578. keepParsing = method()
  579. break
  580. except StopIteration:
  581. keepParsing = False
  582. break
  583. if not keepParsing:
  584. break
  585. return self.encoding
  586. def handleComment(self):
  587. """Skip over comments"""
  588. return self.data.jumpTo(b"-->")
  589. def handleMeta(self):
  590. if self.data.currentByte not in spaceCharactersBytes:
  591. # if we have <meta not followed by a space so just keep going
  592. return True
  593. # We have a valid meta element we want to search for attributes
  594. hasPragma = False
  595. pendingEncoding = None
  596. while True:
  597. # Try to find the next attribute after the current position
  598. attr = self.getAttribute()
  599. if attr is None:
  600. return True
  601. else:
  602. if attr[0] == b"http-equiv":
  603. hasPragma = attr[1] == b"content-type"
  604. if hasPragma and pendingEncoding is not None:
  605. self.encoding = pendingEncoding
  606. return False
  607. elif attr[0] == b"charset":
  608. tentativeEncoding = attr[1]
  609. codec = lookupEncoding(tentativeEncoding)
  610. if codec is not None:
  611. self.encoding = codec
  612. return False
  613. elif attr[0] == b"content":
  614. contentParser = ContentAttrParser(EncodingBytes(attr[1]))
  615. tentativeEncoding = contentParser.parse()
  616. if tentativeEncoding is not None:
  617. codec = lookupEncoding(tentativeEncoding)
  618. if codec is not None:
  619. if hasPragma:
  620. self.encoding = codec
  621. return False
  622. else:
  623. pendingEncoding = codec
  624. def handlePossibleStartTag(self):
  625. return self.handlePossibleTag(False)
  626. def handlePossibleEndTag(self):
  627. next(self.data)
  628. return self.handlePossibleTag(True)
  629. def handlePossibleTag(self, endTag):
  630. data = self.data
  631. if data.currentByte not in asciiLettersBytes:
  632. # If the next byte is not an ascii letter either ignore this
  633. # fragment (possible start tag case) or treat it according to
  634. # handleOther
  635. if endTag:
  636. data.previous()
  637. self.handleOther()
  638. return True
  639. c = data.skipUntil(spacesAngleBrackets)
  640. if c == b"<":
  641. # return to the first step in the overall "two step" algorithm
  642. # reprocessing the < byte
  643. data.previous()
  644. else:
  645. # Read all attributes
  646. attr = self.getAttribute()
  647. while attr is not None:
  648. attr = self.getAttribute()
  649. return True
  650. def handleOther(self):
  651. return self.data.jumpTo(b">")
  652. def getAttribute(self):
  653. """Return a name,value pair for the next attribute in the stream,
  654. if one is found, or None"""
  655. data = self.data
  656. # Step 1 (skip chars)
  657. c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
  658. assert c is None or len(c) == 1
  659. # Step 2
  660. if c in (b">", None):
  661. return None
  662. # Step 3
  663. attrName = []
  664. attrValue = []
  665. # Step 4 attribute name
  666. while True:
  667. if c == b"=" and attrName:
  668. break
  669. elif c in spaceCharactersBytes:
  670. # Step 6!
  671. c = data.skip()
  672. break
  673. elif c in (b"/", b">"):
  674. return b"".join(attrName), b""
  675. elif c in asciiUppercaseBytes:
  676. attrName.append(c.lower())
  677. elif c is None:
  678. return None
  679. else:
  680. attrName.append(c)
  681. # Step 5
  682. c = next(data)
  683. # Step 7
  684. if c != b"=":
  685. data.previous()
  686. return b"".join(attrName), b""
  687. # Step 8
  688. next(data)
  689. # Step 9
  690. c = data.skip()
  691. # Step 10
  692. if c in (b"'", b'"'):
  693. # 10.1
  694. quoteChar = c
  695. while True:
  696. # 10.2
  697. c = next(data)
  698. # 10.3
  699. if c == quoteChar:
  700. next(data)
  701. return b"".join(attrName), b"".join(attrValue)
  702. # 10.4
  703. elif c in asciiUppercaseBytes:
  704. attrValue.append(c.lower())
  705. # 10.5
  706. else:
  707. attrValue.append(c)
  708. elif c == b">":
  709. return b"".join(attrName), b""
  710. elif c in asciiUppercaseBytes:
  711. attrValue.append(c.lower())
  712. elif c is None:
  713. return None
  714. else:
  715. attrValue.append(c)
  716. # Step 11
  717. while True:
  718. c = next(data)
  719. if c in spacesAngleBrackets:
  720. return b"".join(attrName), b"".join(attrValue)
  721. elif c in asciiUppercaseBytes:
  722. attrValue.append(c.lower())
  723. elif c is None:
  724. return None
  725. else:
  726. attrValue.append(c)
  727. class ContentAttrParser(object):
  728. def __init__(self, data):
  729. assert isinstance(data, bytes)
  730. self.data = data
  731. def parse(self):
  732. try:
  733. # Check if the attr name is charset
  734. # otherwise return
  735. self.data.jumpTo(b"charset")
  736. self.data.position += 1
  737. self.data.skip()
  738. if not self.data.currentByte == b"=":
  739. # If there is no = sign keep looking for attrs
  740. return None
  741. self.data.position += 1
  742. self.data.skip()
  743. # Look for an encoding between matching quote marks
  744. if self.data.currentByte in (b'"', b"'"):
  745. quoteMark = self.data.currentByte
  746. self.data.position += 1
  747. oldPosition = self.data.position
  748. if self.data.jumpTo(quoteMark):
  749. return self.data[oldPosition:self.data.position]
  750. else:
  751. return None
  752. else:
  753. # Unquoted value
  754. oldPosition = self.data.position
  755. try:
  756. self.data.skipUntil(spaceCharactersBytes)
  757. return self.data[oldPosition:self.data.position]
  758. except StopIteration:
  759. # Return the whole remaining value
  760. return self.data[oldPosition:]
  761. except StopIteration:
  762. return None
  763. def lookupEncoding(encoding):
  764. """Return the python codec name corresponding to an encoding or None if the
  765. string doesn't correspond to a valid encoding."""
  766. if isinstance(encoding, binary_type):
  767. try:
  768. encoding = encoding.decode("ascii")
  769. except UnicodeDecodeError:
  770. return None
  771. if encoding is not None:
  772. try:
  773. return webencodings.lookup(encoding)
  774. except AttributeError:
  775. return None
  776. else:
  777. return None