1
0

universaldetector.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. """
  29. Module containing the UniversalDetector detector class, which is the primary
  30. class a user of ``chardet`` should use.
  31. :author: Mark Pilgrim (initial port to Python)
  32. :author: Shy Shalom (original C code)
  33. :author: Dan Blanchard (major refactoring for 3.0)
  34. :author: Ian Cordasco
  35. """
  36. import codecs
  37. import logging
  38. import re
  39. from .charsetgroupprober import CharSetGroupProber
  40. from .enums import InputState, LanguageFilter, ProbingState
  41. from .escprober import EscCharSetProber
  42. from .latin1prober import Latin1Prober
  43. from .mbcsgroupprober import MBCSGroupProber
  44. from .sbcsgroupprober import SBCSGroupProber
  45. class UniversalDetector(object):
  46. """
  47. The ``UniversalDetector`` class underlies the ``chardet.detect`` function
  48. and coordinates all of the different charset probers.
  49. To get a ``dict`` containing an encoding and its confidence, you can simply
  50. run:
  51. .. code::
  52. u = UniversalDetector()
  53. u.feed(some_bytes)
  54. u.close()
  55. detected = u.result
  56. """
  57. MINIMUM_THRESHOLD = 0.20
  58. HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
  59. ESC_DETECTOR = re.compile(b'(\033|~{)')
  60. WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
  61. ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
  62. 'iso-8859-2': 'Windows-1250',
  63. 'iso-8859-5': 'Windows-1251',
  64. 'iso-8859-6': 'Windows-1256',
  65. 'iso-8859-7': 'Windows-1253',
  66. 'iso-8859-8': 'Windows-1255',
  67. 'iso-8859-9': 'Windows-1254',
  68. 'iso-8859-13': 'Windows-1257'}
  69. def __init__(self, lang_filter=LanguageFilter.ALL):
  70. self._esc_charset_prober = None
  71. self._charset_probers = []
  72. self.result = None
  73. self.done = None
  74. self._got_data = None
  75. self._input_state = None
  76. self._last_char = None
  77. self.lang_filter = lang_filter
  78. self.logger = logging.getLogger(__name__)
  79. self._has_win_bytes = None
  80. self.reset()
  81. def reset(self):
  82. """
  83. Reset the UniversalDetector and all of its probers back to their
  84. initial states. This is called by ``__init__``, so you only need to
  85. call this directly in between analyses of different documents.
  86. """
  87. self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
  88. self.done = False
  89. self._got_data = False
  90. self._has_win_bytes = False
  91. self._input_state = InputState.PURE_ASCII
  92. self._last_char = b''
  93. if self._esc_charset_prober:
  94. self._esc_charset_prober.reset()
  95. for prober in self._charset_probers:
  96. prober.reset()
  97. def feed(self, byte_str):
  98. """
  99. Takes a chunk of a document and feeds it through all of the relevant
  100. charset probers.
  101. After calling ``feed``, you can check the value of the ``done``
  102. attribute to see if you need to continue feeding the
  103. ``UniversalDetector`` more data, or if it has made a prediction
  104. (in the ``result`` attribute).
  105. .. note::
  106. You should always call ``close`` when you're done feeding in your
  107. document if ``done`` is not already ``True``.
  108. """
  109. if self.done:
  110. return
  111. if not len(byte_str):
  112. return
  113. if not isinstance(byte_str, bytearray):
  114. byte_str = bytearray(byte_str)
  115. # First check for known BOMs, since these are guaranteed to be correct
  116. if not self._got_data:
  117. # If the data starts with BOM, we know it is UTF
  118. if byte_str.startswith(codecs.BOM_UTF8):
  119. # EF BB BF UTF-8 with BOM
  120. self.result = {'encoding': "UTF-8-SIG",
  121. 'confidence': 1.0,
  122. 'language': ''}
  123. elif byte_str.startswith((codecs.BOM_UTF32_LE,
  124. codecs.BOM_UTF32_BE)):
  125. # FF FE 00 00 UTF-32, little-endian BOM
  126. # 00 00 FE FF UTF-32, big-endian BOM
  127. self.result = {'encoding': "UTF-32",
  128. 'confidence': 1.0,
  129. 'language': ''}
  130. elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
  131. # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
  132. self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
  133. 'confidence': 1.0,
  134. 'language': ''}
  135. elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
  136. # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
  137. self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
  138. 'confidence': 1.0,
  139. 'language': ''}
  140. elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
  141. # FF FE UTF-16, little endian BOM
  142. # FE FF UTF-16, big endian BOM
  143. self.result = {'encoding': "UTF-16",
  144. 'confidence': 1.0,
  145. 'language': ''}
  146. self._got_data = True
  147. if self.result['encoding'] is not None:
  148. self.done = True
  149. return
  150. # If none of those matched and we've only see ASCII so far, check
  151. # for high bytes and escape sequences
  152. if self._input_state == InputState.PURE_ASCII:
  153. if self.HIGH_BYTE_DETECTOR.search(byte_str):
  154. self._input_state = InputState.HIGH_BYTE
  155. elif self._input_state == InputState.PURE_ASCII and \
  156. self.ESC_DETECTOR.search(self._last_char + byte_str):
  157. self._input_state = InputState.ESC_ASCII
  158. self._last_char = byte_str[-1:]
  159. # If we've seen escape sequences, use the EscCharSetProber, which
  160. # uses a simple state machine to check for known escape sequences in
  161. # HZ and ISO-2022 encodings, since those are the only encodings that
  162. # use such sequences.
  163. if self._input_state == InputState.ESC_ASCII:
  164. if not self._esc_charset_prober:
  165. self._esc_charset_prober = EscCharSetProber(self.lang_filter)
  166. if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
  167. self.result = {'encoding':
  168. self._esc_charset_prober.charset_name,
  169. 'confidence':
  170. self._esc_charset_prober.get_confidence(),
  171. 'language':
  172. self._esc_charset_prober.language}
  173. self.done = True
  174. # If we've seen high bytes (i.e., those with values greater than 127),
  175. # we need to do more complicated checks using all our multi-byte and
  176. # single-byte probers that are left. The single-byte probers
  177. # use character bigram distributions to determine the encoding, whereas
  178. # the multi-byte probers use a combination of character unigram and
  179. # bigram distributions.
  180. elif self._input_state == InputState.HIGH_BYTE:
  181. if not self._charset_probers:
  182. self._charset_probers = [MBCSGroupProber(self.lang_filter)]
  183. # If we're checking non-CJK encodings, use single-byte prober
  184. if self.lang_filter & LanguageFilter.NON_CJK:
  185. self._charset_probers.append(SBCSGroupProber())
  186. self._charset_probers.append(Latin1Prober())
  187. for prober in self._charset_probers:
  188. if prober.feed(byte_str) == ProbingState.FOUND_IT:
  189. self.result = {'encoding': prober.charset_name,
  190. 'confidence': prober.get_confidence(),
  191. 'language': prober.language}
  192. self.done = True
  193. break
  194. if self.WIN_BYTE_DETECTOR.search(byte_str):
  195. self._has_win_bytes = True
  196. def close(self):
  197. """
  198. Stop analyzing the current document and come up with a final
  199. prediction.
  200. :returns: The ``result`` attribute, a ``dict`` with the keys
  201. `encoding`, `confidence`, and `language`.
  202. """
  203. # Don't bother with checks if we're already done
  204. if self.done:
  205. return self.result
  206. self.done = True
  207. if not self._got_data:
  208. self.logger.debug('no data received!')
  209. # Default to ASCII if it is all we've seen so far
  210. elif self._input_state == InputState.PURE_ASCII:
  211. self.result = {'encoding': 'ascii',
  212. 'confidence': 1.0,
  213. 'language': ''}
  214. # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
  215. elif self._input_state == InputState.HIGH_BYTE:
  216. prober_confidence = None
  217. max_prober_confidence = 0.0
  218. max_prober = None
  219. for prober in self._charset_probers:
  220. if not prober:
  221. continue
  222. prober_confidence = prober.get_confidence()
  223. if prober_confidence > max_prober_confidence:
  224. max_prober_confidence = prober_confidence
  225. max_prober = prober
  226. if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
  227. charset_name = max_prober.charset_name
  228. lower_charset_name = max_prober.charset_name.lower()
  229. confidence = max_prober.get_confidence()
  230. # Use Windows encoding name instead of ISO-8859 if we saw any
  231. # extra Windows-specific bytes
  232. if lower_charset_name.startswith('iso-8859'):
  233. if self._has_win_bytes:
  234. charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
  235. charset_name)
  236. self.result = {'encoding': charset_name,
  237. 'confidence': confidence,
  238. 'language': max_prober.language}
  239. # Log all prober confidences if none met MINIMUM_THRESHOLD
  240. if self.logger.getEffectiveLevel() == logging.DEBUG:
  241. if self.result['encoding'] is None:
  242. self.logger.debug('no probers hit minimum threshold')
  243. for group_prober in self._charset_probers:
  244. if not group_prober:
  245. continue
  246. if isinstance(group_prober, CharSetGroupProber):
  247. for prober in group_prober.probers:
  248. self.logger.debug('%s %s confidence = %s',
  249. prober.charset_name,
  250. prober.language,
  251. prober.get_confidence())
  252. else:
  253. self.logger.debug('%s %s confidence = %s',
  254. prober.charset_name,
  255. prober.language,
  256. prober.get_confidence())
  257. return self.result