lexer.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. # -*- coding: utf-8 -*-
  2. """Implements a Jinja / Python combination lexer. The ``Lexer`` class
  3. is used to do some preprocessing. It filters out invalid operators like
  4. the bitshift operators we don't allow in templates. It separates
  5. template code and python code in expressions.
  6. """
  7. import re
  8. from ast import literal_eval
  9. from collections import deque
  10. from operator import itemgetter
  11. from ._compat import implements_iterator
  12. from ._compat import intern
  13. from ._compat import iteritems
  14. from ._compat import text_type
  15. from .exceptions import TemplateSyntaxError
  16. from .utils import LRUCache
  17. # cache for the lexers. Exists in order to be able to have multiple
  18. # environments with the same lexer
  19. _lexer_cache = LRUCache(50)
  20. # static regular expressions
  21. whitespace_re = re.compile(r"\s+", re.U)
  22. newline_re = re.compile(r"(\r\n|\r|\n)")
  23. string_re = re.compile(
  24. r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S
  25. )
  26. integer_re = re.compile(r"(\d+_)*\d+")
  27. float_re = re.compile(
  28. r"""
  29. (?<!\.) # doesn't start with a .
  30. (\d+_)*\d+ # digits, possibly _ separated
  31. (
  32. (\.(\d+_)*\d+)? # optional fractional part
  33. e[+\-]?(\d+_)*\d+ # exponent part
  34. |
  35. \.(\d+_)*\d+ # required fractional part
  36. )
  37. """,
  38. re.IGNORECASE | re.VERBOSE,
  39. )
  40. try:
  41. # check if this Python supports Unicode identifiers
  42. compile("föö", "<unknown>", "eval")
  43. except SyntaxError:
  44. # Python 2, no Unicode support, use ASCII identifiers
  45. name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")
  46. check_ident = False
  47. else:
  48. # Unicode support, import generated re pattern and set flag to use
  49. # str.isidentifier to validate during lexing.
  50. from ._identifier import pattern as name_re
  51. check_ident = True
  52. # internal the tokens and keep references to them
  53. TOKEN_ADD = intern("add")
  54. TOKEN_ASSIGN = intern("assign")
  55. TOKEN_COLON = intern("colon")
  56. TOKEN_COMMA = intern("comma")
  57. TOKEN_DIV = intern("div")
  58. TOKEN_DOT = intern("dot")
  59. TOKEN_EQ = intern("eq")
  60. TOKEN_FLOORDIV = intern("floordiv")
  61. TOKEN_GT = intern("gt")
  62. TOKEN_GTEQ = intern("gteq")
  63. TOKEN_LBRACE = intern("lbrace")
  64. TOKEN_LBRACKET = intern("lbracket")
  65. TOKEN_LPAREN = intern("lparen")
  66. TOKEN_LT = intern("lt")
  67. TOKEN_LTEQ = intern("lteq")
  68. TOKEN_MOD = intern("mod")
  69. TOKEN_MUL = intern("mul")
  70. TOKEN_NE = intern("ne")
  71. TOKEN_PIPE = intern("pipe")
  72. TOKEN_POW = intern("pow")
  73. TOKEN_RBRACE = intern("rbrace")
  74. TOKEN_RBRACKET = intern("rbracket")
  75. TOKEN_RPAREN = intern("rparen")
  76. TOKEN_SEMICOLON = intern("semicolon")
  77. TOKEN_SUB = intern("sub")
  78. TOKEN_TILDE = intern("tilde")
  79. TOKEN_WHITESPACE = intern("whitespace")
  80. TOKEN_FLOAT = intern("float")
  81. TOKEN_INTEGER = intern("integer")
  82. TOKEN_NAME = intern("name")
  83. TOKEN_STRING = intern("string")
  84. TOKEN_OPERATOR = intern("operator")
  85. TOKEN_BLOCK_BEGIN = intern("block_begin")
  86. TOKEN_BLOCK_END = intern("block_end")
  87. TOKEN_VARIABLE_BEGIN = intern("variable_begin")
  88. TOKEN_VARIABLE_END = intern("variable_end")
  89. TOKEN_RAW_BEGIN = intern("raw_begin")
  90. TOKEN_RAW_END = intern("raw_end")
  91. TOKEN_COMMENT_BEGIN = intern("comment_begin")
  92. TOKEN_COMMENT_END = intern("comment_end")
  93. TOKEN_COMMENT = intern("comment")
  94. TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin")
  95. TOKEN_LINESTATEMENT_END = intern("linestatement_end")
  96. TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin")
  97. TOKEN_LINECOMMENT_END = intern("linecomment_end")
  98. TOKEN_LINECOMMENT = intern("linecomment")
  99. TOKEN_DATA = intern("data")
  100. TOKEN_INITIAL = intern("initial")
  101. TOKEN_EOF = intern("eof")
  102. # bind operators to token types
  103. operators = {
  104. "+": TOKEN_ADD,
  105. "-": TOKEN_SUB,
  106. "/": TOKEN_DIV,
  107. "//": TOKEN_FLOORDIV,
  108. "*": TOKEN_MUL,
  109. "%": TOKEN_MOD,
  110. "**": TOKEN_POW,
  111. "~": TOKEN_TILDE,
  112. "[": TOKEN_LBRACKET,
  113. "]": TOKEN_RBRACKET,
  114. "(": TOKEN_LPAREN,
  115. ")": TOKEN_RPAREN,
  116. "{": TOKEN_LBRACE,
  117. "}": TOKEN_RBRACE,
  118. "==": TOKEN_EQ,
  119. "!=": TOKEN_NE,
  120. ">": TOKEN_GT,
  121. ">=": TOKEN_GTEQ,
  122. "<": TOKEN_LT,
  123. "<=": TOKEN_LTEQ,
  124. "=": TOKEN_ASSIGN,
  125. ".": TOKEN_DOT,
  126. ":": TOKEN_COLON,
  127. "|": TOKEN_PIPE,
  128. ",": TOKEN_COMMA,
  129. ";": TOKEN_SEMICOLON,
  130. }
  131. reverse_operators = dict([(v, k) for k, v in iteritems(operators)])
  132. assert len(operators) == len(reverse_operators), "operators dropped"
  133. operator_re = re.compile(
  134. "(%s)" % "|".join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))
  135. )
  136. ignored_tokens = frozenset(
  137. [
  138. TOKEN_COMMENT_BEGIN,
  139. TOKEN_COMMENT,
  140. TOKEN_COMMENT_END,
  141. TOKEN_WHITESPACE,
  142. TOKEN_LINECOMMENT_BEGIN,
  143. TOKEN_LINECOMMENT_END,
  144. TOKEN_LINECOMMENT,
  145. ]
  146. )
  147. ignore_if_empty = frozenset(
  148. [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT]
  149. )
  150. def _describe_token_type(token_type):
  151. if token_type in reverse_operators:
  152. return reverse_operators[token_type]
  153. return {
  154. TOKEN_COMMENT_BEGIN: "begin of comment",
  155. TOKEN_COMMENT_END: "end of comment",
  156. TOKEN_COMMENT: "comment",
  157. TOKEN_LINECOMMENT: "comment",
  158. TOKEN_BLOCK_BEGIN: "begin of statement block",
  159. TOKEN_BLOCK_END: "end of statement block",
  160. TOKEN_VARIABLE_BEGIN: "begin of print statement",
  161. TOKEN_VARIABLE_END: "end of print statement",
  162. TOKEN_LINESTATEMENT_BEGIN: "begin of line statement",
  163. TOKEN_LINESTATEMENT_END: "end of line statement",
  164. TOKEN_DATA: "template data / text",
  165. TOKEN_EOF: "end of template",
  166. }.get(token_type, token_type)
  167. def describe_token(token):
  168. """Returns a description of the token."""
  169. if token.type == TOKEN_NAME:
  170. return token.value
  171. return _describe_token_type(token.type)
  172. def describe_token_expr(expr):
  173. """Like `describe_token` but for token expressions."""
  174. if ":" in expr:
  175. type, value = expr.split(":", 1)
  176. if type == TOKEN_NAME:
  177. return value
  178. else:
  179. type = expr
  180. return _describe_token_type(type)
  181. def count_newlines(value):
  182. """Count the number of newline characters in the string. This is
  183. useful for extensions that filter a stream.
  184. """
  185. return len(newline_re.findall(value))
  186. def compile_rules(environment):
  187. """Compiles all the rules from the environment into a list of rules."""
  188. e = re.escape
  189. rules = [
  190. (
  191. len(environment.comment_start_string),
  192. TOKEN_COMMENT_BEGIN,
  193. e(environment.comment_start_string),
  194. ),
  195. (
  196. len(environment.block_start_string),
  197. TOKEN_BLOCK_BEGIN,
  198. e(environment.block_start_string),
  199. ),
  200. (
  201. len(environment.variable_start_string),
  202. TOKEN_VARIABLE_BEGIN,
  203. e(environment.variable_start_string),
  204. ),
  205. ]
  206. if environment.line_statement_prefix is not None:
  207. rules.append(
  208. (
  209. len(environment.line_statement_prefix),
  210. TOKEN_LINESTATEMENT_BEGIN,
  211. r"^[ \t\v]*" + e(environment.line_statement_prefix),
  212. )
  213. )
  214. if environment.line_comment_prefix is not None:
  215. rules.append(
  216. (
  217. len(environment.line_comment_prefix),
  218. TOKEN_LINECOMMENT_BEGIN,
  219. r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
  220. )
  221. )
  222. return [x[1:] for x in sorted(rules, reverse=True)]
  223. class Failure(object):
  224. """Class that raises a `TemplateSyntaxError` if called.
  225. Used by the `Lexer` to specify known errors.
  226. """
  227. def __init__(self, message, cls=TemplateSyntaxError):
  228. self.message = message
  229. self.error_class = cls
  230. def __call__(self, lineno, filename):
  231. raise self.error_class(self.message, lineno, filename)
  232. class Token(tuple):
  233. """Token class."""
  234. __slots__ = ()
  235. lineno, type, value = (property(itemgetter(x)) for x in range(3))
  236. def __new__(cls, lineno, type, value):
  237. return tuple.__new__(cls, (lineno, intern(str(type)), value))
  238. def __str__(self):
  239. if self.type in reverse_operators:
  240. return reverse_operators[self.type]
  241. elif self.type == "name":
  242. return self.value
  243. return self.type
  244. def test(self, expr):
  245. """Test a token against a token expression. This can either be a
  246. token type or ``'token_type:token_value'``. This can only test
  247. against string values and types.
  248. """
  249. # here we do a regular string equality check as test_any is usually
  250. # passed an iterable of not interned strings.
  251. if self.type == expr:
  252. return True
  253. elif ":" in expr:
  254. return expr.split(":", 1) == [self.type, self.value]
  255. return False
  256. def test_any(self, *iterable):
  257. """Test against multiple token expressions."""
  258. for expr in iterable:
  259. if self.test(expr):
  260. return True
  261. return False
  262. def __repr__(self):
  263. return "Token(%r, %r, %r)" % (self.lineno, self.type, self.value)
  264. @implements_iterator
  265. class TokenStreamIterator(object):
  266. """The iterator for tokenstreams. Iterate over the stream
  267. until the eof token is reached.
  268. """
  269. def __init__(self, stream):
  270. self.stream = stream
  271. def __iter__(self):
  272. return self
  273. def __next__(self):
  274. token = self.stream.current
  275. if token.type is TOKEN_EOF:
  276. self.stream.close()
  277. raise StopIteration()
  278. next(self.stream)
  279. return token
  280. @implements_iterator
  281. class TokenStream(object):
  282. """A token stream is an iterable that yields :class:`Token`\\s. The
  283. parser however does not iterate over it but calls :meth:`next` to go
  284. one token ahead. The current active token is stored as :attr:`current`.
  285. """
  286. def __init__(self, generator, name, filename):
  287. self._iter = iter(generator)
  288. self._pushed = deque()
  289. self.name = name
  290. self.filename = filename
  291. self.closed = False
  292. self.current = Token(1, TOKEN_INITIAL, "")
  293. next(self)
  294. def __iter__(self):
  295. return TokenStreamIterator(self)
  296. def __bool__(self):
  297. return bool(self._pushed) or self.current.type is not TOKEN_EOF
  298. __nonzero__ = __bool__ # py2
  299. @property
  300. def eos(self):
  301. """Are we at the end of the stream?"""
  302. return not self
  303. def push(self, token):
  304. """Push a token back to the stream."""
  305. self._pushed.append(token)
  306. def look(self):
  307. """Look at the next token."""
  308. old_token = next(self)
  309. result = self.current
  310. self.push(result)
  311. self.current = old_token
  312. return result
  313. def skip(self, n=1):
  314. """Got n tokens ahead."""
  315. for _ in range(n):
  316. next(self)
  317. def next_if(self, expr):
  318. """Perform the token test and return the token if it matched.
  319. Otherwise the return value is `None`.
  320. """
  321. if self.current.test(expr):
  322. return next(self)
  323. def skip_if(self, expr):
  324. """Like :meth:`next_if` but only returns `True` or `False`."""
  325. return self.next_if(expr) is not None
  326. def __next__(self):
  327. """Go one token ahead and return the old one.
  328. Use the built-in :func:`next` instead of calling this directly.
  329. """
  330. rv = self.current
  331. if self._pushed:
  332. self.current = self._pushed.popleft()
  333. elif self.current.type is not TOKEN_EOF:
  334. try:
  335. self.current = next(self._iter)
  336. except StopIteration:
  337. self.close()
  338. return rv
  339. def close(self):
  340. """Close the stream."""
  341. self.current = Token(self.current.lineno, TOKEN_EOF, "")
  342. self._iter = None
  343. self.closed = True
  344. def expect(self, expr):
  345. """Expect a given token type and return it. This accepts the same
  346. argument as :meth:`jinja2.lexer.Token.test`.
  347. """
  348. if not self.current.test(expr):
  349. expr = describe_token_expr(expr)
  350. if self.current.type is TOKEN_EOF:
  351. raise TemplateSyntaxError(
  352. "unexpected end of template, expected %r." % expr,
  353. self.current.lineno,
  354. self.name,
  355. self.filename,
  356. )
  357. raise TemplateSyntaxError(
  358. "expected token %r, got %r" % (expr, describe_token(self.current)),
  359. self.current.lineno,
  360. self.name,
  361. self.filename,
  362. )
  363. try:
  364. return self.current
  365. finally:
  366. next(self)
  367. def get_lexer(environment):
  368. """Return a lexer which is probably cached."""
  369. key = (
  370. environment.block_start_string,
  371. environment.block_end_string,
  372. environment.variable_start_string,
  373. environment.variable_end_string,
  374. environment.comment_start_string,
  375. environment.comment_end_string,
  376. environment.line_statement_prefix,
  377. environment.line_comment_prefix,
  378. environment.trim_blocks,
  379. environment.lstrip_blocks,
  380. environment.newline_sequence,
  381. environment.keep_trailing_newline,
  382. )
  383. lexer = _lexer_cache.get(key)
  384. if lexer is None:
  385. lexer = Lexer(environment)
  386. _lexer_cache[key] = lexer
  387. return lexer
  388. class OptionalLStrip(tuple):
  389. """A special tuple for marking a point in the state that can have
  390. lstrip applied.
  391. """
  392. __slots__ = ()
  393. # Even though it looks like a no-op, creating instances fails
  394. # without this.
  395. def __new__(cls, *members, **kwargs):
  396. return super(OptionalLStrip, cls).__new__(cls, members)
  397. class Lexer(object):
  398. """Class that implements a lexer for a given environment. Automatically
  399. created by the environment class, usually you don't have to do that.
  400. Note that the lexer is not automatically bound to an environment.
  401. Multiple environments can share the same lexer.
  402. """
  403. def __init__(self, environment):
  404. # shortcuts
  405. e = re.escape
  406. def c(x):
  407. return re.compile(x, re.M | re.S)
  408. # lexing rules for tags
  409. tag_rules = [
  410. (whitespace_re, TOKEN_WHITESPACE, None),
  411. (float_re, TOKEN_FLOAT, None),
  412. (integer_re, TOKEN_INTEGER, None),
  413. (name_re, TOKEN_NAME, None),
  414. (string_re, TOKEN_STRING, None),
  415. (operator_re, TOKEN_OPERATOR, None),
  416. ]
  417. # assemble the root lexing rule. because "|" is ungreedy
  418. # we have to sort by length so that the lexer continues working
  419. # as expected when we have parsing rules like <% for block and
  420. # <%= for variables. (if someone wants asp like syntax)
  421. # variables are just part of the rules if variable processing
  422. # is required.
  423. root_tag_rules = compile_rules(environment)
  424. # block suffix if trimming is enabled
  425. block_suffix_re = environment.trim_blocks and "\\n?" or ""
  426. # If lstrip is enabled, it should not be applied if there is any
  427. # non-whitespace between the newline and block.
  428. self.lstrip_unless_re = c(r"[^ \t]") if environment.lstrip_blocks else None
  429. self.newline_sequence = environment.newline_sequence
  430. self.keep_trailing_newline = environment.keep_trailing_newline
  431. # global lexing rules
  432. self.rules = {
  433. "root": [
  434. # directives
  435. (
  436. c(
  437. "(.*?)(?:%s)"
  438. % "|".join(
  439. [
  440. r"(?P<raw_begin>%s(\-|\+|)\s*raw\s*(?:\-%s\s*|%s))"
  441. % (
  442. e(environment.block_start_string),
  443. e(environment.block_end_string),
  444. e(environment.block_end_string),
  445. )
  446. ]
  447. + [
  448. r"(?P<%s>%s(\-|\+|))" % (n, r)
  449. for n, r in root_tag_rules
  450. ]
  451. )
  452. ),
  453. OptionalLStrip(TOKEN_DATA, "#bygroup"),
  454. "#bygroup",
  455. ),
  456. # data
  457. (c(".+"), TOKEN_DATA, None),
  458. ],
  459. # comments
  460. TOKEN_COMMENT_BEGIN: [
  461. (
  462. c(
  463. r"(.*?)((?:\-%s\s*|%s)%s)"
  464. % (
  465. e(environment.comment_end_string),
  466. e(environment.comment_end_string),
  467. block_suffix_re,
  468. )
  469. ),
  470. (TOKEN_COMMENT, TOKEN_COMMENT_END),
  471. "#pop",
  472. ),
  473. (c("(.)"), (Failure("Missing end of comment tag"),), None),
  474. ],
  475. # blocks
  476. TOKEN_BLOCK_BEGIN: [
  477. (
  478. c(
  479. r"(?:\-%s\s*|%s)%s"
  480. % (
  481. e(environment.block_end_string),
  482. e(environment.block_end_string),
  483. block_suffix_re,
  484. )
  485. ),
  486. TOKEN_BLOCK_END,
  487. "#pop",
  488. ),
  489. ]
  490. + tag_rules,
  491. # variables
  492. TOKEN_VARIABLE_BEGIN: [
  493. (
  494. c(
  495. r"\-%s\s*|%s"
  496. % (
  497. e(environment.variable_end_string),
  498. e(environment.variable_end_string),
  499. )
  500. ),
  501. TOKEN_VARIABLE_END,
  502. "#pop",
  503. )
  504. ]
  505. + tag_rules,
  506. # raw block
  507. TOKEN_RAW_BEGIN: [
  508. (
  509. c(
  510. r"(.*?)((?:%s(\-|\+|))\s*endraw\s*(?:\-%s\s*|%s%s))"
  511. % (
  512. e(environment.block_start_string),
  513. e(environment.block_end_string),
  514. e(environment.block_end_string),
  515. block_suffix_re,
  516. )
  517. ),
  518. OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),
  519. "#pop",
  520. ),
  521. (c("(.)"), (Failure("Missing end of raw directive"),), None),
  522. ],
  523. # line statements
  524. TOKEN_LINESTATEMENT_BEGIN: [
  525. (c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
  526. ]
  527. + tag_rules,
  528. # line comments
  529. TOKEN_LINECOMMENT_BEGIN: [
  530. (
  531. c(r"(.*?)()(?=\n|$)"),
  532. (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
  533. "#pop",
  534. )
  535. ],
  536. }
  537. def _normalize_newlines(self, value):
  538. """Called for strings and template data to normalize it to unicode."""
  539. return newline_re.sub(self.newline_sequence, value)
  540. def tokenize(self, source, name=None, filename=None, state=None):
  541. """Calls tokeniter + tokenize and wraps it in a token stream."""
  542. stream = self.tokeniter(source, name, filename, state)
  543. return TokenStream(self.wrap(stream, name, filename), name, filename)
  544. def wrap(self, stream, name=None, filename=None):
  545. """This is called with the stream as returned by `tokenize` and wraps
  546. every token in a :class:`Token` and converts the value.
  547. """
  548. for lineno, token, value in stream:
  549. if token in ignored_tokens:
  550. continue
  551. elif token == TOKEN_LINESTATEMENT_BEGIN:
  552. token = TOKEN_BLOCK_BEGIN
  553. elif token == TOKEN_LINESTATEMENT_END:
  554. token = TOKEN_BLOCK_END
  555. # we are not interested in those tokens in the parser
  556. elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
  557. continue
  558. elif token == TOKEN_DATA:
  559. value = self._normalize_newlines(value)
  560. elif token == "keyword":
  561. token = value
  562. elif token == TOKEN_NAME:
  563. value = str(value)
  564. if check_ident and not value.isidentifier():
  565. raise TemplateSyntaxError(
  566. "Invalid character in identifier", lineno, name, filename
  567. )
  568. elif token == TOKEN_STRING:
  569. # try to unescape string
  570. try:
  571. value = (
  572. self._normalize_newlines(value[1:-1])
  573. .encode("ascii", "backslashreplace")
  574. .decode("unicode-escape")
  575. )
  576. except Exception as e:
  577. msg = str(e).split(":")[-1].strip()
  578. raise TemplateSyntaxError(msg, lineno, name, filename)
  579. elif token == TOKEN_INTEGER:
  580. value = int(value.replace("_", ""))
  581. elif token == TOKEN_FLOAT:
  582. # remove all "_" first to support more Python versions
  583. value = literal_eval(value.replace("_", ""))
  584. elif token == TOKEN_OPERATOR:
  585. token = operators[value]
  586. yield Token(lineno, token, value)
  587. def tokeniter(self, source, name, filename=None, state=None):
  588. """This method tokenizes the text and returns the tokens in a
  589. generator. Use this method if you just want to tokenize a template.
  590. """
  591. source = text_type(source)
  592. lines = source.splitlines()
  593. if self.keep_trailing_newline and source:
  594. for newline in ("\r\n", "\r", "\n"):
  595. if source.endswith(newline):
  596. lines.append("")
  597. break
  598. source = "\n".join(lines)
  599. pos = 0
  600. lineno = 1
  601. stack = ["root"]
  602. if state is not None and state != "root":
  603. assert state in ("variable", "block"), "invalid state"
  604. stack.append(state + "_begin")
  605. statetokens = self.rules[stack[-1]]
  606. source_length = len(source)
  607. balancing_stack = []
  608. lstrip_unless_re = self.lstrip_unless_re
  609. newlines_stripped = 0
  610. line_starting = True
  611. while 1:
  612. # tokenizer loop
  613. for regex, tokens, new_state in statetokens:
  614. m = regex.match(source, pos)
  615. # if no match we try again with the next rule
  616. if m is None:
  617. continue
  618. # we only match blocks and variables if braces / parentheses
  619. # are balanced. continue parsing with the lower rule which
  620. # is the operator rule. do this only if the end tags look
  621. # like operators
  622. if balancing_stack and tokens in (
  623. TOKEN_VARIABLE_END,
  624. TOKEN_BLOCK_END,
  625. TOKEN_LINESTATEMENT_END,
  626. ):
  627. continue
  628. # tuples support more options
  629. if isinstance(tokens, tuple):
  630. groups = m.groups()
  631. if isinstance(tokens, OptionalLStrip):
  632. # Rule supports lstrip. Match will look like
  633. # text, block type, whitespace control, type, control, ...
  634. text = groups[0]
  635. # Skipping the text and first type, every other group is the
  636. # whitespace control for each type. One of the groups will be
  637. # -, +, or empty string instead of None.
  638. strip_sign = next(g for g in groups[2::2] if g is not None)
  639. if strip_sign == "-":
  640. # Strip all whitespace between the text and the tag.
  641. stripped = text.rstrip()
  642. newlines_stripped = text[len(stripped) :].count("\n")
  643. groups = (stripped,) + groups[1:]
  644. elif (
  645. # Not marked for preserving whitespace.
  646. strip_sign != "+"
  647. # lstrip is enabled.
  648. and lstrip_unless_re is not None
  649. # Not a variable expression.
  650. and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
  651. ):
  652. # The start of text between the last newline and the tag.
  653. l_pos = text.rfind("\n") + 1
  654. if l_pos > 0 or line_starting:
  655. # If there's only whitespace between the newline and the
  656. # tag, strip it.
  657. if not lstrip_unless_re.search(text, l_pos):
  658. groups = (text[:l_pos],) + groups[1:]
  659. for idx, token in enumerate(tokens):
  660. # failure group
  661. if token.__class__ is Failure:
  662. raise token(lineno, filename)
  663. # bygroup is a bit more complex, in that case we
  664. # yield for the current token the first named
  665. # group that matched
  666. elif token == "#bygroup":
  667. for key, value in iteritems(m.groupdict()):
  668. if value is not None:
  669. yield lineno, key, value
  670. lineno += value.count("\n")
  671. break
  672. else:
  673. raise RuntimeError(
  674. "%r wanted to resolve "
  675. "the token dynamically"
  676. " but no group matched" % regex
  677. )
  678. # normal group
  679. else:
  680. data = groups[idx]
  681. if data or token not in ignore_if_empty:
  682. yield lineno, token, data
  683. lineno += data.count("\n") + newlines_stripped
  684. newlines_stripped = 0
  685. # strings as token just are yielded as it.
  686. else:
  687. data = m.group()
  688. # update brace/parentheses balance
  689. if tokens == TOKEN_OPERATOR:
  690. if data == "{":
  691. balancing_stack.append("}")
  692. elif data == "(":
  693. balancing_stack.append(")")
  694. elif data == "[":
  695. balancing_stack.append("]")
  696. elif data in ("}", ")", "]"):
  697. if not balancing_stack:
  698. raise TemplateSyntaxError(
  699. "unexpected '%s'" % data, lineno, name, filename
  700. )
  701. expected_op = balancing_stack.pop()
  702. if expected_op != data:
  703. raise TemplateSyntaxError(
  704. "unexpected '%s', "
  705. "expected '%s'" % (data, expected_op),
  706. lineno,
  707. name,
  708. filename,
  709. )
  710. # yield items
  711. if data or tokens not in ignore_if_empty:
  712. yield lineno, tokens, data
  713. lineno += data.count("\n")
  714. line_starting = m.group()[-1:] == "\n"
  715. # fetch new position into new variable so that we can check
  716. # if there is a internal parsing error which would result
  717. # in an infinite loop
  718. pos2 = m.end()
  719. # handle state changes
  720. if new_state is not None:
  721. # remove the uppermost state
  722. if new_state == "#pop":
  723. stack.pop()
  724. # resolve the new state by group checking
  725. elif new_state == "#bygroup":
  726. for key, value in iteritems(m.groupdict()):
  727. if value is not None:
  728. stack.append(key)
  729. break
  730. else:
  731. raise RuntimeError(
  732. "%r wanted to resolve the "
  733. "new state dynamically but"
  734. " no group matched" % regex
  735. )
  736. # direct state name given
  737. else:
  738. stack.append(new_state)
  739. statetokens = self.rules[stack[-1]]
  740. # we are still at the same position and no stack change.
  741. # this means a loop without break condition, avoid that and
  742. # raise error
  743. elif pos2 == pos:
  744. raise RuntimeError(
  745. "%r yielded empty string without stack change" % regex
  746. )
  747. # publish new function and start again
  748. pos = pos2
  749. break
  750. # if loop terminated without break we haven't found a single match
  751. # either we are at the end of the file or we have a problem
  752. else:
  753. # end of text
  754. if pos >= source_length:
  755. return
  756. # something went wrong
  757. raise TemplateSyntaxError(
  758. "unexpected char %r at %d" % (source[pos], pos),
  759. lineno,
  760. name,
  761. filename,
  762. )