1 # -*- coding: utf-8 -*- 2 """ 3 jinja2.lexer 4 ~~~~~~~~~~~~ 5 6 This module implements a Jinja / Python combination lexer. The 7 `Lexer` class provided by this module is used to do some preprocessing 8 for Jinja. 9 10 On the one hand it filters out invalid operators like the bitshift 11 operators we don't allow in templates. On the other hand it separates 12 template code and python code in expressions. 13 14 :copyright: (c) 2010 by the Jinja Team. 15 :license: BSD, see LICENSE for more details. 16 """ 17 import re 18 from operator import itemgetter 19 from collections import deque 20 from jinja2.exceptions import TemplateSyntaxError 21 from jinja2.utils import LRUCache, next 22 23 24 # cache for the lexers. Exists in order to be able to have multiple 25 # environments with the same lexer 26 _lexer_cache = LRUCache(50) 27 28 # static regular expressions 29 whitespace_re = re.compile(r'\s+', re.U) 30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'" 31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S) 32 integer_re = re.compile(r'\d+') 33 34 # we use the unicode identifier rule if this python version is able 35 # to handle unicode identifiers, otherwise the standard ASCII one. 36 try: 37 compile('f', '<unknown>', 'eval') 38 except SyntaxError: 39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b') 40 else: 41 from jinja2 import _stringdefs 42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start, 43 _stringdefs.xid_continue)) 44 45 float_re = re.compile(r'(?<!\.)\d+\.\d+') 46 newline_re = re.compile(r'(\r\n|\r|\n)') 47 48 # internal the tokens and keep references to them 49 TOKEN_ADD = intern('add') 50 TOKEN_ASSIGN = intern('assign') 51 TOKEN_COLON = intern('colon') 52 TOKEN_COMMA = intern('comma') 53 TOKEN_DIV = intern('div') 54 TOKEN_DOT = intern('dot') 55 TOKEN_EQ = intern('eq') 56 TOKEN_FLOORDIV = intern('floordiv') 57 TOKEN_GT = intern('gt') 58 TOKEN_GTEQ = intern('gteq') 59 TOKEN_LBRACE = intern('lbrace') 60 TOKEN_LBRACKET = intern('lbracket') 61 TOKEN_LPAREN = intern('lparen') 62 TOKEN_LT = intern('lt') 63 TOKEN_LTEQ = intern('lteq') 64 TOKEN_MOD = intern('mod') 65 TOKEN_MUL = intern('mul') 66 TOKEN_NE = intern('ne') 67 TOKEN_PIPE = intern('pipe') 68 TOKEN_POW = intern('pow') 69 TOKEN_RBRACE = intern('rbrace') 70 TOKEN_RBRACKET = intern('rbracket') 71 TOKEN_RPAREN = intern('rparen') 72 TOKEN_SEMICOLON = intern('semicolon') 73 TOKEN_SUB = intern('sub') 74 TOKEN_TILDE = intern('tilde') 75 TOKEN_WHITESPACE = intern('whitespace') 76 TOKEN_FLOAT = intern('float') 77 TOKEN_INTEGER = intern('integer') 78 TOKEN_NAME = intern('name') 79 TOKEN_STRING = intern('string') 80 TOKEN_OPERATOR = intern('operator') 81 TOKEN_BLOCK_BEGIN = intern('block_begin') 82 TOKEN_BLOCK_END = intern('block_end') 83 TOKEN_VARIABLE_BEGIN = intern('variable_begin') 84 TOKEN_VARIABLE_END = intern('variable_end') 85 TOKEN_RAW_BEGIN = intern('raw_begin') 86 TOKEN_RAW_END = intern('raw_end') 87 TOKEN_COMMENT_BEGIN = intern('comment_begin') 88 TOKEN_COMMENT_END = intern('comment_end') 89 TOKEN_COMMENT = intern('comment') 90 TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin') 91 TOKEN_LINESTATEMENT_END = intern('linestatement_end') 92 TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin') 93 TOKEN_LINECOMMENT_END = intern('linecomment_end') 94 TOKEN_LINECOMMENT = intern('linecomment') 95 TOKEN_DATA = intern('data') 96 TOKEN_INITIAL = intern('initial') 97 TOKEN_EOF = intern('eof') 98 99 # bind operators to token types 100 operators = { 101 '+': TOKEN_ADD, 102 '-': TOKEN_SUB, 103 '/': TOKEN_DIV, 104 '//': TOKEN_FLOORDIV, 105 '*': TOKEN_MUL, 106 '%': TOKEN_MOD, 107 '**': TOKEN_POW, 108 '~': TOKEN_TILDE, 109 '[': TOKEN_LBRACKET, 110 ']': TOKEN_RBRACKET, 111 '(': TOKEN_LPAREN, 112 ')': TOKEN_RPAREN, 113 '{': TOKEN_LBRACE, 114 '}': TOKEN_RBRACE, 115 '==': TOKEN_EQ, 116 '!=': TOKEN_NE, 117 '>': TOKEN_GT, 118 '>=': TOKEN_GTEQ, 119 '<': TOKEN_LT, 120 '<=': TOKEN_LTEQ, 121 '=': TOKEN_ASSIGN, 122 '.': TOKEN_DOT, 123 ':': TOKEN_COLON, 124 '|': TOKEN_PIPE, 125 ',': TOKEN_COMMA, 126 ';': TOKEN_SEMICOLON 127 } 128 129 reverse_operators = dict([(v, k) for k, v in operators.iteritems()]) 130 assert len(operators) == len(reverse_operators), 'operators dropped' 131 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in 132 sorted(operators, key=lambda x: -len(x)))) 133 134 ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT, 135 TOKEN_COMMENT_END, TOKEN_WHITESPACE, 136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN, 137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT]) 138 ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA, 139 TOKEN_COMMENT, TOKEN_LINECOMMENT]) 140 141 142 def _describe_token_type(token_type): 143 if token_type in reverse_operators: 144 return reverse_operators[token_type] 145 return { 146 TOKEN_COMMENT_BEGIN: 'begin of comment', 147 TOKEN_COMMENT_END: 'end of comment', 148 TOKEN_COMMENT: 'comment', 149 TOKEN_LINECOMMENT: 'comment', 150 TOKEN_BLOCK_BEGIN: 'begin of statement block', 151 TOKEN_BLOCK_END: 'end of statement block', 152 TOKEN_VARIABLE_BEGIN: 'begin of print statement', 153 TOKEN_VARIABLE_END: 'end of print statement', 154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement', 155 TOKEN_LINESTATEMENT_END: 'end of line statement', 156 TOKEN_DATA: 'template data / text', 157 TOKEN_EOF: 'end of template' 158 }.get(token_type, token_type) 159 160 161 def describe_token(token): 162 """Returns a description of the token.""" 163 if token.type == 'name': 164 return token.value 165 return _describe_token_type(token.type) 166 167 168 def describe_token_expr(expr): 169 """Like `describe_token` but for token expressions.""" 170 if ':' in expr: 171 type, value = expr.split(':', 1) 172 if type == 'name': 173 return value 174 else: 175 type = expr 176 return _describe_token_type(type) 177 178 179 def count_newlines(value): 180 """Count the number of newline characters in the string. This is 181 useful for extensions that filter a stream. 182 """ 183 return len(newline_re.findall(value)) 184 185 186 def compile_rules(environment): 187 """Compiles all the rules from the environment into a list of rules.""" 188 e = re.escape 189 rules = [ 190 (len(environment.comment_start_string), 'comment', 191 e(environment.comment_start_string)), 192 (len(environment.block_start_string), 'block', 193 e(environment.block_start_string)), 194 (len(environment.variable_start_string), 'variable', 195 e(environment.variable_start_string)) 196 ] 197 198 if environment.line_statement_prefix is not None: 199 rules.append((len(environment.line_statement_prefix), 'linestatement', 200 r'^\s*' + e(environment.line_statement_prefix))) 201 if environment.line_comment_prefix is not None: 202 rules.append((len(environment.line_comment_prefix), 'linecomment', 203 r'(?:^|(?<=\S))[^\S\r\n]*' + 204 e(environment.line_comment_prefix))) 205 206 return [x[1:] for x in sorted(rules, reverse=True)] 207 208 209 class Failure(object): 210 """Class that raises a `TemplateSyntaxError` if called. 211 Used by the `Lexer` to specify known errors. 212 """ 213 214 def __init__(self, message, cls=TemplateSyntaxError): 215 self.message = message 216 self.error_class = cls 217 218 def __call__(self, lineno, filename): 219 raise self.error_class(self.message, lineno, filename) 220 221 222 class Token(tuple): 223 """Token class.""" 224 __slots__ = () 225 lineno, type, value = (property(itemgetter(x)) for x in range(3)) 226 227 def __new__(cls, lineno, type, value): 228 return tuple.__new__(cls, (lineno, intern(str(type)), value)) 229 230 def __str__(self): 231 if self.type in reverse_operators: 232 return reverse_operators[self.type] 233 elif self.type == 'name': 234 return self.value 235 return self.type 236 237 def test(self, expr): 238 """Test a token against a token expression. This can either be a 239 token type or ``'token_type:token_value'``. This can only test 240 against string values and types. 241 """ 242 # here we do a regular string equality check as test_any is usually 243 # passed an iterable of not interned strings. 244 if self.type == expr: 245 return True 246 elif ':' in expr: 247 return expr.split(':', 1) == [self.type, self.value] 248 return False 249 250 def test_any(self, *iterable): 251 """Test against multiple token expressions.""" 252 for expr in iterable: 253 if self.test(expr): 254 return True 255 return False 256 257 def __repr__(self): 258 return 'Token(%r, %r, %r)' % ( 259 self.lineno, 260 self.type, 261 self.value 262 ) 263 264 265 class TokenStreamIterator(object): 266 """The iterator for tokenstreams. Iterate over the stream 267 until the eof token is reached. 268 """ 269 270 def __init__(self, stream): 271 self.stream = stream 272 273 def __iter__(self): 274 return self 275 276 def next(self): 277 token = self.stream.current 278 if token.type is TOKEN_EOF: 279 self.stream.close() 280 raise StopIteration() 281 next(self.stream) 282 return token 283 284 285 class TokenStream(object): 286 """A token stream is an iterable that yields :class:`Token`\s. The 287 parser however does not iterate over it but calls :meth:`next` to go 288 one token ahead. The current active token is stored as :attr:`current`. 289 """ 290 291 def __init__(self, generator, name, filename): 292 self._next = iter(generator).next 293 self._pushed = deque() 294 self.name = name 295 self.filename = filename 296 self.closed = False 297 self.current = Token(1, TOKEN_INITIAL, '') 298 next(self) 299 300 def __iter__(self): 301 return TokenStreamIterator(self) 302 303 def __nonzero__(self): 304 return bool(self._pushed) or self.current.type is not TOKEN_EOF 305 306 eos = property(lambda x: not x, doc="Are we at the end of the stream?") 307 308 def push(self, token): 309 """Push a token back to the stream.""" 310 self._pushed.append(token) 311 312 def look(self): 313 """Look at the next token.""" 314 old_token = next(self) 315 result = self.current 316 self.push(result) 317 self.current = old_token 318 return result 319 320 def skip(self, n=1): 321 """Got n tokens ahead.""" 322 for x in xrange(n): 323 next(self) 324 325 def next_if(self, expr): 326 """Perform the token test and return the token if it matched. 327 Otherwise the return value is `None`. 328 """ 329 if self.current.test(expr): 330 return next(self) 331 332 def skip_if(self, expr): 333 """Like :meth:`next_if` but only returns `True` or `False`.""" 334 return self.next_if(expr) is not None 335 336 def next(self): 337 """Go one token ahead and return the old one""" 338 rv = self.current 339 if self._pushed: 340 self.current = self._pushed.popleft() 341 elif self.current.type is not TOKEN_EOF: 342 try: 343 self.current = self._next() 344 except StopIteration: 345 self.close() 346 return rv 347 348 def close(self): 349 """Close the stream.""" 350 self.current = Token(self.current.lineno, TOKEN_EOF, '') 351 self._next = None 352 self.closed = True 353 354 def expect(self, expr): 355 """Expect a given token type and return it. This accepts the same 356 argument as :meth:`jinja2.lexer.Token.test`. 357 """ 358 if not self.current.test(expr): 359 expr = describe_token_expr(expr) 360 if self.current.type is TOKEN_EOF: 361 raise TemplateSyntaxError('unexpected end of template, ' 362 'expected %r.' % expr, 363 self.current.lineno, 364 self.name, self.filename) 365 raise TemplateSyntaxError("expected token %r, got %r" % 366 (expr, describe_token(self.current)), 367 self.current.lineno, 368 self.name, self.filename) 369 try: 370 return self.current 371 finally: 372 next(self) 373 374 375 def get_lexer(environment): 376 """Return a lexer which is probably cached.""" 377 key = (environment.block_start_string, 378 environment.block_end_string, 379 environment.variable_start_string, 380 environment.variable_end_string, 381 environment.comment_start_string, 382 environment.comment_end_string, 383 environment.line_statement_prefix, 384 environment.line_comment_prefix, 385 environment.trim_blocks, 386 environment.newline_sequence) 387 lexer = _lexer_cache.get(key) 388 if lexer is None: 389 lexer = Lexer(environment) 390 _lexer_cache[key] = lexer 391 return lexer 392 393 394 class Lexer(object): 395 """Class that implements a lexer for a given environment. Automatically 396 created by the environment class, usually you don't have to do that. 397 398 Note that the lexer is not automatically bound to an environment. 399 Multiple environments can share the same lexer. 400 """ 401 402 def __init__(self, environment): 403 # shortcuts 404 c = lambda x: re.compile(x, re.M | re.S) 405 e = re.escape 406 407 # lexing rules for tags 408 tag_rules = [ 409 (whitespace_re, TOKEN_WHITESPACE, None), 410 (float_re, TOKEN_FLOAT, None), 411 (integer_re, TOKEN_INTEGER, None), 412 (name_re, TOKEN_NAME, None), 413 (string_re, TOKEN_STRING, None), 414 (operator_re, TOKEN_OPERATOR, None) 415 ] 416 417 # assamble the root lexing rule. because "|" is ungreedy 418 # we have to sort by length so that the lexer continues working 419 # as expected when we have parsing rules like <% for block and 420 # <%= for variables. (if someone wants asp like syntax) 421 # variables are just part of the rules if variable processing 422 # is required. 423 root_tag_rules = compile_rules(environment) 424 425 # block suffix if trimming is enabled 426 block_suffix_re = environment.trim_blocks and '\\n?' or '' 427 428 self.newline_sequence = environment.newline_sequence 429 430 # global lexing rules 431 self.rules = { 432 'root': [ 433 # directives 434 (c('(.*?)(?:%s)' % '|'.join( 435 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % ( 436 e(environment.block_start_string), 437 e(environment.block_start_string), 438 e(environment.block_end_string), 439 e(environment.block_end_string) 440 )] + [ 441 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r) 442 for n, r in root_tag_rules 443 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'), 444 # data 445 (c('.+'), TOKEN_DATA, None) 446 ], 447 # comments 448 TOKEN_COMMENT_BEGIN: [ 449 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % ( 450 e(environment.comment_end_string), 451 e(environment.comment_end_string), 452 block_suffix_re 453 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'), 454 (c('(.)'), (Failure('Missing end of comment tag'),), None) 455 ], 456 # blocks 457 TOKEN_BLOCK_BEGIN: [ 458 (c('(?:\-%s\s*|%s)%s' % ( 459 e(environment.block_end_string), 460 e(environment.block_end_string), 461 block_suffix_re 462 )), TOKEN_BLOCK_END, '#pop'), 463 ] + tag_rules, 464 # variables 465 TOKEN_VARIABLE_BEGIN: [ 466 (c('\-%s\s*|%s' % ( 467 e(environment.variable_end_string), 468 e(environment.variable_end_string) 469 )), TOKEN_VARIABLE_END, '#pop') 470 ] + tag_rules, 471 # raw block 472 TOKEN_RAW_BEGIN: [ 473 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( 474 e(environment.block_start_string), 475 e(environment.block_start_string), 476 e(environment.block_end_string), 477 e(environment.block_end_string), 478 block_suffix_re 479 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'), 480 (c('(.)'), (Failure('Missing end of raw directive'),), None) 481 ], 482 # line statements 483 TOKEN_LINESTATEMENT_BEGIN: [ 484 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop') 485 ] + tag_rules, 486 # line comments 487 TOKEN_LINECOMMENT_BEGIN: [ 488 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT, 489 TOKEN_LINECOMMENT_END), '#pop') 490 ] 491 } 492 493 def _normalize_newlines(self, value): 494 """Called for strings and template data to normlize it to unicode.""" 495 return newline_re.sub(self.newline_sequence, value) 496 497 def tokenize(self, source, name=None, filename=None, state=None): 498 """Calls tokeniter + tokenize and wraps it in a token stream. 499 """ 500 stream = self.tokeniter(source, name, filename, state) 501 return TokenStream(self.wrap(stream, name, filename), name, filename) 502 503 def wrap(self, stream, name=None, filename=None): 504 """This is called with the stream as returned by `tokenize` and wraps 505 every token in a :class:`Token` and converts the value. 506 """ 507 for lineno, token, value in stream: 508 if token in ignored_tokens: 509 continue 510 elif token == 'linestatement_begin': 511 token = 'block_begin' 512 elif token == 'linestatement_end': 513 token = 'block_end' 514 # we are not interested in those tokens in the parser 515 elif token in ('raw_begin', 'raw_end'): 516 continue 517 elif token == 'data': 518 value = self._normalize_newlines(value) 519 elif token == 'keyword': 520 token = value 521 elif token == 'name': 522 value = str(value) 523 elif token == 'string': 524 # try to unescape string 525 try: 526 value = self._normalize_newlines(value[1:-1]) \ 527 .encode('ascii', 'backslashreplace') \ 528 .decode('unicode-escape') 529 except Exception, e: 530 msg = str(e).split(':')[-1].strip() 531 raise TemplateSyntaxError(msg, lineno, name, filename) 532 # if we can express it as bytestring (ascii only) 533 # we do that for support of semi broken APIs 534 # as datetime.datetime.strftime. On python 3 this 535 # call becomes a noop thanks to 2to3 536 try: 537 value = str(value) 538 except UnicodeError: 539 pass 540 elif token == 'integer': 541 value = int(value) 542 elif token == 'float': 543 value = float(value) 544 elif token == 'operator': 545 token = operators[value] 546 yield Token(lineno, token, value) 547 548 def tokeniter(self, source, name, filename=None, state=None): 549 """This method tokenizes the text and returns the tokens in a 550 generator. Use this method if you just want to tokenize a template. 551 """ 552 source = '\n'.join(unicode(source).splitlines()) 553 pos = 0 554 lineno = 1 555 stack = ['root'] 556 if state is not None and state != 'root': 557 assert state in ('variable', 'block'), 'invalid state' 558 stack.append(state + '_begin') 559 else: 560 state = 'root' 561 statetokens = self.rules[stack[-1]] 562 source_length = len(source) 563 564 balancing_stack = [] 565 566 while 1: 567 # tokenizer loop 568 for regex, tokens, new_state in statetokens: 569 m = regex.match(source, pos) 570 # if no match we try again with the next rule 571 if m is None: 572 continue 573 574 # we only match blocks and variables if brances / parentheses 575 # are balanced. continue parsing with the lower rule which 576 # is the operator rule. do this only if the end tags look 577 # like operators 578 if balancing_stack and \ 579 tokens in ('variable_end', 'block_end', 580 'linestatement_end'): 581 continue 582 583 # tuples support more options 584 if isinstance(tokens, tuple): 585 for idx, token in enumerate(tokens): 586 # failure group 587 if token.__class__ is Failure: 588 raise token(lineno, filename) 589 # bygroup is a bit more complex, in that case we 590 # yield for the current token the first named 591 # group that matched 592 elif token == '#bygroup': 593 for key, value in m.groupdict().iteritems(): 594 if value is not None: 595 yield lineno, key, value 596 lineno += value.count('\n') 597 break 598 else: 599 raise RuntimeError('%r wanted to resolve ' 600 'the token dynamically' 601 ' but no group matched' 602 % regex) 603 # normal group 604 else: 605 data = m.group(idx + 1) 606 if data or token not in ignore_if_empty: 607 yield lineno, token, data 608 lineno += data.count('\n') 609 610 # strings as token just are yielded as it. 611 else: 612 data = m.group() 613 # update brace/parentheses balance 614 if tokens == 'operator': 615 if data == '{': 616 balancing_stack.append('}') 617 elif data == '(': 618 balancing_stack.append(')') 619 elif data == '[': 620 balancing_stack.append(']') 621 elif data in ('}', ')', ']'): 622 if not balancing_stack: 623 raise TemplateSyntaxError('unexpected \'%s\'' % 624 data, lineno, name, 625 filename) 626 expected_op = balancing_stack.pop() 627 if expected_op != data: 628 raise TemplateSyntaxError('unexpected \'%s\', ' 629 'expected \'%s\'' % 630 (data, expected_op), 631 lineno, name, 632 filename) 633 # yield items 634 if data or tokens not in ignore_if_empty: 635 yield lineno, tokens, data 636 lineno += data.count('\n') 637 638 # fetch new position into new variable so that we can check 639 # if there is a internal parsing error which would result 640 # in an infinite loop 641 pos2 = m.end() 642 643 # handle state changes 644 if new_state is not None: 645 # remove the uppermost state 646 if new_state == '#pop': 647 stack.pop() 648 # resolve the new state by group checking 649 elif new_state == '#bygroup': 650 for key, value in m.groupdict().iteritems(): 651 if value is not None: 652 stack.append(key) 653 break 654 else: 655 raise RuntimeError('%r wanted to resolve the ' 656 'new state dynamically but' 657 ' no group matched' % 658 regex) 659 # direct state name given 660 else: 661 stack.append(new_state) 662 statetokens = self.rules[stack[-1]] 663 # we are still at the same position and no stack change. 664 # this means a loop without break condition, avoid that and 665 # raise error 666 elif pos2 == pos: 667 raise RuntimeError('%r yielded empty string without ' 668 'stack change' % regex) 669 # publish new function and start again 670 pos = pos2 671 break 672 # if loop terminated without break we havn't found a single match 673 # either we are at the end of the file or we have a problem 674 else: 675 # end of text 676 if pos >= source_length: 677 return 678 # something went wrong 679 raise TemplateSyntaxError('unexpected char %r at %d' % 680 (source[pos], pos), lineno, 681 name, filename) 682