1 # mako/lexer.py 2 # Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file> 3 # 4 # This module is part of Mako and is released under 5 # the MIT License: http://www.opensource.org/licenses/mit-license.php 6 7 """provides the Lexer class for parsing template strings into parse trees.""" 8 9 import re 10 import codecs 11 from mako import parsetree, exceptions, compat 12 from mako.pygen import adjust_whitespace 13 14 _regexp_cache = {} 15 16 class Lexer(object): 17 def __init__(self, text, filename=None, 18 disable_unicode=False, 19 input_encoding=None, preprocessor=None): 20 self.text = text 21 self.filename = filename 22 self.template = parsetree.TemplateNode(self.filename) 23 self.matched_lineno = 1 24 self.matched_charpos = 0 25 self.lineno = 1 26 self.match_position = 0 27 self.tag = [] 28 self.control_line = [] 29 self.ternary_stack = [] 30 self.disable_unicode = disable_unicode 31 self.encoding = input_encoding 32 33 if compat.py3k and disable_unicode: 34 raise exceptions.UnsupportedError( 35 "Mako for Python 3 does not " 36 "support disabling Unicode") 37 38 if preprocessor is None: 39 self.preprocessor = [] 40 elif not hasattr(preprocessor, '__iter__'): 41 self.preprocessor = [preprocessor] 42 else: 43 self.preprocessor = preprocessor 44 45 @property 46 def exception_kwargs(self): 47 return {'source': self.text, 48 'lineno': self.matched_lineno, 49 'pos': self.matched_charpos, 50 'filename': self.filename} 51 52 def match(self, regexp, flags=None): 53 """compile the given regexp, cache the reg, and call match_reg().""" 54 55 try: 56 reg = _regexp_cache[(regexp, flags)] 57 except KeyError: 58 if flags: 59 reg = re.compile(regexp, flags) 60 else: 61 reg = re.compile(regexp) 62 _regexp_cache[(regexp, flags)] = reg 63 64 return self.match_reg(reg) 65 66 def match_reg(self, reg): 67 """match the given regular expression object to the current text 68 position. 69 70 if a match occurs, update the current text and line position. 71 72 """ 73 74 mp = self.match_position 75 76 match = reg.match(self.text, self.match_position) 77 if match: 78 (start, end) = match.span() 79 if end == start: 80 self.match_position = end + 1 81 else: 82 self.match_position = end 83 self.matched_lineno = self.lineno 84 lines = re.findall(r"\n", self.text[mp:self.match_position]) 85 cp = mp - 1 86 while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'): 87 cp -= 1 88 self.matched_charpos = mp - cp 89 self.lineno += len(lines) 90 #print "MATCHED:", match.group(0), "LINE START:", 91 # self.matched_lineno, "LINE END:", self.lineno 92 #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \ 93 # (match and "TRUE" or "FALSE") 94 return match 95 96 def parse_until_text(self, *text): 97 startpos = self.match_position 98 text_re = r'|'.join(text) 99 brace_level = 0 100 while True: 101 match = self.match(r'#.*\n') 102 if match: 103 continue 104 match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1', 105 re.S) 106 if match: 107 continue 108 match = self.match(r'(%s)' % text_re) 109 if match: 110 if match.group(1) == '}' and brace_level > 0: 111 brace_level -= 1 112 continue 113 return \ 114 self.text[startpos: 115 self.match_position - len(match.group(1))],\ 116 match.group(1) 117 match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) 118 if match: 119 brace_level += match.group(1).count('{') 120 brace_level -= match.group(1).count('}') 121 continue 122 raise exceptions.SyntaxException( 123 "Expected: %s" % 124 ','.join(text), 125 **self.exception_kwargs) 126 127 def append_node(self, nodecls, *args, **kwargs): 128 kwargs.setdefault('source', self.text) 129 kwargs.setdefault('lineno', self.matched_lineno) 130 kwargs.setdefault('pos', self.matched_charpos) 131 kwargs['filename'] = self.filename 132 node = nodecls(*args, **kwargs) 133 if len(self.tag): 134 self.tag[-1].nodes.append(node) 135 else: 136 self.template.nodes.append(node) 137 # build a set of child nodes for the control line 138 # (used for loop variable detection) 139 # also build a set of child nodes on ternary control lines 140 # (used for determining if a pass needs to be auto-inserted 141 if self.control_line: 142 control_frame = self.control_line[-1] 143 control_frame.nodes.append(node) 144 if not (isinstance(node, parsetree.ControlLine) and 145 control_frame.is_ternary(node.keyword)): 146 if self.ternary_stack and self.ternary_stack[-1]: 147 self.ternary_stack[-1][-1].nodes.append(node) 148 if isinstance(node, parsetree.Tag): 149 if len(self.tag): 150 node.parent = self.tag[-1] 151 self.tag.append(node) 152 elif isinstance(node, parsetree.ControlLine): 153 if node.isend: 154 self.control_line.pop() 155 self.ternary_stack.pop() 156 elif node.is_primary: 157 self.control_line.append(node) 158 self.ternary_stack.append([]) 159 elif self.control_line and \ 160 self.control_line[-1].is_ternary(node.keyword): 161 self.ternary_stack[-1].append(node) 162 elif self.control_line and \ 163 not self.control_line[-1].is_ternary(node.keyword): 164 raise exceptions.SyntaxException( 165 "Keyword '%s' not a legal ternary for keyword '%s'" % 166 (node.keyword, self.control_line[-1].keyword), 167 **self.exception_kwargs) 168 169 _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') 170 171 def decode_raw_stream(self, text, decode_raw, known_encoding, filename): 172 """given string/unicode or bytes/string, determine encoding 173 from magic encoding comment, return body as unicode 174 or raw if decode_raw=False 175 176 """ 177 if isinstance(text, compat.text_type): 178 m = self._coding_re.match(text) 179 encoding = m and m.group(1) or known_encoding or 'ascii' 180 return encoding, text 181 182 if text.startswith(codecs.BOM_UTF8): 183 text = text[len(codecs.BOM_UTF8):] 184 parsed_encoding = 'utf-8' 185 m = self._coding_re.match(text.decode('utf-8', 'ignore')) 186 if m is not None and m.group(1) != 'utf-8': 187 raise exceptions.CompileException( 188 "Found utf-8 BOM in file, with conflicting " 189 "magic encoding comment of '%s'" % m.group(1), 190 text.decode('utf-8', 'ignore'), 191 0, 0, filename) 192 else: 193 m = self._coding_re.match(text.decode('utf-8', 'ignore')) 194 if m: 195 parsed_encoding = m.group(1) 196 else: 197 parsed_encoding = known_encoding or 'ascii' 198 199 if decode_raw: 200 try: 201 text = text.decode(parsed_encoding) 202 except UnicodeDecodeError: 203 raise exceptions.CompileException( 204 "Unicode decode operation of encoding '%s' failed" % 205 parsed_encoding, 206 text.decode('utf-8', 'ignore'), 207 0, 0, filename) 208 209 return parsed_encoding, text 210 211 def parse(self): 212 self.encoding, self.text = self.decode_raw_stream(self.text, 213 not self.disable_unicode, 214 self.encoding, 215 self.filename,) 216 217 for preproc in self.preprocessor: 218 self.text = preproc(self.text) 219 220 # push the match marker past the 221 # encoding comment. 222 self.match_reg(self._coding_re) 223 224 self.textlength = len(self.text) 225 226 while (True): 227 if self.match_position > self.textlength: 228 break 229 230 if self.match_end(): 231 break 232 if self.match_expression(): 233 continue 234 if self.match_control_line(): 235 continue 236 if self.match_comment(): 237 continue 238 if self.match_tag_start(): 239 continue 240 if self.match_tag_end(): 241 continue 242 if self.match_python_block(): 243 continue 244 if self.match_text(): 245 continue 246 247 if self.match_position > self.textlength: 248 break 249 raise exceptions.CompileException("assertion failed") 250 251 if len(self.tag): 252 raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % 253 self.tag[-1].keyword, 254 **self.exception_kwargs) 255 if len(self.control_line): 256 raise exceptions.SyntaxException( 257 "Unterminated control keyword: '%s'" % 258 self.control_line[-1].keyword, 259 self.text, 260 self.control_line[-1].lineno, 261 self.control_line[-1].pos, self.filename) 262 return self.template 263 264 def match_tag_start(self): 265 match = self.match(r''' 266 \<% # opening tag 267 268 ([\w\.\:]+) # keyword 269 270 ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \ 271 # sign, string expression 272 273 \s* # more whitespace 274 275 (/)?> # closing 276 277 ''', 278 279 re.I | re.S | re.X) 280 281 if match: 282 keyword, attr, isend = match.groups() 283 self.keyword = keyword 284 attributes = {} 285 if attr: 286 for att in re.findall( 287 r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): 288 key, val1, val2 = att 289 text = val1 or val2 290 text = text.replace('\r\n', '\n') 291 attributes[key] = text 292 self.append_node(parsetree.Tag, keyword, attributes) 293 if isend: 294 self.tag.pop() 295 else: 296 if keyword == 'text': 297 match = self.match(r'(.*?)(?=\</%text>)', re.S) 298 if not match: 299 raise exceptions.SyntaxException( 300 "Unclosed tag: <%%%s>" % 301 self.tag[-1].keyword, 302 **self.exception_kwargs) 303 self.append_node(parsetree.Text, match.group(1)) 304 return self.match_tag_end() 305 return True 306 else: 307 return False 308 309 def match_tag_end(self): 310 match = self.match(r'\</%[\t ]*(.+?)[\t ]*>') 311 if match: 312 if not len(self.tag): 313 raise exceptions.SyntaxException( 314 "Closing tag without opening tag: </%%%s>" % 315 match.group(1), 316 **self.exception_kwargs) 317 elif self.tag[-1].keyword != match.group(1): 318 raise exceptions.SyntaxException( 319 "Closing tag </%%%s> does not match tag: <%%%s>" % 320 (match.group(1), self.tag[-1].keyword), 321 **self.exception_kwargs) 322 self.tag.pop() 323 return True 324 else: 325 return False 326 327 def match_end(self): 328 match = self.match(r'\Z', re.S) 329 if match: 330 string = match.group() 331 if string: 332 return string 333 else: 334 return True 335 else: 336 return False 337 338 def match_text(self): 339 match = self.match(r""" 340 (.*?) # anything, followed by: 341 ( 342 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based 343 # comment preceded by a 344 # consumed newline and whitespace 345 | 346 (?=\${) # an expression 347 | 348 (?=</?[%&]) # a substitution or block or call start or end 349 # - don't consume 350 | 351 (\\\r?\n) # an escaped newline - throw away 352 | 353 \Z # end of string 354 )""", re.X | re.S) 355 356 if match: 357 text = match.group(1) 358 if text: 359 self.append_node(parsetree.Text, text) 360 return True 361 else: 362 return False 363 364 def match_python_block(self): 365 match = self.match(r"<%(!)?") 366 if match: 367 line, pos = self.matched_lineno, self.matched_charpos 368 text, end = self.parse_until_text(r'%>') 369 # the trailing newline helps 370 # compiler.parse() not complain about indentation 371 text = adjust_whitespace(text) + "\n" 372 self.append_node( 373 parsetree.Code, 374 text, 375 match.group(1) == '!', lineno=line, pos=pos) 376 return True 377 else: 378 return False 379 380 def match_expression(self): 381 match = self.match(r"\${") 382 if match: 383 line, pos = self.matched_lineno, self.matched_charpos 384 text, end = self.parse_until_text(r'\|', r'}') 385 if end == '|': 386 escapes, end = self.parse_until_text(r'}') 387 else: 388 escapes = "" 389 text = text.replace('\r\n', '\n') 390 self.append_node( 391 parsetree.Expression, 392 text, escapes.strip(), 393 lineno=line, pos=pos) 394 return True 395 else: 396 return False 397 398 def match_control_line(self): 399 match = self.match( 400 r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)" 401 r"(?:\r?\n|\Z)", re.M) 402 if match: 403 operator = match.group(1) 404 text = match.group(2) 405 if operator == '%': 406 m2 = re.match(r'(end)?(\w+)\s*(.*)', text) 407 if not m2: 408 raise exceptions.SyntaxException( 409 "Invalid control line: '%s'" % 410 text, 411 **self.exception_kwargs) 412 isend, keyword = m2.group(1, 2) 413 isend = (isend is not None) 414 415 if isend: 416 if not len(self.control_line): 417 raise exceptions.SyntaxException( 418 "No starting keyword '%s' for '%s'" % 419 (keyword, text), 420 **self.exception_kwargs) 421 elif self.control_line[-1].keyword != keyword: 422 raise exceptions.SyntaxException( 423 "Keyword '%s' doesn't match keyword '%s'" % 424 (text, self.control_line[-1].keyword), 425 **self.exception_kwargs) 426 self.append_node(parsetree.ControlLine, keyword, isend, text) 427 else: 428 self.append_node(parsetree.Comment, text) 429 return True 430 else: 431 return False 432 433 def match_comment(self): 434 """matches the multiline version of a comment""" 435 match = self.match(r"<%doc>(.*?)</%doc>", re.S) 436 if match: 437 self.append_node(parsetree.Comment, match.group(1)) 438 return True 439 else: 440 return False 441 442