Home | History | Annotate | Download | only in mako
      1 # mako/lexer.py
      2 # Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
      3 #
      4 # This module is part of Mako and is released under
      5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
      6 
      7 """provides the Lexer class for parsing template strings into parse trees."""
      8 
      9 import re
     10 import codecs
     11 from mako import parsetree, exceptions, compat
     12 from mako.pygen import adjust_whitespace
     13 
     14 _regexp_cache = {}
     15 
     16 class Lexer(object):
     17     def __init__(self, text, filename=None,
     18                         disable_unicode=False,
     19                         input_encoding=None, preprocessor=None):
     20         self.text = text
     21         self.filename = filename
     22         self.template = parsetree.TemplateNode(self.filename)
     23         self.matched_lineno = 1
     24         self.matched_charpos = 0
     25         self.lineno = 1
     26         self.match_position = 0
     27         self.tag = []
     28         self.control_line = []
     29         self.ternary_stack = []
     30         self.disable_unicode = disable_unicode
     31         self.encoding = input_encoding
     32 
     33         if compat.py3k and disable_unicode:
     34             raise exceptions.UnsupportedError(
     35                                     "Mako for Python 3 does not "
     36                                     "support disabling Unicode")
     37 
     38         if preprocessor is None:
     39             self.preprocessor = []
     40         elif not hasattr(preprocessor, '__iter__'):
     41             self.preprocessor = [preprocessor]
     42         else:
     43             self.preprocessor = preprocessor
     44 
     45     @property
     46     def exception_kwargs(self):
     47         return {'source': self.text,
     48                 'lineno': self.matched_lineno,
     49                 'pos': self.matched_charpos,
     50                 'filename': self.filename}
     51 
     52     def match(self, regexp, flags=None):
     53         """compile the given regexp, cache the reg, and call match_reg()."""
     54 
     55         try:
     56             reg = _regexp_cache[(regexp, flags)]
     57         except KeyError:
     58             if flags:
     59                 reg = re.compile(regexp, flags)
     60             else:
     61                 reg = re.compile(regexp)
     62             _regexp_cache[(regexp, flags)] = reg
     63 
     64         return self.match_reg(reg)
     65 
     66     def match_reg(self, reg):
     67         """match the given regular expression object to the current text
     68         position.
     69 
     70         if a match occurs, update the current text and line position.
     71 
     72         """
     73 
     74         mp = self.match_position
     75 
     76         match = reg.match(self.text, self.match_position)
     77         if match:
     78             (start, end) = match.span()
     79             if end == start:
     80                 self.match_position = end + 1
     81             else:
     82                 self.match_position = end
     83             self.matched_lineno = self.lineno
     84             lines = re.findall(r"\n", self.text[mp:self.match_position])
     85             cp = mp - 1
     86             while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
     87                 cp -= 1
     88             self.matched_charpos = mp - cp
     89             self.lineno += len(lines)
     90             #print "MATCHED:", match.group(0), "LINE START:",
     91             # self.matched_lineno, "LINE END:", self.lineno
     92         #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
     93         #          (match and "TRUE" or "FALSE")
     94         return match
     95 
     96     def parse_until_text(self, *text):
     97         startpos = self.match_position
     98         text_re = r'|'.join(text)
     99         brace_level = 0
    100         while True:
    101             match = self.match(r'#.*\n')
    102             if match:
    103                 continue
    104             match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1',
    105                                re.S)
    106             if match:
    107                 continue
    108             match = self.match(r'(%s)' % text_re)
    109             if match:
    110                 if match.group(1) == '}' and brace_level > 0:
    111                     brace_level -= 1
    112                     continue
    113                 return \
    114                     self.text[startpos:
    115                               self.match_position - len(match.group(1))],\
    116                     match.group(1)
    117             match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
    118             if match:
    119                 brace_level += match.group(1).count('{')
    120                 brace_level -= match.group(1).count('}')
    121                 continue
    122             raise exceptions.SyntaxException(
    123                         "Expected: %s" %
    124                         ','.join(text),
    125                         **self.exception_kwargs)
    126 
    127     def append_node(self, nodecls, *args, **kwargs):
    128         kwargs.setdefault('source', self.text)
    129         kwargs.setdefault('lineno', self.matched_lineno)
    130         kwargs.setdefault('pos', self.matched_charpos)
    131         kwargs['filename'] = self.filename
    132         node = nodecls(*args, **kwargs)
    133         if len(self.tag):
    134             self.tag[-1].nodes.append(node)
    135         else:
    136             self.template.nodes.append(node)
    137         # build a set of child nodes for the control line
    138         # (used for loop variable detection)
    139         # also build a set of child nodes on ternary control lines
    140         # (used for determining if a pass needs to be auto-inserted
    141         if self.control_line:
    142             control_frame = self.control_line[-1]
    143             control_frame.nodes.append(node)
    144             if not (isinstance(node, parsetree.ControlLine) and
    145                     control_frame.is_ternary(node.keyword)):
    146                 if self.ternary_stack and self.ternary_stack[-1]:
    147                     self.ternary_stack[-1][-1].nodes.append(node)
    148         if isinstance(node, parsetree.Tag):
    149             if len(self.tag):
    150                 node.parent = self.tag[-1]
    151             self.tag.append(node)
    152         elif isinstance(node, parsetree.ControlLine):
    153             if node.isend:
    154                 self.control_line.pop()
    155                 self.ternary_stack.pop()
    156             elif node.is_primary:
    157                 self.control_line.append(node)
    158                 self.ternary_stack.append([])
    159             elif self.control_line and \
    160                     self.control_line[-1].is_ternary(node.keyword):
    161                 self.ternary_stack[-1].append(node)
    162             elif self.control_line and \
    163                     not self.control_line[-1].is_ternary(node.keyword):
    164                 raise exceptions.SyntaxException(
    165                         "Keyword '%s' not a legal ternary for keyword '%s'" %
    166                         (node.keyword, self.control_line[-1].keyword),
    167                         **self.exception_kwargs)
    168 
    169     _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
    170 
    171     def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
    172         """given string/unicode or bytes/string, determine encoding
    173            from magic encoding comment, return body as unicode
    174            or raw if decode_raw=False
    175 
    176         """
    177         if isinstance(text, compat.text_type):
    178             m = self._coding_re.match(text)
    179             encoding = m and m.group(1) or known_encoding or 'ascii'
    180             return encoding, text
    181 
    182         if text.startswith(codecs.BOM_UTF8):
    183             text = text[len(codecs.BOM_UTF8):]
    184             parsed_encoding = 'utf-8'
    185             m = self._coding_re.match(text.decode('utf-8', 'ignore'))
    186             if m is not None and m.group(1) != 'utf-8':
    187                 raise exceptions.CompileException(
    188                                 "Found utf-8 BOM in file, with conflicting "
    189                                 "magic encoding comment of '%s'" % m.group(1),
    190                                 text.decode('utf-8', 'ignore'),
    191                                 0, 0, filename)
    192         else:
    193             m = self._coding_re.match(text.decode('utf-8', 'ignore'))
    194             if m:
    195                 parsed_encoding = m.group(1)
    196             else:
    197                 parsed_encoding = known_encoding or 'ascii'
    198 
    199         if decode_raw:
    200             try:
    201                 text = text.decode(parsed_encoding)
    202             except UnicodeDecodeError:
    203                 raise exceptions.CompileException(
    204                         "Unicode decode operation of encoding '%s' failed" %
    205                         parsed_encoding,
    206                         text.decode('utf-8', 'ignore'),
    207                         0, 0, filename)
    208 
    209         return parsed_encoding, text
    210 
    211     def parse(self):
    212         self.encoding, self.text = self.decode_raw_stream(self.text,
    213                                         not self.disable_unicode,
    214                                         self.encoding,
    215                                         self.filename,)
    216 
    217         for preproc in self.preprocessor:
    218             self.text = preproc(self.text)
    219 
    220         # push the match marker past the
    221         # encoding comment.
    222         self.match_reg(self._coding_re)
    223 
    224         self.textlength = len(self.text)
    225 
    226         while (True):
    227             if self.match_position > self.textlength:
    228                 break
    229 
    230             if self.match_end():
    231                 break
    232             if self.match_expression():
    233                 continue
    234             if self.match_control_line():
    235                 continue
    236             if self.match_comment():
    237                 continue
    238             if self.match_tag_start():
    239                 continue
    240             if self.match_tag_end():
    241                 continue
    242             if self.match_python_block():
    243                 continue
    244             if self.match_text():
    245                 continue
    246 
    247             if self.match_position > self.textlength:
    248                 break
    249             raise exceptions.CompileException("assertion failed")
    250 
    251         if len(self.tag):
    252             raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
    253                                                 self.tag[-1].keyword,
    254                                                 **self.exception_kwargs)
    255         if len(self.control_line):
    256             raise exceptions.SyntaxException(
    257                                     "Unterminated control keyword: '%s'" %
    258                                     self.control_line[-1].keyword,
    259                                     self.text,
    260                                     self.control_line[-1].lineno,
    261                                     self.control_line[-1].pos, self.filename)
    262         return self.template
    263 
    264     def match_tag_start(self):
    265         match = self.match(r'''
    266             \<%     # opening tag
    267 
    268             ([\w\.\:]+)   # keyword
    269 
    270             ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*)  # attrname, = \
    271                                                #        sign, string expression
    272 
    273             \s*     # more whitespace
    274 
    275             (/)?>   # closing
    276 
    277             ''',
    278 
    279             re.I | re.S | re.X)
    280 
    281         if match:
    282             keyword, attr, isend = match.groups()
    283             self.keyword = keyword
    284             attributes = {}
    285             if attr:
    286                 for att in re.findall(
    287                            r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
    288                     key, val1, val2 = att
    289                     text = val1 or val2
    290                     text = text.replace('\r\n', '\n')
    291                     attributes[key] = text
    292             self.append_node(parsetree.Tag, keyword, attributes)
    293             if isend:
    294                 self.tag.pop()
    295             else:
    296                 if keyword == 'text':
    297                     match = self.match(r'(.*?)(?=\</%text>)',  re.S)
    298                     if not match:
    299                         raise exceptions.SyntaxException(
    300                                             "Unclosed tag: <%%%s>" %
    301                                             self.tag[-1].keyword,
    302                                             **self.exception_kwargs)
    303                     self.append_node(parsetree.Text, match.group(1))
    304                     return self.match_tag_end()
    305             return True
    306         else:
    307             return False
    308 
    309     def match_tag_end(self):
    310         match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
    311         if match:
    312             if not len(self.tag):
    313                 raise exceptions.SyntaxException(
    314                                 "Closing tag without opening tag: </%%%s>" %
    315                                 match.group(1),
    316                                 **self.exception_kwargs)
    317             elif self.tag[-1].keyword != match.group(1):
    318                 raise exceptions.SyntaxException(
    319                             "Closing tag </%%%s> does not match tag: <%%%s>" %
    320                             (match.group(1), self.tag[-1].keyword),
    321                             **self.exception_kwargs)
    322             self.tag.pop()
    323             return True
    324         else:
    325             return False
    326 
    327     def match_end(self):
    328         match = self.match(r'\Z', re.S)
    329         if match:
    330             string = match.group()
    331             if string:
    332                 return string
    333             else:
    334                 return True
    335         else:
    336             return False
    337 
    338     def match_text(self):
    339         match = self.match(r"""
    340                 (.*?)         # anything, followed by:
    341                 (
    342                  (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
    343                                              # comment preceded by a
    344                                              # consumed newline and whitespace
    345                  |
    346                  (?=\${)      # an expression
    347                  |
    348                  (?=</?[%&])  # a substitution or block or call start or end
    349                               # - don't consume
    350                  |
    351                  (\\\r?\n)    # an escaped newline  - throw away
    352                  |
    353                  \Z           # end of string
    354                 )""", re.X | re.S)
    355 
    356         if match:
    357             text = match.group(1)
    358             if text:
    359                 self.append_node(parsetree.Text, text)
    360             return True
    361         else:
    362             return False
    363 
    364     def match_python_block(self):
    365         match = self.match(r"<%(!)?")
    366         if match:
    367             line, pos = self.matched_lineno, self.matched_charpos
    368             text, end = self.parse_until_text(r'%>')
    369             # the trailing newline helps
    370             # compiler.parse() not complain about indentation
    371             text = adjust_whitespace(text) + "\n"
    372             self.append_node(
    373                         parsetree.Code,
    374                         text,
    375                         match.group(1) == '!', lineno=line, pos=pos)
    376             return True
    377         else:
    378             return False
    379 
    380     def match_expression(self):
    381         match = self.match(r"\${")
    382         if match:
    383             line, pos = self.matched_lineno, self.matched_charpos
    384             text, end = self.parse_until_text(r'\|', r'}')
    385             if end == '|':
    386                 escapes, end = self.parse_until_text(r'}')
    387             else:
    388                 escapes = ""
    389             text = text.replace('\r\n', '\n')
    390             self.append_node(
    391                             parsetree.Expression,
    392                             text, escapes.strip(),
    393                             lineno=line, pos=pos)
    394             return True
    395         else:
    396             return False
    397 
    398     def match_control_line(self):
    399         match = self.match(
    400                     r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
    401                     r"(?:\r?\n|\Z)", re.M)
    402         if match:
    403             operator = match.group(1)
    404             text = match.group(2)
    405             if operator == '%':
    406                 m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
    407                 if not m2:
    408                     raise exceptions.SyntaxException(
    409                                 "Invalid control line: '%s'" %
    410                                 text,
    411                                 **self.exception_kwargs)
    412                 isend, keyword = m2.group(1, 2)
    413                 isend = (isend is not None)
    414 
    415                 if isend:
    416                     if not len(self.control_line):
    417                         raise exceptions.SyntaxException(
    418                                 "No starting keyword '%s' for '%s'" %
    419                                 (keyword, text),
    420                                 **self.exception_kwargs)
    421                     elif self.control_line[-1].keyword != keyword:
    422                         raise exceptions.SyntaxException(
    423                                 "Keyword '%s' doesn't match keyword '%s'" %
    424                                 (text, self.control_line[-1].keyword),
    425                                 **self.exception_kwargs)
    426                 self.append_node(parsetree.ControlLine, keyword, isend, text)
    427             else:
    428                 self.append_node(parsetree.Comment, text)
    429             return True
    430         else:
    431             return False
    432 
    433     def match_comment(self):
    434         """matches the multiline version of a comment"""
    435         match = self.match(r"<%doc>(.*?)</%doc>", re.S)
    436         if match:
    437             self.append_node(parsetree.Comment, match.group(1))
    438             return True
    439         else:
    440             return False
    441 
    442