Home | History | Annotate | Download | only in parse
      1 # Copyright 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import imp
      6 import os.path
      7 import sys
      8 
      9 def _GetDirAbove(dirname):
     10   """Returns the directory "above" this file containing |dirname| (which must
     11   also be "above" this file)."""
     12   path = os.path.abspath(__file__)
     13   while True:
     14     path, tail = os.path.split(path)
     15     assert tail
     16     if tail == dirname:
     17       return path
     18 
     19 try:
     20   imp.find_module("ply")
     21 except ImportError:
     22   sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
     23 from ply.lex import TOKEN
     24 
     25 from ..error import Error
     26 
     27 
     28 class LexError(Error):
     29   """Class for errors from the lexer."""
     30 
     31   def __init__(self, filename, message, lineno):
     32     Error.__init__(self, filename, message, lineno=lineno)
     33 
     34 
     35 # We have methods which look like they could be functions:
     36 # pylint: disable=R0201
     37 class Lexer(object):
     38 
     39   def __init__(self, filename):
     40     self.filename = filename
     41 
     42   ######################--   PRIVATE   --######################
     43 
     44   ##
     45   ## Internal auxiliary methods
     46   ##
     47   def _error(self, msg, token):
     48     raise LexError(self.filename, msg, token.lineno)
     49 
     50   ##
     51   ## Reserved keywords
     52   ##
     53   keywords = (
     54     'HANDLE',
     55 
     56     'IMPORT',
     57     'MODULE',
     58     'STRUCT',
     59     'UNION',
     60     'INTERFACE',
     61     'ENUM',
     62     'CONST',
     63     'TRUE',
     64     'FALSE',
     65     'DEFAULT',
     66     'ARRAY',
     67     'MAP',
     68     'ASSOCIATED'
     69   )
     70 
     71   keyword_map = {}
     72   for keyword in keywords:
     73     keyword_map[keyword.lower()] = keyword
     74 
     75   ##
     76   ## All the tokens recognized by the lexer
     77   ##
     78   tokens = keywords + (
     79     # Identifiers
     80     'NAME',
     81 
     82     # Constants
     83     'ORDINAL',
     84     'INT_CONST_DEC', 'INT_CONST_HEX',
     85     'FLOAT_CONST',
     86 
     87     # String literals
     88     'STRING_LITERAL',
     89 
     90     # Operators
     91     'MINUS',
     92     'PLUS',
     93     'AMP',
     94     'QSTN',
     95 
     96     # Assignment
     97     'EQUALS',
     98 
     99     # Request / response
    100     'RESPONSE',
    101 
    102     # Delimiters
    103     'LPAREN', 'RPAREN',         # ( )
    104     'LBRACKET', 'RBRACKET',     # [ ]
    105     'LBRACE', 'RBRACE',         # { }
    106     'LANGLE', 'RANGLE',         # < >
    107     'SEMI',                     # ;
    108     'COMMA', 'DOT'              # , .
    109   )
    110 
    111   ##
    112   ## Regexes for use in tokens
    113   ##
    114 
    115   # valid C identifiers (K&R2: A.2.3)
    116   identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
    117 
    118   hex_prefix = '0[xX]'
    119   hex_digits = '[0-9a-fA-F]+'
    120 
    121   # integer constants (K&R2: A.2.5.1)
    122   decimal_constant = '0|([1-9][0-9]*)'
    123   hex_constant = hex_prefix+hex_digits
    124   # Don't allow octal constants (even invalid octal).
    125   octal_constant_disallowed = '0[0-9]+'
    126 
    127   # character constants (K&R2: A.2.5.2)
    128   # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
    129   # directives with Windows paths as filenames (..\..\dir\file)
    130   # For the same reason, decimal_escape allows all digit sequences. We want to
    131   # parse all correct code, even if it means to sometimes parse incorrect
    132   # code.
    133   #
    134   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
    135   decimal_escape = r"""(\d+)"""
    136   hex_escape = r"""(x[0-9a-fA-F]+)"""
    137   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
    138 
    139   escape_sequence = \
    140       r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
    141 
    142   # string literals (K&R2: A.2.6)
    143   string_char = r"""([^"\\\n]|"""+escape_sequence+')'
    144   string_literal = '"'+string_char+'*"'
    145   bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
    146 
    147   # floating constants (K&R2: A.2.5.3)
    148   exponent_part = r"""([eE][-+]?[0-9]+)"""
    149   fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
    150   floating_constant = \
    151       '(((('+fractional_constant+')'+ \
    152       exponent_part+'?)|([0-9]+'+exponent_part+')))'
    153 
    154   # Ordinals
    155   ordinal = r'@[0-9]+'
    156   missing_ordinal_value = r'@'
    157   # Don't allow ordinal values in octal (even invalid octal, like 09) or
    158   # hexadecimal.
    159   octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'
    160 
    161   ##
    162   ## Rules for the normal state
    163   ##
    164   t_ignore = ' \t\r'
    165 
    166   # Newlines
    167   def t_NEWLINE(self, t):
    168     r'\n+'
    169     t.lexer.lineno += len(t.value)
    170 
    171   # Operators
    172   t_MINUS             = r'-'
    173   t_PLUS              = r'\+'
    174   t_AMP               = r'&'
    175   t_QSTN              = r'\?'
    176 
    177   # =
    178   t_EQUALS            = r'='
    179 
    180   # =>
    181   t_RESPONSE          = r'=>'
    182 
    183   # Delimiters
    184   t_LPAREN            = r'\('
    185   t_RPAREN            = r'\)'
    186   t_LBRACKET          = r'\['
    187   t_RBRACKET          = r'\]'
    188   t_LBRACE            = r'\{'
    189   t_RBRACE            = r'\}'
    190   t_LANGLE            = r'<'
    191   t_RANGLE            = r'>'
    192   t_COMMA             = r','
    193   t_DOT               = r'\.'
    194   t_SEMI              = r';'
    195 
    196   t_STRING_LITERAL    = string_literal
    197 
    198   # The following floating and integer constants are defined as
    199   # functions to impose a strict order (otherwise, decimal
    200   # is placed before the others because its regex is longer,
    201   # and this is bad)
    202   #
    203   @TOKEN(floating_constant)
    204   def t_FLOAT_CONST(self, t):
    205     return t
    206 
    207   @TOKEN(hex_constant)
    208   def t_INT_CONST_HEX(self, t):
    209     return t
    210 
    211   @TOKEN(octal_constant_disallowed)
    212   def t_OCTAL_CONSTANT_DISALLOWED(self, t):
    213     msg = "Octal values not allowed"
    214     self._error(msg, t)
    215 
    216   @TOKEN(decimal_constant)
    217   def t_INT_CONST_DEC(self, t):
    218     return t
    219 
    220   # unmatched string literals are caught by the preprocessor
    221 
    222   @TOKEN(bad_string_literal)
    223   def t_BAD_STRING_LITERAL(self, t):
    224     msg = "String contains invalid escape code"
    225     self._error(msg, t)
    226 
    227   # Handle ordinal-related tokens in the right order:
    228   @TOKEN(octal_or_hex_ordinal_disallowed)
    229   def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
    230     msg = "Octal and hexadecimal ordinal values not allowed"
    231     self._error(msg, t)
    232 
    233   @TOKEN(ordinal)
    234   def t_ORDINAL(self, t):
    235     return t
    236 
    237   @TOKEN(missing_ordinal_value)
    238   def t_BAD_ORDINAL(self, t):
    239     msg = "Missing ordinal value"
    240     self._error(msg, t)
    241 
    242   @TOKEN(identifier)
    243   def t_NAME(self, t):
    244     t.type = self.keyword_map.get(t.value, "NAME")
    245     return t
    246 
    247   # Ignore C and C++ style comments
    248   def t_COMMENT(self, t):
    249     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    250     t.lexer.lineno += t.value.count("\n")
    251 
    252   def t_error(self, t):
    253     msg = "Illegal character %s" % repr(t.value[0])
    254     self._error(msg, t)
    255