Home | History | Annotate | Download | only in parse
      1 # Copyright 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import imp
      6 import os.path
      7 import sys
      8 
      9 # Disable lint check for finding modules:
     10 # pylint: disable=F0401
     11 
     12 def _GetDirAbove(dirname):
     13   """Returns the directory "above" this file containing |dirname| (which must
     14   also be "above" this file)."""
     15   path = os.path.abspath(__file__)
     16   while True:
     17     path, tail = os.path.split(path)
     18     assert tail
     19     if tail == dirname:
     20       return path
     21 
     22 try:
     23   imp.find_module("ply")
     24 except ImportError:
     25   sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
     26 from ply.lex import TOKEN
     27 
     28 from ..error import Error
     29 
     30 
     31 # Disable lint check for exceptions deriving from Exception:
     32 # pylint: disable=W0710
     33 class LexError(Error):
     34   """Class for errors from the lexer."""
     35 
     36   def __init__(self, filename, message, lineno):
     37     Error.__init__(self, filename, message, lineno=lineno)
     38 
     39 
     40 # We have methods which look like they could be functions:
     41 # pylint: disable=R0201
     42 class Lexer(object):
     43 
     44   def __init__(self, filename):
     45     self.filename = filename
     46 
     47   ######################--   PRIVATE   --######################
     48 
     49   ##
     50   ## Internal auxiliary methods
     51   ##
     52   def _error(self, msg, token):
     53     raise LexError(self.filename, msg, token.lineno)
     54 
     55   ##
     56   ## Reserved keywords
     57   ##
     58   keywords = (
     59     'HANDLE',
     60 
     61     'IMPORT',
     62     'MODULE',
     63     'STRUCT',
     64     'INTERFACE',
     65     'ENUM',
     66     'CONST',
     67     'TRUE',
     68     'FALSE',
     69     'DEFAULT',
     70   )
     71 
     72   keyword_map = {}
     73   for keyword in keywords:
     74     keyword_map[keyword.lower()] = keyword
     75 
     76   ##
     77   ## All the tokens recognized by the lexer
     78   ##
     79   tokens = keywords + (
     80     # Identifiers
     81     'NAME',
     82 
     83     # Constants
     84     'ORDINAL',
     85     'INT_CONST_DEC', 'INT_CONST_HEX',
     86     'FLOAT_CONST',
     87     'CHAR_CONST',
     88 
     89     # String literals
     90     'STRING_LITERAL',
     91 
     92     # Operators
     93     'MINUS',
     94     'PLUS',
     95     'AMP',
     96 
     97     # Assignment
     98     'EQUALS',
     99 
    100     # Request / response
    101     'RESPONSE',
    102 
    103     # Delimiters
    104     'LPAREN', 'RPAREN',         # ( )
    105     'LBRACKET', 'RBRACKET',     # [ ]
    106     'LBRACE', 'RBRACE',         # { }
    107     'LANGLE', 'RANGLE',         # < >
    108     'SEMI',                     # ;
    109     'COMMA', 'DOT'              # , .
    110   )
    111 
    112   ##
    113   ## Regexes for use in tokens
    114   ##
    115 
    116   # valid C identifiers (K&R2: A.2.3)
    117   identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
    118 
    119   hex_prefix = '0[xX]'
    120   hex_digits = '[0-9a-fA-F]+'
    121 
    122   # integer constants (K&R2: A.2.5.1)
    123   decimal_constant = '0|([1-9][0-9]*)'
    124   hex_constant = hex_prefix+hex_digits
    125   # Don't allow octal constants (even invalid octal).
    126   octal_constant_disallowed = '0[0-9]+'
    127 
    128   # character constants (K&R2: A.2.5.2)
    129   # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
    130   # directives with Windows paths as filenames (..\..\dir\file)
    131   # For the same reason, decimal_escape allows all digit sequences. We want to
    132   # parse all correct code, even if it means to sometimes parse incorrect
    133   # code.
    134   #
    135   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
    136   decimal_escape = r"""(\d+)"""
    137   hex_escape = r"""(x[0-9a-fA-F]+)"""
    138   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
    139 
    140   escape_sequence = \
    141       r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
    142   cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
    143   char_const = "'"+cconst_char+"'"
    144   unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
    145   bad_char_const = \
    146       r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \
    147       bad_escape+r"""[^'\n]*')"""
    148 
    149   # string literals (K&R2: A.2.6)
    150   string_char = r"""([^"\\\n]|"""+escape_sequence+')'
    151   string_literal = '"'+string_char+'*"'
    152   bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
    153 
    154   # floating constants (K&R2: A.2.5.3)
    155   exponent_part = r"""([eE][-+]?[0-9]+)"""
    156   fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
    157   floating_constant = \
    158       '(((('+fractional_constant+')'+ \
    159       exponent_part+'?)|([0-9]+'+exponent_part+')))'
    160 
    161   # Ordinals
    162   ordinal = r'@[0-9]+'
    163   missing_ordinal_value = r'@'
    164   # Don't allow ordinal values in octal (even invalid octal, like 09) or
    165   # hexadecimal.
    166   octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'
    167 
    168   ##
    169   ## Rules for the normal state
    170   ##
    171   t_ignore = ' \t\r'
    172 
    173   # Newlines
    174   def t_NEWLINE(self, t):
    175     r'\n+'
    176     t.lexer.lineno += len(t.value)
    177 
    178   # Operators
    179   t_MINUS             = r'-'
    180   t_PLUS              = r'\+'
    181   t_AMP               = r'&'
    182 
    183   # =
    184   t_EQUALS            = r'='
    185 
    186   # =>
    187   t_RESPONSE          = r'=>'
    188 
    189   # Delimiters
    190   t_LPAREN            = r'\('
    191   t_RPAREN            = r'\)'
    192   t_LBRACKET          = r'\['
    193   t_RBRACKET          = r'\]'
    194   t_LBRACE            = r'\{'
    195   t_RBRACE            = r'\}'
    196   t_LANGLE            = r'<'
    197   t_RANGLE            = r'>'
    198   t_COMMA             = r','
    199   t_DOT               = r'\.'
    200   t_SEMI              = r';'
    201 
    202   t_STRING_LITERAL    = string_literal
    203 
    204   # The following floating and integer constants are defined as
    205   # functions to impose a strict order (otherwise, decimal
    206   # is placed before the others because its regex is longer,
    207   # and this is bad)
    208   #
    209   @TOKEN(floating_constant)
    210   def t_FLOAT_CONST(self, t):
    211     return t
    212 
    213   @TOKEN(hex_constant)
    214   def t_INT_CONST_HEX(self, t):
    215     return t
    216 
    217   @TOKEN(octal_constant_disallowed)
    218   def t_OCTAL_CONSTANT_DISALLOWED(self, t):
    219     msg = "Octal values not allowed"
    220     self._error(msg, t)
    221 
    222   @TOKEN(decimal_constant)
    223   def t_INT_CONST_DEC(self, t):
    224     return t
    225 
    226   # Must come before bad_char_const, to prevent it from
    227   # catching valid char constants as invalid
    228   #
    229   @TOKEN(char_const)
    230   def t_CHAR_CONST(self, t):
    231     return t
    232 
    233   @TOKEN(unmatched_quote)
    234   def t_UNMATCHED_QUOTE(self, t):
    235     msg = "Unmatched '"
    236     self._error(msg, t)
    237 
    238   @TOKEN(bad_char_const)
    239   def t_BAD_CHAR_CONST(self, t):
    240     msg = "Invalid char constant %s" % t.value
    241     self._error(msg, t)
    242 
    243   # unmatched string literals are caught by the preprocessor
    244 
    245   @TOKEN(bad_string_literal)
    246   def t_BAD_STRING_LITERAL(self, t):
    247     msg = "String contains invalid escape code"
    248     self._error(msg, t)
    249 
    250   # Handle ordinal-related tokens in the right order:
    251   @TOKEN(octal_or_hex_ordinal_disallowed)
    252   def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
    253     msg = "Octal and hexadecimal ordinal values not allowed"
    254     self._error(msg, t)
    255 
    256   @TOKEN(ordinal)
    257   def t_ORDINAL(self, t):
    258     return t
    259 
    260   @TOKEN(missing_ordinal_value)
    261   def t_BAD_ORDINAL(self, t):
    262     msg = "Missing ordinal value"
    263     self._error(msg, t)
    264 
    265   @TOKEN(identifier)
    266   def t_NAME(self, t):
    267     t.type = self.keyword_map.get(t.value, "NAME")
    268     return t
    269 
    270   # Ignore C and C++ style comments
    271   def t_COMMENT(self, t):
    272     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    273     t.lexer.lineno += t.value.count("\n")
    274 
    275   def t_error(self, t):
    276     msg = "Illegal character %s" % repr(t.value[0])
    277     self._error(msg, t)
    278