Home | History | Annotate | Download | only in idl_parser
      1 #!/usr/bin/env python
      2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """ Lexer for PPAPI IDL
      7 
      8 The lexer uses the PLY library to build a tokenizer which understands both
      9 WebIDL and Pepper tokens.
     10 
     11 WebIDL, and WebIDL regular expressions can be found at:
     12    http://www.w3.org/TR/2012/CR-WebIDL-20120419/
     13 PLY can be found at:
     14    http://www.dabeaz.com/ply/
     15 """
     16 
     17 import os.path
     18 import sys
     19 
     20 #
     21 # Try to load the ply module, if not, then assume it is in the third_party
     22 # directory.
     23 #
     24 try:
     25   # Disable lint check which fails to find the ply module.
     26   # pylint: disable=F0401
     27   from ply import lex
     28 except ImportError:
     29   module_path, module_name = os.path.split(__file__)
     30   third_party = os.path.join(module_path, '..', '..', 'third_party')
     31   sys.path.append(third_party)
     32   # pylint: disable=F0401
     33   from ply import lex
     34 
     35 #
     36 # IDL Lexer
     37 #
     38 class IDLLexer(object):
     39   # 'literals' is a value expected by lex which specifies a list of valid
     40   # literal tokens, meaning the token type and token value are identical.
     41   literals = r'"*.(){}[],;:=+-/~|&^?<>'
     42 
     43   # 't_ignore' contains ignored characters (spaces and tabs)
     44   t_ignore = ' \t'
     45 
     46   # 'tokens' is a value required by lex which specifies the complete list
     47   # of valid token types.
     48   tokens = [
     49     # Data types
     50       'float',
     51       'integer',
     52       'string',
     53 
     54     # Symbol and keywords types
     55       'COMMENT',
     56       'identifier',
     57 
     58     # MultiChar operators
     59       'ELLIPSIS',
     60   ]
     61 
     62   # 'keywords' is a map of string to token type.  All tokens matching
     63   # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
     64   # if the token is actually a keyword.
     65   keywords = {
     66     'any' : 'ANY',
     67     'attribute' : 'ATTRIBUTE',
     68     'boolean' : 'BOOLEAN',
     69     'byte' : 'BYTE',
     70     'ByteString' : 'BYTESTRING',
     71     'callback' : 'CALLBACK',
     72     'const' : 'CONST',
     73     'creator' : 'CREATOR',
     74     'Date' : 'DATE',
     75     'deleter' : 'DELETER',
     76     'dictionary' : 'DICTIONARY',
     77     'DOMString' : 'DOMSTRING',
     78     'double' : 'DOUBLE',
     79     'enum'  : 'ENUM',
     80     'false' : 'FALSE',
     81     'float' : 'FLOAT',
     82     'exception' : 'EXCEPTION',
     83     'getter': 'GETTER',
     84     'implements' : 'IMPLEMENTS',
     85     'Infinity' : 'INFINITY',
     86     'inherit' : 'INHERIT',
     87     'interface' : 'INTERFACE',
     88     'legacycaller' : 'LEGACYCALLER',
     89     'long' : 'LONG',
     90     'Nan' : 'NAN',
     91     'null' : 'NULL',
     92     'object' : 'OBJECT',
     93     'octet' : 'OCTET',
     94     'optional' : 'OPTIONAL',
     95     'or' : 'OR',
     96     'partial'  : 'PARTIAL',
     97     'readonly' : 'READONLY',
     98     'RegExp' : 'REGEXP',
     99     'sequence' : 'SEQUENCE',
    100     'serializer' : 'SERIALIZER',
    101     'setter': 'SETTER',
    102     'short' : 'SHORT',
    103     'static' : 'STATIC',
    104     'stringifier' : 'STRINGIFIER',
    105     'typedef' : 'TYPEDEF',
    106     'true' : 'TRUE',
    107     'unsigned' : 'UNSIGNED',
    108     'unrestricted' : 'UNRESTRICTED',
    109     'void' : 'VOID'
    110   }
    111 
    112   # Token definitions
    113   #
    114   # Lex assumes any value or function in the form of 't_<TYPE>' represents a
    115   # regular expression where a match will emit a token of type <TYPE>.  In the
    116   # case of a function, the function is called when a match is made. These
    117   # definitions come from WebIDL.
    118   #
    119   # These need to be methods for lexer construction, despite not using self.
    120   # pylint: disable=R0201
    121   def t_ELLIPSIS(self, t):
    122     r'\.\.\.'
    123     return t
    124 
    125   # Regex needs to be in the docstring
    126   # pylint: disable=C0301
    127   def t_float(self, t):
    128     r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
    129     return t
    130 
    131   def t_integer(self, t):
    132     r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
    133     return t
    134 
    135 
    136   # A line ending '\n', we use this to increment the line number
    137   def t_LINE_END(self, t):
    138     r'\n+'
    139     self.AddLines(len(t.value))
    140 
    141   # We do not process escapes in the IDL strings.  Strings are exclusively
    142   # used for attributes and enums, and not used as typical 'C' constants.
    143   def t_string(self, t):
    144     r'"[^"]*"'
    145     t.value = t.value[1:-1]
    146     self.AddLines(t.value.count('\n'))
    147     return t
    148 
    149   # A C or C++ style comment:  /* xxx */ or //
    150   def t_COMMENT(self, t):
    151     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    152     self.AddLines(t.value.count('\n'))
    153     return t
    154 
    155   # A symbol or keyword.
    156   def t_KEYWORD_OR_SYMBOL(self, t):
    157     r'_?[A-Za-z][A-Za-z_0-9]*'
    158 
    159     # All non-keywords are assumed to be symbols
    160     t.type = self.keywords.get(t.value, 'identifier')
    161 
    162     # We strip leading underscores so that you can specify symbols with the same
    163     # value as a keywords (E.g. a dictionary named 'interface').
    164     if t.value[0] == '_':
    165       t.value = t.value[1:]
    166     return t
    167 
    168   def t_ANY_error(self, t):
    169     msg = 'Unrecognized input'
    170     line = self.Lexer().lineno
    171 
    172     # If that line has not been accounted for, then we must have hit
    173     # EoF, so compute the beginning of the line that caused the problem.
    174     if line >= len(self.index):
    175       # Find the offset in the line of the first word causing the issue
    176       word = t.value.split()[0]
    177       offs = self.lines[line - 1].find(word)
    178       # Add the computed line's starting position
    179       self.index.append(self.Lexer().lexpos - offs)
    180       msg = 'Unexpected EoF reached after'
    181 
    182     pos = self.Lexer().lexpos - self.index[line]
    183     out = self.ErrorMessage(line, pos, msg)
    184     sys.stderr.write(out + '\n')
    185     self._lex_errors += 1
    186 
    187 
    188   def AddLines(self, count):
    189     # Set the lexer position for the beginning of the next line.  In the case
    190     # of multiple lines, tokens can not exist on any of the lines except the
    191     # last one, so the recorded value for previous lines are unused.  We still
    192     # fill the array however, to make sure the line count is correct.
    193     self.Lexer().lineno += count
    194     for _ in range(count):
    195       self.index.append(self.Lexer().lexpos)
    196 
    197   def FileLineMsg(self, line, msg):
    198     # Generate a message containing the file and line number of a token.
    199     filename = self.Lexer().filename
    200     if filename:
    201       return "%s(%d) : %s" % (filename, line + 1, msg)
    202     return "<BuiltIn> : %s" % msg
    203 
    204   def SourceLine(self, line, pos):
    205     # Create a source line marker
    206     caret = ' ' * pos + '^'
    207     # We decrement the line number since the array is 0 based while the
    208     # line numbers are 1 based.
    209     return "%s\n%s" % (self.lines[line - 1], caret)
    210 
    211   def ErrorMessage(self, line, pos, msg):
    212     return "\n%s\n%s" % (
    213         self.FileLineMsg(line, msg),
    214         self.SourceLine(line, pos))
    215 
    216 #
    217 # Tokenizer
    218 #
    219 # The token function returns the next token provided by IDLLexer for matching
    220 # against the leaf paterns.
    221 #
    222   def token(self):
    223     tok = self.Lexer().token()
    224     if tok:
    225       self.last = tok
    226     return tok
    227 
    228 
    229   def GetTokens(self):
    230     outlist = []
    231     while True:
    232       t = self.Lexer().token()
    233       if not t:
    234         break
    235       outlist.append(t)
    236     return outlist
    237 
    238   def Tokenize(self, data, filename='__no_file__'):
    239     lexer = self.Lexer()
    240     lexer.lineno = 1
    241     lexer.filename = filename
    242     lexer.input(data)
    243     self.lines = data.split('\n')
    244 
    245   def KnownTokens(self):
    246     return self.tokens
    247 
    248   def Lexer(self):
    249     if not self._lexobj:
    250       self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
    251     return self._lexobj
    252 
    253   def _AddToken(self, token):
    254     if token in self.tokens:
    255       raise RuntimeError('Same token: ' + token)
    256     self.tokens.append(token)
    257 
    258   def _AddTokens(self, tokens):
    259     for token in tokens:
    260       self._AddToken(token)
    261 
    262   def _AddKeywords(self, keywords):
    263     for key in keywords:
    264       value = key.upper()
    265       self._AddToken(value)
    266       self.keywords[key] = value
    267 
    268   def _DelKeywords(self, keywords):
    269     for key in keywords:
    270       self.tokens.remove(key.upper())
    271       del self.keywords[key]
    272 
    273   def __init__(self):
    274     self.index = [0]
    275     self._lex_errors = 0
    276     self.linex = []
    277     self.filename = None
    278     self.keywords = {}
    279     self.tokens = []
    280     self._AddTokens(IDLLexer.tokens)
    281     self._AddKeywords(IDLLexer.keywords)
    282     self._lexobj = None
    283     self.last = None
    284     self.lines = None
    285 
    286 # If run by itself, attempt to build the lexer
    287 if __name__ == '__main__':
    288   lexer_object = IDLLexer()
    289