Home | History | Annotate | Download | only in generators
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """ Lexer for PPAPI IDL """
      7 
      8 #
      9 # IDL Lexer
     10 #
     11 # The lexer is uses the PLY lex library to build a tokenizer which understands
     12 # WebIDL tokens.
     13 #
     14 # WebIDL, and WebIDL regular expressions can be found at:
     15 #   http://dev.w3.org/2006/webapi/WebIDL/
     16 # PLY can be found at:
     17 #   http://www.dabeaz.com/ply/
     18 
     19 import os.path
     20 import re
     21 import sys
     22 
     23 #
     24 # Try to load the ply module, if not, then assume it is in the third_party
     25 # directory, relative to ppapi
     26 #
     27 try:
     28   from ply import lex
     29 except:
     30   module_path, module_name = os.path.split(__file__)
     31   third_party = os.path.join(module_path, '..', '..', 'third_party')
     32   sys.path.append(third_party)
     33   from ply import lex
     34 
     35 from idl_option import GetOption, Option, ParseOptions
     36 
     37 
     38 Option('output', 'Generate output.')
     39 
     40 #
     41 # IDL Lexer
     42 #
     43 class IDLLexer(object):
     44   # 'tokens' is a value required by lex which specifies the complete list
     45   # of valid token types.
     46   tokens = [
     47     # Symbol and keywords types
     48       'COMMENT',
     49       'DESCRIBE',
     50       'ENUM',
     51       'LABEL',
     52       'SYMBOL',
     53       'INLINE',
     54       'INTERFACE',
     55       'STRUCT',
     56       'TYPEDEF',
     57       'OR',
     58 
     59     # Extra WebIDL keywords
     60       'CALLBACK',
     61       'DICTIONARY',
     62       'OPTIONAL',
     63       'STATIC',
     64 
     65     # Invented for apps use
     66       'NAMESPACE',
     67 
     68     # Data types
     69       'FLOAT',
     70       'OCT',
     71       'INT',
     72       'HEX',
     73       'STRING',
     74 
     75     # Operators
     76       'LSHIFT',
     77       'RSHIFT'
     78   ]
     79 
     80   # 'keywords' is a map of string to token type.  All SYMBOL tokens are
     81   # matched against keywords, to determine if the token is actually a keyword.
     82   keywords = {
     83     'describe' : 'DESCRIBE',
     84     'enum'  : 'ENUM',
     85     'label' : 'LABEL',
     86     'interface' : 'INTERFACE',
     87     'readonly' : 'READONLY',
     88     'struct' : 'STRUCT',
     89     'typedef' : 'TYPEDEF',
     90 
     91     'callback' : 'CALLBACK',
     92     'dictionary' : 'DICTIONARY',
     93     'optional' : 'OPTIONAL',
     94     'static' : 'STATIC',
     95     'namespace' : 'NAMESPACE',
     96 
     97     'or' : 'OR',
     98   }
     99 
    100   # 'literals' is a value expected by lex which specifies a list of valid
    101   # literal tokens, meaning the token type and token value are identical.
    102   literals = '"*.(){}[],;:=+-/~|&^?'
    103 
    104   # Token definitions
    105   #
    106   # Lex assumes any value or function in the form of 't_<TYPE>' represents a
    107   # regular expression where a match will emit a token of type <TYPE>.  In the
    108   # case of a function, the function is called when a match is made. These
    109   # definitions come from WebIDL.
    110 
    111   # 't_ignore' is a special match of items to ignore
    112   t_ignore = ' \t'
    113 
    114   # Constant values
    115   t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
    116   t_INT = r'-?[0-9]+[uU]?'
    117   t_OCT = r'-?0[0-7]+'
    118   t_HEX = r'-?0[Xx][0-9A-Fa-f]+'
    119   t_LSHIFT = r'<<'
    120   t_RSHIFT = r'>>'
    121 
    122   # A line ending '\n', we use this to increment the line number
    123   def t_LINE_END(self, t):
    124     r'\n+'
    125     self.AddLines(len(t.value))
    126 
    127   # We do not process escapes in the IDL strings.  Strings are exclusively
    128   # used for attributes, and not used as typical 'C' constants.
    129   def t_STRING(self, t):
    130     r'"[^"]*"'
    131     t.value = t.value[1:-1]
    132     self.AddLines(t.value.count('\n'))
    133     return t
    134 
    135   # A C or C++ style comment:  /* xxx */ or //
    136   def t_COMMENT(self, t):
    137     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    138     self.AddLines(t.value.count('\n'))
    139     return t
    140 
    141   # Return a "preprocessor" inline block
    142   def t_INLINE(self, t):
    143     r'\#inline (.|\n)*?\#endinl.*'
    144     self.AddLines(t.value.count('\n'))
    145     return t
    146 
    147   # A symbol or keyword.
    148   def t_KEYWORD_SYMBOL(self, t):
    149     r'_?[A-Za-z][A-Za-z_0-9]*'
    150 
    151     # All non-keywords are assumed to be symbols
    152     t.type = self.keywords.get(t.value, 'SYMBOL')
    153 
    154     # We strip leading underscores so that you can specify symbols with the same
    155     # value as a keywords (E.g. a dictionary named 'interface').
    156     if t.value[0] == '_':
    157       t.value = t.value[1:]
    158     return t
    159 
    160   def t_ANY_error(self, t):
    161     msg = "Unrecognized input"
    162     line = self.lexobj.lineno
    163 
    164     # If that line has not been accounted for, then we must have hit
    165     # EoF, so compute the beginning of the line that caused the problem.
    166     if line >= len(self.index):
    167       # Find the offset in the line of the first word causing the issue
    168       word = t.value.split()[0]
    169       offs = self.lines[line - 1].find(word)
    170       # Add the computed line's starting position
    171       self.index.append(self.lexobj.lexpos - offs)
    172       msg = "Unexpected EoF reached after"
    173 
    174     pos = self.lexobj.lexpos - self.index[line]
    175     file = self.lexobj.filename
    176     out = self.ErrorMessage(file, line, pos, msg)
    177     sys.stderr.write(out + '\n')
    178     self.lex_errors += 1
    179 
    180 
    181   def AddLines(self, count):
    182     # Set the lexer position for the beginning of the next line.  In the case
    183     # of multiple lines, tokens can not exist on any of the lines except the
    184     # last one, so the recorded value for previous lines are unused.  We still
    185     # fill the array however, to make sure the line count is correct.
    186     self.lexobj.lineno += count
    187     for i in range(count):
    188       self.index.append(self.lexobj.lexpos)
    189 
    190   def FileLineMsg(self, file, line, msg):
    191     if file:  return "%s(%d) : %s" % (file, line + 1, msg)
    192     return "<BuiltIn> : %s" % msg
    193 
    194   def SourceLine(self, file, line, pos):
    195     caret = '\t^'.expandtabs(pos)
    196     # We decrement the line number since the array is 0 based while the
    197     # line numbers are 1 based.
    198     return "%s\n%s" % (self.lines[line - 1], caret)
    199 
    200   def ErrorMessage(self, file, line, pos, msg):
    201     return "\n%s\n%s" % (
    202         self.FileLineMsg(file, line, msg),
    203         self.SourceLine(file, line, pos))
    204 
    205   def SetData(self, filename, data):
    206     # Start with line 1, not zero
    207     self.lexobj.lineno = 1
    208     self.lexobj.filename = filename
    209     self.lines = data.split('\n')
    210     self.index = [0]
    211     self.lexobj.input(data)
    212     self.lex_errors = 0
    213 
    214   def __init__(self):
    215     self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
    216 
    217 
    218 
    219 #
    220 # FilesToTokens
    221 #
    222 # From a set of source file names, generate a list of tokens.
    223 #
    224 def FilesToTokens(filenames, verbose=False):
    225   lexer = IDLLexer()
    226   outlist = []
    227   for filename in filenames:
    228     data = open(filename).read()
    229     lexer.SetData(filename, data)
    230     if verbose: sys.stdout.write('  Loaded %s...\n' % filename)
    231     while 1:
    232       t = lexer.lexobj.token()
    233       if t is None: break
    234       outlist.append(t)
    235   return outlist
    236 
    237 
    238 def TokensFromText(text):
    239   lexer = IDLLexer()
    240   lexer.SetData('unknown', text)
    241   outlist = []
    242   while 1:
    243     t = lexer.lexobj.token()
    244     if t is None: break
    245     outlist.append(t.value)
    246   return outlist
    247 
    248 #
    249 # TextToTokens
    250 #
    251 # From a block of text, generate a list of tokens
    252 #
    253 def TextToTokens(source):
    254   lexer = IDLLexer()
    255   outlist = []
    256   lexer.SetData('AUTO', source)
    257   while 1:
    258     t = lexer.lexobj.token()
    259     if t is None: break
    260     outlist.append(t.value)
    261   return outlist
    262 
    263 
    264 #
    265 # TestSame
    266 #
    267 # From a set of token values, generate a new source text by joining with a
    268 # single space.  The new source is then tokenized and compared against the
    269 # old set.
    270 #
    271 def TestSame(values1):
    272   # Recreate the source from the tokens.  We use newline instead of whitespace
    273   # since the '//' and #inline regex are line sensitive.
    274   text = '\n'.join(values1)
    275   values2 = TextToTokens(text)
    276 
    277   count1 = len(values1)
    278   count2 = len(values2)
    279   if count1 != count2:
    280     print "Size mismatch original %d vs %d\n" % (count1, count2)
    281     if count1 > count2: count1 = count2
    282 
    283   for i in range(count1):
    284     if values1[i] != values2[i]:
    285       print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i])
    286 
    287   if GetOption('output'):
    288     sys.stdout.write('Generating original.txt and tokenized.txt\n')
    289     open('original.txt', 'w').write(src1)
    290     open('tokenized.txt', 'w').write(src2)
    291 
    292   if values1 == values2:
    293     sys.stdout.write('Same: Pass\n')
    294     return 0
    295 
    296   print "****************\n%s\n%s***************\n" % (src1, src2)
    297   sys.stdout.write('Same: Failed\n')
    298   return -1
    299 
    300 
    301 #
    302 # TestExpect
    303 #
    304 # From a set of tokens pairs, verify the type field of the second matches
    305 # the value of the first, so that:
    306 # INT 123 FLOAT 1.1
    307 # will generate a passing test, where the first token is the SYMBOL INT,
    308 # and the second token is the INT 123, third token is the SYMBOL FLOAT and
    309 # the fourth is the FLOAT 1.1, etc...
    310 def TestExpect(tokens):
    311   count = len(tokens)
    312   index = 0
    313   errors = 0
    314   while index < count:
    315     type = tokens[index].value
    316     token = tokens[index + 1]
    317     index += 2
    318 
    319     if type != token.type:
    320       sys.stderr.write('Mismatch:  Expected %s, but got %s = %s.\n' %
    321                        (type, token.type, token.value))
    322       errors += 1
    323 
    324   if not errors:
    325     sys.stdout.write('Expect: Pass\n')
    326     return 0
    327 
    328   sys.stdout.write('Expect: Failed\n')
    329   return -1
    330 
    331 
    332 def Main(args):
    333   filenames = ParseOptions(args)
    334 
    335   try:
    336     tokens = FilesToTokens(filenames, GetOption('verbose'))
    337     values = [tok.value for tok in tokens]
    338     if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n')
    339     if GetOption('test'):
    340       if TestSame(values):
    341         return -1
    342       if TestExpect(tokens):
    343         return -1
    344     return 0
    345 
    346   except lex.LexError as le:
    347     sys.stderr.write('%s\n' % str(le))
    348   return -1
    349 
    350 
    351 if __name__ == '__main__':
    352   sys.exit(Main(sys.argv[1:]))
    353