Home | History | Annotate | Download | only in generators
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """ Lexer for PPAPI IDL """
      7 
      8 #
      9 # IDL Lexer
     10 #
     11 # The lexer is uses the PLY lex library to build a tokenizer which understands
     12 # WebIDL tokens.
     13 #
     14 # WebIDL, and WebIDL regular expressions can be found at:
     15 #   http://dev.w3.org/2006/webapi/WebIDL/
     16 # PLY can be found at:
     17 #   http://www.dabeaz.com/ply/
     18 
     19 import os.path
     20 import re
     21 import sys
     22 
     23 #
     24 # Try to load the ply module, if not, then assume it is in the third_party
     25 # directory, relative to ppapi
     26 #
     27 try:
     28   from ply import lex
     29 except:
     30   module_path, module_name = os.path.split(__file__)
     31   third_party = os.path.join(module_path, '..', '..', 'third_party')
     32   sys.path.append(third_party)
     33   from ply import lex
     34 
     35 from idl_option import GetOption, Option, ParseOptions
     36 
     37 
     38 Option('output', 'Generate output.')
     39 
     40 #
     41 # IDL Lexer
     42 #
     43 class IDLLexer(object):
     44   # 'tokens' is a value required by lex which specifies the complete list
     45   # of valid token types.
     46   tokens = [
     47     # Symbol and keywords types
     48       'COMMENT',
     49       'DESCRIBE',
     50       'ENUM',
     51       'LABEL',
     52       'SYMBOL',
     53       'INLINE',
     54       'INTERFACE',
     55       'STRUCT',
     56       'TYPEDEF',
     57 
     58     # Extra WebIDL keywords
     59       'CALLBACK',
     60       'DICTIONARY',
     61       'OPTIONAL',
     62       'STATIC',
     63 
     64     # Invented for apps use
     65       'NAMESPACE',
     66 
     67     # Data types
     68       'FLOAT',
     69       'OCT',
     70       'INT',
     71       'HEX',
     72       'STRING',
     73 
     74     # Operators
     75       'LSHIFT',
     76       'RSHIFT'
     77   ]
     78 
     79   # 'keywords' is a map of string to token type.  All SYMBOL tokens are
     80   # matched against keywords, to determine if the token is actually a keyword.
     81   keywords = {
     82     'describe' : 'DESCRIBE',
     83     'enum'  : 'ENUM',
     84     'label' : 'LABEL',
     85     'interface' : 'INTERFACE',
     86     'readonly' : 'READONLY',
     87     'struct' : 'STRUCT',
     88     'typedef' : 'TYPEDEF',
     89 
     90     'callback' : 'CALLBACK',
     91     'dictionary' : 'DICTIONARY',
     92     'optional' : 'OPTIONAL',
     93     'static' : 'STATIC',
     94     'namespace' : 'NAMESPACE',
     95   }
     96 
     97   # 'literals' is a value expected by lex which specifies a list of valid
     98   # literal tokens, meaning the token type and token value are identical.
     99   literals = '"*.(){}[],;:=+-/~|&^?'
    100 
    101   # Token definitions
    102   #
    103   # Lex assumes any value or function in the form of 't_<TYPE>' represents a
    104   # regular expression where a match will emit a token of type <TYPE>.  In the
    105   # case of a function, the function is called when a match is made. These
    106   # definitions come from WebIDL.
    107 
    108   # 't_ignore' is a special match of items to ignore
    109   t_ignore = ' \t'
    110 
    111   # Constant values
    112   t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
    113   t_INT = r'-?[0-9]+[uU]?'
    114   t_OCT = r'-?0[0-7]+'
    115   t_HEX = r'-?0[Xx][0-9A-Fa-f]+'
    116   t_LSHIFT = r'<<'
    117   t_RSHIFT = r'>>'
    118 
    119   # A line ending '\n', we use this to increment the line number
    120   def t_LINE_END(self, t):
    121     r'\n+'
    122     self.AddLines(len(t.value))
    123 
    124   # We do not process escapes in the IDL strings.  Strings are exclusively
    125   # used for attributes, and not used as typical 'C' constants.
    126   def t_STRING(self, t):
    127     r'"[^"]*"'
    128     t.value = t.value[1:-1]
    129     self.AddLines(t.value.count('\n'))
    130     return t
    131 
    132   # A C or C++ style comment:  /* xxx */ or //
    133   def t_COMMENT(self, t):
    134     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    135     self.AddLines(t.value.count('\n'))
    136     return t
    137 
    138   # Return a "preprocessor" inline block
    139   def t_INLINE(self, t):
    140     r'\#inline (.|\n)*?\#endinl.*'
    141     self.AddLines(t.value.count('\n'))
    142     return t
    143 
    144   # A symbol or keyword.
    145   def t_KEYWORD_SYMBOL(self, t):
    146     r'_?[A-Za-z][A-Za-z_0-9]*'
    147 
    148     # All non-keywords are assumed to be symbols
    149     t.type = self.keywords.get(t.value, 'SYMBOL')
    150 
    151     # We strip leading underscores so that you can specify symbols with the same
    152     # value as a keywords (E.g. a dictionary named 'interface').
    153     if t.value[0] == '_':
    154       t.value = t.value[1:]
    155     return t
    156 
    157   def t_ANY_error(self, t):
    158     msg = "Unrecognized input"
    159     line = self.lexobj.lineno
    160 
    161     # If that line has not been accounted for, then we must have hit
    162     # EoF, so compute the beginning of the line that caused the problem.
    163     if line >= len(self.index):
    164       # Find the offset in the line of the first word causing the issue
    165       word = t.value.split()[0]
    166       offs = self.lines[line - 1].find(word)
    167       # Add the computed line's starting position
    168       self.index.append(self.lexobj.lexpos - offs)
    169       msg = "Unexpected EoF reached after"
    170 
    171     pos = self.lexobj.lexpos - self.index[line]
    172     file = self.lexobj.filename
    173     out = self.ErrorMessage(file, line, pos, msg)
    174     sys.stderr.write(out + '\n')
    175     self.lex_errors += 1
    176 
    177 
    178   def AddLines(self, count):
    179     # Set the lexer position for the beginning of the next line.  In the case
    180     # of multiple lines, tokens can not exist on any of the lines except the
    181     # last one, so the recorded value for previous lines are unused.  We still
    182     # fill the array however, to make sure the line count is correct.
    183     self.lexobj.lineno += count
    184     for i in range(count):
    185       self.index.append(self.lexobj.lexpos)
    186 
    187   def FileLineMsg(self, file, line, msg):
    188     if file:  return "%s(%d) : %s" % (file, line + 1, msg)
    189     return "<BuiltIn> : %s" % msg
    190 
    191   def SourceLine(self, file, line, pos):
    192     caret = '\t^'.expandtabs(pos)
    193     # We decrement the line number since the array is 0 based while the
    194     # line numbers are 1 based.
    195     return "%s\n%s" % (self.lines[line - 1], caret)
    196 
    197   def ErrorMessage(self, file, line, pos, msg):
    198     return "\n%s\n%s" % (
    199         self.FileLineMsg(file, line, msg),
    200         self.SourceLine(file, line, pos))
    201 
    202   def SetData(self, filename, data):
    203     # Start with line 1, not zero
    204     self.lexobj.lineno = 1
    205     self.lexobj.filename = filename
    206     self.lines = data.split('\n')
    207     self.index = [0]
    208     self.lexobj.input(data)
    209     self.lex_errors = 0
    210 
    211   def __init__(self):
    212     self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
    213 
    214 
    215 
    216 #
    217 # FilesToTokens
    218 #
    219 # From a set of source file names, generate a list of tokens.
    220 #
    221 def FilesToTokens(filenames, verbose=False):
    222   lexer = IDLLexer()
    223   outlist = []
    224   for filename in filenames:
    225     data = open(filename).read()
    226     lexer.SetData(filename, data)
    227     if verbose: sys.stdout.write('  Loaded %s...\n' % filename)
    228     while 1:
    229       t = lexer.lexobj.token()
    230       if t is None: break
    231       outlist.append(t)
    232   return outlist
    233 
    234 
    235 def TokensFromText(text):
    236   lexer = IDLLexer()
    237   lexer.SetData('unknown', text)
    238   outlist = []
    239   while 1:
    240     t = lexer.lexobj.token()
    241     if t is None: break
    242     outlist.append(t.value)
    243   return outlist
    244 
    245 #
    246 # TextToTokens
    247 #
    248 # From a block of text, generate a list of tokens
    249 #
    250 def TextToTokens(source):
    251   lexer = IDLLexer()
    252   outlist = []
    253   lexer.SetData('AUTO', source)
    254   while 1:
    255     t = lexer.lexobj.token()
    256     if t is None: break
    257     outlist.append(t.value)
    258   return outlist
    259 
    260 
    261 #
    262 # TestSame
    263 #
    264 # From a set of token values, generate a new source text by joining with a
    265 # single space.  The new source is then tokenized and compared against the
    266 # old set.
    267 #
    268 def TestSame(values1):
    269   # Recreate the source from the tokens.  We use newline instead of whitespace
    270   # since the '//' and #inline regex are line sensitive.
    271   text = '\n'.join(values1)
    272   values2 = TextToTokens(text)
    273 
    274   count1 = len(values1)
    275   count2 = len(values2)
    276   if count1 != count2:
    277     print "Size mismatch original %d vs %d\n" % (count1, count2)
    278     if count1 > count2: count1 = count2
    279 
    280   for i in range(count1):
    281     if values1[i] != values2[i]:
    282       print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i])
    283 
    284   if GetOption('output'):
    285     sys.stdout.write('Generating original.txt and tokenized.txt\n')
    286     open('original.txt', 'w').write(src1)
    287     open('tokenized.txt', 'w').write(src2)
    288 
    289   if values1 == values2:
    290     sys.stdout.write('Same: Pass\n')
    291     return 0
    292 
    293   print "****************\n%s\n%s***************\n" % (src1, src2)
    294   sys.stdout.write('Same: Failed\n')
    295   return -1
    296 
    297 
    298 #
    299 # TestExpect
    300 #
    301 # From a set of tokens pairs, verify the type field of the second matches
    302 # the value of the first, so that:
    303 # INT 123 FLOAT 1.1
    304 # will generate a passing test, where the first token is the SYMBOL INT,
    305 # and the second token is the INT 123, third token is the SYMBOL FLOAT and
    306 # the fourth is the FLOAT 1.1, etc...
    307 def TestExpect(tokens):
    308   count = len(tokens)
    309   index = 0
    310   errors = 0
    311   while index < count:
    312     type = tokens[index].value
    313     token = tokens[index + 1]
    314     index += 2
    315 
    316     if type != token.type:
    317       sys.stderr.write('Mismatch:  Expected %s, but got %s = %s.\n' %
    318                        (type, token.type, token.value))
    319       errors += 1
    320 
    321   if not errors:
    322     sys.stdout.write('Expect: Pass\n')
    323     return 0
    324 
    325   sys.stdout.write('Expect: Failed\n')
    326   return -1
    327 
    328 
    329 def Main(args):
    330   filenames = ParseOptions(args)
    331 
    332   try:
    333     tokens = FilesToTokens(filenames, GetOption('verbose'))
    334     values = [tok.value for tok in tokens]
    335     if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n')
    336     if GetOption('test'):
    337       if TestSame(values):
    338         return -1
    339       if TestExpect(tokens):
    340         return -1
    341     return 0
    342 
    343   except lex.LexError as le:
    344     sys.stderr.write('%s\n' % str(le))
    345   return -1
    346 
    347 
    348 if __name__ == '__main__':
    349   sys.exit(Main(sys.argv[1:]))
    350