1 #!/usr/bin/env python 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """ Lexer for PPAPI IDL """ 7 8 # 9 # IDL Lexer 10 # 11 # The lexer is uses the PLY lex library to build a tokenizer which understands 12 # WebIDL tokens. 13 # 14 # WebIDL, and WebIDL regular expressions can be found at: 15 # http://dev.w3.org/2006/webapi/WebIDL/ 16 # PLY can be found at: 17 # http://www.dabeaz.com/ply/ 18 19 import os.path 20 import re 21 import sys 22 23 # 24 # Try to load the ply module, if not, then assume it is in the third_party 25 # directory, relative to ppapi 26 # 27 try: 28 from ply import lex 29 except: 30 module_path, module_name = os.path.split(__file__) 31 third_party = os.path.join(module_path, '..', '..', 'third_party') 32 sys.path.append(third_party) 33 from ply import lex 34 35 from idl_option import GetOption, Option, ParseOptions 36 37 38 Option('output', 'Generate output.') 39 40 # 41 # IDL Lexer 42 # 43 class IDLLexer(object): 44 # 'tokens' is a value required by lex which specifies the complete list 45 # of valid token types. 46 tokens = [ 47 # Symbol and keywords types 48 'COMMENT', 49 'DESCRIBE', 50 'ENUM', 51 'LABEL', 52 'SYMBOL', 53 'INLINE', 54 'INTERFACE', 55 'STRUCT', 56 'TYPEDEF', 57 58 # Extra WebIDL keywords 59 'CALLBACK', 60 'DICTIONARY', 61 'OPTIONAL', 62 'STATIC', 63 64 # Invented for apps use 65 'NAMESPACE', 66 67 # Data types 68 'FLOAT', 69 'OCT', 70 'INT', 71 'HEX', 72 'STRING', 73 74 # Operators 75 'LSHIFT', 76 'RSHIFT' 77 ] 78 79 # 'keywords' is a map of string to token type. All SYMBOL tokens are 80 # matched against keywords, to determine if the token is actually a keyword. 81 keywords = { 82 'describe' : 'DESCRIBE', 83 'enum' : 'ENUM', 84 'label' : 'LABEL', 85 'interface' : 'INTERFACE', 86 'readonly' : 'READONLY', 87 'struct' : 'STRUCT', 88 'typedef' : 'TYPEDEF', 89 90 'callback' : 'CALLBACK', 91 'dictionary' : 'DICTIONARY', 92 'optional' : 'OPTIONAL', 93 'static' : 'STATIC', 94 'namespace' : 'NAMESPACE', 95 } 96 97 # 'literals' is a value expected by lex which specifies a list of valid 98 # literal tokens, meaning the token type and token value are identical. 99 literals = '"*.(){}[],;:=+-/~|&^?' 100 101 # Token definitions 102 # 103 # Lex assumes any value or function in the form of 't_<TYPE>' represents a 104 # regular expression where a match will emit a token of type <TYPE>. In the 105 # case of a function, the function is called when a match is made. These 106 # definitions come from WebIDL. 107 108 # 't_ignore' is a special match of items to ignore 109 t_ignore = ' \t' 110 111 # Constant values 112 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' 113 t_INT = r'-?[0-9]+[uU]?' 114 t_OCT = r'-?0[0-7]+' 115 t_HEX = r'-?0[Xx][0-9A-Fa-f]+' 116 t_LSHIFT = r'<<' 117 t_RSHIFT = r'>>' 118 119 # A line ending '\n', we use this to increment the line number 120 def t_LINE_END(self, t): 121 r'\n+' 122 self.AddLines(len(t.value)) 123 124 # We do not process escapes in the IDL strings. Strings are exclusively 125 # used for attributes, and not used as typical 'C' constants. 126 def t_STRING(self, t): 127 r'"[^"]*"' 128 t.value = t.value[1:-1] 129 self.AddLines(t.value.count('\n')) 130 return t 131 132 # A C or C++ style comment: /* xxx */ or // 133 def t_COMMENT(self, t): 134 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 135 self.AddLines(t.value.count('\n')) 136 return t 137 138 # Return a "preprocessor" inline block 139 def t_INLINE(self, t): 140 r'\#inline (.|\n)*?\#endinl.*' 141 self.AddLines(t.value.count('\n')) 142 return t 143 144 # A symbol or keyword. 145 def t_KEYWORD_SYMBOL(self, t): 146 r'_?[A-Za-z][A-Za-z_0-9]*' 147 148 # All non-keywords are assumed to be symbols 149 t.type = self.keywords.get(t.value, 'SYMBOL') 150 151 # We strip leading underscores so that you can specify symbols with the same 152 # value as a keywords (E.g. a dictionary named 'interface'). 153 if t.value[0] == '_': 154 t.value = t.value[1:] 155 return t 156 157 def t_ANY_error(self, t): 158 msg = "Unrecognized input" 159 line = self.lexobj.lineno 160 161 # If that line has not been accounted for, then we must have hit 162 # EoF, so compute the beginning of the line that caused the problem. 163 if line >= len(self.index): 164 # Find the offset in the line of the first word causing the issue 165 word = t.value.split()[0] 166 offs = self.lines[line - 1].find(word) 167 # Add the computed line's starting position 168 self.index.append(self.lexobj.lexpos - offs) 169 msg = "Unexpected EoF reached after" 170 171 pos = self.lexobj.lexpos - self.index[line] 172 file = self.lexobj.filename 173 out = self.ErrorMessage(file, line, pos, msg) 174 sys.stderr.write(out + '\n') 175 self.lex_errors += 1 176 177 178 def AddLines(self, count): 179 # Set the lexer position for the beginning of the next line. In the case 180 # of multiple lines, tokens can not exist on any of the lines except the 181 # last one, so the recorded value for previous lines are unused. We still 182 # fill the array however, to make sure the line count is correct. 183 self.lexobj.lineno += count 184 for i in range(count): 185 self.index.append(self.lexobj.lexpos) 186 187 def FileLineMsg(self, file, line, msg): 188 if file: return "%s(%d) : %s" % (file, line + 1, msg) 189 return "<BuiltIn> : %s" % msg 190 191 def SourceLine(self, file, line, pos): 192 caret = '\t^'.expandtabs(pos) 193 # We decrement the line number since the array is 0 based while the 194 # line numbers are 1 based. 195 return "%s\n%s" % (self.lines[line - 1], caret) 196 197 def ErrorMessage(self, file, line, pos, msg): 198 return "\n%s\n%s" % ( 199 self.FileLineMsg(file, line, msg), 200 self.SourceLine(file, line, pos)) 201 202 def SetData(self, filename, data): 203 # Start with line 1, not zero 204 self.lexobj.lineno = 1 205 self.lexobj.filename = filename 206 self.lines = data.split('\n') 207 self.index = [0] 208 self.lexobj.input(data) 209 self.lex_errors = 0 210 211 def __init__(self): 212 self.lexobj = lex.lex(object=self, lextab=None, optimize=0) 213 214 215 216 # 217 # FilesToTokens 218 # 219 # From a set of source file names, generate a list of tokens. 220 # 221 def FilesToTokens(filenames, verbose=False): 222 lexer = IDLLexer() 223 outlist = [] 224 for filename in filenames: 225 data = open(filename).read() 226 lexer.SetData(filename, data) 227 if verbose: sys.stdout.write(' Loaded %s...\n' % filename) 228 while 1: 229 t = lexer.lexobj.token() 230 if t is None: break 231 outlist.append(t) 232 return outlist 233 234 235 def TokensFromText(text): 236 lexer = IDLLexer() 237 lexer.SetData('unknown', text) 238 outlist = [] 239 while 1: 240 t = lexer.lexobj.token() 241 if t is None: break 242 outlist.append(t.value) 243 return outlist 244 245 # 246 # TextToTokens 247 # 248 # From a block of text, generate a list of tokens 249 # 250 def TextToTokens(source): 251 lexer = IDLLexer() 252 outlist = [] 253 lexer.SetData('AUTO', source) 254 while 1: 255 t = lexer.lexobj.token() 256 if t is None: break 257 outlist.append(t.value) 258 return outlist 259 260 261 # 262 # TestSame 263 # 264 # From a set of token values, generate a new source text by joining with a 265 # single space. The new source is then tokenized and compared against the 266 # old set. 267 # 268 def TestSame(values1): 269 # Recreate the source from the tokens. We use newline instead of whitespace 270 # since the '//' and #inline regex are line sensitive. 271 text = '\n'.join(values1) 272 values2 = TextToTokens(text) 273 274 count1 = len(values1) 275 count2 = len(values2) 276 if count1 != count2: 277 print "Size mismatch original %d vs %d\n" % (count1, count2) 278 if count1 > count2: count1 = count2 279 280 for i in range(count1): 281 if values1[i] != values2[i]: 282 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i]) 283 284 if GetOption('output'): 285 sys.stdout.write('Generating original.txt and tokenized.txt\n') 286 open('original.txt', 'w').write(src1) 287 open('tokenized.txt', 'w').write(src2) 288 289 if values1 == values2: 290 sys.stdout.write('Same: Pass\n') 291 return 0 292 293 print "****************\n%s\n%s***************\n" % (src1, src2) 294 sys.stdout.write('Same: Failed\n') 295 return -1 296 297 298 # 299 # TestExpect 300 # 301 # From a set of tokens pairs, verify the type field of the second matches 302 # the value of the first, so that: 303 # INT 123 FLOAT 1.1 304 # will generate a passing test, where the first token is the SYMBOL INT, 305 # and the second token is the INT 123, third token is the SYMBOL FLOAT and 306 # the fourth is the FLOAT 1.1, etc... 307 def TestExpect(tokens): 308 count = len(tokens) 309 index = 0 310 errors = 0 311 while index < count: 312 type = tokens[index].value 313 token = tokens[index + 1] 314 index += 2 315 316 if type != token.type: 317 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % 318 (type, token.type, token.value)) 319 errors += 1 320 321 if not errors: 322 sys.stdout.write('Expect: Pass\n') 323 return 0 324 325 sys.stdout.write('Expect: Failed\n') 326 return -1 327 328 329 def Main(args): 330 filenames = ParseOptions(args) 331 332 try: 333 tokens = FilesToTokens(filenames, GetOption('verbose')) 334 values = [tok.value for tok in tokens] 335 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n') 336 if GetOption('test'): 337 if TestSame(values): 338 return -1 339 if TestExpect(tokens): 340 return -1 341 return 0 342 343 except lex.LexError as le: 344 sys.stderr.write('%s\n' % str(le)) 345 return -1 346 347 348 if __name__ == '__main__': 349 sys.exit(Main(sys.argv[1:])) 350