1 # Copyright 2014 The Chromium Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import imp 6 import os.path 7 import sys 8 9 # Disable lint check for finding modules: 10 # pylint: disable=F0401 11 12 def _GetDirAbove(dirname): 13 """Returns the directory "above" this file containing |dirname| (which must 14 also be "above" this file).""" 15 path = os.path.abspath(__file__) 16 while True: 17 path, tail = os.path.split(path) 18 assert tail 19 if tail == dirname: 20 return path 21 22 try: 23 imp.find_module("ply") 24 except ImportError: 25 sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party")) 26 from ply.lex import TOKEN 27 28 from ..error import Error 29 30 31 # Disable lint check for exceptions deriving from Exception: 32 # pylint: disable=W0710 33 class LexError(Error): 34 """Class for errors from the lexer.""" 35 36 def __init__(self, filename, message, lineno): 37 Error.__init__(self, filename, message, lineno=lineno) 38 39 40 # We have methods which look like they could be functions: 41 # pylint: disable=R0201 42 class Lexer(object): 43 44 def __init__(self, filename): 45 self.filename = filename 46 47 ######################-- PRIVATE --###################### 48 49 ## 50 ## Internal auxiliary methods 51 ## 52 def _error(self, msg, token): 53 raise LexError(self.filename, msg, token.lineno) 54 55 ## 56 ## Reserved keywords 57 ## 58 keywords = ( 59 'HANDLE', 60 61 'IMPORT', 62 'MODULE', 63 'STRUCT', 64 'INTERFACE', 65 'ENUM', 66 'CONST', 67 'TRUE', 68 'FALSE', 69 'DEFAULT', 70 ) 71 72 keyword_map = {} 73 for keyword in keywords: 74 keyword_map[keyword.lower()] = keyword 75 76 ## 77 ## All the tokens recognized by the lexer 78 ## 79 tokens = keywords + ( 80 # Identifiers 81 'NAME', 82 83 # Constants 84 'ORDINAL', 85 'INT_CONST_DEC', 'INT_CONST_HEX', 86 'FLOAT_CONST', 87 'CHAR_CONST', 88 89 # String literals 90 'STRING_LITERAL', 91 92 # Operators 93 'MINUS', 94 'PLUS', 95 'AMP', 96 97 # Assignment 98 'EQUALS', 99 100 # Request / response 101 'RESPONSE', 102 103 # Delimiters 104 'LPAREN', 'RPAREN', # ( ) 105 'LBRACKET', 'RBRACKET', # [ ] 106 'LBRACE', 'RBRACE', # { } 107 'LANGLE', 'RANGLE', # < > 108 'SEMI', # ; 109 'COMMA', 'DOT' # , . 110 ) 111 112 ## 113 ## Regexes for use in tokens 114 ## 115 116 # valid C identifiers (K&R2: A.2.3) 117 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' 118 119 hex_prefix = '0[xX]' 120 hex_digits = '[0-9a-fA-F]+' 121 122 # integer constants (K&R2: A.2.5.1) 123 decimal_constant = '0|([1-9][0-9]*)' 124 hex_constant = hex_prefix+hex_digits 125 # Don't allow octal constants (even invalid octal). 126 octal_constant_disallowed = '0[0-9]+' 127 128 # character constants (K&R2: A.2.5.2) 129 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 130 # directives with Windows paths as filenames (..\..\dir\file) 131 # For the same reason, decimal_escape allows all digit sequences. We want to 132 # parse all correct code, even if it means to sometimes parse incorrect 133 # code. 134 # 135 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 136 decimal_escape = r"""(\d+)""" 137 hex_escape = r"""(x[0-9a-fA-F]+)""" 138 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 139 140 escape_sequence = \ 141 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 142 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' 143 char_const = "'"+cconst_char+"'" 144 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" 145 bad_char_const = \ 146 r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \ 147 bad_escape+r"""[^'\n]*')""" 148 149 # string literals (K&R2: A.2.6) 150 string_char = r"""([^"\\\n]|"""+escape_sequence+')' 151 string_literal = '"'+string_char+'*"' 152 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 153 154 # floating constants (K&R2: A.2.5.3) 155 exponent_part = r"""([eE][-+]?[0-9]+)""" 156 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 157 floating_constant = \ 158 '(((('+fractional_constant+')'+ \ 159 exponent_part+'?)|([0-9]+'+exponent_part+')))' 160 161 # Ordinals 162 ordinal = r'@[0-9]+' 163 missing_ordinal_value = r'@' 164 # Don't allow ordinal values in octal (even invalid octal, like 09) or 165 # hexadecimal. 166 octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' 167 168 ## 169 ## Rules for the normal state 170 ## 171 t_ignore = ' \t\r' 172 173 # Newlines 174 def t_NEWLINE(self, t): 175 r'\n+' 176 t.lexer.lineno += len(t.value) 177 178 # Operators 179 t_MINUS = r'-' 180 t_PLUS = r'\+' 181 t_AMP = r'&' 182 183 # = 184 t_EQUALS = r'=' 185 186 # => 187 t_RESPONSE = r'=>' 188 189 # Delimiters 190 t_LPAREN = r'\(' 191 t_RPAREN = r'\)' 192 t_LBRACKET = r'\[' 193 t_RBRACKET = r'\]' 194 t_LBRACE = r'\{' 195 t_RBRACE = r'\}' 196 t_LANGLE = r'<' 197 t_RANGLE = r'>' 198 t_COMMA = r',' 199 t_DOT = r'\.' 200 t_SEMI = r';' 201 202 t_STRING_LITERAL = string_literal 203 204 # The following floating and integer constants are defined as 205 # functions to impose a strict order (otherwise, decimal 206 # is placed before the others because its regex is longer, 207 # and this is bad) 208 # 209 @TOKEN(floating_constant) 210 def t_FLOAT_CONST(self, t): 211 return t 212 213 @TOKEN(hex_constant) 214 def t_INT_CONST_HEX(self, t): 215 return t 216 217 @TOKEN(octal_constant_disallowed) 218 def t_OCTAL_CONSTANT_DISALLOWED(self, t): 219 msg = "Octal values not allowed" 220 self._error(msg, t) 221 222 @TOKEN(decimal_constant) 223 def t_INT_CONST_DEC(self, t): 224 return t 225 226 # Must come before bad_char_const, to prevent it from 227 # catching valid char constants as invalid 228 # 229 @TOKEN(char_const) 230 def t_CHAR_CONST(self, t): 231 return t 232 233 @TOKEN(unmatched_quote) 234 def t_UNMATCHED_QUOTE(self, t): 235 msg = "Unmatched '" 236 self._error(msg, t) 237 238 @TOKEN(bad_char_const) 239 def t_BAD_CHAR_CONST(self, t): 240 msg = "Invalid char constant %s" % t.value 241 self._error(msg, t) 242 243 # unmatched string literals are caught by the preprocessor 244 245 @TOKEN(bad_string_literal) 246 def t_BAD_STRING_LITERAL(self, t): 247 msg = "String contains invalid escape code" 248 self._error(msg, t) 249 250 # Handle ordinal-related tokens in the right order: 251 @TOKEN(octal_or_hex_ordinal_disallowed) 252 def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): 253 msg = "Octal and hexadecimal ordinal values not allowed" 254 self._error(msg, t) 255 256 @TOKEN(ordinal) 257 def t_ORDINAL(self, t): 258 return t 259 260 @TOKEN(missing_ordinal_value) 261 def t_BAD_ORDINAL(self, t): 262 msg = "Missing ordinal value" 263 self._error(msg, t) 264 265 @TOKEN(identifier) 266 def t_NAME(self, t): 267 t.type = self.keyword_map.get(t.value, "NAME") 268 return t 269 270 # Ignore C and C++ style comments 271 def t_COMMENT(self, t): 272 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 273 t.lexer.lineno += t.value.count("\n") 274 275 def t_error(self, t): 276 msg = "Illegal character %s" % repr(t.value[0]) 277 self._error(msg, t) 278