1 # Copyright 2014 The Chromium Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import imp 6 import os.path 7 import sys 8 9 def _GetDirAbove(dirname): 10 """Returns the directory "above" this file containing |dirname| (which must 11 also be "above" this file).""" 12 path = os.path.abspath(__file__) 13 while True: 14 path, tail = os.path.split(path) 15 assert tail 16 if tail == dirname: 17 return path 18 19 try: 20 imp.find_module("ply") 21 except ImportError: 22 sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party")) 23 from ply.lex import TOKEN 24 25 from ..error import Error 26 27 28 class LexError(Error): 29 """Class for errors from the lexer.""" 30 31 def __init__(self, filename, message, lineno): 32 Error.__init__(self, filename, message, lineno=lineno) 33 34 35 # We have methods which look like they could be functions: 36 # pylint: disable=R0201 37 class Lexer(object): 38 39 def __init__(self, filename): 40 self.filename = filename 41 42 ######################-- PRIVATE --###################### 43 44 ## 45 ## Internal auxiliary methods 46 ## 47 def _error(self, msg, token): 48 raise LexError(self.filename, msg, token.lineno) 49 50 ## 51 ## Reserved keywords 52 ## 53 keywords = ( 54 'HANDLE', 55 56 'IMPORT', 57 'MODULE', 58 'STRUCT', 59 'UNION', 60 'INTERFACE', 61 'ENUM', 62 'CONST', 63 'TRUE', 64 'FALSE', 65 'DEFAULT', 66 'ARRAY', 67 'MAP', 68 'ASSOCIATED' 69 ) 70 71 keyword_map = {} 72 for keyword in keywords: 73 keyword_map[keyword.lower()] = keyword 74 75 ## 76 ## All the tokens recognized by the lexer 77 ## 78 tokens = keywords + ( 79 # Identifiers 80 'NAME', 81 82 # Constants 83 'ORDINAL', 84 'INT_CONST_DEC', 'INT_CONST_HEX', 85 'FLOAT_CONST', 86 87 # String literals 88 'STRING_LITERAL', 89 90 # Operators 91 'MINUS', 92 'PLUS', 93 'AMP', 94 'QSTN', 95 96 # Assignment 97 'EQUALS', 98 99 # Request / response 100 'RESPONSE', 101 102 # Delimiters 103 'LPAREN', 'RPAREN', # ( ) 104 'LBRACKET', 'RBRACKET', # [ ] 105 'LBRACE', 'RBRACE', # { } 106 'LANGLE', 'RANGLE', # < > 107 'SEMI', # ; 108 'COMMA', 'DOT' # , . 109 ) 110 111 ## 112 ## Regexes for use in tokens 113 ## 114 115 # valid C identifiers (K&R2: A.2.3) 116 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' 117 118 hex_prefix = '0[xX]' 119 hex_digits = '[0-9a-fA-F]+' 120 121 # integer constants (K&R2: A.2.5.1) 122 decimal_constant = '0|([1-9][0-9]*)' 123 hex_constant = hex_prefix+hex_digits 124 # Don't allow octal constants (even invalid octal). 125 octal_constant_disallowed = '0[0-9]+' 126 127 # character constants (K&R2: A.2.5.2) 128 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 129 # directives with Windows paths as filenames (..\..\dir\file) 130 # For the same reason, decimal_escape allows all digit sequences. We want to 131 # parse all correct code, even if it means to sometimes parse incorrect 132 # code. 133 # 134 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 135 decimal_escape = r"""(\d+)""" 136 hex_escape = r"""(x[0-9a-fA-F]+)""" 137 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 138 139 escape_sequence = \ 140 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' 141 142 # string literals (K&R2: A.2.6) 143 string_char = r"""([^"\\\n]|"""+escape_sequence+')' 144 string_literal = '"'+string_char+'*"' 145 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' 146 147 # floating constants (K&R2: A.2.5.3) 148 exponent_part = r"""([eE][-+]?[0-9]+)""" 149 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 150 floating_constant = \ 151 '(((('+fractional_constant+')'+ \ 152 exponent_part+'?)|([0-9]+'+exponent_part+')))' 153 154 # Ordinals 155 ordinal = r'@[0-9]+' 156 missing_ordinal_value = r'@' 157 # Don't allow ordinal values in octal (even invalid octal, like 09) or 158 # hexadecimal. 159 octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' 160 161 ## 162 ## Rules for the normal state 163 ## 164 t_ignore = ' \t\r' 165 166 # Newlines 167 def t_NEWLINE(self, t): 168 r'\n+' 169 t.lexer.lineno += len(t.value) 170 171 # Operators 172 t_MINUS = r'-' 173 t_PLUS = r'\+' 174 t_AMP = r'&' 175 t_QSTN = r'\?' 176 177 # = 178 t_EQUALS = r'=' 179 180 # => 181 t_RESPONSE = r'=>' 182 183 # Delimiters 184 t_LPAREN = r'\(' 185 t_RPAREN = r'\)' 186 t_LBRACKET = r'\[' 187 t_RBRACKET = r'\]' 188 t_LBRACE = r'\{' 189 t_RBRACE = r'\}' 190 t_LANGLE = r'<' 191 t_RANGLE = r'>' 192 t_COMMA = r',' 193 t_DOT = r'\.' 194 t_SEMI = r';' 195 196 t_STRING_LITERAL = string_literal 197 198 # The following floating and integer constants are defined as 199 # functions to impose a strict order (otherwise, decimal 200 # is placed before the others because its regex is longer, 201 # and this is bad) 202 # 203 @TOKEN(floating_constant) 204 def t_FLOAT_CONST(self, t): 205 return t 206 207 @TOKEN(hex_constant) 208 def t_INT_CONST_HEX(self, t): 209 return t 210 211 @TOKEN(octal_constant_disallowed) 212 def t_OCTAL_CONSTANT_DISALLOWED(self, t): 213 msg = "Octal values not allowed" 214 self._error(msg, t) 215 216 @TOKEN(decimal_constant) 217 def t_INT_CONST_DEC(self, t): 218 return t 219 220 # unmatched string literals are caught by the preprocessor 221 222 @TOKEN(bad_string_literal) 223 def t_BAD_STRING_LITERAL(self, t): 224 msg = "String contains invalid escape code" 225 self._error(msg, t) 226 227 # Handle ordinal-related tokens in the right order: 228 @TOKEN(octal_or_hex_ordinal_disallowed) 229 def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): 230 msg = "Octal and hexadecimal ordinal values not allowed" 231 self._error(msg, t) 232 233 @TOKEN(ordinal) 234 def t_ORDINAL(self, t): 235 return t 236 237 @TOKEN(missing_ordinal_value) 238 def t_BAD_ORDINAL(self, t): 239 msg = "Missing ordinal value" 240 self._error(msg, t) 241 242 @TOKEN(identifier) 243 def t_NAME(self, t): 244 t.type = self.keyword_map.get(t.value, "NAME") 245 return t 246 247 # Ignore C and C++ style comments 248 def t_COMMENT(self, t): 249 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 250 t.lexer.lineno += t.value.count("\n") 251 252 def t_error(self, t): 253 msg = "Illegal character %s" % repr(t.value[0]) 254 self._error(msg, t) 255