1 # ---------------------------------------------------------------------- 2 # clex.py 3 # 4 # A lexer for ANSI C. 5 # ---------------------------------------------------------------------- 6 7 import sys 8 sys.path.insert(0, "../..") 9 10 import ply.lex as lex 11 12 # Reserved words 13 reserved = ( 14 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 15 'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER', 16 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF', 17 'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE', 18 ) 19 20 tokens = reserved + ( 21 # Literals (identifier, integer constant, float constant, string constant, 22 # char const) 23 'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST', 24 25 # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) 26 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', 27 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 28 'LOR', 'LAND', 'LNOT', 29 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', 30 31 # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) 32 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', 33 'LSHIFTEQUAL', 'RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', 34 35 # Increment/decrement (++,--) 36 'PLUSPLUS', 'MINUSMINUS', 37 38 # Structure dereference (->) 39 'ARROW', 40 41 # Conditional operator (?) 42 'CONDOP', 43 44 # Delimeters ( ) [ ] { } , . ; : 45 'LPAREN', 'RPAREN', 46 'LBRACKET', 'RBRACKET', 47 'LBRACE', 'RBRACE', 48 'COMMA', 'PERIOD', 'SEMI', 'COLON', 49 50 # Ellipsis (...) 51 'ELLIPSIS', 52 ) 53 54 # Completely ignored characters 55 t_ignore = ' \t\x0c' 56 57 # Newlines 58 59 60 def t_NEWLINE(t): 61 r'\n+' 62 t.lexer.lineno += t.value.count("\n") 63 64 # Operators 65 t_PLUS = r'\+' 66 t_MINUS = r'-' 67 t_TIMES = r'\*' 68 t_DIVIDE = r'/' 69 t_MOD = r'%' 70 t_OR = r'\|' 71 t_AND = r'&' 72 t_NOT = r'~' 73 t_XOR = r'\^' 74 t_LSHIFT = r'<<' 75 t_RSHIFT = r'>>' 76 t_LOR = r'\|\|' 77 t_LAND = r'&&' 78 t_LNOT = r'!' 79 t_LT = r'<' 80 t_GT = r'>' 81 t_LE = r'<=' 82 t_GE = r'>=' 83 t_EQ = r'==' 84 t_NE = r'!=' 85 86 # Assignment operators 87 88 t_EQUALS = r'=' 89 t_TIMESEQUAL = r'\*=' 90 t_DIVEQUAL = r'/=' 91 t_MODEQUAL = r'%=' 92 t_PLUSEQUAL = r'\+=' 93 t_MINUSEQUAL = r'-=' 94 t_LSHIFTEQUAL = r'<<=' 95 t_RSHIFTEQUAL = r'>>=' 96 t_ANDEQUAL = r'&=' 97 t_OREQUAL = r'\|=' 98 t_XOREQUAL = r'\^=' 99 100 # Increment/decrement 101 t_PLUSPLUS = r'\+\+' 102 t_MINUSMINUS = r'--' 103 104 # -> 105 t_ARROW = r'->' 106 107 # ? 108 t_CONDOP = r'\?' 109 110 # Delimeters 111 t_LPAREN = r'\(' 112 t_RPAREN = r'\)' 113 t_LBRACKET = r'\[' 114 t_RBRACKET = r'\]' 115 t_LBRACE = r'\{' 116 t_RBRACE = r'\}' 117 t_COMMA = r',' 118 t_PERIOD = r'\.' 119 t_SEMI = r';' 120 t_COLON = r':' 121 t_ELLIPSIS = r'\.\.\.' 122 123 # Identifiers and reserved words 124 125 reserved_map = {} 126 for r in reserved: 127 reserved_map[r.lower()] = r 128 129 130 def t_ID(t): 131 r'[A-Za-z_][\w_]*' 132 t.type = reserved_map.get(t.value, "ID") 133 return t 134 135 # Integer literal 136 t_ICONST = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' 137 138 # Floating literal 139 t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' 140 141 # String literal 142 t_SCONST = r'\"([^\\\n]|(\\.))*?\"' 143 144 # Character constant 'c' or L'c' 145 t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\'' 146 147 # Comments 148 149 150 def t_comment(t): 151 r'/\*(.|\n)*?\*/' 152 t.lexer.lineno += t.value.count('\n') 153 154 # Preprocessor directive (ignored) 155 156 157 def t_preprocessor(t): 158 r'\#(.)*?\n' 159 t.lexer.lineno += 1 160 161 162 def t_error(t): 163 print("Illegal character %s" % repr(t.value[0])) 164 t.lexer.skip(1) 165 166 lexer = lex.lex() 167 if __name__ == "__main__": 168 lex.runmain(lexer) 169