Home | History | Annotate | Download | only in ansic
      1 # ----------------------------------------------------------------------
      2 # clex.py
      3 #
      4 # A lexer for ANSI C.
      5 # ----------------------------------------------------------------------
      6 
      7 import sys
      8 sys.path.insert(0, "../..")
      9 
     10 import ply.lex as lex
     11 
     12 # Reserved words
     13 reserved = (
     14     'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE',
     15     'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER',
     16     'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF',
     17     'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE',
     18 )
     19 
     20 tokens = reserved + (
     21     # Literals (identifier, integer constant, float constant, string constant,
     22     # char const)
     23     'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST',
     24 
     25     # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=)
     26     'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
     27     'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
     28     'LOR', 'LAND', 'LNOT',
     29     'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
     30 
     31     # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=)
     32     'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL',
     33     'LSHIFTEQUAL', 'RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL',
     34 
     35     # Increment/decrement (++,--)
     36     'PLUSPLUS', 'MINUSMINUS',
     37 
     38     # Structure dereference (->)
     39     'ARROW',
     40 
     41     # Conditional operator (?)
     42     'CONDOP',
     43 
     44     # Delimeters ( ) [ ] { } , . ; :
     45     'LPAREN', 'RPAREN',
     46     'LBRACKET', 'RBRACKET',
     47     'LBRACE', 'RBRACE',
     48     'COMMA', 'PERIOD', 'SEMI', 'COLON',
     49 
     50     # Ellipsis (...)
     51     'ELLIPSIS',
     52 )
     53 
     54 # Completely ignored characters
     55 t_ignore = ' \t\x0c'
     56 
     57 # Newlines
     58 
     59 
     60 def t_NEWLINE(t):
     61     r'\n+'
     62     t.lexer.lineno += t.value.count("\n")
     63 
     64 # Operators
     65 t_PLUS = r'\+'
     66 t_MINUS = r'-'
     67 t_TIMES = r'\*'
     68 t_DIVIDE = r'/'
     69 t_MOD = r'%'
     70 t_OR = r'\|'
     71 t_AND = r'&'
     72 t_NOT = r'~'
     73 t_XOR = r'\^'
     74 t_LSHIFT = r'<<'
     75 t_RSHIFT = r'>>'
     76 t_LOR = r'\|\|'
     77 t_LAND = r'&&'
     78 t_LNOT = r'!'
     79 t_LT = r'<'
     80 t_GT = r'>'
     81 t_LE = r'<='
     82 t_GE = r'>='
     83 t_EQ = r'=='
     84 t_NE = r'!='
     85 
     86 # Assignment operators
     87 
     88 t_EQUALS = r'='
     89 t_TIMESEQUAL = r'\*='
     90 t_DIVEQUAL = r'/='
     91 t_MODEQUAL = r'%='
     92 t_PLUSEQUAL = r'\+='
     93 t_MINUSEQUAL = r'-='
     94 t_LSHIFTEQUAL = r'<<='
     95 t_RSHIFTEQUAL = r'>>='
     96 t_ANDEQUAL = r'&='
     97 t_OREQUAL = r'\|='
     98 t_XOREQUAL = r'\^='
     99 
    100 # Increment/decrement
    101 t_PLUSPLUS = r'\+\+'
    102 t_MINUSMINUS = r'--'
    103 
    104 # ->
    105 t_ARROW = r'->'
    106 
    107 # ?
    108 t_CONDOP = r'\?'
    109 
    110 # Delimeters
    111 t_LPAREN = r'\('
    112 t_RPAREN = r'\)'
    113 t_LBRACKET = r'\['
    114 t_RBRACKET = r'\]'
    115 t_LBRACE = r'\{'
    116 t_RBRACE = r'\}'
    117 t_COMMA = r','
    118 t_PERIOD = r'\.'
    119 t_SEMI = r';'
    120 t_COLON = r':'
    121 t_ELLIPSIS = r'\.\.\.'
    122 
    123 # Identifiers and reserved words
    124 
    125 reserved_map = {}
    126 for r in reserved:
    127     reserved_map[r.lower()] = r
    128 
    129 
    130 def t_ID(t):
    131     r'[A-Za-z_][\w_]*'
    132     t.type = reserved_map.get(t.value, "ID")
    133     return t
    134 
    135 # Integer literal
    136 t_ICONST = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
    137 
    138 # Floating literal
    139 t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
    140 
    141 # String literal
    142 t_SCONST = r'\"([^\\\n]|(\\.))*?\"'
    143 
    144 # Character constant 'c' or L'c'
    145 t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\''
    146 
    147 # Comments
    148 
    149 
    150 def t_comment(t):
    151     r'/\*(.|\n)*?\*/'
    152     t.lexer.lineno += t.value.count('\n')
    153 
    154 # Preprocessor directive (ignored)
    155 
    156 
    157 def t_preprocessor(t):
    158     r'\#(.)*?\n'
    159     t.lexer.lineno += 1
    160 
    161 
    162 def t_error(t):
    163     print("Illegal character %s" % repr(t.value[0]))
    164     t.lexer.skip(1)
    165 
    166 lexer = lex.lex()
    167 if __name__ == "__main__":
    168     lex.runmain(lexer)
    169