Home | History | Annotate | Download | only in cpp
      1 #!/usr/bin/env python
      2 #
      3 # Copyright 2007 Neal Norwitz
      4 # Portions Copyright 2007 Google Inc.
      5 #
      6 # Licensed under the Apache License, Version 2.0 (the "License");
      7 # you may not use this file except in compliance with the License.
      8 # You may obtain a copy of the License at
      9 #
     10 #      http://www.apache.org/licenses/LICENSE-2.0
     11 #
     12 # Unless required by applicable law or agreed to in writing, software
     13 # distributed under the License is distributed on an "AS IS" BASIS,
     14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 # See the License for the specific language governing permissions and
     16 # limitations under the License.
     17 
     18 """Tokenize C++ source code."""
     19 
     20 __author__ = 'nnorwitz (at] google.com (Neal Norwitz)'
     21 
     22 
     23 try:
     24     # Python 3.x
     25     import builtins
     26 except ImportError:
     27     # Python 2.x
     28     import __builtin__ as builtins
     29 
     30 
     31 import sys
     32 
     33 from cpp import utils
     34 
     35 
     36 if not hasattr(builtins, 'set'):
     37     # Nominal support for Python 2.3.
     38     from sets import Set as set
     39 
     40 
     41 # Add $ as a valid identifier char since so much code uses it.
     42 _letters = 'abcdefghijklmnopqrstuvwxyz'
     43 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
     44 HEX_DIGITS = set('0123456789abcdefABCDEF')
     45 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
     46 
     47 
     48 # C++0x string preffixes.
     49 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
     50 
     51 
     52 # Token types.
     53 UNKNOWN = 'UNKNOWN'
     54 SYNTAX = 'SYNTAX'
     55 CONSTANT = 'CONSTANT'
     56 NAME = 'NAME'
     57 PREPROCESSOR = 'PREPROCESSOR'
     58 
     59 # Where the token originated from.  This can be used for backtracking.
     60 # It is always set to WHENCE_STREAM in this code.
     61 WHENCE_STREAM, WHENCE_QUEUE = range(2)
     62 
     63 
     64 class Token(object):
     65     """Data container to represent a C++ token.
     66 
     67     Tokens can be identifiers, syntax char(s), constants, or
     68     pre-processor directives.
     69 
     70     start contains the index of the first char of the token in the source
     71     end contains the index of the last char of the token in the source
     72     """
     73 
     74     def __init__(self, token_type, name, start, end):
     75         self.token_type = token_type
     76         self.name = name
     77         self.start = start
     78         self.end = end
     79         self.whence = WHENCE_STREAM
     80 
     81     def __str__(self):
     82         if not utils.DEBUG:
     83             return 'Token(%r)' % self.name
     84         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
     85 
     86     __repr__ = __str__
     87 
     88 
     89 def _GetString(source, start, i):
     90     i = source.find('"', i+1)
     91     while source[i-1] == '\\':
     92         # Count the trailing backslashes.
     93         backslash_count = 1
     94         j = i - 2
     95         while source[j] == '\\':
     96             backslash_count += 1
     97             j -= 1
     98         # When trailing backslashes are even, they escape each other.
     99         if (backslash_count % 2) == 0:
    100             break
    101         i = source.find('"', i+1)
    102     return i + 1
    103 
    104 
    105 def _GetChar(source, start, i):
    106     # NOTE(nnorwitz): may not be quite correct, should be good enough.
    107     i = source.find("'", i+1)
    108     while source[i-1] == '\\':
    109         # Need to special case '\\'.
    110         if (i - 2) > start and source[i-2] == '\\':
    111             break
    112         i = source.find("'", i+1)
    113     # Try to handle unterminated single quotes (in a #if 0 block).
    114     if i < 0:
    115         i = start
    116     return i + 1
    117 
    118 
    119 def GetTokens(source):
    120     """Returns a sequence of Tokens.
    121 
    122     Args:
    123       source: string of C++ source code.
    124 
    125     Yields:
    126       Token that represents the next token in the source.
    127     """
    128     # Cache various valid character sets for speed.
    129     valid_identifier_chars = VALID_IDENTIFIER_CHARS
    130     hex_digits = HEX_DIGITS
    131     int_or_float_digits = INT_OR_FLOAT_DIGITS
    132     int_or_float_digits2 = int_or_float_digits | set('.')
    133 
    134     # Only ignore errors while in a #if 0 block.
    135     ignore_errors = False
    136     count_ifs = 0
    137 
    138     i = 0
    139     end = len(source)
    140     while i < end:
    141         # Skip whitespace.
    142         while i < end and source[i].isspace():
    143             i += 1
    144         if i >= end:
    145             return
    146 
    147         token_type = UNKNOWN
    148         start = i
    149         c = source[i]
    150         if c.isalpha() or c == '_':              # Find a string token.
    151             token_type = NAME
    152             while source[i] in valid_identifier_chars:
    153                 i += 1
    154             # String and character constants can look like a name if
    155             # they are something like L"".
    156             if (source[i] == "'" and (i - start) == 1 and
    157                 source[start:i] in 'uUL'):
    158                 # u, U, and L are valid C++0x character preffixes.
    159                 token_type = CONSTANT
    160                 i = _GetChar(source, start, i)
    161             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
    162                 token_type = CONSTANT
    163                 i = _GetString(source, start, i)
    164         elif c == '/' and source[i+1] == '/':    # Find // comments.
    165             i = source.find('\n', i)
    166             if i == -1:  # Handle EOF.
    167                 i = end
    168             continue
    169         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
    170             i = source.find('*/', i) + 2
    171             continue
    172         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
    173             token_type = SYNTAX
    174             i += 1
    175             new_ch = source[i]
    176             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
    177                 i += 1
    178             elif c == '-' and new_ch == '>':
    179                 i += 1
    180             elif new_ch == '=':
    181                 i += 1
    182         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
    183             token_type = SYNTAX
    184             i += 1
    185             if c == '.' and source[i].isdigit():
    186                 token_type = CONSTANT
    187                 i += 1
    188                 while source[i] in int_or_float_digits:
    189                     i += 1
    190                 # Handle float suffixes.
    191                 for suffix in ('l', 'f'):
    192                     if suffix == source[i:i+1].lower():
    193                         i += 1
    194                         break
    195         elif c.isdigit():                        # Find integer.
    196             token_type = CONSTANT
    197             if c == '0' and source[i+1] in 'xX':
    198                 # Handle hex digits.
    199                 i += 2
    200                 while source[i] in hex_digits:
    201                     i += 1
    202             else:
    203                 while source[i] in int_or_float_digits2:
    204                     i += 1
    205             # Handle integer (and float) suffixes.
    206             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
    207                 size = len(suffix)
    208                 if suffix == source[i:i+size].lower():
    209                     i += size
    210                     break
    211         elif c == '"':                           # Find string.
    212             token_type = CONSTANT
    213             i = _GetString(source, start, i)
    214         elif c == "'":                           # Find char.
    215             token_type = CONSTANT
    216             i = _GetChar(source, start, i)
    217         elif c == '#':                           # Find pre-processor command.
    218             token_type = PREPROCESSOR
    219             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
    220             if got_if:
    221                 count_ifs += 1
    222             elif source[i:i+6] == '#endif':
    223                 count_ifs -= 1
    224                 if count_ifs == 0:
    225                     ignore_errors = False
    226 
    227             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
    228             while 1:
    229                 i1 = source.find('\n', i)
    230                 i2 = source.find('//', i)
    231                 i3 = source.find('/*', i)
    232                 i4 = source.find('"', i)
    233                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
    234                 # Get the first important symbol (newline, comment, EOF/end).
    235                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
    236 
    237                 # Handle #include "dir//foo.h" properly.
    238                 if source[i] == '"':
    239                     i = source.find('"', i+1) + 1
    240                     assert i > 0
    241                     continue
    242                 # Keep going if end of the line and the line ends with \.
    243                 if not (i == i1 and source[i-1] == '\\'):
    244                     if got_if:
    245                         condition = source[start+4:i].lstrip()
    246                         if (condition.startswith('0') or
    247                             condition.startswith('(0)')):
    248                             ignore_errors = True
    249                     break
    250                 i += 1
    251         elif c == '\\':                          # Handle \ in code.
    252             # This is different from the pre-processor \ handling.
    253             i += 1
    254             continue
    255         elif ignore_errors:
    256             # The tokenizer seems to be in pretty good shape.  This
    257             # raise is conditionally disabled so that bogus code
    258             # in an #if 0 block can be handled.  Since we will ignore
    259             # it anyways, this is probably fine.  So disable the
    260             # exception and  return the bogus char.
    261             i += 1
    262         else:
    263             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
    264                              ('?', i, c, source[i-10:i+10]))
    265             raise RuntimeError('unexpected token')
    266 
    267         if i <= 0:
    268             print('Invalid index, exiting now.')
    269             return
    270         yield Token(token_type, source[start:i], start, i)
    271 
    272 
    273 if __name__ == '__main__':
    274     def main(argv):
    275         """Driver mostly for testing purposes."""
    276         for filename in argv[1:]:
    277             source = utils.ReadFile(filename)
    278             if source is None:
    279                 continue
    280 
    281             for token in GetTokens(source):
    282                 print('%-12s: %s' % (token.token_type, token.name))
    283                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
    284             sys.stdout.write('\n')
    285 
    286 
    287     main(sys.argv)
    288