Home | History | Annotate | Download | only in closure_linter
      1 #!/usr/bin/env python
      2 #
      3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #      http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS-IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 
     17 """Regular expression based JavaScript parsing classes."""
     18 
     19 __author__ = ('robbyw (at] google.com (Robert Walker)',
     20               'ajp (at] google.com (Andy Perelson)')
     21 
     22 import copy
     23 import re
     24 
     25 from closure_linter import javascripttokens
     26 from closure_linter.common import matcher
     27 from closure_linter.common import tokenizer
     28 
     29 # Shorthand
     30 Type = javascripttokens.JavaScriptTokenType
     31 Matcher = matcher.Matcher
     32 
     33 
     34 class JavaScriptModes(object):
     35   """Enumeration of the different matcher modes used for JavaScript."""
     36   TEXT_MODE = 'text'
     37   SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
     38   DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
     39   BLOCK_COMMENT_MODE = 'block_comment'
     40   DOC_COMMENT_MODE = 'doc_comment'
     41   DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
     42   LINE_COMMENT_MODE = 'line_comment'
     43   PARAMETER_MODE = 'parameter'
     44   FUNCTION_MODE = 'function'
     45 
     46 
     47 class JavaScriptTokenizer(tokenizer.Tokenizer):
     48   """JavaScript tokenizer.
     49 
     50   Convert JavaScript code in to an array of tokens.
     51   """
     52 
     53   # Useful patterns for JavaScript parsing.
     54   IDENTIFIER_CHAR = r'A-Za-z0-9_$'
     55 
     56   # Number patterns based on:
     57   # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
     58   MANTISSA = r"""
     59              (\d+(?!\.)) |                # Matches '10'
     60              (\d+\.(?!\d)) |              # Matches '10.'
     61              (\d*\.\d+)                   # Matches '.5' or '10.5'
     62              """
     63   DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
     64   HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
     65   NUMBER = re.compile(r"""
     66                       ((%s)|(%s))
     67                       """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
     68 
     69   # Strings come in three parts - first we match the start of the string, then
     70   # the contents, then the end.  The contents consist of any character except a
     71   # backslash or end of string, or a backslash followed by any character, or a
     72   # backslash followed by end of line to support correct parsing of multi-line
     73   # strings.
     74   SINGLE_QUOTE = re.compile(r"'")
     75   SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
     76   DOUBLE_QUOTE = re.compile(r'"')
     77   DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
     78 
     79   START_SINGLE_LINE_COMMENT = re.compile(r'//')
     80   END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
     81 
     82   START_DOC_COMMENT = re.compile(r'/\*\*')
     83   START_BLOCK_COMMENT = re.compile(r'/\*')
     84   END_BLOCK_COMMENT = re.compile(r'\*/')
     85   BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
     86 
     87   # Comment text is anything that we are not going to parse into another special
     88   # token like (inline) flags or end comments. Complicated regex to match
     89   # most normal characters, and '*', '{', '}', and '@' when we are sure that
     90   # it is safe. Expression [^*{\s]@ must come first, or the other options will
     91   # match everything before @, and we won't match @'s that aren't part of flags
     92   # like in email addresses in the @author tag.
     93   DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
     94   DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
     95   # Match anything that is allowed in a type definition, except for tokens
     96   # needed to parse it (and the lookahead assertion for "*/").
     97   DOC_COMMENT_TYPE_TEXT = re.compile(r'([^*|!?=<>(){}:,\s]|\*(?!/))+')
     98 
     99   # Match the prefix ' * ' that starts every line of jsdoc. Want to include
    100   # spaces after the '*', but nothing else that occurs after a '*', and don't
    101   # want to match the '*' in '*/'.
    102   DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
    103 
    104   START_BLOCK = re.compile('{')
    105   END_BLOCK = re.compile('}')
    106 
    107   REGEX_CHARACTER_CLASS = r"""
    108                           \[               # Opening bracket
    109                           ([^\]\\]|\\.)*   # Anything but a ] or \,
    110                                            # or a backslash followed by anything
    111                           \]               # Closing bracket
    112                           """
    113   # We ensure the regex is followed by one of the above tokens to avoid
    114   # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
    115   POST_REGEX_LIST = [
    116       ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
    117 
    118   REGEX = re.compile(r"""
    119                      /                      # opening slash
    120                      (?!\*)                 # not the start of a comment
    121                      (\\.|[^\[\/\\]|(%s))*  # a backslash followed by anything,
    122                                             # or anything but a / or [ or \,
    123                                             # or a character class
    124                      /                      # closing slash
    125                      [gimsx]*               # optional modifiers
    126                      (?=\s*(%s))
    127                      """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
    128                      re.VERBOSE)
    129 
    130   ANYTHING = re.compile(r'.*')
    131   PARAMETERS = re.compile(r'[^\)]+')
    132   CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
    133 
    134   FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
    135 
    136   OPENING_PAREN = re.compile(r'\(')
    137   CLOSING_PAREN = re.compile(r'\)')
    138 
    139   OPENING_BRACKET = re.compile(r'\[')
    140   CLOSING_BRACKET = re.compile(r'\]')
    141 
    142   # We omit these JS keywords from the list:
    143   #   function - covered by FUNCTION_DECLARATION.
    144   #   delete, in, instanceof, new, typeof - included as operators.
    145   #   this - included in identifiers.
    146   #   null, undefined - not included, should go in some "special constant" list.
    147   KEYWORD_LIST = [
    148       'break',
    149       'case',
    150       'catch',
    151       'continue',
    152       'default',
    153       'do',
    154       'else',
    155       'finally',
    156       'for',
    157       'if',
    158       'return',
    159       'switch',
    160       'throw',
    161       'try',
    162       'var',
    163       'while',
    164       'with',
    165   ]
    166 
    167   # List of regular expressions to match as operators.  Some notes: for our
    168   # purposes, the comma behaves similarly enough to a normal operator that we
    169   # include it here.  r'\bin\b' actually matches 'in' surrounded by boundary
    170   # characters - this may not match some very esoteric uses of the in operator.
    171   # Operators that are subsets of larger operators must come later in this list
    172   # for proper matching, e.g., '>>' must come AFTER '>>>'.
    173   OPERATOR_LIST = [
    174       ',',
    175       r'\+\+',
    176       '===',
    177       '!==',
    178       '>>>=',
    179       '>>>',
    180       '==',
    181       '>=',
    182       '<=',
    183       '!=',
    184       '<<=',
    185       '>>=',
    186       '<<',
    187       '>>',
    188       '=>',
    189       '>',
    190       '<',
    191       r'\+=',
    192       r'\+',
    193       '--',
    194       r'\^=',
    195       '-=',
    196       '-',
    197       '/=',
    198       '/',
    199       r'\*=',
    200       r'\*',
    201       '%=',
    202       '%',
    203       '&&',
    204       r'\|\|',
    205       '&=',
    206       '&',
    207       r'\|=',
    208       r'\|',
    209       '=',
    210       '!',
    211       ':',
    212       r'\?',
    213       r'\^',
    214       r'\bdelete\b',
    215       r'\bin\b',
    216       r'\binstanceof\b',
    217       r'\bnew\b',
    218       r'\btypeof\b',
    219       r'\bvoid\b',
    220       r'\.',
    221   ]
    222   OPERATOR = re.compile('|'.join(OPERATOR_LIST))
    223 
    224   WHITESPACE = re.compile(r'\s+')
    225   SEMICOLON = re.compile(r';')
    226   # Technically JavaScript identifiers can't contain '.', but we treat a set of
    227   # nested identifiers as a single identifier, except for trailing dots.
    228   NESTED_IDENTIFIER = r'[a-zA-Z_$]([%s]|\.[a-zA-Z_$])*' % IDENTIFIER_CHAR
    229   IDENTIFIER = re.compile(NESTED_IDENTIFIER)
    230 
    231   SIMPLE_LVALUE = re.compile(r"""
    232                              (?P<identifier>%s)      # a valid identifier
    233                              (?=\s*                  # optional whitespace
    234                              \=                      # look ahead to equal sign
    235                              (?!=))                  # not follwed by equal
    236                              """ % NESTED_IDENTIFIER, re.VERBOSE)
    237 
    238   # A doc flag is a @ sign followed by non-space characters that appears at the
    239   # beginning of the line, after whitespace, or after a '{'.  The look-behind
    240   # check is necessary to not match someone (at] google.com as a flag.
    241   DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
    242   # To properly parse parameter names and complex doctypes containing
    243   # whitespace, we need to tokenize whitespace into a token after certain
    244   # doctags. All statetracker.HAS_TYPE that are not listed here must not contain
    245   # any whitespace in their types.
    246   DOC_FLAG_LEX_SPACES = re.compile(
    247       r'(^|(?<=\s))@(?P<name>%s)\b' %
    248       '|'.join([
    249           'const',
    250           'enum',
    251           'export',
    252           'extends',
    253           'final',
    254           'implements',
    255           'package',
    256           'param',
    257           'private',
    258           'protected',
    259           'public',
    260           'return',
    261           'type',
    262           'typedef'
    263       ]))
    264 
    265   DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
    266 
    267   DOC_TYPE_BLOCK_START = re.compile(r'[<(]')
    268   DOC_TYPE_BLOCK_END = re.compile(r'[>)]')
    269   DOC_TYPE_MODIFIERS = re.compile(r'[!?|,:=]')
    270 
    271   # Star followed by non-slash, i.e a star that does not end a comment.
    272   # This is used for TYPE_GROUP below.
    273   SAFE_STAR = r'(\*(?!/))'
    274 
    275   COMMON_DOC_MATCHERS = [
    276       # Find the end of the comment.
    277       Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
    278               JavaScriptModes.TEXT_MODE),
    279 
    280       # Tokenize documented flags like @private.
    281       Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
    282       Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
    283               JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
    284 
    285       # Encountering a doc flag should leave lex spaces mode.
    286       Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
    287 
    288       # Tokenize braces so we can find types.
    289       Matcher(START_BLOCK, Type.DOC_START_BRACE),
    290       Matcher(END_BLOCK, Type.DOC_END_BRACE),
    291 
    292       # And some more to parse types.
    293       Matcher(DOC_TYPE_BLOCK_START, Type.DOC_TYPE_START_BLOCK),
    294       Matcher(DOC_TYPE_BLOCK_END, Type.DOC_TYPE_END_BLOCK),
    295 
    296       Matcher(DOC_TYPE_MODIFIERS, Type.DOC_TYPE_MODIFIER),
    297       Matcher(DOC_COMMENT_TYPE_TEXT, Type.COMMENT),
    298 
    299       Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
    300 
    301   # When text is not matched, it is given this default type based on mode.
    302   # If unspecified in this map, the default default is Type.NORMAL.
    303   JAVASCRIPT_DEFAULT_TYPES = {
    304       JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
    305       JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
    306   }
    307 
    308   @classmethod
    309   def BuildMatchers(cls):
    310     """Builds the token matcher group.
    311 
    312     The token matcher groups work as follows: it is a list of Matcher objects.
    313     The matchers will be tried in this order, and the first to match will be
    314     returned.  Hence the order is important because the matchers that come first
    315     overrule the matchers that come later.
    316 
    317     Returns:
    318       The completed token matcher group.
    319     """
    320     # Match a keyword string followed by a non-identifier character in order to
    321     # not match something like doSomething as do + Something.
    322     keyword = re.compile('(%s)((?=[^%s])|$)' % (
    323         '|'.join(cls.KEYWORD_LIST), cls.IDENTIFIER_CHAR))
    324     return {
    325 
    326         # Matchers for basic text mode.
    327         JavaScriptModes.TEXT_MODE: [
    328             # Check a big group - strings, starting comments, and regexes - all
    329             # of which could be intertwined.  'string with /regex/',
    330             # /regex with 'string'/, /* comment with /regex/ and string */ (and
    331             # so on)
    332             Matcher(cls.START_DOC_COMMENT, Type.START_DOC_COMMENT,
    333                     JavaScriptModes.DOC_COMMENT_MODE),
    334             Matcher(cls.START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
    335                     JavaScriptModes.BLOCK_COMMENT_MODE),
    336             Matcher(cls.END_OF_LINE_SINGLE_LINE_COMMENT,
    337                     Type.START_SINGLE_LINE_COMMENT),
    338             Matcher(cls.START_SINGLE_LINE_COMMENT,
    339                     Type.START_SINGLE_LINE_COMMENT,
    340                     JavaScriptModes.LINE_COMMENT_MODE),
    341             Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
    342                     JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
    343             Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
    344                     JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
    345             Matcher(cls.REGEX, Type.REGEX),
    346 
    347             # Next we check for start blocks appearing outside any of the items
    348             # above.
    349             Matcher(cls.START_BLOCK, Type.START_BLOCK),
    350             Matcher(cls.END_BLOCK, Type.END_BLOCK),
    351 
    352             # Then we search for function declarations.
    353             Matcher(cls.FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
    354                     JavaScriptModes.FUNCTION_MODE),
    355 
    356             # Next, we convert non-function related parens to tokens.
    357             Matcher(cls.OPENING_PAREN, Type.START_PAREN),
    358             Matcher(cls.CLOSING_PAREN, Type.END_PAREN),
    359 
    360             # Next, we convert brackets to tokens.
    361             Matcher(cls.OPENING_BRACKET, Type.START_BRACKET),
    362             Matcher(cls.CLOSING_BRACKET, Type.END_BRACKET),
    363 
    364             # Find numbers.  This has to happen before operators because
    365             # scientific notation numbers can have + and - in them.
    366             Matcher(cls.NUMBER, Type.NUMBER),
    367 
    368             # Find operators and simple assignments
    369             Matcher(cls.SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
    370             Matcher(cls.OPERATOR, Type.OPERATOR),
    371 
    372             # Find key words and whitespace.
    373             Matcher(keyword, Type.KEYWORD),
    374             Matcher(cls.WHITESPACE, Type.WHITESPACE),
    375 
    376             # Find identifiers.
    377             Matcher(cls.IDENTIFIER, Type.IDENTIFIER),
    378 
    379             # Finally, we convert semicolons to tokens.
    380             Matcher(cls.SEMICOLON, Type.SEMICOLON)],
    381 
    382         # Matchers for single quote strings.
    383         JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
    384             Matcher(cls.SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
    385             Matcher(cls.SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
    386                     JavaScriptModes.TEXT_MODE)],
    387 
    388         # Matchers for double quote strings.
    389         JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
    390             Matcher(cls.DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
    391             Matcher(cls.DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
    392                     JavaScriptModes.TEXT_MODE)],
    393 
    394         # Matchers for block comments.
    395         JavaScriptModes.BLOCK_COMMENT_MODE: [
    396             # First we check for exiting a block comment.
    397             Matcher(cls.END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
    398                     JavaScriptModes.TEXT_MODE),
    399 
    400             # Match non-comment-ending text..
    401             Matcher(cls.BLOCK_COMMENT_TEXT, Type.COMMENT)],
    402 
    403         # Matchers for doc comments.
    404         JavaScriptModes.DOC_COMMENT_MODE: cls.COMMON_DOC_MATCHERS + [
    405             Matcher(cls.DOC_COMMENT_TEXT, Type.COMMENT)],
    406 
    407         JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: cls.COMMON_DOC_MATCHERS + [
    408             Matcher(cls.WHITESPACE, Type.COMMENT),
    409             Matcher(cls.DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
    410 
    411         # Matchers for single line comments.
    412         JavaScriptModes.LINE_COMMENT_MODE: [
    413             # We greedy match until the end of the line in line comment mode.
    414             Matcher(cls.ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
    415 
    416         # Matchers for code after the function keyword.
    417         JavaScriptModes.FUNCTION_MODE: [
    418             # Must match open paren before anything else and move into parameter
    419             # mode, otherwise everything inside the parameter list is parsed
    420             # incorrectly.
    421             Matcher(cls.OPENING_PAREN, Type.START_PARAMETERS,
    422                     JavaScriptModes.PARAMETER_MODE),
    423             Matcher(cls.WHITESPACE, Type.WHITESPACE),
    424             Matcher(cls.IDENTIFIER, Type.FUNCTION_NAME)],
    425 
    426         # Matchers for function parameters
    427         JavaScriptModes.PARAMETER_MODE: [
    428             # When in function parameter mode, a closing paren is treated
    429             # specially. Everything else is treated as lines of parameters.
    430             Matcher(cls.CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
    431                     JavaScriptModes.TEXT_MODE),
    432             Matcher(cls.PARAMETERS, Type.PARAMETERS,
    433                     JavaScriptModes.PARAMETER_MODE)]}
    434 
    435   def __init__(self, parse_js_doc=True):
    436     """Create a tokenizer object.
    437 
    438     Args:
    439       parse_js_doc: Whether to do detailed parsing of javascript doc comments,
    440           or simply treat them as normal comments.  Defaults to parsing JsDoc.
    441     """
    442     matchers = self.BuildMatchers()
    443     if not parse_js_doc:
    444       # Make a copy so the original doesn't get modified.
    445       matchers = copy.deepcopy(matchers)
    446       matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
    447           JavaScriptModes.BLOCK_COMMENT_MODE]
    448 
    449     tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
    450         self.JAVASCRIPT_DEFAULT_TYPES)
    451 
    452   def _CreateToken(self, string, token_type, line, line_number, values=None):
    453     """Creates a new JavaScriptToken object.
    454 
    455     Args:
    456       string: The string of input the token contains.
    457       token_type: The type of token.
    458       line: The text of the line this token is in.
    459       line_number: The line number of the token.
    460       values: A dict of named values within the token.  For instance, a
    461         function declaration may have a value called 'name' which captures the
    462         name of the function.
    463     """
    464     return javascripttokens.JavaScriptToken(string, token_type, line,
    465                                             line_number, values, line_number)
    466