Home | History | Annotate | Download | only in closure_linter
      1 #!/usr/bin/env python
      2 #
      3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #      http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS-IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 
     17 """Regular expression based JavaScript parsing classes."""
     18 
     19 __author__ = ('robbyw (at] google.com (Robert Walker)',
     20               'ajp (at] google.com (Andy Perelson)')
     21 
     22 import copy
     23 import re
     24 
     25 from closure_linter import javascripttokens
     26 from closure_linter.common import matcher
     27 from closure_linter.common import tokenizer
     28 
     29 # Shorthand
     30 Type = javascripttokens.JavaScriptTokenType
     31 Matcher = matcher.Matcher
     32 
     33 
     34 class JavaScriptModes(object):
     35   """Enumeration of the different matcher modes used for JavaScript."""
     36   TEXT_MODE = 'text'
     37   SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
     38   DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
     39   BLOCK_COMMENT_MODE = 'block_comment'
     40   DOC_COMMENT_MODE = 'doc_comment'
     41   DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
     42   LINE_COMMENT_MODE = 'line_comment'
     43   PARAMETER_MODE = 'parameter'
     44   FUNCTION_MODE = 'function'
     45 
     46 
     47 class JavaScriptTokenizer(tokenizer.Tokenizer):
     48   """JavaScript tokenizer.
     49 
     50   Convert JavaScript code in to an array of tokens.
     51   """
     52 
     53   # Useful patterns for JavaScript parsing.
     54   IDENTIFIER_CHAR = r'A-Za-z0-9_$.'
     55 
     56   # Number patterns based on:
     57   # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
     58   MANTISSA = r"""
     59              (\d+(?!\.)) |                # Matches '10'
     60              (\d+\.(?!\d)) |              # Matches '10.'
     61              (\d*\.\d+)                   # Matches '.5' or '10.5'
     62              """
     63   DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
     64   HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
     65   NUMBER = re.compile(r"""
     66                       ((%s)|(%s))
     67                       """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
     68 
     69   # Strings come in three parts - first we match the start of the string, then
     70   # the contents, then the end.  The contents consist of any character except a
     71   # backslash or end of string, or a backslash followed by any character, or a
     72   # backslash followed by end of line to support correct parsing of multi-line
     73   # strings.
     74   SINGLE_QUOTE = re.compile(r"'")
     75   SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
     76   DOUBLE_QUOTE = re.compile(r'"')
     77   DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
     78 
     79   START_SINGLE_LINE_COMMENT = re.compile(r'//')
     80   END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
     81 
     82   START_DOC_COMMENT = re.compile(r'/\*\*')
     83   START_BLOCK_COMMENT = re.compile(r'/\*')
     84   END_BLOCK_COMMENT = re.compile(r'\*/')
     85   BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
     86 
     87   # Comment text is anything that we are not going to parse into another special
     88   # token like (inline) flags or end comments. Complicated regex to match
     89   # most normal characters, and '*', '{', '}', and '@' when we are sure that
     90   # it is safe. Expression [^*{\s]@ must come first, or the other options will
     91   # match everything before @, and we won't match @'s that aren't part of flags
     92   # like in email addresses in the @author tag.
     93   DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
     94   DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
     95 
     96   # Match the prefix ' * ' that starts every line of jsdoc. Want to include
     97   # spaces after the '*', but nothing else that occurs after a '*', and don't
     98   # want to match the '*' in '*/'.
     99   DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
    100 
    101   START_BLOCK = re.compile('{')
    102   END_BLOCK = re.compile('}')
    103 
    104   REGEX_CHARACTER_CLASS = r"""
    105                           \[               # Opening bracket
    106                           ([^\]\\]|\\.)*   # Anything but a ] or \,
    107                                            # or a backslash followed by anything
    108                           \]               # Closing bracket
    109                           """
    110   # We ensure the regex is followed by one of the above tokens to avoid
    111   # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
    112   POST_REGEX_LIST = [
    113       ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
    114 
    115   REGEX = re.compile(r"""
    116                      /                      # opening slash
    117                      (?!\*)                 # not the start of a comment
    118                      (\\.|[^\[\/\\]|(%s))*  # a backslash followed by anything,
    119                                             # or anything but a / or [ or \,
    120                                             # or a character class
    121                      /                      # closing slash
    122                      [gimsx]*               # optional modifiers
    123                      (?=\s*(%s))
    124                      """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
    125                      re.VERBOSE)
    126 
    127   ANYTHING = re.compile(r'.*')
    128   PARAMETERS = re.compile(r'[^\)]+')
    129   CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
    130 
    131   FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
    132 
    133   OPENING_PAREN = re.compile(r'\(')
    134   CLOSING_PAREN = re.compile(r'\)')
    135 
    136   OPENING_BRACKET = re.compile(r'\[')
    137   CLOSING_BRACKET = re.compile(r'\]')
    138 
    139   # We omit these JS keywords from the list:
    140   #   function - covered by FUNCTION_DECLARATION.
    141   #   delete, in, instanceof, new, typeof - included as operators.
    142   #   this - included in identifiers.
    143   #   null, undefined - not included, should go in some "special constant" list.
    144   KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else',
    145       'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var',
    146       'while', 'with']
    147   # Match a keyword string followed by a non-identifier character in order to
    148   # not match something like doSomething as do + Something.
    149   KEYWORD = re.compile('(%s)((?=[^%s])|$)' % (
    150       '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR))
    151 
    152   # List of regular expressions to match as operators.  Some notes: for our
    153   # purposes, the comma behaves similarly enough to a normal operator that we
    154   # include it here.  r'\bin\b' actually matches 'in' surrounded by boundary
    155   # characters - this may not match some very esoteric uses of the in operator.
    156   # Operators that are subsets of larger operators must come later in this list
    157   # for proper matching, e.g., '>>' must come AFTER '>>>'.
    158   OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=',
    159                    '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+',
    160                    '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%',
    161                    '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?',
    162                    r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b',
    163                    r'\btypeof\b', r'\bvoid\b']
    164   OPERATOR = re.compile('|'.join(OPERATOR_LIST))
    165 
    166   WHITESPACE = re.compile(r'\s+')
    167   SEMICOLON = re.compile(r';')
    168   # Technically JavaScript identifiers can't contain '.', but we treat a set of
    169   # nested identifiers as a single identifier.
    170   NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR
    171   IDENTIFIER = re.compile(NESTED_IDENTIFIER)
    172 
    173   SIMPLE_LVALUE = re.compile(r"""
    174                              (?P<identifier>%s)      # a valid identifier
    175                              (?=\s*                  # optional whitespace
    176                              \=                      # look ahead to equal sign
    177                              (?!=))                  # not follwed by equal
    178                              """ % NESTED_IDENTIFIER, re.VERBOSE)
    179 
    180   # A doc flag is a @ sign followed by non-space characters that appears at the
    181   # beginning of the line, after whitespace, or after a '{'.  The look-behind
    182   # check is necessary to not match someone (at] google.com as a flag.
    183   DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
    184   # To properly parse parameter names, we need to tokenize whitespace into a
    185   # token.
    186   DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' %
    187                                      '|'.join(['param']))
    188 
    189   DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
    190 
    191   # Star followed by non-slash, i.e a star that does not end a comment.
    192   # This is used for TYPE_GROUP below.
    193   SAFE_STAR = r'(\*(?!/))'
    194 
    195   COMMON_DOC_MATCHERS = [
    196       # Find the end of the comment.
    197       Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
    198               JavaScriptModes.TEXT_MODE),
    199 
    200       # Tokenize documented flags like @private.
    201       Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
    202       Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
    203               JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
    204 
    205       # Encountering a doc flag should leave lex spaces mode.
    206       Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
    207 
    208       # Tokenize braces so we can find types.
    209       Matcher(START_BLOCK, Type.DOC_START_BRACE),
    210       Matcher(END_BLOCK, Type.DOC_END_BRACE),
    211       Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
    212 
    213 
    214   # The token matcher groups work as follows: it is an list of  Matcher objects.
    215   # The matchers will be tried in this order, and the first to match will be
    216   # returned.  Hence the order is important because the matchers that come first
    217   # overrule the matchers that come later.
    218   JAVASCRIPT_MATCHERS = {
    219       # Matchers for basic text mode.
    220       JavaScriptModes.TEXT_MODE: [
    221         # Check a big group - strings, starting comments, and regexes - all
    222         # of which could be intertwined.  'string with /regex/',
    223         # /regex with 'string'/, /* comment with /regex/ and string */ (and so
    224         # on)
    225         Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT,
    226                 JavaScriptModes.DOC_COMMENT_MODE),
    227         Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
    228                 JavaScriptModes.BLOCK_COMMENT_MODE),
    229         Matcher(END_OF_LINE_SINGLE_LINE_COMMENT,
    230                 Type.START_SINGLE_LINE_COMMENT),
    231         Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT,
    232                 JavaScriptModes.LINE_COMMENT_MODE),
    233         Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
    234                 JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
    235         Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
    236                 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
    237         Matcher(REGEX, Type.REGEX),
    238 
    239         # Next we check for start blocks appearing outside any of the items
    240         # above.
    241         Matcher(START_BLOCK, Type.START_BLOCK),
    242         Matcher(END_BLOCK, Type.END_BLOCK),
    243 
    244         # Then we search for function declarations.
    245         Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
    246                 JavaScriptModes.FUNCTION_MODE),
    247 
    248         # Next, we convert non-function related parens to tokens.
    249         Matcher(OPENING_PAREN, Type.START_PAREN),
    250         Matcher(CLOSING_PAREN, Type.END_PAREN),
    251 
    252         # Next, we convert brackets to tokens.
    253         Matcher(OPENING_BRACKET, Type.START_BRACKET),
    254         Matcher(CLOSING_BRACKET, Type.END_BRACKET),
    255 
    256         # Find numbers.  This has to happen before operators because scientific
    257         # notation numbers can have + and - in them.
    258         Matcher(NUMBER, Type.NUMBER),
    259 
    260         # Find operators and simple assignments
    261         Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
    262         Matcher(OPERATOR, Type.OPERATOR),
    263 
    264         # Find key words and whitespace.
    265         Matcher(KEYWORD, Type.KEYWORD),
    266         Matcher(WHITESPACE, Type.WHITESPACE),
    267 
    268         # Find identifiers.
    269         Matcher(IDENTIFIER, Type.IDENTIFIER),
    270 
    271         # Finally, we convert semicolons to tokens.
    272         Matcher(SEMICOLON, Type.SEMICOLON)],
    273 
    274       # Matchers for single quote strings.
    275       JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
    276           Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
    277           Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
    278               JavaScriptModes.TEXT_MODE)],
    279 
    280       # Matchers for double quote strings.
    281       JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
    282           Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
    283           Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
    284               JavaScriptModes.TEXT_MODE)],
    285 
    286       # Matchers for block comments.
    287       JavaScriptModes.BLOCK_COMMENT_MODE: [
    288         # First we check for exiting a block comment.
    289         Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
    290                 JavaScriptModes.TEXT_MODE),
    291 
    292         # Match non-comment-ending text..
    293         Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)],
    294 
    295       # Matchers for doc comments.
    296       JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [
    297         Matcher(DOC_COMMENT_TEXT, Type.COMMENT)],
    298 
    299       JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [
    300         Matcher(WHITESPACE, Type.COMMENT),
    301         Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
    302 
    303       # Matchers for single line comments.
    304       JavaScriptModes.LINE_COMMENT_MODE: [
    305         # We greedy match until the end of the line in line comment mode.
    306         Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
    307 
    308       # Matchers for code after the function keyword.
    309       JavaScriptModes.FUNCTION_MODE: [
    310         # Must match open paren before anything else and move into parameter
    311         # mode, otherwise everything inside the parameter list is parsed
    312         # incorrectly.
    313         Matcher(OPENING_PAREN, Type.START_PARAMETERS,
    314                 JavaScriptModes.PARAMETER_MODE),
    315         Matcher(WHITESPACE, Type.WHITESPACE),
    316         Matcher(IDENTIFIER, Type.FUNCTION_NAME)],
    317 
    318       # Matchers for function parameters
    319       JavaScriptModes.PARAMETER_MODE: [
    320         # When in function parameter mode, a closing paren is treated specially.
    321         # Everything else is treated as lines of parameters.
    322         Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
    323                 JavaScriptModes.TEXT_MODE),
    324         Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]}
    325 
    326   # When text is not matched, it is given this default type based on mode.
    327   # If unspecified in this map, the default default is Type.NORMAL.
    328   JAVASCRIPT_DEFAULT_TYPES = {
    329     JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
    330     JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
    331   }
    332 
    333   def __init__(self, parse_js_doc = True):
    334     """Create a tokenizer object.
    335 
    336     Args:
    337       parse_js_doc: Whether to do detailed parsing of javascript doc comments,
    338           or simply treat them as normal comments.  Defaults to parsing JsDoc.
    339     """
    340     matchers = self.JAVASCRIPT_MATCHERS
    341     if not parse_js_doc:
    342       # Make a copy so the original doesn't get modified.
    343       matchers = copy.deepcopy(matchers)
    344       matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
    345           JavaScriptModes.BLOCK_COMMENT_MODE]
    346 
    347     tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
    348         self.JAVASCRIPT_DEFAULT_TYPES)
    349 
    350   def _CreateToken(self, string, token_type, line, line_number, values=None):
    351     """Creates a new JavaScriptToken object.
    352 
    353     Args:
    354       string: The string of input the token contains.
    355       token_type: The type of token.
    356       line: The text of the line this token is in.
    357       line_number: The line number of the token.
    358       values: A dict of named values within the token.  For instance, a
    359         function declaration may have a value called 'name' which captures the
    360         name of the function.
    361     """
    362     return javascripttokens.JavaScriptToken(string, token_type, line,
    363                                             line_number, values)
    364