1 #!/usr/bin/env python 2 # 3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS-IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Regular expression based JavaScript parsing classes.""" 18 19 __author__ = ('robbyw (at] google.com (Robert Walker)', 20 'ajp (at] google.com (Andy Perelson)') 21 22 import copy 23 import re 24 25 from closure_linter import javascripttokens 26 from closure_linter.common import matcher 27 from closure_linter.common import tokenizer 28 29 # Shorthand 30 Type = javascripttokens.JavaScriptTokenType 31 Matcher = matcher.Matcher 32 33 34 class JavaScriptModes(object): 35 """Enumeration of the different matcher modes used for JavaScript.""" 36 TEXT_MODE = 'text' 37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string' 38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string' 39 BLOCK_COMMENT_MODE = 'block_comment' 40 DOC_COMMENT_MODE = 'doc_comment' 41 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces' 42 LINE_COMMENT_MODE = 'line_comment' 43 PARAMETER_MODE = 'parameter' 44 FUNCTION_MODE = 'function' 45 46 47 class JavaScriptTokenizer(tokenizer.Tokenizer): 48 """JavaScript tokenizer. 49 50 Convert JavaScript code in to an array of tokens. 51 """ 52 53 # Useful patterns for JavaScript parsing. 54 IDENTIFIER_CHAR = r'A-Za-z0-9_$.' 55 56 # Number patterns based on: 57 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html 58 MANTISSA = r""" 59 (\d+(?!\.)) | # Matches '10' 60 (\d+\.(?!\d)) | # Matches '10.' 61 (\d*\.\d+) # Matches '.5' or '10.5' 62 """ 63 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA 64 HEX_LITERAL = r'0[xX][0-9a-fA-F]+' 65 NUMBER = re.compile(r""" 66 ((%s)|(%s)) 67 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE) 68 69 # Strings come in three parts - first we match the start of the string, then 70 # the contents, then the end. The contents consist of any character except a 71 # backslash or end of string, or a backslash followed by any character, or a 72 # backslash followed by end of line to support correct parsing of multi-line 73 # strings. 74 SINGLE_QUOTE = re.compile(r"'") 75 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+") 76 DOUBLE_QUOTE = re.compile(r'"') 77 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+') 78 79 START_SINGLE_LINE_COMMENT = re.compile(r'//') 80 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$') 81 82 START_DOC_COMMENT = re.compile(r'/\*\*') 83 START_BLOCK_COMMENT = re.compile(r'/\*') 84 END_BLOCK_COMMENT = re.compile(r'\*/') 85 BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+') 86 87 # Comment text is anything that we are not going to parse into another special 88 # token like (inline) flags or end comments. Complicated regex to match 89 # most normal characters, and '*', '{', '}', and '@' when we are sure that 90 # it is safe. Expression [^*{\s]@ must come first, or the other options will 91 # match everything before @, and we won't match @'s that aren't part of flags 92 # like in email addresses in the @author tag. 93 DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+') 94 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+') 95 96 # Match the prefix ' * ' that starts every line of jsdoc. Want to include 97 # spaces after the '*', but nothing else that occurs after a '*', and don't 98 # want to match the '*' in '*/'. 99 DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))') 100 101 START_BLOCK = re.compile('{') 102 END_BLOCK = re.compile('}') 103 104 REGEX_CHARACTER_CLASS = r""" 105 \[ # Opening bracket 106 ([^\]\\]|\\.)* # Anything but a ] or \, 107 # or a backslash followed by anything 108 \] # Closing bracket 109 """ 110 # We ensure the regex is followed by one of the above tokens to avoid 111 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z 112 POST_REGEX_LIST = [ 113 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}'] 114 115 REGEX = re.compile(r""" 116 / # opening slash 117 (?!\*) # not the start of a comment 118 (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything, 119 # or anything but a / or [ or \, 120 # or a character class 121 / # closing slash 122 [gimsx]* # optional modifiers 123 (?=\s*(%s)) 124 """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)), 125 re.VERBOSE) 126 127 ANYTHING = re.compile(r'.*') 128 PARAMETERS = re.compile(r'[^\)]+') 129 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*') 130 131 FUNCTION_DECLARATION = re.compile(r'\bfunction\b') 132 133 OPENING_PAREN = re.compile(r'\(') 134 CLOSING_PAREN = re.compile(r'\)') 135 136 OPENING_BRACKET = re.compile(r'\[') 137 CLOSING_BRACKET = re.compile(r'\]') 138 139 # We omit these JS keywords from the list: 140 # function - covered by FUNCTION_DECLARATION. 141 # delete, in, instanceof, new, typeof - included as operators. 142 # this - included in identifiers. 143 # null, undefined - not included, should go in some "special constant" list. 144 KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else', 145 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var', 146 'while', 'with'] 147 # Match a keyword string followed by a non-identifier character in order to 148 # not match something like doSomething as do + Something. 149 KEYWORD = re.compile('(%s)((?=[^%s])|$)' % ( 150 '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR)) 151 152 # List of regular expressions to match as operators. Some notes: for our 153 # purposes, the comma behaves similarly enough to a normal operator that we 154 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary 155 # characters - this may not match some very esoteric uses of the in operator. 156 # Operators that are subsets of larger operators must come later in this list 157 # for proper matching, e.g., '>>' must come AFTER '>>>'. 158 OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=', 159 '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+', 160 '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%', 161 '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?', 162 r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b', 163 r'\btypeof\b', r'\bvoid\b'] 164 OPERATOR = re.compile('|'.join(OPERATOR_LIST)) 165 166 WHITESPACE = re.compile(r'\s+') 167 SEMICOLON = re.compile(r';') 168 # Technically JavaScript identifiers can't contain '.', but we treat a set of 169 # nested identifiers as a single identifier. 170 NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR 171 IDENTIFIER = re.compile(NESTED_IDENTIFIER) 172 173 SIMPLE_LVALUE = re.compile(r""" 174 (?P<identifier>%s) # a valid identifier 175 (?=\s* # optional whitespace 176 \= # look ahead to equal sign 177 (?!=)) # not follwed by equal 178 """ % NESTED_IDENTIFIER, re.VERBOSE) 179 180 # A doc flag is a @ sign followed by non-space characters that appears at the 181 # beginning of the line, after whitespace, or after a '{'. The look-behind 182 # check is necessary to not match someone (at] google.com as a flag. 183 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)') 184 # To properly parse parameter names, we need to tokenize whitespace into a 185 # token. 186 DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' % 187 '|'.join(['param'])) 188 189 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)') 190 191 # Star followed by non-slash, i.e a star that does not end a comment. 192 # This is used for TYPE_GROUP below. 193 SAFE_STAR = r'(\*(?!/))' 194 195 COMMON_DOC_MATCHERS = [ 196 # Find the end of the comment. 197 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT, 198 JavaScriptModes.TEXT_MODE), 199 200 # Tokenize documented flags like @private. 201 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG), 202 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG, 203 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE), 204 205 # Encountering a doc flag should leave lex spaces mode. 206 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE), 207 208 # Tokenize braces so we can find types. 209 Matcher(START_BLOCK, Type.DOC_START_BRACE), 210 Matcher(END_BLOCK, Type.DOC_END_BRACE), 211 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)] 212 213 214 # The token matcher groups work as follows: it is an list of Matcher objects. 215 # The matchers will be tried in this order, and the first to match will be 216 # returned. Hence the order is important because the matchers that come first 217 # overrule the matchers that come later. 218 JAVASCRIPT_MATCHERS = { 219 # Matchers for basic text mode. 220 JavaScriptModes.TEXT_MODE: [ 221 # Check a big group - strings, starting comments, and regexes - all 222 # of which could be intertwined. 'string with /regex/', 223 # /regex with 'string'/, /* comment with /regex/ and string */ (and so 224 # on) 225 Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT, 226 JavaScriptModes.DOC_COMMENT_MODE), 227 Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT, 228 JavaScriptModes.BLOCK_COMMENT_MODE), 229 Matcher(END_OF_LINE_SINGLE_LINE_COMMENT, 230 Type.START_SINGLE_LINE_COMMENT), 231 Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT, 232 JavaScriptModes.LINE_COMMENT_MODE), 233 Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START, 234 JavaScriptModes.SINGLE_QUOTE_STRING_MODE), 235 Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START, 236 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE), 237 Matcher(REGEX, Type.REGEX), 238 239 # Next we check for start blocks appearing outside any of the items 240 # above. 241 Matcher(START_BLOCK, Type.START_BLOCK), 242 Matcher(END_BLOCK, Type.END_BLOCK), 243 244 # Then we search for function declarations. 245 Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION, 246 JavaScriptModes.FUNCTION_MODE), 247 248 # Next, we convert non-function related parens to tokens. 249 Matcher(OPENING_PAREN, Type.START_PAREN), 250 Matcher(CLOSING_PAREN, Type.END_PAREN), 251 252 # Next, we convert brackets to tokens. 253 Matcher(OPENING_BRACKET, Type.START_BRACKET), 254 Matcher(CLOSING_BRACKET, Type.END_BRACKET), 255 256 # Find numbers. This has to happen before operators because scientific 257 # notation numbers can have + and - in them. 258 Matcher(NUMBER, Type.NUMBER), 259 260 # Find operators and simple assignments 261 Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE), 262 Matcher(OPERATOR, Type.OPERATOR), 263 264 # Find key words and whitespace. 265 Matcher(KEYWORD, Type.KEYWORD), 266 Matcher(WHITESPACE, Type.WHITESPACE), 267 268 # Find identifiers. 269 Matcher(IDENTIFIER, Type.IDENTIFIER), 270 271 # Finally, we convert semicolons to tokens. 272 Matcher(SEMICOLON, Type.SEMICOLON)], 273 274 # Matchers for single quote strings. 275 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [ 276 Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT), 277 Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END, 278 JavaScriptModes.TEXT_MODE)], 279 280 # Matchers for double quote strings. 281 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [ 282 Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT), 283 Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END, 284 JavaScriptModes.TEXT_MODE)], 285 286 # Matchers for block comments. 287 JavaScriptModes.BLOCK_COMMENT_MODE: [ 288 # First we check for exiting a block comment. 289 Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT, 290 JavaScriptModes.TEXT_MODE), 291 292 # Match non-comment-ending text.. 293 Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)], 294 295 # Matchers for doc comments. 296 JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [ 297 Matcher(DOC_COMMENT_TEXT, Type.COMMENT)], 298 299 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [ 300 Matcher(WHITESPACE, Type.COMMENT), 301 Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)], 302 303 # Matchers for single line comments. 304 JavaScriptModes.LINE_COMMENT_MODE: [ 305 # We greedy match until the end of the line in line comment mode. 306 Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)], 307 308 # Matchers for code after the function keyword. 309 JavaScriptModes.FUNCTION_MODE: [ 310 # Must match open paren before anything else and move into parameter 311 # mode, otherwise everything inside the parameter list is parsed 312 # incorrectly. 313 Matcher(OPENING_PAREN, Type.START_PARAMETERS, 314 JavaScriptModes.PARAMETER_MODE), 315 Matcher(WHITESPACE, Type.WHITESPACE), 316 Matcher(IDENTIFIER, Type.FUNCTION_NAME)], 317 318 # Matchers for function parameters 319 JavaScriptModes.PARAMETER_MODE: [ 320 # When in function parameter mode, a closing paren is treated specially. 321 # Everything else is treated as lines of parameters. 322 Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS, 323 JavaScriptModes.TEXT_MODE), 324 Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]} 325 326 # When text is not matched, it is given this default type based on mode. 327 # If unspecified in this map, the default default is Type.NORMAL. 328 JAVASCRIPT_DEFAULT_TYPES = { 329 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT, 330 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT 331 } 332 333 def __init__(self, parse_js_doc = True): 334 """Create a tokenizer object. 335 336 Args: 337 parse_js_doc: Whether to do detailed parsing of javascript doc comments, 338 or simply treat them as normal comments. Defaults to parsing JsDoc. 339 """ 340 matchers = self.JAVASCRIPT_MATCHERS 341 if not parse_js_doc: 342 # Make a copy so the original doesn't get modified. 343 matchers = copy.deepcopy(matchers) 344 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[ 345 JavaScriptModes.BLOCK_COMMENT_MODE] 346 347 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers, 348 self.JAVASCRIPT_DEFAULT_TYPES) 349 350 def _CreateToken(self, string, token_type, line, line_number, values=None): 351 """Creates a new JavaScriptToken object. 352 353 Args: 354 string: The string of input the token contains. 355 token_type: The type of token. 356 line: The text of the line this token is in. 357 line_number: The line number of the token. 358 values: A dict of named values within the token. For instance, a 359 function declaration may have a value called 'name' which captures the 360 name of the function. 361 """ 362 return javascripttokens.JavaScriptToken(string, token_type, line, 363 line_number, values) 364