1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 #ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_ 5 #define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_ 6 #pragma once 7 8 #include <functional> 9 #include <string> 10 #include <vector> 11 12 class GURL; 13 14 // A pattern that can be used to match URLs. A URLPattern is a very restricted 15 // subset of URL syntax: 16 // 17 // <url-pattern> := <scheme>://<host><path> | '<all_urls>' 18 // <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome' 19 // <host> := '*' | '*.' <anychar except '/' and '*'>+ 20 // <path> := '/' <any chars> 21 // 22 // * Host is not used when the scheme is 'file'. 23 // * The path can have embedded '*' characters which act as glob wildcards. 24 // * '<all_urls>' is a special pattern that matches any URL that contains a 25 // valid scheme (as specified by valid_schemes_). 26 // * The '*' scheme pattern excludes file URLs. 27 // 28 // Examples of valid patterns: 29 // - http://*/* 30 // - http://*/foo* 31 // - https://*.google.com/foo*bar 32 // - file://monkey* 33 // - http://127.0.0.1/* 34 // 35 // Examples of invalid patterns: 36 // - http://* -- path not specified 37 // - http://*foo/bar -- * not allowed as substring of host component 38 // - http://foo.*.bar/baz -- * must be first component 39 // - http:/bar -- scheme separator not found 40 // - foo://* -- invalid scheme 41 // - chrome:// -- we don't support chrome internal URLs 42 // 43 // Design rationale: 44 // * We need to be able to tell users what 'sites' a given URLPattern will 45 // affect. For example "This extension will interact with the site 46 // 'www.google.com'. 47 // * We'd like to be able to convert as many existing Greasemonkey @include 48 // patterns to URLPatterns as possible. Greasemonkey @include patterns are 49 // simple globs, so this won't be perfect. 50 // * Although we would like to support any scheme, it isn't clear what to tell 51 // users about URLPatterns that affect data or javascript URLs, so those are 52 // left out for now. 53 // 54 // From a 2008-ish crawl of userscripts.org, the following patterns were found 55 // in @include lines: 56 // - total lines : 24471 57 // - @include * : 919 58 // - @include http://[^\*]+?/ : 11128 (no star in host) 59 // - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.) 60 // - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many 61 // appear to only need subdomain 62 // matching, not real prefix matching) 63 // - @include http://[^\*/]+\*/ : 320 (host suffixed by *) 64 // - @include contains .tld : 297 (host suffixed by .tld -- a special 65 // Greasemonkey domain component that 66 // tries to match all valid registry- 67 // controlled suffixes) 68 // - @include http://\*/ : 228 (host is * exactly, but there is 69 // more to the pattern) 70 // 71 // So, we can support at least half of current @include lines without supporting 72 // subdomain matching. We can pick up at least another 10% by supporting 73 // subdomain matching. It is probably possible to coerce more of the existing 74 // patterns to URLPattern, but the resulting pattern will be more restrictive 75 // than the original glob, which is probably better than nothing. 76 class URLPattern { 77 public: 78 // A collection of scheme bitmasks for use with valid_schemes. 79 enum SchemeMasks { 80 SCHEME_NONE = 0, 81 SCHEME_HTTP = 1 << 0, 82 SCHEME_HTTPS = 1 << 1, 83 SCHEME_FILE = 1 << 2, 84 SCHEME_FTP = 1 << 3, 85 SCHEME_CHROMEUI = 1 << 4, 86 SCHEME_FILESYSTEM = 1 << 5, 87 // SCHEME_ALL will match every scheme, including chrome://, chrome- 88 // extension://, about:, etc. Because this has lots of security 89 // implications, third-party extensions should never be able to get access 90 // to URL patterns initialized this way. It should only be used for internal 91 // Chrome code. 92 SCHEME_ALL = -1, 93 }; 94 95 // Options for URLPattern::Parse(). 96 enum ParseOption { 97 PARSE_LENIENT, 98 PARSE_STRICT 99 }; 100 101 // Error codes returned from Parse(). 102 enum ParseResult { 103 PARSE_SUCCESS = 0, 104 PARSE_ERROR_MISSING_SCHEME_SEPARATOR, 105 PARSE_ERROR_INVALID_SCHEME, 106 PARSE_ERROR_WRONG_SCHEME_SEPARATOR, 107 PARSE_ERROR_EMPTY_HOST, 108 PARSE_ERROR_INVALID_HOST_WILDCARD, 109 PARSE_ERROR_EMPTY_PATH, 110 PARSE_ERROR_HAS_COLON, // Only checked when strict checks are enabled. 111 NUM_PARSE_RESULTS 112 }; 113 114 // The <all_urls> string pattern. 115 static const char kAllUrlsPattern[]; 116 117 // Construct an URLPattern with the given set of allowable schemes. See 118 // valid_schemes_ for more info. 119 explicit URLPattern(int valid_schemes); 120 121 // Convenience to construct a URLPattern from a string. The string is expected 122 // to be a valid pattern. If the string is not known ahead of time, use 123 // Parse() instead, which returns success or failure. 124 URLPattern(int valid_schemes, const std::string& pattern); 125 126 #if defined(_MSC_VER) && _MSC_VER >= 1600 127 // Note: don't use this directly. This exists so URLPattern can be used 128 // with STL containers. Starting with Visual Studio 2010, we can't have this 129 // method private and use "friend class std::vector<URLPattern>;" as we used 130 // to do. 131 URLPattern(); 132 #endif 133 134 ~URLPattern(); 135 136 // Gets the bitmask of valid schemes. 137 int valid_schemes() const { return valid_schemes_; } 138 void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; } 139 140 // Gets the host the pattern matches. This can be an empty string if the 141 // pattern matches all hosts (the input was <scheme>://*/<whatever>). 142 const std::string& host() const { return host_; } 143 void set_host(const std::string& host) { host_ = host; } 144 145 // Gets whether to match subdomains of host(). 146 bool match_subdomains() const { return match_subdomains_; } 147 void set_match_subdomains(bool val) { match_subdomains_ = val; } 148 149 // Gets the path the pattern matches with the leading slash. This can have 150 // embedded asterisks which are interpreted using glob rules. 151 const std::string& path() const { return path_; } 152 void SetPath(const std::string& path); 153 154 // Returns true if this pattern matches all urls. 155 bool match_all_urls() const { return match_all_urls_; } 156 void set_match_all_urls(bool val) { match_all_urls_ = val; } 157 158 // Initializes this instance by parsing the provided string. Returns 159 // URLPattern::PARSE_SUCCESS on success, or an error code otherwise. On 160 // failure, this instance will have some intermediate values and is in an 161 // invalid state. Adding error checks to URLPattern::Parse() can cause 162 // patterns in installed extensions to fail. If an installed extension 163 // uses a pattern that was valid but fails a new error check, the 164 // extension will fail to load when chrome is auto-updated. To avoid 165 // this, new parse checks are enabled only when |strictness| is 166 // OPTION_STRICT. OPTION_STRICT should be used when loading in developer 167 // mode, or when an extension's patterns are controlled by chrome (such 168 // as component extensions). 169 ParseResult Parse(const std::string& pattern_str, 170 ParseOption strictness); 171 172 // Sets the scheme for pattern matches. This can be a single '*' if the 173 // pattern matches all valid schemes (as defined by the valid_schemes_ 174 // property). Returns false on failure (if the scheme is not valid). 175 bool SetScheme(const std::string& scheme); 176 // Note: You should use MatchesScheme() instead of this getter unless you 177 // absolutely need the exact scheme. This is exposed for testing. 178 const std::string& scheme() const { return scheme_; } 179 180 // Returns true if the specified scheme can be used in this URL pattern, and 181 // false otherwise. Uses valid_schemes_ to determine validity. 182 bool IsValidScheme(const std::string& scheme) const; 183 184 // Returns true if this instance matches the specified URL. 185 bool MatchesUrl(const GURL& url) const; 186 187 // Returns true if |test| matches our scheme. 188 bool MatchesScheme(const std::string& test) const; 189 190 // Returns true if |test| matches our host. 191 bool MatchesHost(const std::string& test) const; 192 bool MatchesHost(const GURL& test) const; 193 194 // Returns true if |test| matches our path. 195 bool MatchesPath(const std::string& test) const; 196 197 // Returns a string representing this instance. 198 std::string GetAsString() const; 199 200 // Determine whether there is a URL that would match this instance and another 201 // instance. This method is symmetrical: Calling other.OverlapsWith(this) 202 // would result in the same answer. 203 bool OverlapsWith(const URLPattern& other) const; 204 205 // Convert this URLPattern into an equivalent set of URLPatterns that don't 206 // use a wildcard in the scheme component. If this URLPattern doesn't use a 207 // wildcard scheme, then the returned set will contain one element that is 208 // equivalent to this instance. 209 std::vector<URLPattern> ConvertToExplicitSchemes() const; 210 211 static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) { 212 if (a.match_all_urls_ && b.match_all_urls_) 213 return false; 214 return a.host_.compare(b.host_) < 0; 215 }; 216 217 // Used for origin comparisons in a std::set. 218 class EffectiveHostCompareFunctor { 219 public: 220 bool operator()(const URLPattern& a, const URLPattern& b) const { 221 return EffectiveHostCompare(a, b); 222 }; 223 }; 224 225 // Get an error string for a ParseResult. 226 static const char* GetParseResultString(URLPattern::ParseResult parse_result); 227 228 private: 229 #if !(defined(_MSC_VER) && _MSC_VER >= 1600) 230 friend class std::vector<URLPattern>; 231 232 // Note: don't use this directly. This exists so URLPattern can be used 233 // with STL containers. 234 URLPattern(); 235 #endif 236 237 // A bitmask containing the schemes which are considered valid for this 238 // pattern. Parse() uses this to decide whether a pattern contains a valid 239 // scheme. MatchesScheme uses this to decide whether a wildcard scheme_ 240 // matches a given test scheme. 241 int valid_schemes_; 242 243 // True if this is a special-case "<all_urls>" pattern. 244 bool match_all_urls_; 245 246 // The scheme for the pattern. 247 std::string scheme_; 248 249 // The host without any leading "*" components. 250 std::string host_; 251 252 // Whether we should match subdomains of the host. This is true if the first 253 // component of the pattern's host was "*". 254 bool match_subdomains_; 255 256 // The path to match. This is everything after the host of the URL, or 257 // everything after the scheme in the case of file:// URLs. 258 std::string path_; 259 260 // The path with "?" and "\" characters escaped for use with the 261 // MatchPattern() function. 262 std::string path_escaped_; 263 }; 264 265 typedef std::vector<URLPattern> URLPatternList; 266 267 #endif // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_ 268