Home | History | Annotate | Download | only in extensions
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 #ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
      5 #define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
      6 #pragma once
      7 
      8 #include <functional>
      9 #include <string>
     10 #include <vector>
     11 
     12 class GURL;
     13 
     14 // A pattern that can be used to match URLs. A URLPattern is a very restricted
     15 // subset of URL syntax:
     16 //
     17 // <url-pattern> := <scheme>://<host><path> | '<all_urls>'
     18 // <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
     19 // <host> := '*' | '*.' <anychar except '/' and '*'>+
     20 // <path> := '/' <any chars>
     21 //
     22 // * Host is not used when the scheme is 'file'.
     23 // * The path can have embedded '*' characters which act as glob wildcards.
     24 // * '<all_urls>' is a special pattern that matches any URL that contains a
     25 //   valid scheme (as specified by valid_schemes_).
     26 // * The '*' scheme pattern excludes file URLs.
     27 //
     28 // Examples of valid patterns:
     29 // - http://*/*
     30 // - http://*/foo*
     31 // - https://*.google.com/foo*bar
     32 // - file://monkey*
     33 // - http://127.0.0.1/*
     34 //
     35 // Examples of invalid patterns:
     36 // - http://* -- path not specified
     37 // - http://*foo/bar -- * not allowed as substring of host component
     38 // - http://foo.*.bar/baz -- * must be first component
     39 // - http:/bar -- scheme separator not found
     40 // - foo://* -- invalid scheme
     41 // - chrome:// -- we don't support chrome internal URLs
     42 //
     43 // Design rationale:
     44 // * We need to be able to tell users what 'sites' a given URLPattern will
     45 //   affect. For example "This extension will interact with the site
     46 //   'www.google.com'.
     47 // * We'd like to be able to convert as many existing Greasemonkey @include
     48 //   patterns to URLPatterns as possible. Greasemonkey @include patterns are
     49 //   simple globs, so this won't be perfect.
     50 // * Although we would like to support any scheme, it isn't clear what to tell
     51 //   users about URLPatterns that affect data or javascript URLs, so those are
     52 //   left out for now.
     53 //
     54 // From a 2008-ish crawl of userscripts.org, the following patterns were found
     55 // in @include lines:
     56 // - total lines                    : 24471
     57 // - @include *                     :   919
     58 // - @include http://[^\*]+?/       : 11128 (no star in host)
     59 // - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
     60 // - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
     61 //                                           appear to only need subdomain
     62 //                                           matching, not real prefix matching)
     63 // - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
     64 // - @include contains .tld         :   297 (host suffixed by .tld -- a special
     65 //                                           Greasemonkey domain component that
     66 //                                           tries to match all valid registry-
     67 //                                           controlled suffixes)
     68 // - @include http://\*/            :   228 (host is * exactly, but there is
     69 //                                           more to the pattern)
     70 //
     71 // So, we can support at least half of current @include lines without supporting
     72 // subdomain matching. We can pick up at least another 10% by supporting
     73 // subdomain matching. It is probably possible to coerce more of the existing
     74 // patterns to URLPattern, but the resulting pattern will be more restrictive
     75 // than the original glob, which is probably better than nothing.
     76 class URLPattern {
     77  public:
     78   // A collection of scheme bitmasks for use with valid_schemes.
     79   enum SchemeMasks {
     80     SCHEME_NONE       = 0,
     81     SCHEME_HTTP       = 1 << 0,
     82     SCHEME_HTTPS      = 1 << 1,
     83     SCHEME_FILE       = 1 << 2,
     84     SCHEME_FTP        = 1 << 3,
     85     SCHEME_CHROMEUI   = 1 << 4,
     86     SCHEME_FILESYSTEM = 1 << 5,
     87     // SCHEME_ALL will match every scheme, including chrome://, chrome-
     88     // extension://, about:, etc. Because this has lots of security
     89     // implications, third-party extensions should never be able to get access
     90     // to URL patterns initialized this way. It should only be used for internal
     91     // Chrome code.
     92     SCHEME_ALL      = -1,
     93   };
     94 
     95   // Options for URLPattern::Parse().
     96   enum ParseOption {
     97     PARSE_LENIENT,
     98     PARSE_STRICT
     99   };
    100 
    101   // Error codes returned from Parse().
    102   enum ParseResult {
    103     PARSE_SUCCESS = 0,
    104     PARSE_ERROR_MISSING_SCHEME_SEPARATOR,
    105     PARSE_ERROR_INVALID_SCHEME,
    106     PARSE_ERROR_WRONG_SCHEME_SEPARATOR,
    107     PARSE_ERROR_EMPTY_HOST,
    108     PARSE_ERROR_INVALID_HOST_WILDCARD,
    109     PARSE_ERROR_EMPTY_PATH,
    110     PARSE_ERROR_HAS_COLON,  // Only checked when strict checks are enabled.
    111     NUM_PARSE_RESULTS
    112   };
    113 
    114   // The <all_urls> string pattern.
    115   static const char kAllUrlsPattern[];
    116 
    117   // Construct an URLPattern with the given set of allowable schemes. See
    118   // valid_schemes_ for more info.
    119   explicit URLPattern(int valid_schemes);
    120 
    121   // Convenience to construct a URLPattern from a string. The string is expected
    122   // to be a valid pattern. If the string is not known ahead of time, use
    123   // Parse() instead, which returns success or failure.
    124   URLPattern(int valid_schemes, const std::string& pattern);
    125 
    126 #if defined(_MSC_VER) && _MSC_VER >= 1600
    127   // Note: don't use this directly. This exists so URLPattern can be used
    128   // with STL containers.  Starting with Visual Studio 2010, we can't have this
    129   // method private and use "friend class std::vector<URLPattern>;" as we used
    130   // to do.
    131   URLPattern();
    132 #endif
    133 
    134   ~URLPattern();
    135 
    136   // Gets the bitmask of valid schemes.
    137   int valid_schemes() const { return valid_schemes_; }
    138   void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }
    139 
    140   // Gets the host the pattern matches. This can be an empty string if the
    141   // pattern matches all hosts (the input was <scheme>://*/<whatever>).
    142   const std::string& host() const { return host_; }
    143   void set_host(const std::string& host) { host_ = host; }
    144 
    145   // Gets whether to match subdomains of host().
    146   bool match_subdomains() const { return match_subdomains_; }
    147   void set_match_subdomains(bool val) { match_subdomains_ = val; }
    148 
    149   // Gets the path the pattern matches with the leading slash. This can have
    150   // embedded asterisks which are interpreted using glob rules.
    151   const std::string& path() const { return path_; }
    152   void SetPath(const std::string& path);
    153 
    154   // Returns true if this pattern matches all urls.
    155   bool match_all_urls() const { return match_all_urls_; }
    156   void set_match_all_urls(bool val) { match_all_urls_ = val; }
    157 
    158   // Initializes this instance by parsing the provided string. Returns
    159   // URLPattern::PARSE_SUCCESS on success, or an error code otherwise. On
    160   // failure, this instance will have some intermediate values and is in an
    161   // invalid state.  Adding error checks to URLPattern::Parse() can cause
    162   // patterns in installed extensions to fail.  If an installed extension
    163   // uses a pattern that was valid but fails a new error check, the
    164   // extension will fail to load when chrome is auto-updated.  To avoid
    165   // this, new parse checks are enabled only when |strictness| is
    166   // OPTION_STRICT.  OPTION_STRICT should be used when loading in developer
    167   // mode, or when an extension's patterns are controlled by chrome (such
    168   // as component extensions).
    169   ParseResult Parse(const std::string& pattern_str,
    170                     ParseOption strictness);
    171 
    172   // Sets the scheme for pattern matches. This can be a single '*' if the
    173   // pattern matches all valid schemes (as defined by the valid_schemes_
    174   // property). Returns false on failure (if the scheme is not valid).
    175   bool SetScheme(const std::string& scheme);
    176   // Note: You should use MatchesScheme() instead of this getter unless you
    177   // absolutely need the exact scheme. This is exposed for testing.
    178   const std::string& scheme() const { return scheme_; }
    179 
    180   // Returns true if the specified scheme can be used in this URL pattern, and
    181   // false otherwise. Uses valid_schemes_ to determine validity.
    182   bool IsValidScheme(const std::string& scheme) const;
    183 
    184   // Returns true if this instance matches the specified URL.
    185   bool MatchesUrl(const GURL& url) const;
    186 
    187   // Returns true if |test| matches our scheme.
    188   bool MatchesScheme(const std::string& test) const;
    189 
    190   // Returns true if |test| matches our host.
    191   bool MatchesHost(const std::string& test) const;
    192   bool MatchesHost(const GURL& test) const;
    193 
    194   // Returns true if |test| matches our path.
    195   bool MatchesPath(const std::string& test) const;
    196 
    197   // Returns a string representing this instance.
    198   std::string GetAsString() const;
    199 
    200   // Determine whether there is a URL that would match this instance and another
    201   // instance. This method is symmetrical: Calling other.OverlapsWith(this)
    202   // would result in the same answer.
    203   bool OverlapsWith(const URLPattern& other) const;
    204 
    205   // Convert this URLPattern into an equivalent set of URLPatterns that don't
    206   // use a wildcard in the scheme component. If this URLPattern doesn't use a
    207   // wildcard scheme, then the returned set will contain one element that is
    208   // equivalent to this instance.
    209   std::vector<URLPattern> ConvertToExplicitSchemes() const;
    210 
    211   static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
    212     if (a.match_all_urls_ && b.match_all_urls_)
    213       return false;
    214     return a.host_.compare(b.host_) < 0;
    215   };
    216 
    217   // Used for origin comparisons in a std::set.
    218   class EffectiveHostCompareFunctor {
    219    public:
    220     bool operator()(const URLPattern& a, const URLPattern& b) const {
    221       return EffectiveHostCompare(a, b);
    222     };
    223   };
    224 
    225   // Get an error string for a ParseResult.
    226   static const char* GetParseResultString(URLPattern::ParseResult parse_result);
    227 
    228  private:
    229 #if !(defined(_MSC_VER) && _MSC_VER >= 1600)
    230   friend class std::vector<URLPattern>;
    231 
    232   // Note: don't use this directly. This exists so URLPattern can be used
    233   // with STL containers.
    234   URLPattern();
    235 #endif
    236 
    237   // A bitmask containing the schemes which are considered valid for this
    238   // pattern. Parse() uses this to decide whether a pattern contains a valid
    239   // scheme. MatchesScheme uses this to decide whether a wildcard scheme_
    240   // matches a given test scheme.
    241   int valid_schemes_;
    242 
    243   // True if this is a special-case "<all_urls>" pattern.
    244   bool match_all_urls_;
    245 
    246   // The scheme for the pattern.
    247   std::string scheme_;
    248 
    249   // The host without any leading "*" components.
    250   std::string host_;
    251 
    252   // Whether we should match subdomains of the host. This is true if the first
    253   // component of the pattern's host was "*".
    254   bool match_subdomains_;
    255 
    256   // The path to match. This is everything after the host of the URL, or
    257   // everything after the scheme in the case of file:// URLs.
    258   std::string path_;
    259 
    260   // The path with "?" and "\" characters escaped for use with the
    261   // MatchPattern() function.
    262   std::string path_escaped_;
    263 };
    264 
    265 typedef std::vector<URLPattern> URLPatternList;
    266 
    267 #endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
    268