Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
      6 #define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
      7 
      8 #include <limits>
      9 #include <map>
     10 #include <set>
     11 #include <string>
     12 #include <vector>
     13 
     14 #include "base/basictypes.h"
     15 #include "components/url_matcher/string_pattern.h"
     16 #include "components/url_matcher/url_matcher_export.h"
     17 
     18 namespace url_matcher {
     19 
     20 // Class that store a set of string patterns and can find for a string S,
     21 // which string patterns occur in S.
     22 class URL_MATCHER_EXPORT SubstringSetMatcher {
     23  public:
     24   SubstringSetMatcher();
     25   ~SubstringSetMatcher();
     26 
     27   // Registers all |patterns|. The ownership remains with the caller.
     28   // The same pattern cannot be registered twice and each pattern needs to have
     29   // a unique ID.
     30   // Ownership of the patterns remains with the caller.
     31   void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
     32 
     33   // Unregisters the passed |patterns|.
     34   void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
     35 
     36   // Analogous to RegisterPatterns and UnregisterPatterns but executes both
     37   // operations in one step, which is cheaper in the execution.
     38   void RegisterAndUnregisterPatterns(
     39       const std::vector<const StringPattern*>& to_register,
     40       const std::vector<const StringPattern*>& to_unregister);
     41 
     42   // Matches |text| against all registered StringPatterns. Stores the IDs
     43   // of matching patterns in |matches|. |matches| is not cleared before adding
     44   // to it.
     45   bool Match(const std::string& text,
     46              std::set<StringPattern::ID>* matches) const;
     47 
     48   // Returns true if this object retains no allocated data. Only for debugging.
     49   bool IsEmpty() const;
     50 
     51  private:
     52   // A node of an Aho Corasick Tree. This is implemented according to
     53   // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
     54   //
     55   // The algorithm is based on the idea of building a trie of all registered
     56   // patterns. Each node of the tree is annotated with a set of pattern
     57   // IDs that are used to report matches.
     58   //
     59   // The root of the trie represents an empty match. If we were looking whether
     60   // any registered pattern matches a text at the beginning of the text (i.e.
     61   // whether any pattern is a prefix of the text), we could just follow
     62   // nodes in the trie according to the matching characters in the text.
     63   // E.g., if text == "foobar", we would follow the trie from the root node
     64   // to its child labeled 'f', from there to child 'o', etc. In this process we
     65   // would report all pattern IDs associated with the trie nodes as matches.
     66   //
     67   // As we are not looking for all prefix matches but all substring matches,
     68   // this algorithm would need to compare text.substr(0), text.substr(1), ...
     69   // against the trie, which is in O(|text|^2).
     70   //
     71   // The Aho Corasick algorithm improves this runtime by using failure edges.
     72   // In case we have found a partial match of length k in the text
     73   // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
     74   // a node at depth k, but cannot find a match in the trie for character
     75   // text[i + k] at depth k + 1, we follow a failure edge. This edge
     76   // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
     77   // is a prefix of any registered pattern.
     78   //
     79   // If your brain thinks "Forget it, let's go shopping.", don't worry.
     80   // Take a nap and read an introductory text on the Aho Corasick algorithm.
     81   // It will make sense. Eventually.
     82   class AhoCorasickNode {
     83    public:
     84     // Key: label of the edge, value: node index in |tree_| of parent class.
     85     typedef std::map<char, uint32> Edges;
     86     typedef std::set<StringPattern::ID> Matches;
     87 
     88     static const uint32 kNoSuchEdge;  // Represents an invalid node index.
     89 
     90     AhoCorasickNode();
     91     ~AhoCorasickNode();
     92     AhoCorasickNode(const AhoCorasickNode& other);
     93     AhoCorasickNode& operator=(const AhoCorasickNode& other);
     94 
     95     uint32 GetEdge(char c) const;
     96     void SetEdge(char c, uint32 node);
     97     const Edges& edges() const { return edges_; }
     98 
     99     uint32 failure() const { return failure_; }
    100     void set_failure(uint32 failure) { failure_ = failure; }
    101 
    102     void AddMatch(StringPattern::ID id);
    103     void AddMatches(const Matches& matches);
    104     const Matches& matches() const { return matches_; }
    105 
    106    private:
    107     // Outgoing edges of current node.
    108     Edges edges_;
    109 
    110     // Node index that failure edge leads to.
    111     uint32 failure_;
    112 
    113     // Identifiers of matches.
    114     Matches matches_;
    115   };
    116 
    117   typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
    118   typedef std::vector<const StringPattern*> SubstringPatternVector;
    119 
    120   // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
    121   void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
    122 
    123   // Inserts a path for |pattern->pattern()| into the tree and adds
    124   // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
    125   // the caller.
    126   void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
    127   void CreateFailureEdges();
    128 
    129   // Set of all registered StringPatterns. Used to regenerate the
    130   // Aho-Corasick tree in case patterns are registered or unregistered.
    131   SubstringPatternMap patterns_;
    132 
    133   // The nodes of a Aho-Corasick tree.
    134   std::vector<AhoCorasickNode> tree_;
    135 
    136   DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
    137 };
    138 
    139 }  // namespace url_matcher
    140 
    141 #endif  // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
    142