Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/url_matcher/substring_set_matcher.h"
      6 
      7 #include <algorithm>
      8 #include <queue>
      9 
     10 #include "base/logging.h"
     11 #include "base/stl_util.h"
     12 
     13 namespace url_matcher {
     14 
     15 namespace {
     16 
     17 // Compare StringPattern instances based on their string patterns.
     18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
     19   return a->pattern() < b->pattern();
     20 }
     21 
     22 // Given the set of patterns, compute how many nodes will the corresponding
     23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
     24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
     25   uint32 result = 1u;  // 1 for the root node.
     26   if (patterns.empty())
     27     return result;
     28 
     29   std::vector<const StringPattern*>::const_iterator last = patterns.begin();
     30   std::vector<const StringPattern*>::const_iterator current = last + 1;
     31   // For the first pattern, each letter is a label of an edge to a new node.
     32   result += (*last)->pattern().size();
     33 
     34   // For the subsequent patterns, only count the edges which were not counted
     35   // yet. For this it suffices to test against the previous pattern, because the
     36   // patterns are sorted.
     37   for (; current != patterns.end(); ++last, ++current) {
     38     const std::string& last_pattern = (*last)->pattern();
     39     const std::string& current_pattern = (*current)->pattern();
     40     const uint32 prefix_bound =
     41         std::min(last_pattern.size(), current_pattern.size());
     42 
     43     uint32 common_prefix = 0;
     44     while (common_prefix < prefix_bound &&
     45            last_pattern[common_prefix] == current_pattern[common_prefix])
     46       ++common_prefix;
     47     result += current_pattern.size() - common_prefix;
     48   }
     49   return result;
     50 }
     51 
     52 }  // namespace
     53 
     54 //
     55 // SubstringSetMatcher
     56 //
     57 
     58 SubstringSetMatcher::SubstringSetMatcher() {
     59   RebuildAhoCorasickTree(SubstringPatternVector());
     60 }
     61 
     62 SubstringSetMatcher::~SubstringSetMatcher() {}
     63 
     64 void SubstringSetMatcher::RegisterPatterns(
     65     const std::vector<const StringPattern*>& patterns) {
     66   RegisterAndUnregisterPatterns(patterns,
     67                                 std::vector<const StringPattern*>());
     68 }
     69 
     70 void SubstringSetMatcher::UnregisterPatterns(
     71     const std::vector<const StringPattern*>& patterns) {
     72   RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
     73                                 patterns);
     74 }
     75 
     76 void SubstringSetMatcher::RegisterAndUnregisterPatterns(
     77       const std::vector<const StringPattern*>& to_register,
     78       const std::vector<const StringPattern*>& to_unregister) {
     79   // Register patterns.
     80   for (std::vector<const StringPattern*>::const_iterator i =
     81       to_register.begin(); i != to_register.end(); ++i) {
     82     DCHECK(patterns_.find((*i)->id()) == patterns_.end());
     83     patterns_[(*i)->id()] = *i;
     84   }
     85 
     86   // Unregister patterns
     87   for (std::vector<const StringPattern*>::const_iterator i =
     88       to_unregister.begin(); i != to_unregister.end(); ++i) {
     89     patterns_.erase((*i)->id());
     90   }
     91 
     92   // Now we compute the total number of tree nodes needed.
     93   SubstringPatternVector sorted_patterns;
     94   sorted_patterns.resize(patterns_.size());
     95 
     96   size_t next = 0;
     97   for (SubstringPatternMap::const_iterator i = patterns_.begin();
     98        i != patterns_.end();
     99        ++i, ++next) {
    100     sorted_patterns[next] = i->second;
    101   }
    102 
    103   std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
    104   tree_.reserve(TreeSize(sorted_patterns));
    105 
    106   RebuildAhoCorasickTree(sorted_patterns);
    107 }
    108 
    109 bool SubstringSetMatcher::Match(const std::string& text,
    110                                 std::set<StringPattern::ID>* matches) const {
    111   const size_t old_number_of_matches = matches->size();
    112 
    113   // Handle patterns matching the empty string.
    114   matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
    115 
    116   uint32 current_node = 0;
    117   for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
    118     uint32 edge_from_current = tree_[current_node].GetEdge(*i);
    119     while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
    120            current_node != 0) {
    121       current_node = tree_[current_node].failure();
    122       edge_from_current = tree_[current_node].GetEdge(*i);
    123     }
    124     if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
    125       current_node = edge_from_current;
    126       matches->insert(tree_[current_node].matches().begin(),
    127                       tree_[current_node].matches().end());
    128     } else {
    129       DCHECK_EQ(0u, current_node);
    130     }
    131   }
    132 
    133   return old_number_of_matches != matches->size();
    134 }
    135 
    136 bool SubstringSetMatcher::IsEmpty() const {
    137   // An empty tree consists of only the root node.
    138   return patterns_.empty() && tree_.size() == 1u;
    139 }
    140 
    141 void SubstringSetMatcher::RebuildAhoCorasickTree(
    142     const SubstringPatternVector& sorted_patterns) {
    143   tree_.clear();
    144 
    145   // Initialize root note of tree.
    146   AhoCorasickNode root;
    147   root.set_failure(0);
    148   tree_.push_back(root);
    149 
    150   // Insert all patterns.
    151   for (SubstringPatternVector::const_iterator i = sorted_patterns.begin();
    152        i != sorted_patterns.end();
    153        ++i) {
    154     InsertPatternIntoAhoCorasickTree(*i);
    155   }
    156 
    157   CreateFailureEdges();
    158 }
    159 
    160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
    161     const StringPattern* pattern) {
    162   const std::string& text = pattern->pattern();
    163   const std::string::const_iterator text_end = text.end();
    164 
    165   // Iterators on the tree and the text.
    166   uint32 current_node = 0;
    167   std::string::const_iterator i = text.begin();
    168 
    169   // Follow existing paths for as long as possible.
    170   while (i != text_end) {
    171     uint32 edge_from_current = tree_[current_node].GetEdge(*i);
    172     if (edge_from_current == AhoCorasickNode::kNoSuchEdge)
    173       break;
    174     current_node = edge_from_current;
    175     ++i;
    176   }
    177 
    178   // Create new nodes if necessary.
    179   while (i != text_end) {
    180     tree_.push_back(AhoCorasickNode());
    181     tree_[current_node].SetEdge(*i, tree_.size() - 1);
    182     current_node = tree_.size() - 1;
    183     ++i;
    184   }
    185 
    186   // Register match.
    187   tree_[current_node].AddMatch(pattern->id());
    188 }
    189 
    190 void SubstringSetMatcher::CreateFailureEdges() {
    191   typedef AhoCorasickNode::Edges Edges;
    192 
    193   std::queue<uint32> queue;
    194 
    195   AhoCorasickNode& root = tree_[0];
    196   root.set_failure(0);
    197   const Edges& root_edges = root.edges();
    198   for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
    199        ++e) {
    200     const uint32& leads_to = e->second;
    201     tree_[leads_to].set_failure(0);
    202     queue.push(leads_to);
    203   }
    204 
    205   while (!queue.empty()) {
    206     AhoCorasickNode& current_node = tree_[queue.front()];
    207     queue.pop();
    208     for (Edges::const_iterator e = current_node.edges().begin();
    209          e != current_node.edges().end(); ++e) {
    210       const char& edge_label = e->first;
    211       const uint32& leads_to = e->second;
    212       queue.push(leads_to);
    213 
    214       uint32 failure = current_node.failure();
    215       uint32 edge_from_failure = tree_[failure].GetEdge(edge_label);
    216       while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
    217              failure != 0) {
    218         failure = tree_[failure].failure();
    219         edge_from_failure = tree_[failure].GetEdge(edge_label);
    220       }
    221 
    222       const uint32 follow_in_case_of_failure =
    223           edge_from_failure != AhoCorasickNode::kNoSuchEdge
    224               ? edge_from_failure
    225               : 0;
    226       tree_[leads_to].set_failure(follow_in_case_of_failure);
    227       tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
    228     }
    229   }
    230 }
    231 
    232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
    233 
    234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
    235     : failure_(kNoSuchEdge) {}
    236 
    237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
    238 
    239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
    240     const SubstringSetMatcher::AhoCorasickNode& other)
    241     : edges_(other.edges_),
    242       failure_(other.failure_),
    243       matches_(other.matches_) {}
    244 
    245 SubstringSetMatcher::AhoCorasickNode&
    246 SubstringSetMatcher::AhoCorasickNode::operator=(
    247     const SubstringSetMatcher::AhoCorasickNode& other) {
    248   edges_ = other.edges_;
    249   failure_ = other.failure_;
    250   matches_ = other.matches_;
    251   return *this;
    252 }
    253 
    254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
    255   Edges::const_iterator i = edges_.find(c);
    256   return i == edges_.end() ? kNoSuchEdge : i->second;
    257 }
    258 
    259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) {
    260   edges_[c] = node;
    261 }
    262 
    263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
    264   matches_.insert(id);
    265 }
    266 
    267 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
    268     const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
    269   matches_.insert(matches.begin(), matches.end());
    270 }
    271 
    272 }  // namespace url_matcher
    273