1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/url_matcher/substring_set_matcher.h" 6 7 #include <algorithm> 8 #include <queue> 9 10 #include "base/logging.h" 11 #include "base/stl_util.h" 12 13 namespace url_matcher { 14 15 namespace { 16 17 // Compare StringPattern instances based on their string patterns. 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { 19 return a->pattern() < b->pattern(); 20 } 21 22 // Given the set of patterns, compute how many nodes will the corresponding 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. 24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { 25 uint32 result = 1u; // 1 for the root node. 26 if (patterns.empty()) 27 return result; 28 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); 30 std::vector<const StringPattern*>::const_iterator current = last + 1; 31 // For the first pattern, each letter is a label of an edge to a new node. 32 result += (*last)->pattern().size(); 33 34 // For the subsequent patterns, only count the edges which were not counted 35 // yet. For this it suffices to test against the previous pattern, because the 36 // patterns are sorted. 37 for (; current != patterns.end(); ++last, ++current) { 38 const std::string& last_pattern = (*last)->pattern(); 39 const std::string& current_pattern = (*current)->pattern(); 40 const uint32 prefix_bound = 41 std::min(last_pattern.size(), current_pattern.size()); 42 43 uint32 common_prefix = 0; 44 while (common_prefix < prefix_bound && 45 last_pattern[common_prefix] == current_pattern[common_prefix]) 46 ++common_prefix; 47 result += current_pattern.size() - common_prefix; 48 } 49 return result; 50 } 51 52 } // namespace 53 54 // 55 // SubstringSetMatcher 56 // 57 58 SubstringSetMatcher::SubstringSetMatcher() { 59 RebuildAhoCorasickTree(SubstringPatternVector()); 60 } 61 62 SubstringSetMatcher::~SubstringSetMatcher() {} 63 64 void SubstringSetMatcher::RegisterPatterns( 65 const std::vector<const StringPattern*>& patterns) { 66 RegisterAndUnregisterPatterns(patterns, 67 std::vector<const StringPattern*>()); 68 } 69 70 void SubstringSetMatcher::UnregisterPatterns( 71 const std::vector<const StringPattern*>& patterns) { 72 RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(), 73 patterns); 74 } 75 76 void SubstringSetMatcher::RegisterAndUnregisterPatterns( 77 const std::vector<const StringPattern*>& to_register, 78 const std::vector<const StringPattern*>& to_unregister) { 79 // Register patterns. 80 for (std::vector<const StringPattern*>::const_iterator i = 81 to_register.begin(); i != to_register.end(); ++i) { 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); 83 patterns_[(*i)->id()] = *i; 84 } 85 86 // Unregister patterns 87 for (std::vector<const StringPattern*>::const_iterator i = 88 to_unregister.begin(); i != to_unregister.end(); ++i) { 89 patterns_.erase((*i)->id()); 90 } 91 92 // Now we compute the total number of tree nodes needed. 93 SubstringPatternVector sorted_patterns; 94 sorted_patterns.resize(patterns_.size()); 95 96 size_t next = 0; 97 for (SubstringPatternMap::const_iterator i = patterns_.begin(); 98 i != patterns_.end(); 99 ++i, ++next) { 100 sorted_patterns[next] = i->second; 101 } 102 103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); 104 tree_.reserve(TreeSize(sorted_patterns)); 105 106 RebuildAhoCorasickTree(sorted_patterns); 107 } 108 109 bool SubstringSetMatcher::Match(const std::string& text, 110 std::set<StringPattern::ID>* matches) const { 111 const size_t old_number_of_matches = matches->size(); 112 113 // Handle patterns matching the empty string. 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); 115 116 uint32 current_node = 0; 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && 120 current_node != 0) { 121 current_node = tree_[current_node].failure(); 122 edge_from_current = tree_[current_node].GetEdge(*i); 123 } 124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { 125 current_node = edge_from_current; 126 matches->insert(tree_[current_node].matches().begin(), 127 tree_[current_node].matches().end()); 128 } else { 129 DCHECK_EQ(0u, current_node); 130 } 131 } 132 133 return old_number_of_matches != matches->size(); 134 } 135 136 bool SubstringSetMatcher::IsEmpty() const { 137 // An empty tree consists of only the root node. 138 return patterns_.empty() && tree_.size() == 1u; 139 } 140 141 void SubstringSetMatcher::RebuildAhoCorasickTree( 142 const SubstringPatternVector& sorted_patterns) { 143 tree_.clear(); 144 145 // Initialize root note of tree. 146 AhoCorasickNode root; 147 root.set_failure(0); 148 tree_.push_back(root); 149 150 // Insert all patterns. 151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); 152 i != sorted_patterns.end(); 153 ++i) { 154 InsertPatternIntoAhoCorasickTree(*i); 155 } 156 157 CreateFailureEdges(); 158 } 159 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( 161 const StringPattern* pattern) { 162 const std::string& text = pattern->pattern(); 163 const std::string::const_iterator text_end = text.end(); 164 165 // Iterators on the tree and the text. 166 uint32 current_node = 0; 167 std::string::const_iterator i = text.begin(); 168 169 // Follow existing paths for as long as possible. 170 while (i != text_end) { 171 uint32 edge_from_current = tree_[current_node].GetEdge(*i); 172 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) 173 break; 174 current_node = edge_from_current; 175 ++i; 176 } 177 178 // Create new nodes if necessary. 179 while (i != text_end) { 180 tree_.push_back(AhoCorasickNode()); 181 tree_[current_node].SetEdge(*i, tree_.size() - 1); 182 current_node = tree_.size() - 1; 183 ++i; 184 } 185 186 // Register match. 187 tree_[current_node].AddMatch(pattern->id()); 188 } 189 190 void SubstringSetMatcher::CreateFailureEdges() { 191 typedef AhoCorasickNode::Edges Edges; 192 193 std::queue<uint32> queue; 194 195 AhoCorasickNode& root = tree_[0]; 196 root.set_failure(0); 197 const Edges& root_edges = root.edges(); 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); 199 ++e) { 200 const uint32& leads_to = e->second; 201 tree_[leads_to].set_failure(0); 202 queue.push(leads_to); 203 } 204 205 while (!queue.empty()) { 206 AhoCorasickNode& current_node = tree_[queue.front()]; 207 queue.pop(); 208 for (Edges::const_iterator e = current_node.edges().begin(); 209 e != current_node.edges().end(); ++e) { 210 const char& edge_label = e->first; 211 const uint32& leads_to = e->second; 212 queue.push(leads_to); 213 214 uint32 failure = current_node.failure(); 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); 216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && 217 failure != 0) { 218 failure = tree_[failure].failure(); 219 edge_from_failure = tree_[failure].GetEdge(edge_label); 220 } 221 222 const uint32 follow_in_case_of_failure = 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge 224 ? edge_from_failure 225 : 0; 226 tree_[leads_to].set_failure(follow_in_case_of_failure); 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); 228 } 229 } 230 } 231 232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; 233 234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() 235 : failure_(kNoSuchEdge) {} 236 237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} 238 239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( 240 const SubstringSetMatcher::AhoCorasickNode& other) 241 : edges_(other.edges_), 242 failure_(other.failure_), 243 matches_(other.matches_) {} 244 245 SubstringSetMatcher::AhoCorasickNode& 246 SubstringSetMatcher::AhoCorasickNode::operator=( 247 const SubstringSetMatcher::AhoCorasickNode& other) { 248 edges_ = other.edges_; 249 failure_ = other.failure_; 250 matches_ = other.matches_; 251 return *this; 252 } 253 254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { 255 Edges::const_iterator i = edges_.find(c); 256 return i == edges_.end() ? kNoSuchEdge : i->second; 257 } 258 259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { 260 edges_[c] = node; 261 } 262 263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { 264 matches_.insert(id); 265 } 266 267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( 268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { 269 matches_.insert(matches.begin(), matches.end()); 270 } 271 272 } // namespace url_matcher 273