1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/query_parser.h" 6 7 #include <algorithm> 8 9 #include "base/compiler_specific.h" 10 #include "base/i18n/break_iterator.h" 11 #include "base/i18n/case_conversion.h" 12 #include "base/logging.h" 13 #include "base/stl_util.h" 14 #include "base/strings/utf_string_conversions.h" 15 16 namespace { 17 18 // Returns true if |mp1.first| is less than |mp2.first|. This is used to 19 // sort match positions. 20 int CompareMatchPosition(const Snippet::MatchPosition& mp1, 21 const Snippet::MatchPosition& mp2) { 22 return mp1.first < mp2.first; 23 } 24 25 // Returns true if |mp2| intersects |mp1|. This is intended for use by 26 // CoalesceMatchesFrom and isn't meant as a general intersection comparison 27 // function. 28 bool SnippetIntersects(const Snippet::MatchPosition& mp1, 29 const Snippet::MatchPosition& mp2) { 30 return mp2.first >= mp1.first && mp2.first <= mp1.second; 31 } 32 33 // Coalesces match positions in |matches| after index that intersect the match 34 // position at |index|. 35 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) { 36 Snippet::MatchPosition& mp = (*matches)[index]; 37 for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1; 38 i != matches->end(); ) { 39 if (SnippetIntersects(mp, *i)) { 40 mp.second = std::max(mp.second, i->second); 41 i = matches->erase(i); 42 } else { 43 return; 44 } 45 } 46 } 47 48 // Sorts the match positions in |matches| by their first index, then coalesces 49 // any match positions that intersect each other. 50 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) { 51 std::sort(matches->begin(), matches->end(), &CompareMatchPosition); 52 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove 53 // from matches. 54 for (size_t i = 0; i < matches->size(); ++i) 55 CoalesceMatchesFrom(i, matches); 56 } 57 58 // Returns true if the character is considered a quote. 59 bool IsQueryQuote(wchar_t ch) { 60 return ch == '"' || 61 ch == 0xab || // left pointing double angle bracket 62 ch == 0xbb || // right pointing double angle bracket 63 ch == 0x201c || // left double quotation mark 64 ch == 0x201d || // right double quotation mark 65 ch == 0x201e; // double low-9 quotation mark 66 } 67 68 } // namespace 69 70 // Inheritance structure: 71 // Queries are represented as trees of QueryNodes. 72 // QueryNodes are either a collection of subnodes (a QueryNodeList) 73 // or a single word (a QueryNodeWord). 74 75 // A QueryNodeWord is a single word in the query. 76 class QueryNodeWord : public QueryNode { 77 public: 78 explicit QueryNodeWord(const base::string16& word); 79 virtual ~QueryNodeWord(); 80 81 const base::string16& word() const { return word_; } 82 83 void set_literal(bool literal) { literal_ = literal; } 84 85 // QueryNode: 86 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 87 virtual bool IsWord() const OVERRIDE; 88 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE; 89 virtual bool HasMatchIn( 90 const std::vector<QueryWord>& words, 91 Snippet::MatchPositions* match_positions) const OVERRIDE; 92 virtual bool HasMatchIn( 93 const std::vector<QueryWord>& words) const OVERRIDE; 94 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE; 95 96 private: 97 base::string16 word_; 98 bool literal_; 99 100 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord); 101 }; 102 103 QueryNodeWord::QueryNodeWord(const base::string16& word) 104 : word_(word), 105 literal_(false) {} 106 107 QueryNodeWord::~QueryNodeWord() {} 108 109 int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const { 110 query->append(word_); 111 112 // Use prefix search if we're not literal and long enough. 113 if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 114 *query += L'*'; 115 return 1; 116 } 117 118 bool QueryNodeWord::IsWord() const { 119 return true; 120 } 121 122 bool QueryNodeWord::Matches(const base::string16& word, bool exact) const { 123 if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 124 return word == word_; 125 return word.size() >= word_.size() && 126 (word_.compare(0, word_.size(), word, 0, word_.size()) == 0); 127 } 128 129 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words, 130 Snippet::MatchPositions* match_positions) const { 131 bool matched = false; 132 for (size_t i = 0; i < words.size(); ++i) { 133 if (Matches(words[i].word, false)) { 134 size_t match_start = words[i].position; 135 match_positions->push_back( 136 Snippet::MatchPosition(match_start, 137 match_start + static_cast<int>(word_.size()))); 138 matched = true; 139 } 140 } 141 return matched; 142 } 143 144 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words) const { 145 for (size_t i = 0; i < words.size(); ++i) { 146 if (Matches(words[i].word, false)) 147 return true; 148 } 149 return false; 150 } 151 152 void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const { 153 words->push_back(word_); 154 } 155 156 // A QueryNodeList has a collection of QueryNodes which are deleted in the end. 157 class QueryNodeList : public QueryNode { 158 public: 159 typedef std::vector<QueryNode*> QueryNodeVector; 160 161 QueryNodeList(); 162 virtual ~QueryNodeList(); 163 164 QueryNodeVector* children() { return &children_; } 165 166 void AddChild(QueryNode* node); 167 168 // Remove empty subnodes left over from other parsing. 169 void RemoveEmptySubnodes(); 170 171 // QueryNode: 172 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 173 virtual bool IsWord() const OVERRIDE; 174 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE; 175 virtual bool HasMatchIn( 176 const std::vector<QueryWord>& words, 177 Snippet::MatchPositions* match_positions) const OVERRIDE; 178 virtual bool HasMatchIn( 179 const std::vector<QueryWord>& words) const OVERRIDE; 180 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE; 181 182 protected: 183 int AppendChildrenToString(base::string16* query) const; 184 185 QueryNodeVector children_; 186 187 private: 188 DISALLOW_COPY_AND_ASSIGN(QueryNodeList); 189 }; 190 191 QueryNodeList::QueryNodeList() {} 192 193 QueryNodeList::~QueryNodeList() { 194 STLDeleteElements(&children_); 195 } 196 197 void QueryNodeList::AddChild(QueryNode* node) { 198 children_.push_back(node); 199 } 200 201 void QueryNodeList::RemoveEmptySubnodes() { 202 for (size_t i = 0; i < children_.size(); ++i) { 203 if (children_[i]->IsWord()) 204 continue; 205 206 QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]); 207 list_node->RemoveEmptySubnodes(); 208 if (list_node->children()->empty()) { 209 children_.erase(children_.begin() + i); 210 --i; 211 delete list_node; 212 } 213 } 214 } 215 216 int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const { 217 return AppendChildrenToString(query); 218 } 219 220 bool QueryNodeList::IsWord() const { 221 return false; 222 } 223 224 bool QueryNodeList::Matches(const base::string16& word, bool exact) const { 225 NOTREACHED(); 226 return false; 227 } 228 229 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words, 230 Snippet::MatchPositions* match_positions) const { 231 NOTREACHED(); 232 return false; 233 } 234 235 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words) const { 236 NOTREACHED(); 237 return false; 238 } 239 240 void QueryNodeList::AppendWords(std::vector<base::string16>* words) const { 241 for (size_t i = 0; i < children_.size(); ++i) 242 children_[i]->AppendWords(words); 243 } 244 245 int QueryNodeList::AppendChildrenToString(base::string16* query) const { 246 int num_words = 0; 247 for (QueryNodeVector::const_iterator node = children_.begin(); 248 node != children_.end(); ++node) { 249 if (node != children_.begin()) 250 query->push_back(L' '); 251 num_words += (*node)->AppendToSQLiteQuery(query); 252 } 253 return num_words; 254 } 255 256 // A QueryNodePhrase is a phrase query ("quoted"). 257 class QueryNodePhrase : public QueryNodeList { 258 public: 259 QueryNodePhrase(); 260 virtual ~QueryNodePhrase(); 261 262 // QueryNodeList: 263 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 264 virtual bool HasMatchIn( 265 const std::vector<QueryWord>& words, 266 Snippet::MatchPositions* match_positions) const OVERRIDE; 267 virtual bool HasMatchIn( 268 const std::vector<QueryWord>& words) const OVERRIDE; 269 270 private: 271 bool MatchesAll(const std::vector<QueryWord>& words, 272 const QueryWord** first_word, 273 const QueryWord** last_word) const; 274 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase); 275 }; 276 277 QueryNodePhrase::QueryNodePhrase() {} 278 279 QueryNodePhrase::~QueryNodePhrase() {} 280 281 int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const { 282 query->push_back(L'"'); 283 int num_words = AppendChildrenToString(query); 284 query->push_back(L'"'); 285 return num_words; 286 } 287 288 bool QueryNodePhrase::MatchesAll(const std::vector<QueryWord>& words, 289 const QueryWord** first_word, 290 const QueryWord** last_word) const { 291 if (words.size() < children_.size()) 292 return false; 293 294 for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) { 295 bool matched_all = true; 296 for (size_t j = 0; j < children_.size(); ++j) { 297 if (!children_[j]->Matches(words[i + j].word, true)) { 298 matched_all = false; 299 break; 300 } 301 } 302 if (matched_all) { 303 *first_word = &words[i]; 304 *last_word = &words[i + children_.size() - 1]; 305 return true; 306 } 307 } 308 return false; 309 } 310 311 bool QueryNodePhrase::HasMatchIn( 312 const std::vector<QueryWord>& words, 313 Snippet::MatchPositions* match_positions) const { 314 const QueryWord* first_word; 315 const QueryWord* last_word; 316 317 if (MatchesAll(words, &first_word, &last_word)) { 318 match_positions->push_back( 319 Snippet::MatchPosition(first_word->position, 320 last_word->position + last_word->word.length())); 321 return true; 322 } 323 return false; 324 } 325 326 bool QueryNodePhrase::HasMatchIn(const std::vector<QueryWord>& words) const { 327 const QueryWord* first_word; 328 const QueryWord* last_word; 329 return MatchesAll(words, &first_word, &last_word); 330 } 331 332 QueryParser::QueryParser() {} 333 334 // static 335 bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) { 336 DCHECK(!word.empty()); 337 size_t minimum_length = 3; 338 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility) 339 // because they 'behave like' Latin letters. Moreover, we should 340 // normalize the former before reaching here. 341 if (0xAC00 <= word[0] && word[0] <= 0xD7A3) 342 minimum_length = 2; 343 return word.size() >= minimum_length; 344 } 345 346 int QueryParser::ParseQuery(const base::string16& query, 347 base::string16* sqlite_query) { 348 QueryNodeList root; 349 if (!ParseQueryImpl(query, &root)) 350 return 0; 351 return root.AppendToSQLiteQuery(sqlite_query); 352 } 353 354 void QueryParser::ParseQueryWords(const base::string16& query, 355 std::vector<base::string16>* words) { 356 QueryNodeList root; 357 if (!ParseQueryImpl(query, &root)) 358 return; 359 root.AppendWords(words); 360 } 361 362 void QueryParser::ParseQueryNodes(const base::string16& query, 363 std::vector<QueryNode*>* nodes) { 364 QueryNodeList root; 365 if (ParseQueryImpl(base::i18n::ToLower(query), &root)) 366 nodes->swap(*root.children()); 367 } 368 369 bool QueryParser::DoesQueryMatch(const base::string16& text, 370 const std::vector<QueryNode*>& query_nodes, 371 Snippet::MatchPositions* match_positions) { 372 if (query_nodes.empty()) 373 return false; 374 375 std::vector<QueryWord> query_words; 376 base::string16 lower_text = base::i18n::ToLower(text); 377 ExtractQueryWords(lower_text, &query_words); 378 379 if (query_words.empty()) 380 return false; 381 382 Snippet::MatchPositions matches; 383 for (size_t i = 0; i < query_nodes.size(); ++i) { 384 if (!query_nodes[i]->HasMatchIn(query_words, &matches)) 385 return false; 386 } 387 if (lower_text.length() != text.length()) { 388 // The lower case string differs from the original string. The matches are 389 // meaningless. 390 // TODO(sky): we need a better way to align the positions so that we don't 391 // completely punt here. 392 match_positions->clear(); 393 } else { 394 CoalseAndSortMatchPositions(&matches); 395 match_positions->swap(matches); 396 } 397 return true; 398 } 399 400 bool QueryParser::DoesQueryMatch(const std::vector<QueryWord>& query_words, 401 const std::vector<QueryNode*>& query_nodes) { 402 if (query_nodes.empty() || query_words.empty()) 403 return false; 404 405 for (size_t i = 0; i < query_nodes.size(); ++i) { 406 if (!query_nodes[i]->HasMatchIn(query_words)) 407 return false; 408 } 409 return true; 410 } 411 412 bool QueryParser::ParseQueryImpl(const base::string16& query, 413 QueryNodeList* root) { 414 base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD); 415 // TODO(evanm): support a locale here 416 if (!iter.Init()) 417 return false; 418 419 // To handle nesting, we maintain a stack of QueryNodeLists. 420 // The last element (back) of the stack contains the current, deepest node. 421 std::vector<QueryNodeList*> query_stack; 422 query_stack.push_back(root); 423 424 bool in_quotes = false; // whether we're currently in a quoted phrase 425 while (iter.Advance()) { 426 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 427 // is not necessarily a word, but could also be a sequence of punctuation 428 // or whitespace. 429 if (iter.IsWord()) { 430 QueryNodeWord* word_node = new QueryNodeWord(iter.GetString()); 431 if (in_quotes) 432 word_node->set_literal(true); 433 query_stack.back()->AddChild(word_node); 434 } else { // Punctuation. 435 if (IsQueryQuote(query[iter.prev()])) { 436 if (!in_quotes) { 437 QueryNodeList* quotes_node = new QueryNodePhrase; 438 query_stack.back()->AddChild(quotes_node); 439 query_stack.push_back(quotes_node); 440 in_quotes = true; 441 } else { 442 query_stack.pop_back(); // Stop adding to the quoted phrase. 443 in_quotes = false; 444 } 445 } 446 } 447 } 448 449 root->RemoveEmptySubnodes(); 450 return true; 451 } 452 453 void QueryParser::ExtractQueryWords(const base::string16& text, 454 std::vector<QueryWord>* words) { 455 base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD); 456 // TODO(evanm): support a locale here 457 if (!iter.Init()) 458 return; 459 460 while (iter.Advance()) { 461 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 462 // is not necessarily a word, but could also be a sequence of punctuation 463 // or whitespace. 464 if (iter.IsWord()) { 465 base::string16 word = iter.GetString(); 466 if (!word.empty()) { 467 words->push_back(QueryWord()); 468 words->back().word = word; 469 words->back().position = iter.prev(); 470 } 471 } 472 } 473 } 474