1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/in_memory_url_index.h" 6 7 #include <algorithm> 8 #include <iterator> 9 #include <limits> 10 #include <numeric> 11 12 #include "base/file_util.h" 13 #include "base/i18n/break_iterator.h" 14 #include "base/metrics/histogram.h" 15 #include "base/string_util.h" 16 #include "base/time.h" 17 #include "base/utf_string_conversions.h" 18 #include "chrome/browser/autocomplete/autocomplete.h" 19 #include "chrome/browser/autocomplete/history_provider_util.h" 20 #include "chrome/browser/history/url_database.h" 21 #include "chrome/browser/profiles/profile.h" 22 #include "chrome/common/url_constants.h" 23 #include "googleurl/src/url_util.h" 24 #include "net/base/escape.h" 25 #include "net/base/net_util.h" 26 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 27 #include "ui/base/l10n/l10n_util.h" 28 29 using google::protobuf::RepeatedField; 30 using google::protobuf::RepeatedPtrField; 31 using in_memory_url_index::InMemoryURLIndexCacheItem; 32 33 namespace history { 34 35 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 36 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 37 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 38 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 39 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 40 CharWordMapEntry; 41 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 42 WordIDHistoryMapItem; 43 typedef imui:: 44 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 45 WordIDHistoryMapEntry; 46 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 47 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 48 HistoryInfoMapEntry; 49 50 const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1; 51 52 // Score ranges used to get a 'base' score for each of the scoring factors 53 // (such as recency of last visit, times visited, times the URL was typed, 54 // and the quality of the string match). There is a matching value range for 55 // each of these scores for each factor. 56 const int kScoreRank[] = { 1425, 1200, 900, 400 }; 57 58 ScoredHistoryMatch::ScoredHistoryMatch() 59 : raw_score(0), 60 prefix_adjust(0) {} 61 62 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info) 63 : HistoryMatch(url_info, 0, false, false), 64 raw_score(0), 65 prefix_adjust(0) {} 66 67 ScoredHistoryMatch::~ScoredHistoryMatch() {} 68 69 // Comparison function for sorting ScoredMatches by their scores. 70 bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1, 71 const ScoredHistoryMatch& m2) { 72 return m1.raw_score >= m2.raw_score; 73 } 74 75 struct InMemoryURLIndex::TermCharWordSet { 76 TermCharWordSet() // Required for STL resize(). 77 : char_(0), 78 word_id_set_(), 79 used_(false) {} 80 TermCharWordSet(const char16& uni_char, 81 const WordIDSet& word_id_set, 82 bool used) 83 : char_(uni_char), 84 word_id_set_(word_id_set), 85 used_(used) {} 86 87 // Predicate for STL algorithm use. 88 bool is_not_used() const { return !used_; } 89 90 char16 char_; 91 WordIDSet word_id_set_; 92 bool used_; // true if this set has been used for the current term search. 93 }; 94 95 // Comparison function for sorting TermMatches by their offsets. 96 bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) { 97 return m1.offset < m2.offset; 98 } 99 100 // std::accumulate helper function to add up TermMatches' lengths. 101 int AccumulateMatchLength(int total, const TermMatch& match) { 102 return total + match.length; 103 } 104 105 // Converts a raw value for some particular scoring factor into a score 106 // component for that factor. The conversion function is piecewise linear, with 107 // input values provided in |value_ranks| and resulting output scores from 108 // |kScoreRank| (mathematically, f(value_rank[i]) = kScoreRank[i]). A score 109 // cannot be higher than kScoreRank[0], and drops directly to 0 if lower than 110 // kScoreRank[3]. 111 // 112 // For example, take |value| == 70 and |value_ranks| == { 100, 50, 30, 10 }. 113 // Because 70 falls between ranks 0 (100) and 1 (50), the score is given by the 114 // linear function: 115 // score = m * value + b, where 116 // m = (kScoreRank[0] - kScoreRank[1]) / (value_ranks[0] - value_ranks[1]) 117 // b = value_ranks[1] 118 // Any value higher than 100 would be scored as if it were 100, and any value 119 // lower than 10 scored 0. 120 int ScoreForValue(int value, const int* value_ranks) { 121 int i = 0; 122 int rank_count = arraysize(kScoreRank); 123 while ((i < rank_count) && ((value_ranks[0] < value_ranks[1]) ? 124 (value > value_ranks[i]) : (value < value_ranks[i]))) 125 ++i; 126 if (i >= rank_count) 127 return 0; 128 int score = kScoreRank[i]; 129 if (i > 0) { 130 score += (value - value_ranks[i]) * 131 (kScoreRank[i - 1] - kScoreRank[i]) / 132 (value_ranks[i - 1] - value_ranks[i]); 133 } 134 return score; 135 } 136 137 InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir) 138 : history_dir_(history_dir), 139 history_item_count_(0) { 140 } 141 142 // Called only by unit tests. 143 InMemoryURLIndex::InMemoryURLIndex() 144 : history_item_count_(0) { 145 } 146 147 InMemoryURLIndex::~InMemoryURLIndex() {} 148 149 // Indexing 150 151 bool InMemoryURLIndex::Init(history::URLDatabase* history_db, 152 const std::string& languages) { 153 // TODO(mrossetti): Register for profile/language change notifications. 154 languages_ = languages; 155 return ReloadFromHistory(history_db, false); 156 } 157 158 void InMemoryURLIndex::ShutDown() { 159 // Write our cache. 160 SaveToCacheFile(); 161 } 162 163 bool InMemoryURLIndex::IndexRow(const URLRow& row) { 164 const GURL& gurl(row.url()); 165 string16 url(net::FormatUrl(gurl, languages_, 166 net::kFormatUrlOmitUsernamePassword, 167 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 168 NULL, NULL, NULL)); 169 170 HistoryID history_id = static_cast<HistoryID>(row.id()); 171 DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max()); 172 173 // Add the row for quick lookup in the history info store. 174 URLRow new_row(GURL(url), row.id()); 175 new_row.set_visit_count(row.visit_count()); 176 new_row.set_typed_count(row.typed_count()); 177 new_row.set_last_visit(row.last_visit()); 178 new_row.set_title(row.title()); 179 history_info_map_[history_id] = new_row; 180 181 // Split URL into individual, unique words then add in the title words. 182 url = l10n_util::ToLower(url); 183 String16Set url_words = WordSetFromString16(url); 184 String16Set title_words = WordSetFromString16(row.title()); 185 String16Set words; 186 std::set_union(url_words.begin(), url_words.end(), 187 title_words.begin(), title_words.end(), 188 std::insert_iterator<String16Set>(words, words.begin())); 189 for (String16Set::iterator word_iter = words.begin(); 190 word_iter != words.end(); ++word_iter) 191 AddWordToIndex(*word_iter, history_id); 192 193 ++history_item_count_; 194 return true; 195 } 196 197 bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db, 198 bool clear_cache) { 199 ClearPrivateData(); 200 201 if (!history_db) 202 return false; 203 204 if (clear_cache || !RestoreFromCacheFile()) { 205 base::TimeTicks beginning_time = base::TimeTicks::Now(); 206 // The index has to be built from scratch. 207 URLDatabase::URLEnumerator history_enum; 208 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) 209 return false; 210 URLRow row; 211 while (history_enum.GetNextURL(&row)) { 212 if (!IndexRow(row)) 213 return false; 214 } 215 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", 216 base::TimeTicks::Now() - beginning_time); 217 SaveToCacheFile(); 218 } 219 return true; 220 } 221 222 void InMemoryURLIndex::ClearPrivateData() { 223 history_item_count_ = 0; 224 word_list_.clear(); 225 word_map_.clear(); 226 char_word_map_.clear(); 227 word_id_history_map_.clear(); 228 term_char_word_set_cache_.clear(); 229 history_info_map_.clear(); 230 } 231 232 bool InMemoryURLIndex::RestoreFromCacheFile() { 233 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date. 234 // That is: ensure that the database has not been modified since the cache 235 // was last saved. DB file modification date is inadequate. There are no 236 // SQLite table checksums automatically stored. 237 base::TimeTicks beginning_time = base::TimeTicks::Now(); 238 FilePath file_path; 239 if (!GetCacheFilePath(&file_path) || !file_util::PathExists(file_path)) 240 return false; 241 std::string data; 242 if (!file_util::ReadFileToString(file_path, &data)) { 243 LOG(WARNING) << "Failed to read InMemoryURLIndex cache from " 244 << file_path.value(); 245 return false; 246 } 247 248 InMemoryURLIndexCacheItem index_cache; 249 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { 250 LOG(WARNING) << "Failed to parse InMemoryURLIndex cache data read from " 251 << file_path.value(); 252 return false; 253 } 254 255 if (!RestorePrivateData(index_cache)) { 256 ClearPrivateData(); // Back to square one -- must build from scratch. 257 return false; 258 } 259 260 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", 261 base::TimeTicks::Now() - beginning_time); 262 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", history_item_count_); 263 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); 264 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", word_map_.size()); 265 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", char_word_map_.size()); 266 return true; 267 } 268 269 bool InMemoryURLIndex::SaveToCacheFile() { 270 base::TimeTicks beginning_time = base::TimeTicks::Now(); 271 InMemoryURLIndexCacheItem index_cache; 272 SavePrivateData(&index_cache); 273 std::string data; 274 if (!index_cache.SerializeToString(&data)) { 275 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; 276 return false; 277 } 278 279 // Write the cache to a file then swap it for the old cache, if any, and 280 // delete the old cache. 281 FilePath file_path; 282 if (!GetCacheFilePath(&file_path)) 283 return false; 284 file_util::ScopedFILE file(file_util::OpenFile(file_path, "w")); 285 if (!file.get()) 286 return false; 287 288 int size = data.size(); 289 if (file_util::WriteFile(file_path, data.c_str(), size) != size) { 290 LOG(WARNING) << "Failed to write " << file_path.value(); 291 return false; 292 } 293 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 294 base::TimeTicks::Now() - beginning_time); 295 return true; 296 } 297 298 void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) { 299 // The row may or may not already be in our index. If it is not already 300 // indexed and it qualifies then it gets indexed. If it is already 301 // indexed and still qualifies then it gets updated, otherwise it 302 // is deleted from the index. 303 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); 304 if (row_pos == history_info_map_.end()) { 305 // This new row should be indexed if it qualifies. 306 if (RowQualifiesAsSignificant(row, base::Time())) 307 IndexRow(row); 308 } else if (RowQualifiesAsSignificant(row, base::Time())) { 309 // This indexed row still qualifies and will be re-indexed. 310 // The url won't have changed but the title, visit count, etc. 311 // might have changed. 312 URLRow& old_row = row_pos->second; 313 old_row.set_visit_count(row.visit_count()); 314 old_row.set_typed_count(row.typed_count()); 315 old_row.set_last_visit(row.last_visit()); 316 // TODO(mrossetti): When we start indexing the title the next line 317 // will need attention. 318 old_row.set_title(row.title()); 319 } else { 320 // This indexed row no longer qualifies and will be de-indexed. 321 history_info_map_.erase(row_id); 322 } 323 // This invalidates the cache. 324 term_char_word_set_cache_.clear(); 325 // TODO(mrossetti): Record this transaction in the cache. 326 } 327 328 void InMemoryURLIndex::DeleteURL(URLID row_id) { 329 // Note that this does not remove any reference to this row from the 330 // word_id_history_map_. That map will continue to contain (and return) 331 // hits against this row until that map is rebuilt, but since the 332 // history_info_map_ no longer references the row no erroneous results 333 // will propagate to the user. 334 history_info_map_.erase(row_id); 335 // This invalidates the word cache. 336 term_char_word_set_cache_.clear(); 337 // TODO(mrossetti): Record this transaction in the cache. 338 } 339 340 // Searching 341 342 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 343 const String16Vector& terms) { 344 ScoredHistoryMatches scored_items; 345 if (!terms.empty()) { 346 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 347 // and-sweep approach. 348 ResetTermCharWordSetCache(); 349 350 // Lowercase the terms. 351 // TODO(mrossetti): Another opportunity for a transform algorithm. 352 String16Vector lower_terms; 353 for (String16Vector::const_iterator term_iter = terms.begin(); 354 term_iter != terms.end(); ++term_iter) 355 lower_terms.push_back(l10n_util::ToLower(*term_iter)); 356 357 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 358 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 359 360 // Don't perform any scoring (and don't return any matches) if the 361 // candidate pool is large. (See comments in header.) 362 const size_t kItemsToScoreLimit = 500; 363 if (history_id_set.size() <= kItemsToScoreLimit) { 364 // Pass over all of the candidates filtering out any without a proper 365 // substring match, inserting those which pass in order by score. 366 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 367 AddHistoryMatch(*this, lower_terms)).ScoredMatches(); 368 369 // Select and sort only the top kMaxMatches results. 370 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { 371 std::partial_sort(scored_items.begin(), 372 scored_items.begin() + 373 AutocompleteProvider::kMaxMatches, 374 scored_items.end(), 375 ScoredHistoryMatch::MatchScoreGreater); 376 scored_items.resize(AutocompleteProvider::kMaxMatches); 377 } else { 378 std::sort(scored_items.begin(), scored_items.end(), 379 ScoredHistoryMatch::MatchScoreGreater); 380 } 381 } 382 } 383 384 // Remove any stale TermCharWordSet's. 385 term_char_word_set_cache_.erase( 386 std::remove_if(term_char_word_set_cache_.begin(), 387 term_char_word_set_cache_.end(), 388 std::mem_fun_ref(&TermCharWordSet::is_not_used)), 389 term_char_word_set_cache_.end()); 390 return scored_items; 391 } 392 393 void InMemoryURLIndex::ResetTermCharWordSetCache() { 394 // TODO(mrossetti): Consider keeping more of the cache around for possible 395 // repeat searches, except a 'shortcuts' approach might be better for that. 396 // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. 397 for (TermCharWordSetVector::iterator iter = 398 term_char_word_set_cache_.begin(); 399 iter != term_char_word_set_cache_.end(); ++iter) 400 iter->used_ = false; 401 } 402 403 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 404 const string16& uni_string) { 405 // Break the terms down into individual terms (words), get the candidate 406 // set for each term, and intersect each to get a final candidate list. 407 // Note that a single 'term' from the user's perspective might be 408 // a string like "http://www.somewebsite.com" which, from our perspective, 409 // is four words: 'http', 'www', 'somewebsite', and 'com'. 410 HistoryIDSet history_id_set; 411 String16Set words = WordSetFromString16(uni_string); 412 bool first_word = true; 413 for (String16Set::iterator iter = words.begin(); 414 iter != words.end(); ++iter) { 415 String16Set::value_type uni_word = *iter; 416 HistoryIDSet term_history_id_set = 417 InMemoryURLIndex::HistoryIDsForTerm(uni_word); 418 if (first_word) { 419 history_id_set.swap(term_history_id_set); 420 first_word = false; 421 } else { 422 HistoryIDSet old_history_id_set(history_id_set); 423 history_id_set.clear(); 424 std::set_intersection(old_history_id_set.begin(), 425 old_history_id_set.end(), 426 term_history_id_set.begin(), 427 term_history_id_set.end(), 428 std::inserter(history_id_set, 429 history_id_set.begin())); 430 } 431 } 432 return history_id_set; 433 } 434 435 InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 436 const string16& uni_word) { 437 InMemoryURLIndex::HistoryIDSet history_id_set; 438 439 // For each unique character in the word, in order of first appearance, get 440 // the char/word_id map entry and intersect with the set in an incremental 441 // manner. 442 Char16Vector uni_chars = Char16VectorFromString16(uni_word); 443 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 444 445 // TODO(mrossetti): At this point, as a possible optimization, we could 446 // scan through all candidate words and make sure the |uni_word| is a 447 // substring within the candidate words, eliminating those which aren't. 448 // I'm not sure it would be worth the effort. And remember, we've got to do 449 // a final substring match in order to get the highlighting ranges later 450 // in the process in any case. 451 452 // If any words resulted then we can compose a set of history IDs by unioning 453 // the sets from each word. 454 if (!word_id_set.empty()) { 455 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 456 word_id_iter != word_id_set.end(); ++word_id_iter) { 457 WordID word_id = *word_id_iter; 458 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 459 if (word_iter != word_id_history_map_.end()) { 460 HistoryIDSet& word_history_id_set(word_iter->second); 461 history_id_set.insert(word_history_id_set.begin(), 462 word_history_id_set.end()); 463 } 464 } 465 } 466 467 return history_id_set; 468 } 469 470 // Utility Functions 471 472 // static 473 InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 474 const string16& uni_string) { 475 String16Vector words = WordVectorFromString16(uni_string, false); 476 String16Set word_set; 477 for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); 478 ++iter) 479 word_set.insert(l10n_util::ToLower(*iter)); 480 return word_set; 481 } 482 483 // static 484 InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16( 485 const string16& uni_string, 486 bool break_on_space) { 487 base::BreakIterator iter(&uni_string, break_on_space ? 488 base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD); 489 String16Vector words; 490 if (!iter.Init()) 491 return words; 492 while (iter.Advance()) { 493 if (break_on_space || iter.IsWord()) { 494 string16 word = iter.GetString(); 495 if (break_on_space) 496 TrimWhitespace(word, TRIM_ALL, &word); 497 if (!word.empty()) 498 words.push_back(word); 499 } 500 } 501 return words; 502 } 503 504 // static 505 InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( 506 const string16& uni_word) { 507 InMemoryURLIndex::Char16Vector characters; 508 InMemoryURLIndex::Char16Set unique_characters; 509 for (string16::const_iterator iter = uni_word.begin(); 510 iter != uni_word.end(); ++iter) { 511 if (!unique_characters.count(*iter)) { 512 unique_characters.insert(*iter); 513 characters.push_back(*iter); 514 } 515 } 516 return characters; 517 } 518 519 // static 520 InMemoryURLIndex::Char16Set InMemoryURLIndex::Char16SetFromString16( 521 const string16& uni_word) { 522 Char16Set characters; 523 for (string16::const_iterator iter = uni_word.begin(); 524 iter != uni_word.end(); ++iter) 525 characters.insert(*iter); 526 return characters; 527 } 528 529 void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, 530 HistoryID history_id) { 531 WordMap::iterator word_pos = word_map_.find(uni_word); 532 if (word_pos != word_map_.end()) 533 UpdateWordHistory(word_pos->second, history_id); 534 else 535 AddWordHistory(uni_word, history_id); 536 } 537 538 void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { 539 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 540 DCHECK(history_pos != word_id_history_map_.end()); 541 HistoryIDSet& history_id_set(history_pos->second); 542 history_id_set.insert(history_id); 543 } 544 545 // Add a new word to the word list and the word map, and then create a 546 // new entry in the word/history map. 547 void InMemoryURLIndex::AddWordHistory(const string16& uni_word, 548 HistoryID history_id) { 549 word_list_.push_back(uni_word); 550 WordID word_id = word_list_.size() - 1; 551 word_map_[uni_word] = word_id; 552 HistoryIDSet history_id_set; 553 history_id_set.insert(history_id); 554 word_id_history_map_[word_id] = history_id_set; 555 // For each character in the newly added word (i.e. a word that is not 556 // already in the word index), add the word to the character index. 557 Char16Set characters = Char16SetFromString16(uni_word); 558 for (Char16Set::iterator uni_char_iter = characters.begin(); 559 uni_char_iter != characters.end(); ++uni_char_iter) { 560 Char16Set::value_type uni_char = *uni_char_iter; 561 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 562 if (char_iter != char_word_map_.end()) { 563 // Update existing entry in the char/word index. 564 WordIDSet& word_id_set(char_iter->second); 565 word_id_set.insert(word_id); 566 } else { 567 // Create a new entry in the char/word index. 568 WordIDSet word_id_set; 569 word_id_set.insert(word_id); 570 char_word_map_[uni_char] = word_id_set; 571 } 572 } 573 } 574 575 InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( 576 const InMemoryURLIndex::Char16Vector& uni_chars) { 577 size_t index = CachedResultsIndexForTerm(uni_chars); 578 579 // If there were no unprocessed characters in the search term |uni_chars| 580 // then we can use the cached one as-is as the results with no further 581 // filtering. 582 if (index != kNoCachedResultForTerm && index == uni_chars.size() - 1) 583 return term_char_word_set_cache_[index].word_id_set_; 584 585 // Some or all of the characters remain to be indexed so trim the cache. 586 if (index + 1 < term_char_word_set_cache_.size()) 587 term_char_word_set_cache_.resize(index + 1); 588 WordIDSet word_id_set; 589 // Take advantage of our cached starting point, if any. 590 Char16Vector::const_iterator c_iter = uni_chars.begin(); 591 if (index != kNoCachedResultForTerm) { 592 word_id_set = term_char_word_set_cache_[index].word_id_set_; 593 c_iter += index + 1; 594 } 595 // Now process the remaining characters in the search term. 596 for (; c_iter != uni_chars.end(); ++c_iter) { 597 Char16Vector::value_type uni_char = *c_iter; 598 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 599 if (char_iter == char_word_map_.end()) { 600 // A character was not found so there are no matching results: bail. 601 word_id_set.clear(); 602 break; 603 } 604 WordIDSet& char_word_id_set(char_iter->second); 605 // It is possible for there to no longer be any words associated with 606 // a particular character. Give up in that case. 607 if (char_word_id_set.empty()) { 608 word_id_set.clear(); 609 break; 610 } 611 612 if (word_id_set.empty()) { 613 word_id_set = char_word_id_set; 614 } else { 615 WordIDSet old_word_id_set(word_id_set); 616 word_id_set.clear(); 617 std::set_intersection(old_word_id_set.begin(), 618 old_word_id_set.end(), 619 char_word_id_set.begin(), 620 char_word_id_set.end(), 621 std::inserter(word_id_set, 622 word_id_set.begin())); 623 } 624 // Add this new char/set instance to the cache. 625 term_char_word_set_cache_.push_back(TermCharWordSet( 626 uni_char, word_id_set, true)); 627 } 628 return word_id_set; 629 } 630 631 size_t InMemoryURLIndex::CachedResultsIndexForTerm( 632 const InMemoryURLIndex::Char16Vector& uni_chars) { 633 TermCharWordSetVector::iterator t_iter = term_char_word_set_cache_.begin(); 634 for (Char16Vector::const_iterator c_iter = uni_chars.begin(); 635 (c_iter != uni_chars.end()) && 636 (t_iter != term_char_word_set_cache_.end()) && 637 (*c_iter == t_iter->char_); 638 ++c_iter, ++t_iter) 639 t_iter->used_ = true; // Update the cache. 640 return t_iter - term_char_word_set_cache_.begin() - 1; 641 } 642 643 // static 644 TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, 645 const string16& string, 646 int term_num) { 647 TermMatches matches; 648 for (size_t location = string.find(term); location != string16::npos; 649 location = string.find(term, location + 1)) 650 matches.push_back(TermMatch(term_num, location, term.size())); 651 return matches; 652 } 653 654 // static 655 TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) { 656 if (matches.empty()) 657 return matches; 658 TermMatches sorted_matches = matches; 659 std::sort(sorted_matches.begin(), sorted_matches.end(), 660 MatchOffsetLess); 661 TermMatches clean_matches; 662 TermMatch last_match = sorted_matches[0]; 663 clean_matches.push_back(last_match); 664 for (TermMatches::const_iterator iter = sorted_matches.begin() + 1; 665 iter != sorted_matches.end(); ++iter) { 666 if (iter->offset >= last_match.offset + last_match.length) { 667 last_match = *iter; 668 clean_matches.push_back(last_match); 669 } 670 } 671 return clean_matches; 672 } 673 674 // static 675 std::vector<size_t> InMemoryURLIndex::OffsetsFromTermMatches( 676 const TermMatches& matches) { 677 std::vector<size_t> offsets; 678 for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); ++i) 679 offsets.push_back(i->offset); 680 return offsets; 681 } 682 683 // static 684 TermMatches InMemoryURLIndex::ReplaceOffsetsInTermMatches( 685 const TermMatches& matches, 686 const std::vector<size_t>& offsets) { 687 DCHECK_EQ(matches.size(), offsets.size()); 688 TermMatches new_matches; 689 std::vector<size_t>::const_iterator offset_iter = offsets.begin(); 690 for (TermMatches::const_iterator term_iter = matches.begin(); 691 term_iter != matches.end(); ++term_iter, ++offset_iter) { 692 if (*offset_iter != string16::npos) { 693 TermMatch new_match(*term_iter); 694 new_match.offset = *offset_iter; 695 new_matches.push_back(new_match); 696 } 697 } 698 return new_matches; 699 } 700 701 // static 702 ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( 703 const URLRow& row, 704 const String16Vector& terms) { 705 ScoredHistoryMatch match(row); 706 GURL gurl = row.url(); 707 if (!gurl.is_valid()) 708 return match; 709 710 // Figure out where each search term appears in the URL and/or page title 711 // so that we can score as well as provide autocomplete highlighting. 712 string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec())); 713 // Strip any 'http://' prefix before matching. 714 if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) { 715 match.prefix_adjust = strlen(chrome::kHttpScheme) + 3; // Allow for '://'. 716 url = url.substr(match.prefix_adjust); 717 } 718 719 string16 title = l10n_util::ToLower(row.title()); 720 int term_num = 0; 721 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 722 ++iter, ++term_num) { 723 string16 term = *iter; 724 TermMatches url_term_matches = MatchTermInString(term, url, term_num); 725 TermMatches title_term_matches = MatchTermInString(term, title, term_num); 726 if (url_term_matches.empty() && title_term_matches.empty()) 727 return match; // A term was not found in either URL or title - reject. 728 match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(), 729 url_term_matches.end()); 730 match.title_matches.insert(match.title_matches.end(), 731 title_term_matches.begin(), 732 title_term_matches.end()); 733 } 734 735 // Sort matches by offset and eliminate any which overlap. 736 match.url_matches = SortAndDeoverlap(match.url_matches); 737 match.title_matches = SortAndDeoverlap(match.title_matches); 738 739 // Get partial scores based on term matching. Note that the score for 740 // each of the URL and title are adjusted by the fraction of the 741 // terms appearing in each. 742 int url_score = ScoreComponentForMatches(match.url_matches, url.size()) * 743 match.url_matches.size() / terms.size(); 744 int title_score = 745 ScoreComponentForMatches(match.title_matches, title.size()) * 746 static_cast<int>(match.title_matches.size()) / 747 static_cast<int>(terms.size()); 748 // Arbitrarily pick the best. 749 // TODO(mrossetti): It might make sense that a term which appears in both the 750 // URL and the Title should boost the score a bit. 751 int term_score = std::max(url_score, title_score); 752 if (term_score == 0) 753 return match; 754 755 // Factor in recency of visit, visit count and typed count attributes of the 756 // URLRow. 757 const int kDaysAgoLevel[] = { 0, 10, 20, 30 }; 758 int score = ScoreForValue((base::Time::Now() - 759 row.last_visit()).InDays(), kDaysAgoLevel); 760 const int kVisitCountLevel[] = { 30, 10, 5, 3 }; 761 int visit_count_value = ScoreForValue(row.visit_count(), kVisitCountLevel); 762 const int kTypedCountLevel[] = { 10, 5, 3, 1 }; 763 int typed_count_value = ScoreForValue(row.typed_count(), kTypedCountLevel); 764 765 // Determine how many of the factors comprising the final score are 766 // significant by summing the relative factors for each and subtracting how 767 // many will be 'discarded' even if they are low. 768 const int kVisitCountMultiplier = 2; 769 const int kTypedCountMultiplier = 3; 770 const int kSignificantFactors = 771 kVisitCountMultiplier + // Visit count factor plus 772 kTypedCountMultiplier + // typed count factor plus 773 2 - // one each for string match and last visit 774 2; // minus 2 insignificant factors. 775 // The following, in effect, discards up to |kSignificantFactors| low scoring 776 // elements which contribute little to the score but which can inordinately 777 // drag down an otherwise good score. 778 match.raw_score = std::min(kScoreRank[0], (term_score + score + 779 (visit_count_value * kVisitCountMultiplier) + (typed_count_value * 780 kTypedCountMultiplier)) / kSignificantFactors); 781 782 return match; 783 } 784 785 int InMemoryURLIndex::ScoreComponentForMatches(const TermMatches& matches, 786 size_t max_length) { 787 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 788 if (matches.empty()) 789 return 0; 790 791 // Score component for whether the input terms (if more than one) were found 792 // in the same order in the match. Start with kOrderMaxValue points divided 793 // equally among (number of terms - 1); then discount each of those terms that 794 // is out-of-order in the match. 795 const int kOrderMaxValue = 250; 796 int order_value = kOrderMaxValue; 797 if (matches.size() > 1) { 798 int max_possible_out_of_order = matches.size() - 1; 799 int out_of_order = 0; 800 for (size_t i = 1; i < matches.size(); ++i) { 801 if (matches[i - 1].term_num > matches[i].term_num) 802 ++out_of_order; 803 } 804 order_value = (max_possible_out_of_order - out_of_order) * kOrderMaxValue / 805 max_possible_out_of_order; 806 } 807 808 // Score component for how early in the match string the first search term 809 // appears. Start with kStartMaxValue points and discount by 810 // 1/kMaxSignificantStart points for each character later than the first at 811 // which the term begins. No points are earned if the start of the match 812 // occurs at or after kMaxSignificantStart. 813 const size_t kMaxSignificantStart = 20; 814 const int kStartMaxValue = 250; 815 int start_value = (kMaxSignificantStart - 816 std::min(kMaxSignificantStart, matches[0].offset)) * kStartMaxValue / 817 kMaxSignificantStart; 818 819 // Score component for how much of the matched string the input terms cover. 820 // kCompleteMaxValue points times the fraction of the URL/page title string 821 // that was matched. 822 size_t term_length_total = std::accumulate(matches.begin(), matches.end(), 823 0, AccumulateMatchLength); 824 const int kCompleteMaxValue = 500; 825 int complete_value = term_length_total * kCompleteMaxValue / max_length; 826 827 int raw_score = order_value + start_value + complete_value; 828 const int kTermScoreLevel[] = { 1000, 650, 500, 200 }; 829 830 // Scale the sum of the three components above into a single score component 831 // on the same scale as that used in ScoredMatchForURL(). 832 return ScoreForValue(raw_score, kTermScoreLevel); 833 } 834 835 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 836 const InMemoryURLIndex& index, 837 const String16Vector& lower_terms) 838 : index_(index), 839 lower_terms_(lower_terms) { 840 } 841 842 InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 843 844 void InMemoryURLIndex::AddHistoryMatch::operator()( 845 const InMemoryURLIndex::HistoryID history_id) { 846 HistoryInfoMap::const_iterator hist_pos = 847 index_.history_info_map_.find(history_id); 848 // Note that a history_id may be present in the word_id_history_map_ yet not 849 // be found in the history_info_map_. This occurs when an item has been 850 // deleted by the user or the item no longer qualifies as a quick result. 851 if (hist_pos != index_.history_info_map_.end()) { 852 const URLRow& hist_item = hist_pos->second; 853 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_)); 854 if (match.raw_score > 0) 855 scored_matches_.push_back(match); 856 } 857 } 858 859 bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) { 860 if (history_dir_.empty()) 861 return false; 862 *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache")); 863 return true; 864 } 865 866 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const { 867 DCHECK(cache); 868 cache->set_timestamp(base::Time::Now().ToInternalValue()); 869 cache->set_history_item_count(history_item_count_); 870 SaveWordList(cache); 871 SaveWordMap(cache); 872 SaveCharWordMap(cache); 873 SaveWordIDHistoryMap(cache); 874 SaveHistoryInfoMap(cache); 875 } 876 877 bool InMemoryURLIndex::RestorePrivateData( 878 const InMemoryURLIndexCacheItem& cache) { 879 last_saved_ = base::Time::FromInternalValue(cache.timestamp()); 880 history_item_count_ = cache.history_item_count(); 881 return (history_item_count_ == 0) || (RestoreWordList(cache) && 882 RestoreWordMap(cache) && RestoreCharWordMap(cache) && 883 RestoreWordIDHistoryMap(cache) && RestoreHistoryInfoMap(cache)); 884 } 885 886 887 void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 888 if (word_list_.empty()) 889 return; 890 WordListItem* list_item = cache->mutable_word_list(); 891 list_item->set_word_count(word_list_.size()); 892 for (String16Vector::const_iterator iter = word_list_.begin(); 893 iter != word_list_.end(); ++iter) 894 list_item->add_word(UTF16ToUTF8(*iter)); 895 } 896 897 bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) { 898 if (!cache.has_word_list()) 899 return false; 900 const WordListItem& list_item(cache.word_list()); 901 uint32 expected_item_count = list_item.word_count(); 902 uint32 actual_item_count = list_item.word_size(); 903 if (actual_item_count == 0 || actual_item_count != expected_item_count) 904 return false; 905 const RepeatedPtrField<std::string>& words(list_item.word()); 906 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); 907 iter != words.end(); ++iter) 908 word_list_.push_back(UTF8ToUTF16(*iter)); 909 return true; 910 } 911 912 void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { 913 if (word_map_.empty()) 914 return; 915 WordMapItem* map_item = cache->mutable_word_map(); 916 map_item->set_item_count(word_map_.size()); 917 for (WordMap::const_iterator iter = word_map_.begin(); 918 iter != word_map_.end(); ++iter) { 919 WordMapEntry* map_entry = map_item->add_word_map_entry(); 920 map_entry->set_word(UTF16ToUTF8(iter->first)); 921 map_entry->set_word_id(iter->second); 922 } 923 } 924 925 bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) { 926 if (!cache.has_word_map()) 927 return false; 928 const WordMapItem& list_item(cache.word_map()); 929 uint32 expected_item_count = list_item.item_count(); 930 uint32 actual_item_count = list_item.word_map_entry_size(); 931 if (actual_item_count == 0 || actual_item_count != expected_item_count) 932 return false; 933 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); 934 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); 935 iter != entries.end(); ++iter) 936 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); 937 return true; 938 } 939 940 void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const { 941 if (char_word_map_.empty()) 942 return; 943 CharWordMapItem* map_item = cache->mutable_char_word_map(); 944 map_item->set_item_count(char_word_map_.size()); 945 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); 946 iter != char_word_map_.end(); ++iter) { 947 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); 948 map_entry->set_char_16(iter->first); 949 const WordIDSet& word_id_set(iter->second); 950 map_entry->set_item_count(word_id_set.size()); 951 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); 952 set_iter != word_id_set.end(); ++set_iter) 953 map_entry->add_word_id(*set_iter); 954 } 955 } 956 957 bool InMemoryURLIndex::RestoreCharWordMap( 958 const InMemoryURLIndexCacheItem& cache) { 959 if (!cache.has_char_word_map()) 960 return false; 961 const CharWordMapItem& list_item(cache.char_word_map()); 962 uint32 expected_item_count = list_item.item_count(); 963 uint32 actual_item_count = list_item.char_word_map_entry_size(); 964 if (actual_item_count == 0 || actual_item_count != expected_item_count) 965 return false; 966 const RepeatedPtrField<CharWordMapEntry>& 967 entries(list_item.char_word_map_entry()); 968 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = 969 entries.begin(); iter != entries.end(); ++iter) { 970 expected_item_count = iter->item_count(); 971 actual_item_count = iter->word_id_size(); 972 if (actual_item_count == 0 || actual_item_count != expected_item_count) 973 return false; 974 char16 uni_char = static_cast<char16>(iter->char_16()); 975 WordIDSet word_id_set; 976 const RepeatedField<int32>& word_ids(iter->word_id()); 977 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); 978 jiter != word_ids.end(); ++jiter) 979 word_id_set.insert(*jiter); 980 char_word_map_[uni_char] = word_id_set; 981 } 982 return true; 983 } 984 985 void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache) 986 const { 987 if (word_id_history_map_.empty()) 988 return; 989 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); 990 map_item->set_item_count(word_id_history_map_.size()); 991 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); 992 iter != word_id_history_map_.end(); ++iter) { 993 WordIDHistoryMapEntry* map_entry = 994 map_item->add_word_id_history_map_entry(); 995 map_entry->set_word_id(iter->first); 996 const HistoryIDSet& history_id_set(iter->second); 997 map_entry->set_item_count(history_id_set.size()); 998 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); 999 set_iter != history_id_set.end(); ++set_iter) 1000 map_entry->add_history_id(*set_iter); 1001 } 1002 } 1003 1004 bool InMemoryURLIndex::RestoreWordIDHistoryMap( 1005 const InMemoryURLIndexCacheItem& cache) { 1006 if (!cache.has_word_id_history_map()) 1007 return false; 1008 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); 1009 uint32 expected_item_count = list_item.item_count(); 1010 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); 1011 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1012 return false; 1013 const RepeatedPtrField<WordIDHistoryMapEntry>& 1014 entries(list_item.word_id_history_map_entry()); 1015 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = 1016 entries.begin(); iter != entries.end(); ++iter) { 1017 expected_item_count = iter->item_count(); 1018 actual_item_count = iter->history_id_size(); 1019 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1020 return false; 1021 WordID word_id = iter->word_id(); 1022 HistoryIDSet history_id_set; 1023 const RepeatedField<int64>& history_ids(iter->history_id()); 1024 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); 1025 jiter != history_ids.end(); ++jiter) 1026 history_id_set.insert(*jiter); 1027 word_id_history_map_[word_id] = history_id_set; 1028 } 1029 return true; 1030 } 1031 1032 void InMemoryURLIndex::SaveHistoryInfoMap( 1033 InMemoryURLIndexCacheItem* cache) const { 1034 if (history_info_map_.empty()) 1035 return; 1036 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); 1037 map_item->set_item_count(history_info_map_.size()); 1038 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 1039 iter != history_info_map_.end(); ++iter) { 1040 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); 1041 map_entry->set_history_id(iter->first); 1042 const URLRow& url_row(iter->second); 1043 // Note: We only save information that contributes to the index so there 1044 // is no need to save term_char_word_set_cache_ (not persistent), 1045 // languages_, etc. 1046 map_entry->set_visit_count(url_row.visit_count()); 1047 map_entry->set_typed_count(url_row.typed_count()); 1048 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 1049 map_entry->set_url(url_row.url().spec()); 1050 map_entry->set_title(UTF16ToUTF8(url_row.title())); 1051 } 1052 } 1053 1054 bool InMemoryURLIndex::RestoreHistoryInfoMap( 1055 const InMemoryURLIndexCacheItem& cache) { 1056 if (!cache.has_history_info_map()) 1057 return false; 1058 const HistoryInfoMapItem& list_item(cache.history_info_map()); 1059 uint32 expected_item_count = list_item.item_count(); 1060 uint32 actual_item_count = list_item.history_info_map_entry_size(); 1061 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1062 return false; 1063 const RepeatedPtrField<HistoryInfoMapEntry>& 1064 entries(list_item.history_info_map_entry()); 1065 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = 1066 entries.begin(); iter != entries.end(); ++iter) { 1067 HistoryID history_id = iter->history_id(); 1068 GURL url(iter->url()); 1069 URLRow url_row(url, history_id); 1070 url_row.set_visit_count(iter->visit_count()); 1071 url_row.set_typed_count(iter->typed_count()); 1072 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1073 if (iter->has_title()) { 1074 string16 title(UTF8ToUTF16(iter->title())); 1075 url_row.set_title(title); 1076 } 1077 history_info_map_[history_id] = url_row; 1078 } 1079 return true; 1080 } 1081 1082 } // namespace history 1083