1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/url_index_private_data.h" 6 7 #include <algorithm> 8 #include <functional> 9 #include <iterator> 10 #include <limits> 11 #include <numeric> 12 #include <string> 13 #include <vector> 14 15 #include "base/basictypes.h" 16 #include "base/file_util.h" 17 #include "base/i18n/case_conversion.h" 18 #include "base/metrics/histogram.h" 19 #include "base/strings/string_util.h" 20 #include "base/strings/utf_string_conversions.h" 21 #include "base/time/time.h" 22 #include "chrome/browser/autocomplete/autocomplete_provider.h" 23 #include "chrome/browser/autocomplete/url_prefix.h" 24 #include "chrome/browser/bookmarks/bookmark_service.h" 25 #include "chrome/browser/history/history_database.h" 26 #include "chrome/browser/history/history_db_task.h" 27 #include "chrome/browser/history/history_service.h" 28 #include "chrome/browser/history/in_memory_url_index.h" 29 #include "content/public/browser/browser_thread.h" 30 #include "content/public/browser/notification_details.h" 31 #include "content/public/browser/notification_service.h" 32 #include "content/public/browser/notification_source.h" 33 #include "net/base/net_util.h" 34 35 #if defined(USE_SYSTEM_PROTOBUF) 36 #include <google/protobuf/repeated_field.h> 37 #else 38 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 39 #endif 40 41 using google::protobuf::RepeatedField; 42 using google::protobuf::RepeatedPtrField; 43 using in_memory_url_index::InMemoryURLIndexCacheItem; 44 45 namespace { 46 static const size_t kMaxVisitsToStoreInCache = 10u; 47 } // anonymous namespace 48 49 namespace history { 50 51 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 52 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 53 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 54 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 55 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 56 CharWordMapEntry; 57 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 58 WordIDHistoryMapItem; 59 typedef imui:: 60 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 61 WordIDHistoryMapEntry; 62 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 63 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 64 HistoryInfoMapEntry; 65 typedef imui:: 66 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo 67 HistoryInfoMapEntry_VisitInfo; 68 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; 69 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry 70 WordStartsMapEntry; 71 72 73 // Algorithm Functions --------------------------------------------------------- 74 75 // Comparison function for sorting search terms by descending length. 76 bool LengthGreater(const string16& string_a, const string16& string_b) { 77 return string_a.length() > string_b.length(); 78 } 79 80 81 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- 82 83 // HistoryDBTask used to update the recent visit data for a particular 84 // row from the history database. 85 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask { 86 public: 87 explicit UpdateRecentVisitsFromHistoryDBTask( 88 URLIndexPrivateData* private_data, 89 URLID url_id); 90 91 virtual bool RunOnDBThread(HistoryBackend* backend, 92 history::HistoryDatabase* db) OVERRIDE; 93 virtual void DoneRunOnMainThread() OVERRIDE; 94 95 private: 96 virtual ~UpdateRecentVisitsFromHistoryDBTask(); 97 98 // The URLIndexPrivateData that gets updated after the historyDB 99 // task returns. 100 URLIndexPrivateData* private_data_; 101 // The ID of the URL to get visits for and then update. 102 URLID url_id_; 103 // Whether fetching the recent visits for the URL succeeded. 104 bool succeeded_; 105 // The awaited data that's shown to private_data_ for it to copy and 106 // store. 107 VisitVector recent_visits_; 108 109 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); 110 }; 111 112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( 113 URLIndexPrivateData* private_data, 114 URLID url_id) 115 : private_data_(private_data), 116 url_id_(url_id), 117 succeeded_(false) { 118 } 119 120 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( 121 HistoryBackend* backend, 122 HistoryDatabase* db) { 123 // Make sure the private data is going to get as many recent visits as 124 // ScoredHistoryMatch::GetFrecency() hopes to use. 125 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 126 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, 127 kMaxVisitsToStoreInCache, 128 &recent_visits_); 129 if (!succeeded_) 130 recent_visits_.clear(); 131 return true; // Always claim to be done; do not retry failures. 132 } 133 134 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { 135 if (succeeded_) 136 private_data_->UpdateRecentVisits(url_id_, recent_visits_); 137 } 138 139 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { 140 } 141 142 143 // URLIndexPrivateData --------------------------------------------------------- 144 145 URLIndexPrivateData::URLIndexPrivateData() 146 : restored_cache_version_(0), 147 saved_cache_version_(kCurrentCacheFileVersion), 148 pre_filter_item_count_(0), 149 post_filter_item_count_(0), 150 post_scoring_item_count_(0) { 151 } 152 153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 154 string16 search_string, 155 size_t cursor_position, 156 const std::string& languages, 157 BookmarkService* bookmark_service) { 158 // If cursor position is set and useful (not at either end of the 159 // string), allow the search string to be broken at cursor position. 160 // We do this by pretending there's a space where the cursor is. 161 if ((cursor_position != string16::npos) && 162 (cursor_position < search_string.length()) && 163 (cursor_position > 0)) { 164 search_string.insert(cursor_position, ASCIIToUTF16(" ")); 165 } 166 pre_filter_item_count_ = 0; 167 post_filter_item_count_ = 0; 168 post_scoring_item_count_ = 0; 169 // The search string we receive may contain escaped characters. For reducing 170 // the index we need individual, lower-cased words, ignoring escapings. For 171 // the final filtering we need whitespace separated substrings possibly 172 // containing escaped characters. 173 string16 lower_raw_string(base::i18n::ToLower(search_string)); 174 string16 lower_unescaped_string = 175 net::UnescapeURLComponent(lower_raw_string, 176 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 177 // Extract individual 'words' (as opposed to 'terms'; see below) from the 178 // search string. When the user types "colspec=ID%20Mstone Release" we get 179 // four 'words': "colspec", "id", "mstone" and "release". 180 String16Vector lower_words( 181 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); 182 ScoredHistoryMatches scored_items; 183 184 // Do nothing if we have indexed no words (probably because we've not been 185 // initialized yet) or the search string has no words. 186 if (word_list_.empty() || lower_words.empty()) { 187 search_term_cache_.clear(); // Invalidate the term cache. 188 return scored_items; 189 } 190 191 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 192 // approach. 193 ResetSearchTermCache(); 194 195 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 196 197 // Trim the candidate pool if it is large. Note that we do not filter out 198 // items that do not contain the search terms as proper substrings -- doing 199 // so is the performance-costly operation we are trying to avoid in order 200 // to maintain omnibox responsiveness. 201 const size_t kItemsToScoreLimit = 500; 202 pre_filter_item_count_ = history_id_set.size(); 203 // If we trim the results set we do not want to cache the results for next 204 // time as the user's ultimately desired result could easily be eliminated 205 // in this early rough filter. 206 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); 207 if (was_trimmed) { 208 HistoryIDVector history_ids; 209 std::copy(history_id_set.begin(), history_id_set.end(), 210 std::back_inserter(history_ids)); 211 // Trim down the set by sorting by typed-count, visit-count, and last 212 // visit. 213 HistoryItemFactorGreater 214 item_factor_functor(history_info_map_); 215 std::partial_sort(history_ids.begin(), 216 history_ids.begin() + kItemsToScoreLimit, 217 history_ids.end(), 218 item_factor_functor); 219 history_id_set.clear(); 220 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, 221 std::inserter(history_id_set, history_id_set.end())); 222 post_filter_item_count_ = history_id_set.size(); 223 } 224 225 // Pass over all of the candidates filtering out any without a proper 226 // substring match, inserting those which pass in order by score. Note that 227 // in this step we are using the raw search string complete with escaped 228 // URL elements. When the user has specifically typed something akin to 229 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 230 // specific substring appears in the URL or page title. 231 232 // We call these 'terms' (as opposed to 'words'; see above) as in this case 233 // we only want to break up the search string on 'true' whitespace rather than 234 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 235 // get two 'terms': "colspec=id%20mstone" and "release". 236 history::String16Vector lower_raw_terms; 237 if (Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms) == 0) { 238 // Don't score matches when there are no terms to score against. (It's 239 // possible that the word break iterater that extracts words to search 240 // for in the database allows some whitespace "words" whereas Tokenize 241 // excludes a long list of whitespace.) One could write a scoring 242 // function that gives a reasonable order to matches when there 243 // are no terms (i.e., all the words are some form of whitespace), 244 // but this is such a rare edge case that it's not worth the time. 245 return scored_items; 246 } 247 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 248 AddHistoryMatch(*this, languages, bookmark_service, lower_raw_string, 249 lower_raw_terms, base::Time::Now())).ScoredMatches(); 250 251 // Select and sort only the top kMaxMatches results. 252 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { 253 std::partial_sort(scored_items.begin(), 254 scored_items.begin() + 255 AutocompleteProvider::kMaxMatches, 256 scored_items.end(), 257 ScoredHistoryMatch::MatchScoreGreater); 258 scored_items.resize(AutocompleteProvider::kMaxMatches); 259 } else { 260 std::sort(scored_items.begin(), scored_items.end(), 261 ScoredHistoryMatch::MatchScoreGreater); 262 } 263 post_scoring_item_count_ = scored_items.size(); 264 265 if (was_trimmed) { 266 search_term_cache_.clear(); // Invalidate the term cache. 267 } else { 268 // Remove any stale SearchTermCacheItems. 269 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 270 cache_iter != search_term_cache_.end(); ) { 271 if (!cache_iter->second.used_) 272 search_term_cache_.erase(cache_iter++); 273 else 274 ++cache_iter; 275 } 276 } 277 278 return scored_items; 279 } 280 281 bool URLIndexPrivateData::UpdateURL( 282 HistoryService* history_service, 283 const URLRow& row, 284 const std::string& languages, 285 const std::set<std::string>& scheme_whitelist) { 286 // The row may or may not already be in our index. If it is not already 287 // indexed and it qualifies then it gets indexed. If it is already 288 // indexed and still qualifies then it gets updated, otherwise it 289 // is deleted from the index. 290 bool row_was_updated = false; 291 URLID row_id = row.id(); 292 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); 293 if (row_pos == history_info_map_.end()) { 294 // This new row should be indexed if it qualifies. 295 URLRow new_row(row); 296 new_row.set_id(row_id); 297 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && 298 IndexRow(NULL, history_service, new_row, languages, scheme_whitelist); 299 } else if (RowQualifiesAsSignificant(row, base::Time())) { 300 // This indexed row still qualifies and will be re-indexed. 301 // The url won't have changed but the title, visit count, etc. 302 // might have changed. 303 URLRow& row_to_update = row_pos->second.url_row; 304 bool title_updated = row_to_update.title() != row.title(); 305 if (row_to_update.visit_count() != row.visit_count() || 306 row_to_update.typed_count() != row.typed_count() || 307 row_to_update.last_visit() != row.last_visit() || title_updated) { 308 row_to_update.set_visit_count(row.visit_count()); 309 row_to_update.set_typed_count(row.typed_count()); 310 row_to_update.set_last_visit(row.last_visit()); 311 // If something appears to have changed, update the recent visits 312 // information. 313 ScheduleUpdateRecentVisits(history_service, row_id); 314 // While the URL is guaranteed to remain stable, the title may have 315 // changed. If so, then update the index with the changed words. 316 if (title_updated) { 317 // Clear all words associated with this row and re-index both the 318 // URL and title. 319 RemoveRowWordsFromIndex(row_to_update); 320 row_to_update.set_title(row.title()); 321 RowWordStarts word_starts; 322 AddRowWordsToIndex(row_to_update, &word_starts, languages); 323 word_starts_map_[row_id] = word_starts; 324 } 325 row_was_updated = true; 326 } 327 } else { 328 // This indexed row no longer qualifies and will be de-indexed by 329 // clearing all words associated with this row. 330 RemoveRowFromIndex(row); 331 row_was_updated = true; 332 } 333 if (row_was_updated) 334 search_term_cache_.clear(); // This invalidates the cache. 335 return row_was_updated; 336 } 337 338 void URLIndexPrivateData::UpdateRecentVisits( 339 URLID url_id, 340 const VisitVector& recent_visits) { 341 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); 342 if (row_pos != history_info_map_.end()) { 343 VisitInfoVector* visits = &row_pos->second.visits; 344 visits->clear(); 345 const size_t size = 346 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); 347 visits->reserve(size); 348 for (size_t i = 0; i < size; i++) { 349 // Copy from the VisitVector the only fields visits needs. 350 visits->push_back(std::make_pair(recent_visits[i].visit_time, 351 recent_visits[i].transition)); 352 } 353 } 354 // Else: Oddly, the URL doesn't seem to exist in the private index. 355 // Ignore this update. This can happen if, for instance, the user 356 // removes the URL from URLIndexPrivateData before the historyDB call 357 // returns. 358 } 359 360 void URLIndexPrivateData::ScheduleUpdateRecentVisits( 361 HistoryService* history_service, 362 URLID url_id) { 363 history_service->ScheduleDBTask( 364 new UpdateRecentVisitsFromHistoryDBTask(this, url_id), 365 &recent_visits_consumer_); 366 } 367 368 // Helper functor for DeleteURL. 369 class HistoryInfoMapItemHasURL { 370 public: 371 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} 372 373 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { 374 return item.second.url_row.url() == url_; 375 } 376 377 private: 378 const GURL& url_; 379 }; 380 381 bool URLIndexPrivateData::DeleteURL(const GURL& url) { 382 // Find the matching entry in the history_info_map_. 383 HistoryInfoMap::iterator pos = std::find_if( 384 history_info_map_.begin(), 385 history_info_map_.end(), 386 HistoryInfoMapItemHasURL(url)); 387 if (pos == history_info_map_.end()) 388 return false; 389 RemoveRowFromIndex(pos->second.url_row); 390 search_term_cache_.clear(); // This invalidates the cache. 391 return true; 392 } 393 394 // static 395 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( 396 const base::FilePath& file_path, 397 const std::string& languages) { 398 base::TimeTicks beginning_time = base::TimeTicks::Now(); 399 if (!base::PathExists(file_path)) 400 return NULL; 401 std::string data; 402 // If there is no cache file then simply give up. This will cause us to 403 // attempt to rebuild from the history database. 404 if (!file_util::ReadFileToString(file_path, &data)) 405 return NULL; 406 407 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); 408 InMemoryURLIndexCacheItem index_cache; 409 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { 410 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " 411 << file_path.value(); 412 return restored_data; 413 } 414 415 if (!restored_data->RestorePrivateData(index_cache, languages)) 416 return NULL; 417 418 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", 419 base::TimeTicks::Now() - beginning_time); 420 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 421 restored_data->history_id_word_map_.size()); 422 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); 423 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 424 restored_data->word_map_.size()); 425 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 426 restored_data->char_word_map_.size()); 427 if (restored_data->Empty()) 428 return NULL; // 'No data' is the same as a failed reload. 429 return restored_data; 430 } 431 432 // static 433 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( 434 HistoryDatabase* history_db, 435 const std::string& languages, 436 const std::set<std::string>& scheme_whitelist) { 437 if (!history_db) 438 return NULL; 439 440 base::TimeTicks beginning_time = base::TimeTicks::Now(); 441 442 scoped_refptr<URLIndexPrivateData> 443 rebuilt_data(new URLIndexPrivateData); 444 URLDatabase::URLEnumerator history_enum; 445 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) 446 return NULL; 447 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); 448 for (URLRow row; history_enum.GetNextURL(&row); ) { 449 rebuilt_data->IndexRow(history_db, NULL, row, languages, 450 scheme_whitelist); 451 } 452 453 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", 454 base::TimeTicks::Now() - beginning_time); 455 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 456 rebuilt_data->history_id_word_map_.size()); 457 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 458 rebuilt_data->word_map_.size()); 459 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 460 rebuilt_data->char_word_map_.size()); 461 return rebuilt_data; 462 } 463 464 // static 465 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( 466 scoped_refptr<URLIndexPrivateData> private_data, 467 const base::FilePath& file_path) { 468 DCHECK(private_data.get()); 469 DCHECK(!file_path.empty()); 470 return private_data->SaveToFile(file_path); 471 } 472 473 void URLIndexPrivateData::CancelPendingUpdates() { 474 recent_visits_consumer_.CancelAllRequests(); 475 } 476 477 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { 478 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; 479 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; 480 data_copy->word_list_ = word_list_; 481 data_copy->available_words_ = available_words_; 482 data_copy->word_map_ = word_map_; 483 data_copy->char_word_map_ = char_word_map_; 484 data_copy->word_id_history_map_ = word_id_history_map_; 485 data_copy->history_id_word_map_ = history_id_word_map_; 486 data_copy->history_info_map_ = history_info_map_; 487 data_copy->word_starts_map_ = word_starts_map_; 488 return data_copy; 489 // Not copied: 490 // search_term_cache_ 491 // pre_filter_item_count_ 492 // post_filter_item_count_ 493 // post_scoring_item_count_ 494 }; 495 496 bool URLIndexPrivateData::Empty() const { 497 return history_info_map_.empty(); 498 } 499 500 void URLIndexPrivateData::Clear() { 501 last_time_rebuilt_from_history_ = base::Time(); 502 word_list_.clear(); 503 available_words_.clear(); 504 word_map_.clear(); 505 char_word_map_.clear(); 506 word_id_history_map_.clear(); 507 history_id_word_map_.clear(); 508 history_info_map_.clear(); 509 word_starts_map_.clear(); 510 } 511 512 URLIndexPrivateData::~URLIndexPrivateData() {} 513 514 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( 515 const String16Vector& unsorted_words) { 516 // Break the terms down into individual terms (words), get the candidate 517 // set for each term, and intersect each to get a final candidate list. 518 // Note that a single 'term' from the user's perspective might be 519 // a string like "http://www.somewebsite.com" which, from our perspective, 520 // is four words: 'http', 'www', 'somewebsite', and 'com'. 521 HistoryIDSet history_id_set; 522 String16Vector words(unsorted_words); 523 // Sort the words into the longest first as such are likely to narrow down 524 // the results quicker. Also, single character words are the most expensive 525 // to process so save them for last. 526 std::sort(words.begin(), words.end(), LengthGreater); 527 for (String16Vector::iterator iter = words.begin(); iter != words.end(); 528 ++iter) { 529 string16 uni_word = *iter; 530 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 531 if (term_history_set.empty()) { 532 history_id_set.clear(); 533 break; 534 } 535 if (iter == words.begin()) { 536 history_id_set.swap(term_history_set); 537 } else { 538 HistoryIDSet new_history_id_set; 539 std::set_intersection(history_id_set.begin(), history_id_set.end(), 540 term_history_set.begin(), term_history_set.end(), 541 std::inserter(new_history_id_set, 542 new_history_id_set.begin())); 543 history_id_set.swap(new_history_id_set); 544 } 545 } 546 return history_id_set; 547 } 548 549 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( 550 const string16& term) { 551 if (term.empty()) 552 return HistoryIDSet(); 553 554 // TODO(mrossetti): Consider optimizing for very common terms such as 555 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently 556 // occuring words in the user's searches. 557 558 size_t term_length = term.length(); 559 WordIDSet word_id_set; 560 if (term_length > 1) { 561 // See if this term or a prefix thereof is present in the cache. 562 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); 563 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 564 cache_iter != search_term_cache_.end(); ++cache_iter) { 565 if (StartsWith(term, cache_iter->first, false) && 566 (best_prefix == search_term_cache_.end() || 567 cache_iter->first.length() > best_prefix->first.length())) 568 best_prefix = cache_iter; 569 } 570 571 // If a prefix was found then determine the leftover characters to be used 572 // for further refining the results from that prefix. 573 Char16Set prefix_chars; 574 string16 leftovers(term); 575 if (best_prefix != search_term_cache_.end()) { 576 // If the prefix is an exact match for the term then grab the cached 577 // results and we're done. 578 size_t prefix_length = best_prefix->first.length(); 579 if (prefix_length == term_length) { 580 best_prefix->second.used_ = true; 581 return best_prefix->second.history_id_set_; 582 } 583 584 // Otherwise we have a handy starting point. 585 // If there are no history results for this prefix then we can bail early 586 // as there will be no history results for the full term. 587 if (best_prefix->second.history_id_set_.empty()) { 588 search_term_cache_[term] = SearchTermCacheItem(); 589 return HistoryIDSet(); 590 } 591 word_id_set = best_prefix->second.word_id_set_; 592 prefix_chars = Char16SetFromString16(best_prefix->first); 593 leftovers = term.substr(prefix_length); 594 } 595 596 // Filter for each remaining, unique character in the term. 597 Char16Set leftover_chars = Char16SetFromString16(leftovers); 598 Char16Set unique_chars; 599 std::set_difference(leftover_chars.begin(), leftover_chars.end(), 600 prefix_chars.begin(), prefix_chars.end(), 601 std::inserter(unique_chars, unique_chars.begin())); 602 603 // Reduce the word set with any leftover, unprocessed characters. 604 if (!unique_chars.empty()) { 605 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); 606 // We might come up empty on the leftovers. 607 if (leftover_set.empty()) { 608 search_term_cache_[term] = SearchTermCacheItem(); 609 return HistoryIDSet(); 610 } 611 // Or there may not have been a prefix from which to start. 612 if (prefix_chars.empty()) { 613 word_id_set.swap(leftover_set); 614 } else { 615 WordIDSet new_word_id_set; 616 std::set_intersection(word_id_set.begin(), word_id_set.end(), 617 leftover_set.begin(), leftover_set.end(), 618 std::inserter(new_word_id_set, 619 new_word_id_set.begin())); 620 word_id_set.swap(new_word_id_set); 621 } 622 } 623 624 // We must filter the word list because the resulting word set surely 625 // contains words which do not have the search term as a proper subset. 626 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); 627 word_set_iter != word_id_set.end(); ) { 628 if (word_list_[*word_set_iter].find(term) == string16::npos) 629 word_id_set.erase(word_set_iter++); 630 else 631 ++word_set_iter; 632 } 633 } else { 634 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); 635 } 636 637 // If any words resulted then we can compose a set of history IDs by unioning 638 // the sets from each word. 639 HistoryIDSet history_id_set; 640 if (!word_id_set.empty()) { 641 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 642 word_id_iter != word_id_set.end(); ++word_id_iter) { 643 WordID word_id = *word_id_iter; 644 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 645 if (word_iter != word_id_history_map_.end()) { 646 HistoryIDSet& word_history_id_set(word_iter->second); 647 history_id_set.insert(word_history_id_set.begin(), 648 word_history_id_set.end()); 649 } 650 } 651 } 652 653 // Record a new cache entry for this word if the term is longer than 654 // a single character. 655 if (term_length > 1) 656 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); 657 658 return history_id_set; 659 } 660 661 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( 662 const Char16Set& term_chars) { 663 WordIDSet word_id_set; 664 for (Char16Set::const_iterator c_iter = term_chars.begin(); 665 c_iter != term_chars.end(); ++c_iter) { 666 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); 667 if (char_iter == char_word_map_.end()) { 668 // A character was not found so there are no matching results: bail. 669 word_id_set.clear(); 670 break; 671 } 672 WordIDSet& char_word_id_set(char_iter->second); 673 // It is possible for there to no longer be any words associated with 674 // a particular character. Give up in that case. 675 if (char_word_id_set.empty()) { 676 word_id_set.clear(); 677 break; 678 } 679 680 if (c_iter == term_chars.begin()) { 681 // First character results becomes base set of results. 682 word_id_set = char_word_id_set; 683 } else { 684 // Subsequent character results get intersected in. 685 WordIDSet new_word_id_set; 686 std::set_intersection(word_id_set.begin(), word_id_set.end(), 687 char_word_id_set.begin(), char_word_id_set.end(), 688 std::inserter(new_word_id_set, 689 new_word_id_set.begin())); 690 word_id_set.swap(new_word_id_set); 691 } 692 } 693 return word_id_set; 694 } 695 696 bool URLIndexPrivateData::IndexRow( 697 HistoryDatabase* history_db, 698 HistoryService* history_service, 699 const URLRow& row, 700 const std::string& languages, 701 const std::set<std::string>& scheme_whitelist) { 702 const GURL& gurl(row.url()); 703 704 // Index only URLs with a whitelisted scheme. 705 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) 706 return false; 707 708 URLID row_id = row.id(); 709 // Strip out username and password before saving and indexing. 710 string16 url(net::FormatUrl(gurl, languages, 711 net::kFormatUrlOmitUsernamePassword, 712 net::UnescapeRule::NONE, 713 NULL, NULL, NULL)); 714 715 HistoryID history_id = static_cast<HistoryID>(row_id); 716 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); 717 718 // Add the row for quick lookup in the history info store. 719 URLRow new_row(GURL(url), row_id); 720 new_row.set_visit_count(row.visit_count()); 721 new_row.set_typed_count(row.typed_count()); 722 new_row.set_last_visit(row.last_visit()); 723 new_row.set_title(row.title()); 724 history_info_map_[history_id].url_row = new_row; 725 726 // Index the words contained in the URL and title of the row. 727 RowWordStarts word_starts; 728 AddRowWordsToIndex(new_row, &word_starts, languages); 729 word_starts_map_[history_id] = word_starts; 730 731 // Update the recent visits information or schedule the update 732 // as appropriate. 733 if (history_db) { 734 // We'd like to check that we're on the history DB thread. 735 // However, unittest code actually calls this on the UI thread. 736 // So we don't do any thread checks. 737 VisitVector recent_visits; 738 // Make sure the private data is going to get as many recent visits as 739 // ScoredHistoryMatch::GetFrecency() hopes to use. 740 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 741 if (history_db->GetMostRecentVisitsForURL(row_id, 742 kMaxVisitsToStoreInCache, 743 &recent_visits)) 744 UpdateRecentVisits(row_id, recent_visits); 745 } else { 746 DCHECK(history_service); 747 ScheduleUpdateRecentVisits(history_service, row_id); 748 } 749 750 return true; 751 } 752 753 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, 754 RowWordStarts* word_starts, 755 const std::string& languages) { 756 HistoryID history_id = static_cast<HistoryID>(row.id()); 757 // Split URL into individual, unique words then add in the title words. 758 const GURL& gurl(row.url()); 759 const string16& url = CleanUpUrlForMatching(gurl, languages); 760 String16Set url_words = String16SetFromString16(url, 761 word_starts ? &word_starts->url_word_starts_ : NULL); 762 const string16& title = CleanUpTitleForMatching(row.title()); 763 String16Set title_words = String16SetFromString16(title, 764 word_starts ? &word_starts->title_word_starts_ : NULL); 765 String16Set words; 766 std::set_union(url_words.begin(), url_words.end(), 767 title_words.begin(), title_words.end(), 768 std::insert_iterator<String16Set>(words, words.begin())); 769 for (String16Set::iterator word_iter = words.begin(); 770 word_iter != words.end(); ++word_iter) 771 AddWordToIndex(*word_iter, history_id); 772 773 search_term_cache_.clear(); // Invalidate the term cache. 774 } 775 776 void URLIndexPrivateData::AddWordToIndex(const string16& term, 777 HistoryID history_id) { 778 WordMap::iterator word_pos = word_map_.find(term); 779 if (word_pos != word_map_.end()) 780 UpdateWordHistory(word_pos->second, history_id); 781 else 782 AddWordHistory(term, history_id); 783 } 784 785 void URLIndexPrivateData::AddWordHistory(const string16& term, 786 HistoryID history_id) { 787 WordID word_id = word_list_.size(); 788 if (available_words_.empty()) { 789 word_list_.push_back(term); 790 } else { 791 word_id = *(available_words_.begin()); 792 word_list_[word_id] = term; 793 available_words_.erase(word_id); 794 } 795 word_map_[term] = word_id; 796 797 HistoryIDSet history_id_set; 798 history_id_set.insert(history_id); 799 word_id_history_map_[word_id] = history_id_set; 800 AddToHistoryIDWordMap(history_id, word_id); 801 802 // For each character in the newly added word (i.e. a word that is not 803 // already in the word index), add the word to the character index. 804 Char16Set characters = Char16SetFromString16(term); 805 for (Char16Set::iterator uni_char_iter = characters.begin(); 806 uni_char_iter != characters.end(); ++uni_char_iter) { 807 char16 uni_char = *uni_char_iter; 808 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 809 if (char_iter != char_word_map_.end()) { 810 // Update existing entry in the char/word index. 811 WordIDSet& word_id_set(char_iter->second); 812 word_id_set.insert(word_id); 813 } else { 814 // Create a new entry in the char/word index. 815 WordIDSet word_id_set; 816 word_id_set.insert(word_id); 817 char_word_map_[uni_char] = word_id_set; 818 } 819 } 820 } 821 822 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, 823 HistoryID history_id) { 824 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 825 DCHECK(history_pos != word_id_history_map_.end()); 826 HistoryIDSet& history_id_set(history_pos->second); 827 history_id_set.insert(history_id); 828 AddToHistoryIDWordMap(history_id, word_id); 829 } 830 831 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, 832 WordID word_id) { 833 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); 834 if (iter != history_id_word_map_.end()) { 835 WordIDSet& word_id_set(iter->second); 836 word_id_set.insert(word_id); 837 } else { 838 WordIDSet word_id_set; 839 word_id_set.insert(word_id); 840 history_id_word_map_[history_id] = word_id_set; 841 } 842 } 843 844 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { 845 RemoveRowWordsFromIndex(row); 846 HistoryID history_id = static_cast<HistoryID>(row.id()); 847 history_info_map_.erase(history_id); 848 word_starts_map_.erase(history_id); 849 } 850 851 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { 852 // Remove the entries in history_id_word_map_ and word_id_history_map_ for 853 // this row. 854 HistoryID history_id = static_cast<HistoryID>(row.id()); 855 WordIDSet word_id_set = history_id_word_map_[history_id]; 856 history_id_word_map_.erase(history_id); 857 858 // Reconcile any changes to word usage. 859 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 860 word_id_iter != word_id_set.end(); ++word_id_iter) { 861 WordID word_id = *word_id_iter; 862 word_id_history_map_[word_id].erase(history_id); 863 if (!word_id_history_map_[word_id].empty()) 864 continue; // The word is still in use. 865 866 // The word is no longer in use. Reconcile any changes to character usage. 867 string16 word = word_list_[word_id]; 868 Char16Set characters = Char16SetFromString16(word); 869 for (Char16Set::iterator uni_char_iter = characters.begin(); 870 uni_char_iter != characters.end(); ++uni_char_iter) { 871 char16 uni_char = *uni_char_iter; 872 char_word_map_[uni_char].erase(word_id); 873 if (char_word_map_[uni_char].empty()) 874 char_word_map_.erase(uni_char); // No longer in use. 875 } 876 877 // Complete the removal of references to the word. 878 word_id_history_map_.erase(word_id); 879 word_map_.erase(word); 880 word_list_[word_id] = string16(); 881 available_words_.insert(word_id); 882 } 883 } 884 885 void URLIndexPrivateData::ResetSearchTermCache() { 886 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); 887 iter != search_term_cache_.end(); ++iter) 888 iter->second.used_ = false; 889 } 890 891 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { 892 base::TimeTicks beginning_time = base::TimeTicks::Now(); 893 InMemoryURLIndexCacheItem index_cache; 894 SavePrivateData(&index_cache); 895 std::string data; 896 if (!index_cache.SerializeToString(&data)) { 897 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; 898 return false; 899 } 900 901 int size = data.size(); 902 if (file_util::WriteFile(file_path, data.c_str(), size) != size) { 903 LOG(WARNING) << "Failed to write " << file_path.value(); 904 return false; 905 } 906 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 907 base::TimeTicks::Now() - beginning_time); 908 return true; 909 } 910 911 void URLIndexPrivateData::SavePrivateData( 912 InMemoryURLIndexCacheItem* cache) const { 913 DCHECK(cache); 914 cache->set_last_rebuild_timestamp( 915 last_time_rebuilt_from_history_.ToInternalValue()); 916 cache->set_version(saved_cache_version_); 917 // history_item_count_ is no longer used but rather than change the protobuf 918 // definition use a placeholder. This will go away with the switch to SQLite. 919 cache->set_history_item_count(0); 920 SaveWordList(cache); 921 SaveWordMap(cache); 922 SaveCharWordMap(cache); 923 SaveWordIDHistoryMap(cache); 924 SaveHistoryInfoMap(cache); 925 SaveWordStartsMap(cache); 926 } 927 928 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 929 if (word_list_.empty()) 930 return; 931 WordListItem* list_item = cache->mutable_word_list(); 932 list_item->set_word_count(word_list_.size()); 933 for (String16Vector::const_iterator iter = word_list_.begin(); 934 iter != word_list_.end(); ++iter) 935 list_item->add_word(UTF16ToUTF8(*iter)); 936 } 937 938 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { 939 if (word_map_.empty()) 940 return; 941 WordMapItem* map_item = cache->mutable_word_map(); 942 map_item->set_item_count(word_map_.size()); 943 for (WordMap::const_iterator iter = word_map_.begin(); 944 iter != word_map_.end(); ++iter) { 945 WordMapEntry* map_entry = map_item->add_word_map_entry(); 946 map_entry->set_word(UTF16ToUTF8(iter->first)); 947 map_entry->set_word_id(iter->second); 948 } 949 } 950 951 void URLIndexPrivateData::SaveCharWordMap( 952 InMemoryURLIndexCacheItem* cache) const { 953 if (char_word_map_.empty()) 954 return; 955 CharWordMapItem* map_item = cache->mutable_char_word_map(); 956 map_item->set_item_count(char_word_map_.size()); 957 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); 958 iter != char_word_map_.end(); ++iter) { 959 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); 960 map_entry->set_char_16(iter->first); 961 const WordIDSet& word_id_set(iter->second); 962 map_entry->set_item_count(word_id_set.size()); 963 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); 964 set_iter != word_id_set.end(); ++set_iter) 965 map_entry->add_word_id(*set_iter); 966 } 967 } 968 969 void URLIndexPrivateData::SaveWordIDHistoryMap( 970 InMemoryURLIndexCacheItem* cache) const { 971 if (word_id_history_map_.empty()) 972 return; 973 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); 974 map_item->set_item_count(word_id_history_map_.size()); 975 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); 976 iter != word_id_history_map_.end(); ++iter) { 977 WordIDHistoryMapEntry* map_entry = 978 map_item->add_word_id_history_map_entry(); 979 map_entry->set_word_id(iter->first); 980 const HistoryIDSet& history_id_set(iter->second); 981 map_entry->set_item_count(history_id_set.size()); 982 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); 983 set_iter != history_id_set.end(); ++set_iter) 984 map_entry->add_history_id(*set_iter); 985 } 986 } 987 988 void URLIndexPrivateData::SaveHistoryInfoMap( 989 InMemoryURLIndexCacheItem* cache) const { 990 if (history_info_map_.empty()) 991 return; 992 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); 993 map_item->set_item_count(history_info_map_.size()); 994 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 995 iter != history_info_map_.end(); ++iter) { 996 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); 997 map_entry->set_history_id(iter->first); 998 const URLRow& url_row(iter->second.url_row); 999 // Note: We only save information that contributes to the index so there 1000 // is no need to save search_term_cache_ (not persistent). 1001 map_entry->set_visit_count(url_row.visit_count()); 1002 map_entry->set_typed_count(url_row.typed_count()); 1003 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 1004 map_entry->set_url(url_row.url().spec()); 1005 map_entry->set_title(UTF16ToUTF8(url_row.title())); 1006 const VisitInfoVector& visits(iter->second.visits); 1007 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); 1008 visit_iter != visits.end(); ++visit_iter) { 1009 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); 1010 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); 1011 visit_info->set_transition_type(visit_iter->second); 1012 } 1013 } 1014 } 1015 1016 void URLIndexPrivateData::SaveWordStartsMap( 1017 InMemoryURLIndexCacheItem* cache) const { 1018 if (word_starts_map_.empty()) 1019 return; 1020 // For unit testing: Enable saving of the cache as an earlier version to 1021 // allow testing of cache file upgrading in ReadFromFile(). 1022 // TODO(mrossetti): Instead of intruding on production code with this kind of 1023 // test harness, save a copy of an older version cache with known results. 1024 // Implement this when switching the caching over to SQLite. 1025 if (saved_cache_version_ < 1) 1026 return; 1027 1028 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); 1029 map_item->set_item_count(word_starts_map_.size()); 1030 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); 1031 iter != word_starts_map_.end(); ++iter) { 1032 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); 1033 map_entry->set_history_id(iter->first); 1034 const RowWordStarts& word_starts(iter->second); 1035 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); 1036 i != word_starts.url_word_starts_.end(); ++i) 1037 map_entry->add_url_word_starts(*i); 1038 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); 1039 i != word_starts.title_word_starts_.end(); ++i) 1040 map_entry->add_title_word_starts(*i); 1041 } 1042 } 1043 1044 bool URLIndexPrivateData::RestorePrivateData( 1045 const InMemoryURLIndexCacheItem& cache, 1046 const std::string& languages) { 1047 last_time_rebuilt_from_history_ = 1048 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); 1049 const base::TimeDelta rebuilt_ago = 1050 base::Time::Now() - last_time_rebuilt_from_history_; 1051 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || 1052 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { 1053 // Cache is more than a week old or, somehow, from some time in the future. 1054 // It's probably a good time to rebuild the index from history to 1055 // allow synced entries to now appear, expired entries to disappear, etc. 1056 // Allow one day in the future to make the cache not rebuild on simple 1057 // system clock changes such as time zone changes. 1058 return false; 1059 } 1060 if (cache.has_version()) { 1061 if (cache.version() < kCurrentCacheFileVersion) { 1062 // Don't try to restore an old format cache file. (This will cause 1063 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData 1064 // from history.) 1065 return false; 1066 } 1067 restored_cache_version_ = cache.version(); 1068 } 1069 return RestoreWordList(cache) && RestoreWordMap(cache) && 1070 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && 1071 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); 1072 } 1073 1074 bool URLIndexPrivateData::RestoreWordList( 1075 const InMemoryURLIndexCacheItem& cache) { 1076 if (!cache.has_word_list()) 1077 return false; 1078 const WordListItem& list_item(cache.word_list()); 1079 uint32 expected_item_count = list_item.word_count(); 1080 uint32 actual_item_count = list_item.word_size(); 1081 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1082 return false; 1083 const RepeatedPtrField<std::string>& words(list_item.word()); 1084 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); 1085 iter != words.end(); ++iter) 1086 word_list_.push_back(UTF8ToUTF16(*iter)); 1087 return true; 1088 } 1089 1090 bool URLIndexPrivateData::RestoreWordMap( 1091 const InMemoryURLIndexCacheItem& cache) { 1092 if (!cache.has_word_map()) 1093 return false; 1094 const WordMapItem& list_item(cache.word_map()); 1095 uint32 expected_item_count = list_item.item_count(); 1096 uint32 actual_item_count = list_item.word_map_entry_size(); 1097 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1098 return false; 1099 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); 1100 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); 1101 iter != entries.end(); ++iter) 1102 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); 1103 return true; 1104 } 1105 1106 bool URLIndexPrivateData::RestoreCharWordMap( 1107 const InMemoryURLIndexCacheItem& cache) { 1108 if (!cache.has_char_word_map()) 1109 return false; 1110 const CharWordMapItem& list_item(cache.char_word_map()); 1111 uint32 expected_item_count = list_item.item_count(); 1112 uint32 actual_item_count = list_item.char_word_map_entry_size(); 1113 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1114 return false; 1115 const RepeatedPtrField<CharWordMapEntry>& 1116 entries(list_item.char_word_map_entry()); 1117 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = 1118 entries.begin(); iter != entries.end(); ++iter) { 1119 expected_item_count = iter->item_count(); 1120 actual_item_count = iter->word_id_size(); 1121 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1122 return false; 1123 char16 uni_char = static_cast<char16>(iter->char_16()); 1124 WordIDSet word_id_set; 1125 const RepeatedField<int32>& word_ids(iter->word_id()); 1126 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); 1127 jiter != word_ids.end(); ++jiter) 1128 word_id_set.insert(*jiter); 1129 char_word_map_[uni_char] = word_id_set; 1130 } 1131 return true; 1132 } 1133 1134 bool URLIndexPrivateData::RestoreWordIDHistoryMap( 1135 const InMemoryURLIndexCacheItem& cache) { 1136 if (!cache.has_word_id_history_map()) 1137 return false; 1138 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); 1139 uint32 expected_item_count = list_item.item_count(); 1140 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); 1141 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1142 return false; 1143 const RepeatedPtrField<WordIDHistoryMapEntry>& 1144 entries(list_item.word_id_history_map_entry()); 1145 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = 1146 entries.begin(); iter != entries.end(); ++iter) { 1147 expected_item_count = iter->item_count(); 1148 actual_item_count = iter->history_id_size(); 1149 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1150 return false; 1151 WordID word_id = iter->word_id(); 1152 HistoryIDSet history_id_set; 1153 const RepeatedField<int64>& history_ids(iter->history_id()); 1154 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); 1155 jiter != history_ids.end(); ++jiter) { 1156 history_id_set.insert(*jiter); 1157 AddToHistoryIDWordMap(*jiter, word_id); 1158 } 1159 word_id_history_map_[word_id] = history_id_set; 1160 } 1161 return true; 1162 } 1163 1164 bool URLIndexPrivateData::RestoreHistoryInfoMap( 1165 const InMemoryURLIndexCacheItem& cache) { 1166 if (!cache.has_history_info_map()) 1167 return false; 1168 const HistoryInfoMapItem& list_item(cache.history_info_map()); 1169 uint32 expected_item_count = list_item.item_count(); 1170 uint32 actual_item_count = list_item.history_info_map_entry_size(); 1171 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1172 return false; 1173 const RepeatedPtrField<HistoryInfoMapEntry>& 1174 entries(list_item.history_info_map_entry()); 1175 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = 1176 entries.begin(); iter != entries.end(); ++iter) { 1177 HistoryID history_id = iter->history_id(); 1178 GURL url(iter->url()); 1179 URLRow url_row(url, history_id); 1180 url_row.set_visit_count(iter->visit_count()); 1181 url_row.set_typed_count(iter->typed_count()); 1182 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1183 if (iter->has_title()) { 1184 string16 title(UTF8ToUTF16(iter->title())); 1185 url_row.set_title(title); 1186 } 1187 history_info_map_[history_id].url_row = url_row; 1188 1189 // Restore visits list. 1190 VisitInfoVector visits; 1191 visits.reserve(iter->visits_size()); 1192 for (int i = 0; i < iter->visits_size(); ++i) { 1193 visits.push_back(std::make_pair( 1194 base::Time::FromInternalValue(iter->visits(i).visit_time()), 1195 static_cast<content::PageTransition>(iter->visits(i). 1196 transition_type()))); 1197 } 1198 history_info_map_[history_id].visits = visits; 1199 } 1200 return true; 1201 } 1202 1203 bool URLIndexPrivateData::RestoreWordStartsMap( 1204 const InMemoryURLIndexCacheItem& cache, 1205 const std::string& languages) { 1206 // Note that this function must be called after RestoreHistoryInfoMap() has 1207 // been run as the word starts may have to be recalculated from the urls and 1208 // page titles. 1209 if (cache.has_word_starts_map()) { 1210 const WordStartsMapItem& list_item(cache.word_starts_map()); 1211 uint32 expected_item_count = list_item.item_count(); 1212 uint32 actual_item_count = list_item.word_starts_map_entry_size(); 1213 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1214 return false; 1215 const RepeatedPtrField<WordStartsMapEntry>& 1216 entries(list_item.word_starts_map_entry()); 1217 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = 1218 entries.begin(); iter != entries.end(); ++iter) { 1219 HistoryID history_id = iter->history_id(); 1220 RowWordStarts word_starts; 1221 // Restore the URL word starts. 1222 const RepeatedField<int32>& url_starts(iter->url_word_starts()); 1223 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); 1224 jiter != url_starts.end(); ++jiter) 1225 word_starts.url_word_starts_.push_back(*jiter); 1226 // Restore the page title word starts. 1227 const RepeatedField<int32>& title_starts(iter->title_word_starts()); 1228 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); 1229 jiter != title_starts.end(); ++jiter) 1230 word_starts.title_word_starts_.push_back(*jiter); 1231 word_starts_map_[history_id] = word_starts; 1232 } 1233 } else { 1234 // Since the cache did not contain any word starts we must rebuild then from 1235 // the URL and page titles. 1236 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 1237 iter != history_info_map_.end(); ++iter) { 1238 RowWordStarts word_starts; 1239 const URLRow& row(iter->second.url_row); 1240 const string16& url = CleanUpUrlForMatching(row.url(), languages); 1241 String16VectorFromString16(url, false, &word_starts.url_word_starts_); 1242 const string16& title = CleanUpTitleForMatching(row.title()); 1243 String16VectorFromString16(title, false, &word_starts.title_word_starts_); 1244 word_starts_map_[iter->first] = word_starts; 1245 } 1246 } 1247 return true; 1248 } 1249 1250 // static 1251 bool URLIndexPrivateData::URLSchemeIsWhitelisted( 1252 const GURL& gurl, 1253 const std::set<std::string>& whitelist) { 1254 return whitelist.find(gurl.scheme()) != whitelist.end(); 1255 } 1256 1257 1258 // SearchTermCacheItem --------------------------------------------------------- 1259 1260 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1261 const WordIDSet& word_id_set, 1262 const HistoryIDSet& history_id_set) 1263 : word_id_set_(word_id_set), 1264 history_id_set_(history_id_set), 1265 used_(true) {} 1266 1267 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() 1268 : used_(true) {} 1269 1270 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} 1271 1272 1273 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- 1274 1275 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( 1276 const URLIndexPrivateData& private_data, 1277 const std::string& languages, 1278 BookmarkService* bookmark_service, 1279 const string16& lower_string, 1280 const String16Vector& lower_terms, 1281 const base::Time now) 1282 : private_data_(private_data), 1283 languages_(languages), 1284 bookmark_service_(bookmark_service), 1285 lower_string_(lower_string), 1286 lower_terms_(lower_terms), 1287 now_(now) {} 1288 1289 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} 1290 1291 void URLIndexPrivateData::AddHistoryMatch::operator()( 1292 const HistoryID history_id) { 1293 HistoryInfoMap::const_iterator hist_pos = 1294 private_data_.history_info_map_.find(history_id); 1295 if (hist_pos != private_data_.history_info_map_.end()) { 1296 const URLRow& hist_item = hist_pos->second.url_row; 1297 const VisitInfoVector& visits = hist_pos->second.visits; 1298 WordStartsMap::const_iterator starts_pos = 1299 private_data_.word_starts_map_.find(history_id); 1300 DCHECK(starts_pos != private_data_.word_starts_map_.end()); 1301 ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_, 1302 lower_terms_, starts_pos->second, now_, 1303 bookmark_service_); 1304 if (match.raw_score > 0) 1305 scored_matches_.push_back(match); 1306 } 1307 } 1308 1309 1310 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- 1311 1312 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( 1313 const HistoryInfoMap& history_info_map) 1314 : history_info_map_(history_info_map) { 1315 } 1316 1317 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} 1318 1319 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( 1320 const HistoryID h1, 1321 const HistoryID h2) { 1322 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); 1323 if (entry1 == history_info_map_.end()) 1324 return false; 1325 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); 1326 if (entry2 == history_info_map_.end()) 1327 return true; 1328 const URLRow& r1(entry1->second.url_row); 1329 const URLRow& r2(entry2->second.url_row); 1330 // First cut: typed count, visit count, recency. 1331 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1332 // recently visited (within the last 12/24 hours) as highly important. Get 1333 // input from mpearson. 1334 if (r1.typed_count() != r2.typed_count()) 1335 return (r1.typed_count() > r2.typed_count()); 1336 if (r1.visit_count() != r2.visit_count()) 1337 return (r1.visit_count() > r2.visit_count()); 1338 return (r1.last_visit() > r2.last_visit()); 1339 } 1340 1341 } // namespace history 1342