1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/url_index_private_data.h" 6 7 #include <functional> 8 #include <iterator> 9 #include <limits> 10 #include <numeric> 11 #include <string> 12 #include <vector> 13 14 #include "base/basictypes.h" 15 #include "base/files/file_util.h" 16 #include "base/i18n/break_iterator.h" 17 #include "base/i18n/case_conversion.h" 18 #include "base/metrics/histogram.h" 19 #include "base/strings/string_util.h" 20 #include "base/strings/utf_string_conversions.h" 21 #include "base/time/time.h" 22 #include "chrome/browser/history/history_database.h" 23 #include "chrome/browser/history/history_db_task.h" 24 #include "chrome/browser/history/history_service.h" 25 #include "chrome/browser/history/in_memory_url_index.h" 26 #include "components/bookmarks/browser/bookmark_utils.h" 27 #include "components/history/core/browser/history_client.h" 28 #include "net/base/net_util.h" 29 30 #if defined(USE_SYSTEM_PROTOBUF) 31 #include <google/protobuf/repeated_field.h> 32 #else 33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 34 #endif 35 36 using google::protobuf::RepeatedField; 37 using google::protobuf::RepeatedPtrField; 38 using in_memory_url_index::InMemoryURLIndexCacheItem; 39 40 namespace { 41 static const size_t kMaxVisitsToStoreInCache = 10u; 42 } // anonymous namespace 43 44 namespace history { 45 46 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 48 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 50 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 51 CharWordMapEntry; 52 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 53 WordIDHistoryMapItem; 54 typedef imui:: 55 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 56 WordIDHistoryMapEntry; 57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 58 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 59 HistoryInfoMapEntry; 60 typedef imui:: 61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo 62 HistoryInfoMapEntry_VisitInfo; 63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; 64 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry 65 WordStartsMapEntry; 66 67 68 // Algorithm Functions --------------------------------------------------------- 69 70 // Comparison function for sorting search terms by descending length. 71 bool LengthGreater(const base::string16& string_a, 72 const base::string16& string_b) { 73 return string_a.length() > string_b.length(); 74 } 75 76 77 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- 78 79 // HistoryDBTask used to update the recent visit data for a particular 80 // row from the history database. 81 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask { 82 public: 83 explicit UpdateRecentVisitsFromHistoryDBTask( 84 URLIndexPrivateData* private_data, 85 URLID url_id); 86 87 virtual bool RunOnDBThread(HistoryBackend* backend, 88 history::HistoryDatabase* db) OVERRIDE; 89 virtual void DoneRunOnMainThread() OVERRIDE; 90 91 private: 92 virtual ~UpdateRecentVisitsFromHistoryDBTask(); 93 94 // The URLIndexPrivateData that gets updated after the historyDB 95 // task returns. 96 URLIndexPrivateData* private_data_; 97 // The ID of the URL to get visits for and then update. 98 URLID url_id_; 99 // Whether fetching the recent visits for the URL succeeded. 100 bool succeeded_; 101 // The awaited data that's shown to private_data_ for it to copy and 102 // store. 103 VisitVector recent_visits_; 104 105 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); 106 }; 107 108 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( 109 URLIndexPrivateData* private_data, 110 URLID url_id) 111 : private_data_(private_data), 112 url_id_(url_id), 113 succeeded_(false) { 114 } 115 116 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( 117 HistoryBackend* backend, 118 HistoryDatabase* db) { 119 // Make sure the private data is going to get as many recent visits as 120 // ScoredHistoryMatch::GetFrequency() hopes to use. 121 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 122 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, 123 kMaxVisitsToStoreInCache, 124 &recent_visits_); 125 if (!succeeded_) 126 recent_visits_.clear(); 127 return true; // Always claim to be done; do not retry failures. 128 } 129 130 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { 131 if (succeeded_) 132 private_data_->UpdateRecentVisits(url_id_, recent_visits_); 133 } 134 135 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { 136 } 137 138 139 // URLIndexPrivateData --------------------------------------------------------- 140 141 URLIndexPrivateData::URLIndexPrivateData() 142 : restored_cache_version_(0), 143 saved_cache_version_(kCurrentCacheFileVersion), 144 pre_filter_item_count_(0), 145 post_filter_item_count_(0), 146 post_scoring_item_count_(0) { 147 } 148 149 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 150 base::string16 search_string, 151 size_t cursor_position, 152 size_t max_matches, 153 const std::string& languages, 154 HistoryClient* history_client) { 155 // If cursor position is set and useful (not at either end of the 156 // string), allow the search string to be broken at cursor position. 157 // We do this by pretending there's a space where the cursor is. 158 if ((cursor_position != base::string16::npos) && 159 (cursor_position < search_string.length()) && 160 (cursor_position > 0)) { 161 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); 162 } 163 pre_filter_item_count_ = 0; 164 post_filter_item_count_ = 0; 165 post_scoring_item_count_ = 0; 166 // The search string we receive may contain escaped characters. For reducing 167 // the index we need individual, lower-cased words, ignoring escapings. For 168 // the final filtering we need whitespace separated substrings possibly 169 // containing escaped characters. 170 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 171 base::string16 lower_unescaped_string = 172 net::UnescapeURLComponent(lower_raw_string, 173 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 174 // Extract individual 'words' (as opposed to 'terms'; see below) from the 175 // search string. When the user types "colspec=ID%20Mstone Release" we get 176 // four 'words': "colspec", "id", "mstone" and "release". 177 String16Vector lower_words( 178 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); 179 ScoredHistoryMatches scored_items; 180 181 // Do nothing if we have indexed no words (probably because we've not been 182 // initialized yet) or the search string has no words. 183 if (word_list_.empty() || lower_words.empty()) { 184 search_term_cache_.clear(); // Invalidate the term cache. 185 return scored_items; 186 } 187 188 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 189 // approach. 190 ResetSearchTermCache(); 191 192 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 193 194 // Trim the candidate pool if it is large. Note that we do not filter out 195 // items that do not contain the search terms as proper substrings -- doing 196 // so is the performance-costly operation we are trying to avoid in order 197 // to maintain omnibox responsiveness. 198 const size_t kItemsToScoreLimit = 500; 199 pre_filter_item_count_ = history_id_set.size(); 200 // If we trim the results set we do not want to cache the results for next 201 // time as the user's ultimately desired result could easily be eliminated 202 // in this early rough filter. 203 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); 204 if (was_trimmed) { 205 HistoryIDVector history_ids; 206 std::copy(history_id_set.begin(), history_id_set.end(), 207 std::back_inserter(history_ids)); 208 // Trim down the set by sorting by typed-count, visit-count, and last 209 // visit. 210 HistoryItemFactorGreater 211 item_factor_functor(history_info_map_); 212 std::partial_sort(history_ids.begin(), 213 history_ids.begin() + kItemsToScoreLimit, 214 history_ids.end(), 215 item_factor_functor); 216 history_id_set.clear(); 217 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, 218 std::inserter(history_id_set, history_id_set.end())); 219 post_filter_item_count_ = history_id_set.size(); 220 } 221 222 // Pass over all of the candidates filtering out any without a proper 223 // substring match, inserting those which pass in order by score. Note that 224 // in this step we are using the raw search string complete with escaped 225 // URL elements. When the user has specifically typed something akin to 226 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 227 // specific substring appears in the URL or page title. 228 229 // We call these 'terms' (as opposed to 'words'; see above) as in this case 230 // we only want to break up the search string on 'true' whitespace rather than 231 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 232 // get two 'terms': "colspec=id%20mstone" and "release". 233 history::String16Vector lower_raw_terms; 234 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16, 235 &lower_raw_terms) == 0) { 236 // Don't score matches when there are no terms to score against. (It's 237 // possible that the word break iterater that extracts words to search 238 // for in the database allows some whitespace "words" whereas Tokenize 239 // excludes a long list of whitespace.) One could write a scoring 240 // function that gives a reasonable order to matches when there 241 // are no terms (i.e., all the words are some form of whitespace), 242 // but this is such a rare edge case that it's not worth the time. 243 return scored_items; 244 } 245 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 246 AddHistoryMatch(*this, languages, history_client, lower_raw_string, 247 lower_raw_terms, base::Time::Now())).ScoredMatches(); 248 249 // Select and sort only the top |max_matches| results. 250 if (scored_items.size() > max_matches) { 251 std::partial_sort(scored_items.begin(), 252 scored_items.begin() + 253 max_matches, 254 scored_items.end(), 255 ScoredHistoryMatch::MatchScoreGreater); 256 scored_items.resize(max_matches); 257 } else { 258 std::sort(scored_items.begin(), scored_items.end(), 259 ScoredHistoryMatch::MatchScoreGreater); 260 } 261 post_scoring_item_count_ = scored_items.size(); 262 263 if (was_trimmed) { 264 search_term_cache_.clear(); // Invalidate the term cache. 265 } else { 266 // Remove any stale SearchTermCacheItems. 267 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 268 cache_iter != search_term_cache_.end(); ) { 269 if (!cache_iter->second.used_) 270 search_term_cache_.erase(cache_iter++); 271 else 272 ++cache_iter; 273 } 274 } 275 276 return scored_items; 277 } 278 279 bool URLIndexPrivateData::UpdateURL( 280 HistoryService* history_service, 281 const URLRow& row, 282 const std::string& languages, 283 const std::set<std::string>& scheme_whitelist, 284 base::CancelableTaskTracker* tracker) { 285 // The row may or may not already be in our index. If it is not already 286 // indexed and it qualifies then it gets indexed. If it is already 287 // indexed and still qualifies then it gets updated, otherwise it 288 // is deleted from the index. 289 bool row_was_updated = false; 290 URLID row_id = row.id(); 291 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); 292 if (row_pos == history_info_map_.end()) { 293 // This new row should be indexed if it qualifies. 294 URLRow new_row(row); 295 new_row.set_id(row_id); 296 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && 297 IndexRow(NULL, 298 history_service, 299 new_row, 300 languages, 301 scheme_whitelist, 302 tracker); 303 } else if (RowQualifiesAsSignificant(row, base::Time())) { 304 // This indexed row still qualifies and will be re-indexed. 305 // The url won't have changed but the title, visit count, etc. 306 // might have changed. 307 URLRow& row_to_update = row_pos->second.url_row; 308 bool title_updated = row_to_update.title() != row.title(); 309 if (row_to_update.visit_count() != row.visit_count() || 310 row_to_update.typed_count() != row.typed_count() || 311 row_to_update.last_visit() != row.last_visit() || title_updated) { 312 row_to_update.set_visit_count(row.visit_count()); 313 row_to_update.set_typed_count(row.typed_count()); 314 row_to_update.set_last_visit(row.last_visit()); 315 // If something appears to have changed, update the recent visits 316 // information. 317 ScheduleUpdateRecentVisits(history_service, row_id, tracker); 318 // While the URL is guaranteed to remain stable, the title may have 319 // changed. If so, then update the index with the changed words. 320 if (title_updated) { 321 // Clear all words associated with this row and re-index both the 322 // URL and title. 323 RemoveRowWordsFromIndex(row_to_update); 324 row_to_update.set_title(row.title()); 325 RowWordStarts word_starts; 326 AddRowWordsToIndex(row_to_update, &word_starts, languages); 327 word_starts_map_[row_id] = word_starts; 328 } 329 row_was_updated = true; 330 } 331 } else { 332 // This indexed row no longer qualifies and will be de-indexed by 333 // clearing all words associated with this row. 334 RemoveRowFromIndex(row); 335 row_was_updated = true; 336 } 337 if (row_was_updated) 338 search_term_cache_.clear(); // This invalidates the cache. 339 return row_was_updated; 340 } 341 342 void URLIndexPrivateData::UpdateRecentVisits( 343 URLID url_id, 344 const VisitVector& recent_visits) { 345 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); 346 if (row_pos != history_info_map_.end()) { 347 VisitInfoVector* visits = &row_pos->second.visits; 348 visits->clear(); 349 const size_t size = 350 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); 351 visits->reserve(size); 352 for (size_t i = 0; i < size; i++) { 353 // Copy from the VisitVector the only fields visits needs. 354 visits->push_back(std::make_pair(recent_visits[i].visit_time, 355 recent_visits[i].transition)); 356 } 357 } 358 // Else: Oddly, the URL doesn't seem to exist in the private index. 359 // Ignore this update. This can happen if, for instance, the user 360 // removes the URL from URLIndexPrivateData before the historyDB call 361 // returns. 362 } 363 364 void URLIndexPrivateData::ScheduleUpdateRecentVisits( 365 HistoryService* history_service, 366 URLID url_id, 367 base::CancelableTaskTracker* tracker) { 368 history_service->ScheduleDBTask( 369 scoped_ptr<history::HistoryDBTask>( 370 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker); 371 } 372 373 // Helper functor for DeleteURL. 374 class HistoryInfoMapItemHasURL { 375 public: 376 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} 377 378 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { 379 return item.second.url_row.url() == url_; 380 } 381 382 private: 383 const GURL& url_; 384 }; 385 386 bool URLIndexPrivateData::DeleteURL(const GURL& url) { 387 // Find the matching entry in the history_info_map_. 388 HistoryInfoMap::iterator pos = std::find_if( 389 history_info_map_.begin(), 390 history_info_map_.end(), 391 HistoryInfoMapItemHasURL(url)); 392 if (pos == history_info_map_.end()) 393 return false; 394 RemoveRowFromIndex(pos->second.url_row); 395 search_term_cache_.clear(); // This invalidates the cache. 396 return true; 397 } 398 399 // static 400 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( 401 const base::FilePath& file_path, 402 const std::string& languages) { 403 base::TimeTicks beginning_time = base::TimeTicks::Now(); 404 if (!base::PathExists(file_path)) 405 return NULL; 406 std::string data; 407 // If there is no cache file then simply give up. This will cause us to 408 // attempt to rebuild from the history database. 409 if (!base::ReadFileToString(file_path, &data)) 410 return NULL; 411 412 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); 413 InMemoryURLIndexCacheItem index_cache; 414 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { 415 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " 416 << file_path.value(); 417 return restored_data; 418 } 419 420 if (!restored_data->RestorePrivateData(index_cache, languages)) 421 return NULL; 422 423 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", 424 base::TimeTicks::Now() - beginning_time); 425 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 426 restored_data->history_id_word_map_.size()); 427 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); 428 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 429 restored_data->word_map_.size()); 430 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 431 restored_data->char_word_map_.size()); 432 if (restored_data->Empty()) 433 return NULL; // 'No data' is the same as a failed reload. 434 return restored_data; 435 } 436 437 // static 438 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( 439 HistoryDatabase* history_db, 440 const std::string& languages, 441 const std::set<std::string>& scheme_whitelist) { 442 if (!history_db) 443 return NULL; 444 445 base::TimeTicks beginning_time = base::TimeTicks::Now(); 446 447 scoped_refptr<URLIndexPrivateData> 448 rebuilt_data(new URLIndexPrivateData); 449 URLDatabase::URLEnumerator history_enum; 450 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) 451 return NULL; 452 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); 453 for (URLRow row; history_enum.GetNextURL(&row); ) { 454 rebuilt_data->IndexRow( 455 history_db, NULL, row, languages, scheme_whitelist, NULL); 456 } 457 458 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", 459 base::TimeTicks::Now() - beginning_time); 460 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 461 rebuilt_data->history_id_word_map_.size()); 462 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 463 rebuilt_data->word_map_.size()); 464 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 465 rebuilt_data->char_word_map_.size()); 466 return rebuilt_data; 467 } 468 469 // static 470 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( 471 scoped_refptr<URLIndexPrivateData> private_data, 472 const base::FilePath& file_path) { 473 DCHECK(private_data.get()); 474 DCHECK(!file_path.empty()); 475 return private_data->SaveToFile(file_path); 476 } 477 478 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { 479 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; 480 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; 481 data_copy->word_list_ = word_list_; 482 data_copy->available_words_ = available_words_; 483 data_copy->word_map_ = word_map_; 484 data_copy->char_word_map_ = char_word_map_; 485 data_copy->word_id_history_map_ = word_id_history_map_; 486 data_copy->history_id_word_map_ = history_id_word_map_; 487 data_copy->history_info_map_ = history_info_map_; 488 data_copy->word_starts_map_ = word_starts_map_; 489 return data_copy; 490 // Not copied: 491 // search_term_cache_ 492 // pre_filter_item_count_ 493 // post_filter_item_count_ 494 // post_scoring_item_count_ 495 } 496 497 bool URLIndexPrivateData::Empty() const { 498 return history_info_map_.empty(); 499 } 500 501 void URLIndexPrivateData::Clear() { 502 last_time_rebuilt_from_history_ = base::Time(); 503 word_list_.clear(); 504 available_words_.clear(); 505 word_map_.clear(); 506 char_word_map_.clear(); 507 word_id_history_map_.clear(); 508 history_id_word_map_.clear(); 509 history_info_map_.clear(); 510 word_starts_map_.clear(); 511 } 512 513 URLIndexPrivateData::~URLIndexPrivateData() {} 514 515 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( 516 const String16Vector& unsorted_words) { 517 // Break the terms down into individual terms (words), get the candidate 518 // set for each term, and intersect each to get a final candidate list. 519 // Note that a single 'term' from the user's perspective might be 520 // a string like "http://www.somewebsite.com" which, from our perspective, 521 // is four words: 'http', 'www', 'somewebsite', and 'com'. 522 HistoryIDSet history_id_set; 523 String16Vector words(unsorted_words); 524 // Sort the words into the longest first as such are likely to narrow down 525 // the results quicker. Also, single character words are the most expensive 526 // to process so save them for last. 527 std::sort(words.begin(), words.end(), LengthGreater); 528 for (String16Vector::iterator iter = words.begin(); iter != words.end(); 529 ++iter) { 530 base::string16 uni_word = *iter; 531 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 532 if (term_history_set.empty()) { 533 history_id_set.clear(); 534 break; 535 } 536 if (iter == words.begin()) { 537 history_id_set.swap(term_history_set); 538 } else { 539 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( 540 history_id_set, term_history_set); 541 history_id_set.swap(new_history_id_set); 542 } 543 } 544 return history_id_set; 545 } 546 547 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( 548 const base::string16& term) { 549 if (term.empty()) 550 return HistoryIDSet(); 551 552 // TODO(mrossetti): Consider optimizing for very common terms such as 553 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently 554 // occuring words in the user's searches. 555 556 size_t term_length = term.length(); 557 WordIDSet word_id_set; 558 if (term_length > 1) { 559 // See if this term or a prefix thereof is present in the cache. 560 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); 561 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 562 cache_iter != search_term_cache_.end(); ++cache_iter) { 563 if (StartsWith(term, cache_iter->first, false) && 564 (best_prefix == search_term_cache_.end() || 565 cache_iter->first.length() > best_prefix->first.length())) 566 best_prefix = cache_iter; 567 } 568 569 // If a prefix was found then determine the leftover characters to be used 570 // for further refining the results from that prefix. 571 Char16Set prefix_chars; 572 base::string16 leftovers(term); 573 if (best_prefix != search_term_cache_.end()) { 574 // If the prefix is an exact match for the term then grab the cached 575 // results and we're done. 576 size_t prefix_length = best_prefix->first.length(); 577 if (prefix_length == term_length) { 578 best_prefix->second.used_ = true; 579 return best_prefix->second.history_id_set_; 580 } 581 582 // Otherwise we have a handy starting point. 583 // If there are no history results for this prefix then we can bail early 584 // as there will be no history results for the full term. 585 if (best_prefix->second.history_id_set_.empty()) { 586 search_term_cache_[term] = SearchTermCacheItem(); 587 return HistoryIDSet(); 588 } 589 word_id_set = best_prefix->second.word_id_set_; 590 prefix_chars = Char16SetFromString16(best_prefix->first); 591 leftovers = term.substr(prefix_length); 592 } 593 594 // Filter for each remaining, unique character in the term. 595 Char16Set leftover_chars = Char16SetFromString16(leftovers); 596 Char16Set unique_chars = 597 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); 598 599 // Reduce the word set with any leftover, unprocessed characters. 600 if (!unique_chars.empty()) { 601 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); 602 // We might come up empty on the leftovers. 603 if (leftover_set.empty()) { 604 search_term_cache_[term] = SearchTermCacheItem(); 605 return HistoryIDSet(); 606 } 607 // Or there may not have been a prefix from which to start. 608 if (prefix_chars.empty()) { 609 word_id_set.swap(leftover_set); 610 } else { 611 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 612 word_id_set, leftover_set); 613 word_id_set.swap(new_word_id_set); 614 } 615 } 616 617 // We must filter the word list because the resulting word set surely 618 // contains words which do not have the search term as a proper subset. 619 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); 620 word_set_iter != word_id_set.end(); ) { 621 if (word_list_[*word_set_iter].find(term) == base::string16::npos) 622 word_id_set.erase(word_set_iter++); 623 else 624 ++word_set_iter; 625 } 626 } else { 627 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); 628 } 629 630 // If any words resulted then we can compose a set of history IDs by unioning 631 // the sets from each word. 632 HistoryIDSet history_id_set; 633 if (!word_id_set.empty()) { 634 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 635 word_id_iter != word_id_set.end(); ++word_id_iter) { 636 WordID word_id = *word_id_iter; 637 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 638 if (word_iter != word_id_history_map_.end()) { 639 HistoryIDSet& word_history_id_set(word_iter->second); 640 history_id_set.insert(word_history_id_set.begin(), 641 word_history_id_set.end()); 642 } 643 } 644 } 645 646 // Record a new cache entry for this word if the term is longer than 647 // a single character. 648 if (term_length > 1) 649 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); 650 651 return history_id_set; 652 } 653 654 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( 655 const Char16Set& term_chars) { 656 WordIDSet word_id_set; 657 for (Char16Set::const_iterator c_iter = term_chars.begin(); 658 c_iter != term_chars.end(); ++c_iter) { 659 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); 660 if (char_iter == char_word_map_.end()) { 661 // A character was not found so there are no matching results: bail. 662 word_id_set.clear(); 663 break; 664 } 665 WordIDSet& char_word_id_set(char_iter->second); 666 // It is possible for there to no longer be any words associated with 667 // a particular character. Give up in that case. 668 if (char_word_id_set.empty()) { 669 word_id_set.clear(); 670 break; 671 } 672 673 if (c_iter == term_chars.begin()) { 674 // First character results becomes base set of results. 675 word_id_set = char_word_id_set; 676 } else { 677 // Subsequent character results get intersected in. 678 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 679 word_id_set, char_word_id_set); 680 word_id_set.swap(new_word_id_set); 681 } 682 } 683 return word_id_set; 684 } 685 686 bool URLIndexPrivateData::IndexRow( 687 HistoryDatabase* history_db, 688 HistoryService* history_service, 689 const URLRow& row, 690 const std::string& languages, 691 const std::set<std::string>& scheme_whitelist, 692 base::CancelableTaskTracker* tracker) { 693 const GURL& gurl(row.url()); 694 695 // Index only URLs with a whitelisted scheme. 696 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) 697 return false; 698 699 URLID row_id = row.id(); 700 // Strip out username and password before saving and indexing. 701 base::string16 url(net::FormatUrl(gurl, languages, 702 net::kFormatUrlOmitUsernamePassword, 703 net::UnescapeRule::NONE, 704 NULL, NULL, NULL)); 705 706 HistoryID history_id = static_cast<HistoryID>(row_id); 707 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); 708 709 // Add the row for quick lookup in the history info store. 710 URLRow new_row(GURL(url), row_id); 711 new_row.set_visit_count(row.visit_count()); 712 new_row.set_typed_count(row.typed_count()); 713 new_row.set_last_visit(row.last_visit()); 714 new_row.set_title(row.title()); 715 history_info_map_[history_id].url_row = new_row; 716 717 // Index the words contained in the URL and title of the row. 718 RowWordStarts word_starts; 719 AddRowWordsToIndex(new_row, &word_starts, languages); 720 word_starts_map_[history_id] = word_starts; 721 722 // Update the recent visits information or schedule the update 723 // as appropriate. 724 if (history_db) { 725 // We'd like to check that we're on the history DB thread. 726 // However, unittest code actually calls this on the UI thread. 727 // So we don't do any thread checks. 728 VisitVector recent_visits; 729 // Make sure the private data is going to get as many recent visits as 730 // ScoredHistoryMatch::GetFrequency() hopes to use. 731 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 732 if (history_db->GetMostRecentVisitsForURL(row_id, 733 kMaxVisitsToStoreInCache, 734 &recent_visits)) 735 UpdateRecentVisits(row_id, recent_visits); 736 } else { 737 DCHECK(tracker); 738 DCHECK(history_service); 739 ScheduleUpdateRecentVisits(history_service, row_id, tracker); 740 } 741 742 return true; 743 } 744 745 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, 746 RowWordStarts* word_starts, 747 const std::string& languages) { 748 HistoryID history_id = static_cast<HistoryID>(row.id()); 749 // Split URL into individual, unique words then add in the title words. 750 const GURL& gurl(row.url()); 751 const base::string16& url = 752 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL); 753 String16Set url_words = String16SetFromString16(url, 754 word_starts ? &word_starts->url_word_starts_ : NULL); 755 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); 756 String16Set title_words = String16SetFromString16(title, 757 word_starts ? &word_starts->title_word_starts_ : NULL); 758 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words); 759 for (String16Set::iterator word_iter = words.begin(); 760 word_iter != words.end(); ++word_iter) 761 AddWordToIndex(*word_iter, history_id); 762 763 search_term_cache_.clear(); // Invalidate the term cache. 764 } 765 766 void URLIndexPrivateData::AddWordToIndex(const base::string16& term, 767 HistoryID history_id) { 768 WordMap::iterator word_pos = word_map_.find(term); 769 if (word_pos != word_map_.end()) 770 UpdateWordHistory(word_pos->second, history_id); 771 else 772 AddWordHistory(term, history_id); 773 } 774 775 void URLIndexPrivateData::AddWordHistory(const base::string16& term, 776 HistoryID history_id) { 777 WordID word_id = word_list_.size(); 778 if (available_words_.empty()) { 779 word_list_.push_back(term); 780 } else { 781 word_id = *(available_words_.begin()); 782 word_list_[word_id] = term; 783 available_words_.erase(word_id); 784 } 785 word_map_[term] = word_id; 786 787 HistoryIDSet history_id_set; 788 history_id_set.insert(history_id); 789 word_id_history_map_[word_id] = history_id_set; 790 AddToHistoryIDWordMap(history_id, word_id); 791 792 // For each character in the newly added word (i.e. a word that is not 793 // already in the word index), add the word to the character index. 794 Char16Set characters = Char16SetFromString16(term); 795 for (Char16Set::iterator uni_char_iter = characters.begin(); 796 uni_char_iter != characters.end(); ++uni_char_iter) { 797 base::char16 uni_char = *uni_char_iter; 798 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 799 if (char_iter != char_word_map_.end()) { 800 // Update existing entry in the char/word index. 801 WordIDSet& word_id_set(char_iter->second); 802 word_id_set.insert(word_id); 803 } else { 804 // Create a new entry in the char/word index. 805 WordIDSet word_id_set; 806 word_id_set.insert(word_id); 807 char_word_map_[uni_char] = word_id_set; 808 } 809 } 810 } 811 812 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, 813 HistoryID history_id) { 814 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 815 DCHECK(history_pos != word_id_history_map_.end()); 816 HistoryIDSet& history_id_set(history_pos->second); 817 history_id_set.insert(history_id); 818 AddToHistoryIDWordMap(history_id, word_id); 819 } 820 821 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, 822 WordID word_id) { 823 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); 824 if (iter != history_id_word_map_.end()) { 825 WordIDSet& word_id_set(iter->second); 826 word_id_set.insert(word_id); 827 } else { 828 WordIDSet word_id_set; 829 word_id_set.insert(word_id); 830 history_id_word_map_[history_id] = word_id_set; 831 } 832 } 833 834 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { 835 RemoveRowWordsFromIndex(row); 836 HistoryID history_id = static_cast<HistoryID>(row.id()); 837 history_info_map_.erase(history_id); 838 word_starts_map_.erase(history_id); 839 } 840 841 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { 842 // Remove the entries in history_id_word_map_ and word_id_history_map_ for 843 // this row. 844 HistoryID history_id = static_cast<HistoryID>(row.id()); 845 WordIDSet word_id_set = history_id_word_map_[history_id]; 846 history_id_word_map_.erase(history_id); 847 848 // Reconcile any changes to word usage. 849 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 850 word_id_iter != word_id_set.end(); ++word_id_iter) { 851 WordID word_id = *word_id_iter; 852 word_id_history_map_[word_id].erase(history_id); 853 if (!word_id_history_map_[word_id].empty()) 854 continue; // The word is still in use. 855 856 // The word is no longer in use. Reconcile any changes to character usage. 857 base::string16 word = word_list_[word_id]; 858 Char16Set characters = Char16SetFromString16(word); 859 for (Char16Set::iterator uni_char_iter = characters.begin(); 860 uni_char_iter != characters.end(); ++uni_char_iter) { 861 base::char16 uni_char = *uni_char_iter; 862 char_word_map_[uni_char].erase(word_id); 863 if (char_word_map_[uni_char].empty()) 864 char_word_map_.erase(uni_char); // No longer in use. 865 } 866 867 // Complete the removal of references to the word. 868 word_id_history_map_.erase(word_id); 869 word_map_.erase(word); 870 word_list_[word_id] = base::string16(); 871 available_words_.insert(word_id); 872 } 873 } 874 875 void URLIndexPrivateData::ResetSearchTermCache() { 876 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); 877 iter != search_term_cache_.end(); ++iter) 878 iter->second.used_ = false; 879 } 880 881 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { 882 base::TimeTicks beginning_time = base::TimeTicks::Now(); 883 InMemoryURLIndexCacheItem index_cache; 884 SavePrivateData(&index_cache); 885 std::string data; 886 if (!index_cache.SerializeToString(&data)) { 887 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; 888 return false; 889 } 890 891 int size = data.size(); 892 if (base::WriteFile(file_path, data.c_str(), size) != size) { 893 LOG(WARNING) << "Failed to write " << file_path.value(); 894 return false; 895 } 896 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 897 base::TimeTicks::Now() - beginning_time); 898 return true; 899 } 900 901 void URLIndexPrivateData::SavePrivateData( 902 InMemoryURLIndexCacheItem* cache) const { 903 DCHECK(cache); 904 cache->set_last_rebuild_timestamp( 905 last_time_rebuilt_from_history_.ToInternalValue()); 906 cache->set_version(saved_cache_version_); 907 // history_item_count_ is no longer used but rather than change the protobuf 908 // definition use a placeholder. This will go away with the switch to SQLite. 909 cache->set_history_item_count(0); 910 SaveWordList(cache); 911 SaveWordMap(cache); 912 SaveCharWordMap(cache); 913 SaveWordIDHistoryMap(cache); 914 SaveHistoryInfoMap(cache); 915 SaveWordStartsMap(cache); 916 } 917 918 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 919 if (word_list_.empty()) 920 return; 921 WordListItem* list_item = cache->mutable_word_list(); 922 list_item->set_word_count(word_list_.size()); 923 for (String16Vector::const_iterator iter = word_list_.begin(); 924 iter != word_list_.end(); ++iter) 925 list_item->add_word(base::UTF16ToUTF8(*iter)); 926 } 927 928 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { 929 if (word_map_.empty()) 930 return; 931 WordMapItem* map_item = cache->mutable_word_map(); 932 map_item->set_item_count(word_map_.size()); 933 for (WordMap::const_iterator iter = word_map_.begin(); 934 iter != word_map_.end(); ++iter) { 935 WordMapEntry* map_entry = map_item->add_word_map_entry(); 936 map_entry->set_word(base::UTF16ToUTF8(iter->first)); 937 map_entry->set_word_id(iter->second); 938 } 939 } 940 941 void URLIndexPrivateData::SaveCharWordMap( 942 InMemoryURLIndexCacheItem* cache) const { 943 if (char_word_map_.empty()) 944 return; 945 CharWordMapItem* map_item = cache->mutable_char_word_map(); 946 map_item->set_item_count(char_word_map_.size()); 947 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); 948 iter != char_word_map_.end(); ++iter) { 949 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); 950 map_entry->set_char_16(iter->first); 951 const WordIDSet& word_id_set(iter->second); 952 map_entry->set_item_count(word_id_set.size()); 953 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); 954 set_iter != word_id_set.end(); ++set_iter) 955 map_entry->add_word_id(*set_iter); 956 } 957 } 958 959 void URLIndexPrivateData::SaveWordIDHistoryMap( 960 InMemoryURLIndexCacheItem* cache) const { 961 if (word_id_history_map_.empty()) 962 return; 963 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); 964 map_item->set_item_count(word_id_history_map_.size()); 965 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); 966 iter != word_id_history_map_.end(); ++iter) { 967 WordIDHistoryMapEntry* map_entry = 968 map_item->add_word_id_history_map_entry(); 969 map_entry->set_word_id(iter->first); 970 const HistoryIDSet& history_id_set(iter->second); 971 map_entry->set_item_count(history_id_set.size()); 972 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); 973 set_iter != history_id_set.end(); ++set_iter) 974 map_entry->add_history_id(*set_iter); 975 } 976 } 977 978 void URLIndexPrivateData::SaveHistoryInfoMap( 979 InMemoryURLIndexCacheItem* cache) const { 980 if (history_info_map_.empty()) 981 return; 982 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); 983 map_item->set_item_count(history_info_map_.size()); 984 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 985 iter != history_info_map_.end(); ++iter) { 986 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); 987 map_entry->set_history_id(iter->first); 988 const URLRow& url_row(iter->second.url_row); 989 // Note: We only save information that contributes to the index so there 990 // is no need to save search_term_cache_ (not persistent). 991 map_entry->set_visit_count(url_row.visit_count()); 992 map_entry->set_typed_count(url_row.typed_count()); 993 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 994 map_entry->set_url(url_row.url().spec()); 995 map_entry->set_title(base::UTF16ToUTF8(url_row.title())); 996 const VisitInfoVector& visits(iter->second.visits); 997 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); 998 visit_iter != visits.end(); ++visit_iter) { 999 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); 1000 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); 1001 visit_info->set_transition_type(visit_iter->second); 1002 } 1003 } 1004 } 1005 1006 void URLIndexPrivateData::SaveWordStartsMap( 1007 InMemoryURLIndexCacheItem* cache) const { 1008 if (word_starts_map_.empty()) 1009 return; 1010 // For unit testing: Enable saving of the cache as an earlier version to 1011 // allow testing of cache file upgrading in ReadFromFile(). 1012 // TODO(mrossetti): Instead of intruding on production code with this kind of 1013 // test harness, save a copy of an older version cache with known results. 1014 // Implement this when switching the caching over to SQLite. 1015 if (saved_cache_version_ < 1) 1016 return; 1017 1018 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); 1019 map_item->set_item_count(word_starts_map_.size()); 1020 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); 1021 iter != word_starts_map_.end(); ++iter) { 1022 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); 1023 map_entry->set_history_id(iter->first); 1024 const RowWordStarts& word_starts(iter->second); 1025 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); 1026 i != word_starts.url_word_starts_.end(); ++i) 1027 map_entry->add_url_word_starts(*i); 1028 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); 1029 i != word_starts.title_word_starts_.end(); ++i) 1030 map_entry->add_title_word_starts(*i); 1031 } 1032 } 1033 1034 bool URLIndexPrivateData::RestorePrivateData( 1035 const InMemoryURLIndexCacheItem& cache, 1036 const std::string& languages) { 1037 last_time_rebuilt_from_history_ = 1038 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); 1039 const base::TimeDelta rebuilt_ago = 1040 base::Time::Now() - last_time_rebuilt_from_history_; 1041 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || 1042 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { 1043 // Cache is more than a week old or, somehow, from some time in the future. 1044 // It's probably a good time to rebuild the index from history to 1045 // allow synced entries to now appear, expired entries to disappear, etc. 1046 // Allow one day in the future to make the cache not rebuild on simple 1047 // system clock changes such as time zone changes. 1048 return false; 1049 } 1050 if (cache.has_version()) { 1051 if (cache.version() < kCurrentCacheFileVersion) { 1052 // Don't try to restore an old format cache file. (This will cause 1053 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData 1054 // from history.) 1055 return false; 1056 } 1057 restored_cache_version_ = cache.version(); 1058 } 1059 return RestoreWordList(cache) && RestoreWordMap(cache) && 1060 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && 1061 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); 1062 } 1063 1064 bool URLIndexPrivateData::RestoreWordList( 1065 const InMemoryURLIndexCacheItem& cache) { 1066 if (!cache.has_word_list()) 1067 return false; 1068 const WordListItem& list_item(cache.word_list()); 1069 uint32 expected_item_count = list_item.word_count(); 1070 uint32 actual_item_count = list_item.word_size(); 1071 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1072 return false; 1073 const RepeatedPtrField<std::string>& words(list_item.word()); 1074 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); 1075 iter != words.end(); ++iter) 1076 word_list_.push_back(base::UTF8ToUTF16(*iter)); 1077 return true; 1078 } 1079 1080 bool URLIndexPrivateData::RestoreWordMap( 1081 const InMemoryURLIndexCacheItem& cache) { 1082 if (!cache.has_word_map()) 1083 return false; 1084 const WordMapItem& list_item(cache.word_map()); 1085 uint32 expected_item_count = list_item.item_count(); 1086 uint32 actual_item_count = list_item.word_map_entry_size(); 1087 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1088 return false; 1089 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); 1090 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); 1091 iter != entries.end(); ++iter) 1092 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); 1093 return true; 1094 } 1095 1096 bool URLIndexPrivateData::RestoreCharWordMap( 1097 const InMemoryURLIndexCacheItem& cache) { 1098 if (!cache.has_char_word_map()) 1099 return false; 1100 const CharWordMapItem& list_item(cache.char_word_map()); 1101 uint32 expected_item_count = list_item.item_count(); 1102 uint32 actual_item_count = list_item.char_word_map_entry_size(); 1103 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1104 return false; 1105 const RepeatedPtrField<CharWordMapEntry>& 1106 entries(list_item.char_word_map_entry()); 1107 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = 1108 entries.begin(); iter != entries.end(); ++iter) { 1109 expected_item_count = iter->item_count(); 1110 actual_item_count = iter->word_id_size(); 1111 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1112 return false; 1113 base::char16 uni_char = static_cast<base::char16>(iter->char_16()); 1114 WordIDSet word_id_set; 1115 const RepeatedField<int32>& word_ids(iter->word_id()); 1116 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); 1117 jiter != word_ids.end(); ++jiter) 1118 word_id_set.insert(*jiter); 1119 char_word_map_[uni_char] = word_id_set; 1120 } 1121 return true; 1122 } 1123 1124 bool URLIndexPrivateData::RestoreWordIDHistoryMap( 1125 const InMemoryURLIndexCacheItem& cache) { 1126 if (!cache.has_word_id_history_map()) 1127 return false; 1128 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); 1129 uint32 expected_item_count = list_item.item_count(); 1130 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); 1131 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1132 return false; 1133 const RepeatedPtrField<WordIDHistoryMapEntry>& 1134 entries(list_item.word_id_history_map_entry()); 1135 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = 1136 entries.begin(); iter != entries.end(); ++iter) { 1137 expected_item_count = iter->item_count(); 1138 actual_item_count = iter->history_id_size(); 1139 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1140 return false; 1141 WordID word_id = iter->word_id(); 1142 HistoryIDSet history_id_set; 1143 const RepeatedField<int64>& history_ids(iter->history_id()); 1144 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); 1145 jiter != history_ids.end(); ++jiter) { 1146 history_id_set.insert(*jiter); 1147 AddToHistoryIDWordMap(*jiter, word_id); 1148 } 1149 word_id_history_map_[word_id] = history_id_set; 1150 } 1151 return true; 1152 } 1153 1154 bool URLIndexPrivateData::RestoreHistoryInfoMap( 1155 const InMemoryURLIndexCacheItem& cache) { 1156 if (!cache.has_history_info_map()) 1157 return false; 1158 const HistoryInfoMapItem& list_item(cache.history_info_map()); 1159 uint32 expected_item_count = list_item.item_count(); 1160 uint32 actual_item_count = list_item.history_info_map_entry_size(); 1161 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1162 return false; 1163 const RepeatedPtrField<HistoryInfoMapEntry>& 1164 entries(list_item.history_info_map_entry()); 1165 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = 1166 entries.begin(); iter != entries.end(); ++iter) { 1167 HistoryID history_id = iter->history_id(); 1168 GURL url(iter->url()); 1169 URLRow url_row(url, history_id); 1170 url_row.set_visit_count(iter->visit_count()); 1171 url_row.set_typed_count(iter->typed_count()); 1172 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1173 if (iter->has_title()) { 1174 base::string16 title(base::UTF8ToUTF16(iter->title())); 1175 url_row.set_title(title); 1176 } 1177 history_info_map_[history_id].url_row = url_row; 1178 1179 // Restore visits list. 1180 VisitInfoVector visits; 1181 visits.reserve(iter->visits_size()); 1182 for (int i = 0; i < iter->visits_size(); ++i) { 1183 visits.push_back(std::make_pair( 1184 base::Time::FromInternalValue(iter->visits(i).visit_time()), 1185 ui::PageTransitionFromInt(iter->visits(i).transition_type()))); 1186 } 1187 history_info_map_[history_id].visits = visits; 1188 } 1189 return true; 1190 } 1191 1192 bool URLIndexPrivateData::RestoreWordStartsMap( 1193 const InMemoryURLIndexCacheItem& cache, 1194 const std::string& languages) { 1195 // Note that this function must be called after RestoreHistoryInfoMap() has 1196 // been run as the word starts may have to be recalculated from the urls and 1197 // page titles. 1198 if (cache.has_word_starts_map()) { 1199 const WordStartsMapItem& list_item(cache.word_starts_map()); 1200 uint32 expected_item_count = list_item.item_count(); 1201 uint32 actual_item_count = list_item.word_starts_map_entry_size(); 1202 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1203 return false; 1204 const RepeatedPtrField<WordStartsMapEntry>& 1205 entries(list_item.word_starts_map_entry()); 1206 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = 1207 entries.begin(); iter != entries.end(); ++iter) { 1208 HistoryID history_id = iter->history_id(); 1209 RowWordStarts word_starts; 1210 // Restore the URL word starts. 1211 const RepeatedField<int32>& url_starts(iter->url_word_starts()); 1212 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); 1213 jiter != url_starts.end(); ++jiter) 1214 word_starts.url_word_starts_.push_back(*jiter); 1215 // Restore the page title word starts. 1216 const RepeatedField<int32>& title_starts(iter->title_word_starts()); 1217 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); 1218 jiter != title_starts.end(); ++jiter) 1219 word_starts.title_word_starts_.push_back(*jiter); 1220 word_starts_map_[history_id] = word_starts; 1221 } 1222 } else { 1223 // Since the cache did not contain any word starts we must rebuild then from 1224 // the URL and page titles. 1225 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 1226 iter != history_info_map_.end(); ++iter) { 1227 RowWordStarts word_starts; 1228 const URLRow& row(iter->second.url_row); 1229 const base::string16& url = 1230 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL); 1231 String16VectorFromString16(url, false, &word_starts.url_word_starts_); 1232 const base::string16& title = 1233 bookmarks::CleanUpTitleForMatching(row.title()); 1234 String16VectorFromString16(title, false, &word_starts.title_word_starts_); 1235 word_starts_map_[iter->first] = word_starts; 1236 } 1237 } 1238 return true; 1239 } 1240 1241 // static 1242 bool URLIndexPrivateData::URLSchemeIsWhitelisted( 1243 const GURL& gurl, 1244 const std::set<std::string>& whitelist) { 1245 return whitelist.find(gurl.scheme()) != whitelist.end(); 1246 } 1247 1248 1249 // SearchTermCacheItem --------------------------------------------------------- 1250 1251 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1252 const WordIDSet& word_id_set, 1253 const HistoryIDSet& history_id_set) 1254 : word_id_set_(word_id_set), 1255 history_id_set_(history_id_set), 1256 used_(true) {} 1257 1258 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() 1259 : used_(true) {} 1260 1261 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} 1262 1263 1264 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- 1265 1266 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( 1267 const URLIndexPrivateData& private_data, 1268 const std::string& languages, 1269 HistoryClient* history_client, 1270 const base::string16& lower_string, 1271 const String16Vector& lower_terms, 1272 const base::Time now) 1273 : private_data_(private_data), 1274 languages_(languages), 1275 history_client_(history_client), 1276 lower_string_(lower_string), 1277 lower_terms_(lower_terms), 1278 now_(now) { 1279 // Calculate offsets for each term. For instance, the offset for 1280 // ".net" should be 1, indicating that the actual word-part of the term 1281 // starts at offset 1. 1282 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); 1283 for (size_t i = 0; i < lower_terms_.size(); ++i) { 1284 base::i18n::BreakIterator iter(lower_terms_[i], 1285 base::i18n::BreakIterator::BREAK_WORD); 1286 // If the iterator doesn't work, assume an offset of 0. 1287 if (!iter.Init()) 1288 continue; 1289 // Find the first word start. 1290 while (iter.Advance() && !iter.IsWord()) {} 1291 if (iter.IsWord()) 1292 lower_terms_to_word_starts_offsets_[i] = iter.prev(); 1293 // Else: the iterator didn't find a word break. Assume an offset of 0. 1294 } 1295 } 1296 1297 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} 1298 1299 void URLIndexPrivateData::AddHistoryMatch::operator()( 1300 const HistoryID history_id) { 1301 HistoryInfoMap::const_iterator hist_pos = 1302 private_data_.history_info_map_.find(history_id); 1303 if (hist_pos != private_data_.history_info_map_.end()) { 1304 const URLRow& hist_item = hist_pos->second.url_row; 1305 const VisitInfoVector& visits = hist_pos->second.visits; 1306 WordStartsMap::const_iterator starts_pos = 1307 private_data_.word_starts_map_.find(history_id); 1308 DCHECK(starts_pos != private_data_.word_starts_map_.end()); 1309 ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_, 1310 lower_terms_, lower_terms_to_word_starts_offsets_, 1311 starts_pos->second, now_, history_client_); 1312 if (match.raw_score() > 0) 1313 scored_matches_.push_back(match); 1314 } 1315 } 1316 1317 1318 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- 1319 1320 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( 1321 const HistoryInfoMap& history_info_map) 1322 : history_info_map_(history_info_map) { 1323 } 1324 1325 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} 1326 1327 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( 1328 const HistoryID h1, 1329 const HistoryID h2) { 1330 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); 1331 if (entry1 == history_info_map_.end()) 1332 return false; 1333 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); 1334 if (entry2 == history_info_map_.end()) 1335 return true; 1336 const URLRow& r1(entry1->second.url_row); 1337 const URLRow& r2(entry2->second.url_row); 1338 // First cut: typed count, visit count, recency. 1339 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1340 // recently visited (within the last 12/24 hours) as highly important. Get 1341 // input from mpearson. 1342 if (r1.typed_count() != r2.typed_count()) 1343 return (r1.typed_count() > r2.typed_count()); 1344 if (r1.visit_count() != r2.visit_count()) 1345 return (r1.visit_count() > r2.visit_count()); 1346 return (r1.last_visit() > r2.last_visit()); 1347 } 1348 1349 } // namespace history 1350