1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/url_index_private_data.h" 6 7 #include <functional> 8 #include <iterator> 9 #include <limits> 10 #include <numeric> 11 #include <string> 12 #include <vector> 13 14 #include "base/basictypes.h" 15 #include "base/file_util.h" 16 #include "base/i18n/break_iterator.h" 17 #include "base/i18n/case_conversion.h" 18 #include "base/metrics/histogram.h" 19 #include "base/strings/string_util.h" 20 #include "base/strings/utf_string_conversions.h" 21 #include "base/time/time.h" 22 #include "chrome/browser/history/history_database.h" 23 #include "chrome/browser/history/history_db_task.h" 24 #include "chrome/browser/history/history_service.h" 25 #include "chrome/browser/history/in_memory_url_index.h" 26 #include "components/bookmarks/browser/bookmark_utils.h" 27 #include "components/history/core/browser/history_client.h" 28 #include "net/base/net_util.h" 29 30 #if defined(USE_SYSTEM_PROTOBUF) 31 #include <google/protobuf/repeated_field.h> 32 #else 33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 34 #endif 35 36 using google::protobuf::RepeatedField; 37 using google::protobuf::RepeatedPtrField; 38 using in_memory_url_index::InMemoryURLIndexCacheItem; 39 40 namespace { 41 static const size_t kMaxVisitsToStoreInCache = 10u; 42 } // anonymous namespace 43 44 namespace history { 45 46 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; 47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; 48 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; 49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; 50 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 51 CharWordMapEntry; 52 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 53 WordIDHistoryMapItem; 54 typedef imui:: 55 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 56 WordIDHistoryMapEntry; 57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 58 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 59 HistoryInfoMapEntry; 60 typedef imui:: 61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo 62 HistoryInfoMapEntry_VisitInfo; 63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; 64 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry 65 WordStartsMapEntry; 66 67 68 // Algorithm Functions --------------------------------------------------------- 69 70 // Comparison function for sorting search terms by descending length. 71 bool LengthGreater(const base::string16& string_a, 72 const base::string16& string_b) { 73 return string_a.length() > string_b.length(); 74 } 75 76 77 // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- 78 79 // HistoryDBTask used to update the recent visit data for a particular 80 // row from the history database. 81 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask { 82 public: 83 explicit UpdateRecentVisitsFromHistoryDBTask( 84 URLIndexPrivateData* private_data, 85 URLID url_id); 86 87 virtual bool RunOnDBThread(HistoryBackend* backend, 88 history::HistoryDatabase* db) OVERRIDE; 89 virtual void DoneRunOnMainThread() OVERRIDE; 90 91 private: 92 virtual ~UpdateRecentVisitsFromHistoryDBTask(); 93 94 // The URLIndexPrivateData that gets updated after the historyDB 95 // task returns. 96 URLIndexPrivateData* private_data_; 97 // The ID of the URL to get visits for and then update. 98 URLID url_id_; 99 // Whether fetching the recent visits for the URL succeeded. 100 bool succeeded_; 101 // The awaited data that's shown to private_data_ for it to copy and 102 // store. 103 VisitVector recent_visits_; 104 105 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); 106 }; 107 108 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( 109 URLIndexPrivateData* private_data, 110 URLID url_id) 111 : private_data_(private_data), 112 url_id_(url_id), 113 succeeded_(false) { 114 } 115 116 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( 117 HistoryBackend* backend, 118 HistoryDatabase* db) { 119 // Make sure the private data is going to get as many recent visits as 120 // ScoredHistoryMatch::GetFrequency() hopes to use. 121 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 122 succeeded_ = db->GetMostRecentVisitsForURL(url_id_, 123 kMaxVisitsToStoreInCache, 124 &recent_visits_); 125 if (!succeeded_) 126 recent_visits_.clear(); 127 return true; // Always claim to be done; do not retry failures. 128 } 129 130 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { 131 if (succeeded_) 132 private_data_->UpdateRecentVisits(url_id_, recent_visits_); 133 } 134 135 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { 136 } 137 138 139 // URLIndexPrivateData --------------------------------------------------------- 140 141 URLIndexPrivateData::URLIndexPrivateData() 142 : restored_cache_version_(0), 143 saved_cache_version_(kCurrentCacheFileVersion), 144 pre_filter_item_count_(0), 145 post_filter_item_count_(0), 146 post_scoring_item_count_(0) { 147 } 148 149 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( 150 base::string16 search_string, 151 size_t cursor_position, 152 size_t max_matches, 153 const std::string& languages, 154 HistoryClient* history_client) { 155 // If cursor position is set and useful (not at either end of the 156 // string), allow the search string to be broken at cursor position. 157 // We do this by pretending there's a space where the cursor is. 158 if ((cursor_position != base::string16::npos) && 159 (cursor_position < search_string.length()) && 160 (cursor_position > 0)) { 161 search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); 162 } 163 pre_filter_item_count_ = 0; 164 post_filter_item_count_ = 0; 165 post_scoring_item_count_ = 0; 166 // The search string we receive may contain escaped characters. For reducing 167 // the index we need individual, lower-cased words, ignoring escapings. For 168 // the final filtering we need whitespace separated substrings possibly 169 // containing escaped characters. 170 base::string16 lower_raw_string(base::i18n::ToLower(search_string)); 171 base::string16 lower_unescaped_string = 172 net::UnescapeURLComponent(lower_raw_string, 173 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 174 // Extract individual 'words' (as opposed to 'terms'; see below) from the 175 // search string. When the user types "colspec=ID%20Mstone Release" we get 176 // four 'words': "colspec", "id", "mstone" and "release". 177 String16Vector lower_words( 178 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); 179 ScoredHistoryMatches scored_items; 180 181 // Do nothing if we have indexed no words (probably because we've not been 182 // initialized yet) or the search string has no words. 183 if (word_list_.empty() || lower_words.empty()) { 184 search_term_cache_.clear(); // Invalidate the term cache. 185 return scored_items; 186 } 187 188 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 189 // approach. 190 ResetSearchTermCache(); 191 192 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); 193 194 // Trim the candidate pool if it is large. Note that we do not filter out 195 // items that do not contain the search terms as proper substrings -- doing 196 // so is the performance-costly operation we are trying to avoid in order 197 // to maintain omnibox responsiveness. 198 const size_t kItemsToScoreLimit = 500; 199 pre_filter_item_count_ = history_id_set.size(); 200 // If we trim the results set we do not want to cache the results for next 201 // time as the user's ultimately desired result could easily be eliminated 202 // in this early rough filter. 203 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); 204 if (was_trimmed) { 205 HistoryIDVector history_ids; 206 std::copy(history_id_set.begin(), history_id_set.end(), 207 std::back_inserter(history_ids)); 208 // Trim down the set by sorting by typed-count, visit-count, and last 209 // visit. 210 HistoryItemFactorGreater 211 item_factor_functor(history_info_map_); 212 std::partial_sort(history_ids.begin(), 213 history_ids.begin() + kItemsToScoreLimit, 214 history_ids.end(), 215 item_factor_functor); 216 history_id_set.clear(); 217 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, 218 std::inserter(history_id_set, history_id_set.end())); 219 post_filter_item_count_ = history_id_set.size(); 220 } 221 222 // Pass over all of the candidates filtering out any without a proper 223 // substring match, inserting those which pass in order by score. Note that 224 // in this step we are using the raw search string complete with escaped 225 // URL elements. When the user has specifically typed something akin to 226 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that 227 // specific substring appears in the URL or page title. 228 229 // We call these 'terms' (as opposed to 'words'; see above) as in this case 230 // we only want to break up the search string on 'true' whitespace rather than 231 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we 232 // get two 'terms': "colspec=id%20mstone" and "release". 233 history::String16Vector lower_raw_terms; 234 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16, 235 &lower_raw_terms) == 0) { 236 // Don't score matches when there are no terms to score against. (It's 237 // possible that the word break iterater that extracts words to search 238 // for in the database allows some whitespace "words" whereas Tokenize 239 // excludes a long list of whitespace.) One could write a scoring 240 // function that gives a reasonable order to matches when there 241 // are no terms (i.e., all the words are some form of whitespace), 242 // but this is such a rare edge case that it's not worth the time. 243 return scored_items; 244 } 245 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 246 AddHistoryMatch(*this, languages, history_client, lower_raw_string, 247 lower_raw_terms, base::Time::Now())).ScoredMatches(); 248 249 // Select and sort only the top |max_matches| results. 250 if (scored_items.size() > max_matches) { 251 std::partial_sort(scored_items.begin(), 252 scored_items.begin() + 253 max_matches, 254 scored_items.end(), 255 ScoredHistoryMatch::MatchScoreGreater); 256 scored_items.resize(max_matches); 257 } else { 258 std::sort(scored_items.begin(), scored_items.end(), 259 ScoredHistoryMatch::MatchScoreGreater); 260 } 261 post_scoring_item_count_ = scored_items.size(); 262 263 if (was_trimmed) { 264 search_term_cache_.clear(); // Invalidate the term cache. 265 } else { 266 // Remove any stale SearchTermCacheItems. 267 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 268 cache_iter != search_term_cache_.end(); ) { 269 if (!cache_iter->second.used_) 270 search_term_cache_.erase(cache_iter++); 271 else 272 ++cache_iter; 273 } 274 } 275 276 return scored_items; 277 } 278 279 bool URLIndexPrivateData::UpdateURL( 280 HistoryService* history_service, 281 const URLRow& row, 282 const std::string& languages, 283 const std::set<std::string>& scheme_whitelist) { 284 // The row may or may not already be in our index. If it is not already 285 // indexed and it qualifies then it gets indexed. If it is already 286 // indexed and still qualifies then it gets updated, otherwise it 287 // is deleted from the index. 288 bool row_was_updated = false; 289 URLID row_id = row.id(); 290 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); 291 if (row_pos == history_info_map_.end()) { 292 // This new row should be indexed if it qualifies. 293 URLRow new_row(row); 294 new_row.set_id(row_id); 295 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && 296 IndexRow(NULL, history_service, new_row, languages, scheme_whitelist); 297 } else if (RowQualifiesAsSignificant(row, base::Time())) { 298 // This indexed row still qualifies and will be re-indexed. 299 // The url won't have changed but the title, visit count, etc. 300 // might have changed. 301 URLRow& row_to_update = row_pos->second.url_row; 302 bool title_updated = row_to_update.title() != row.title(); 303 if (row_to_update.visit_count() != row.visit_count() || 304 row_to_update.typed_count() != row.typed_count() || 305 row_to_update.last_visit() != row.last_visit() || title_updated) { 306 row_to_update.set_visit_count(row.visit_count()); 307 row_to_update.set_typed_count(row.typed_count()); 308 row_to_update.set_last_visit(row.last_visit()); 309 // If something appears to have changed, update the recent visits 310 // information. 311 ScheduleUpdateRecentVisits(history_service, row_id); 312 // While the URL is guaranteed to remain stable, the title may have 313 // changed. If so, then update the index with the changed words. 314 if (title_updated) { 315 // Clear all words associated with this row and re-index both the 316 // URL and title. 317 RemoveRowWordsFromIndex(row_to_update); 318 row_to_update.set_title(row.title()); 319 RowWordStarts word_starts; 320 AddRowWordsToIndex(row_to_update, &word_starts, languages); 321 word_starts_map_[row_id] = word_starts; 322 } 323 row_was_updated = true; 324 } 325 } else { 326 // This indexed row no longer qualifies and will be de-indexed by 327 // clearing all words associated with this row. 328 RemoveRowFromIndex(row); 329 row_was_updated = true; 330 } 331 if (row_was_updated) 332 search_term_cache_.clear(); // This invalidates the cache. 333 return row_was_updated; 334 } 335 336 void URLIndexPrivateData::UpdateRecentVisits( 337 URLID url_id, 338 const VisitVector& recent_visits) { 339 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); 340 if (row_pos != history_info_map_.end()) { 341 VisitInfoVector* visits = &row_pos->second.visits; 342 visits->clear(); 343 const size_t size = 344 std::min(recent_visits.size(), kMaxVisitsToStoreInCache); 345 visits->reserve(size); 346 for (size_t i = 0; i < size; i++) { 347 // Copy from the VisitVector the only fields visits needs. 348 visits->push_back(std::make_pair(recent_visits[i].visit_time, 349 recent_visits[i].transition)); 350 } 351 } 352 // Else: Oddly, the URL doesn't seem to exist in the private index. 353 // Ignore this update. This can happen if, for instance, the user 354 // removes the URL from URLIndexPrivateData before the historyDB call 355 // returns. 356 } 357 358 void URLIndexPrivateData::ScheduleUpdateRecentVisits( 359 HistoryService* history_service, 360 URLID url_id) { 361 history_service->ScheduleDBTask( 362 new UpdateRecentVisitsFromHistoryDBTask(this, url_id), 363 &recent_visits_consumer_); 364 } 365 366 // Helper functor for DeleteURL. 367 class HistoryInfoMapItemHasURL { 368 public: 369 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} 370 371 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) { 372 return item.second.url_row.url() == url_; 373 } 374 375 private: 376 const GURL& url_; 377 }; 378 379 bool URLIndexPrivateData::DeleteURL(const GURL& url) { 380 // Find the matching entry in the history_info_map_. 381 HistoryInfoMap::iterator pos = std::find_if( 382 history_info_map_.begin(), 383 history_info_map_.end(), 384 HistoryInfoMapItemHasURL(url)); 385 if (pos == history_info_map_.end()) 386 return false; 387 RemoveRowFromIndex(pos->second.url_row); 388 search_term_cache_.clear(); // This invalidates the cache. 389 return true; 390 } 391 392 // static 393 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( 394 const base::FilePath& file_path, 395 const std::string& languages) { 396 base::TimeTicks beginning_time = base::TimeTicks::Now(); 397 if (!base::PathExists(file_path)) 398 return NULL; 399 std::string data; 400 // If there is no cache file then simply give up. This will cause us to 401 // attempt to rebuild from the history database. 402 if (!base::ReadFileToString(file_path, &data)) 403 return NULL; 404 405 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); 406 InMemoryURLIndexCacheItem index_cache; 407 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { 408 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " 409 << file_path.value(); 410 return restored_data; 411 } 412 413 if (!restored_data->RestorePrivateData(index_cache, languages)) 414 return NULL; 415 416 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", 417 base::TimeTicks::Now() - beginning_time); 418 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 419 restored_data->history_id_word_map_.size()); 420 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); 421 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 422 restored_data->word_map_.size()); 423 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 424 restored_data->char_word_map_.size()); 425 if (restored_data->Empty()) 426 return NULL; // 'No data' is the same as a failed reload. 427 return restored_data; 428 } 429 430 // static 431 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( 432 HistoryDatabase* history_db, 433 const std::string& languages, 434 const std::set<std::string>& scheme_whitelist) { 435 if (!history_db) 436 return NULL; 437 438 base::TimeTicks beginning_time = base::TimeTicks::Now(); 439 440 scoped_refptr<URLIndexPrivateData> 441 rebuilt_data(new URLIndexPrivateData); 442 URLDatabase::URLEnumerator history_enum; 443 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) 444 return NULL; 445 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); 446 for (URLRow row; history_enum.GetNextURL(&row); ) { 447 rebuilt_data->IndexRow(history_db, NULL, row, languages, 448 scheme_whitelist); 449 } 450 451 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", 452 base::TimeTicks::Now() - beginning_time); 453 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", 454 rebuilt_data->history_id_word_map_.size()); 455 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 456 rebuilt_data->word_map_.size()); 457 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 458 rebuilt_data->char_word_map_.size()); 459 return rebuilt_data; 460 } 461 462 // static 463 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( 464 scoped_refptr<URLIndexPrivateData> private_data, 465 const base::FilePath& file_path) { 466 DCHECK(private_data.get()); 467 DCHECK(!file_path.empty()); 468 return private_data->SaveToFile(file_path); 469 } 470 471 void URLIndexPrivateData::CancelPendingUpdates() { 472 recent_visits_consumer_.CancelAllRequests(); 473 } 474 475 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { 476 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; 477 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; 478 data_copy->word_list_ = word_list_; 479 data_copy->available_words_ = available_words_; 480 data_copy->word_map_ = word_map_; 481 data_copy->char_word_map_ = char_word_map_; 482 data_copy->word_id_history_map_ = word_id_history_map_; 483 data_copy->history_id_word_map_ = history_id_word_map_; 484 data_copy->history_info_map_ = history_info_map_; 485 data_copy->word_starts_map_ = word_starts_map_; 486 return data_copy; 487 // Not copied: 488 // search_term_cache_ 489 // pre_filter_item_count_ 490 // post_filter_item_count_ 491 // post_scoring_item_count_ 492 } 493 494 bool URLIndexPrivateData::Empty() const { 495 return history_info_map_.empty(); 496 } 497 498 void URLIndexPrivateData::Clear() { 499 last_time_rebuilt_from_history_ = base::Time(); 500 word_list_.clear(); 501 available_words_.clear(); 502 word_map_.clear(); 503 char_word_map_.clear(); 504 word_id_history_map_.clear(); 505 history_id_word_map_.clear(); 506 history_info_map_.clear(); 507 word_starts_map_.clear(); 508 } 509 510 URLIndexPrivateData::~URLIndexPrivateData() {} 511 512 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( 513 const String16Vector& unsorted_words) { 514 // Break the terms down into individual terms (words), get the candidate 515 // set for each term, and intersect each to get a final candidate list. 516 // Note that a single 'term' from the user's perspective might be 517 // a string like "http://www.somewebsite.com" which, from our perspective, 518 // is four words: 'http', 'www', 'somewebsite', and 'com'. 519 HistoryIDSet history_id_set; 520 String16Vector words(unsorted_words); 521 // Sort the words into the longest first as such are likely to narrow down 522 // the results quicker. Also, single character words are the most expensive 523 // to process so save them for last. 524 std::sort(words.begin(), words.end(), LengthGreater); 525 for (String16Vector::iterator iter = words.begin(); iter != words.end(); 526 ++iter) { 527 base::string16 uni_word = *iter; 528 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); 529 if (term_history_set.empty()) { 530 history_id_set.clear(); 531 break; 532 } 533 if (iter == words.begin()) { 534 history_id_set.swap(term_history_set); 535 } else { 536 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>( 537 history_id_set, term_history_set); 538 history_id_set.swap(new_history_id_set); 539 } 540 } 541 return history_id_set; 542 } 543 544 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( 545 const base::string16& term) { 546 if (term.empty()) 547 return HistoryIDSet(); 548 549 // TODO(mrossetti): Consider optimizing for very common terms such as 550 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently 551 // occuring words in the user's searches. 552 553 size_t term_length = term.length(); 554 WordIDSet word_id_set; 555 if (term_length > 1) { 556 // See if this term or a prefix thereof is present in the cache. 557 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); 558 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); 559 cache_iter != search_term_cache_.end(); ++cache_iter) { 560 if (StartsWith(term, cache_iter->first, false) && 561 (best_prefix == search_term_cache_.end() || 562 cache_iter->first.length() > best_prefix->first.length())) 563 best_prefix = cache_iter; 564 } 565 566 // If a prefix was found then determine the leftover characters to be used 567 // for further refining the results from that prefix. 568 Char16Set prefix_chars; 569 base::string16 leftovers(term); 570 if (best_prefix != search_term_cache_.end()) { 571 // If the prefix is an exact match for the term then grab the cached 572 // results and we're done. 573 size_t prefix_length = best_prefix->first.length(); 574 if (prefix_length == term_length) { 575 best_prefix->second.used_ = true; 576 return best_prefix->second.history_id_set_; 577 } 578 579 // Otherwise we have a handy starting point. 580 // If there are no history results for this prefix then we can bail early 581 // as there will be no history results for the full term. 582 if (best_prefix->second.history_id_set_.empty()) { 583 search_term_cache_[term] = SearchTermCacheItem(); 584 return HistoryIDSet(); 585 } 586 word_id_set = best_prefix->second.word_id_set_; 587 prefix_chars = Char16SetFromString16(best_prefix->first); 588 leftovers = term.substr(prefix_length); 589 } 590 591 // Filter for each remaining, unique character in the term. 592 Char16Set leftover_chars = Char16SetFromString16(leftovers); 593 Char16Set unique_chars = 594 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars); 595 596 // Reduce the word set with any leftover, unprocessed characters. 597 if (!unique_chars.empty()) { 598 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); 599 // We might come up empty on the leftovers. 600 if (leftover_set.empty()) { 601 search_term_cache_[term] = SearchTermCacheItem(); 602 return HistoryIDSet(); 603 } 604 // Or there may not have been a prefix from which to start. 605 if (prefix_chars.empty()) { 606 word_id_set.swap(leftover_set); 607 } else { 608 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 609 word_id_set, leftover_set); 610 word_id_set.swap(new_word_id_set); 611 } 612 } 613 614 // We must filter the word list because the resulting word set surely 615 // contains words which do not have the search term as a proper subset. 616 for (WordIDSet::iterator word_set_iter = word_id_set.begin(); 617 word_set_iter != word_id_set.end(); ) { 618 if (word_list_[*word_set_iter].find(term) == base::string16::npos) 619 word_id_set.erase(word_set_iter++); 620 else 621 ++word_set_iter; 622 } 623 } else { 624 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); 625 } 626 627 // If any words resulted then we can compose a set of history IDs by unioning 628 // the sets from each word. 629 HistoryIDSet history_id_set; 630 if (!word_id_set.empty()) { 631 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 632 word_id_iter != word_id_set.end(); ++word_id_iter) { 633 WordID word_id = *word_id_iter; 634 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 635 if (word_iter != word_id_history_map_.end()) { 636 HistoryIDSet& word_history_id_set(word_iter->second); 637 history_id_set.insert(word_history_id_set.begin(), 638 word_history_id_set.end()); 639 } 640 } 641 } 642 643 // Record a new cache entry for this word if the term is longer than 644 // a single character. 645 if (term_length > 1) 646 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); 647 648 return history_id_set; 649 } 650 651 WordIDSet URLIndexPrivateData::WordIDSetForTermChars( 652 const Char16Set& term_chars) { 653 WordIDSet word_id_set; 654 for (Char16Set::const_iterator c_iter = term_chars.begin(); 655 c_iter != term_chars.end(); ++c_iter) { 656 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); 657 if (char_iter == char_word_map_.end()) { 658 // A character was not found so there are no matching results: bail. 659 word_id_set.clear(); 660 break; 661 } 662 WordIDSet& char_word_id_set(char_iter->second); 663 // It is possible for there to no longer be any words associated with 664 // a particular character. Give up in that case. 665 if (char_word_id_set.empty()) { 666 word_id_set.clear(); 667 break; 668 } 669 670 if (c_iter == term_chars.begin()) { 671 // First character results becomes base set of results. 672 word_id_set = char_word_id_set; 673 } else { 674 // Subsequent character results get intersected in. 675 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>( 676 word_id_set, char_word_id_set); 677 word_id_set.swap(new_word_id_set); 678 } 679 } 680 return word_id_set; 681 } 682 683 bool URLIndexPrivateData::IndexRow( 684 HistoryDatabase* history_db, 685 HistoryService* history_service, 686 const URLRow& row, 687 const std::string& languages, 688 const std::set<std::string>& scheme_whitelist) { 689 const GURL& gurl(row.url()); 690 691 // Index only URLs with a whitelisted scheme. 692 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) 693 return false; 694 695 URLID row_id = row.id(); 696 // Strip out username and password before saving and indexing. 697 base::string16 url(net::FormatUrl(gurl, languages, 698 net::kFormatUrlOmitUsernamePassword, 699 net::UnescapeRule::NONE, 700 NULL, NULL, NULL)); 701 702 HistoryID history_id = static_cast<HistoryID>(row_id); 703 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); 704 705 // Add the row for quick lookup in the history info store. 706 URLRow new_row(GURL(url), row_id); 707 new_row.set_visit_count(row.visit_count()); 708 new_row.set_typed_count(row.typed_count()); 709 new_row.set_last_visit(row.last_visit()); 710 new_row.set_title(row.title()); 711 history_info_map_[history_id].url_row = new_row; 712 713 // Index the words contained in the URL and title of the row. 714 RowWordStarts word_starts; 715 AddRowWordsToIndex(new_row, &word_starts, languages); 716 word_starts_map_[history_id] = word_starts; 717 718 // Update the recent visits information or schedule the update 719 // as appropriate. 720 if (history_db) { 721 // We'd like to check that we're on the history DB thread. 722 // However, unittest code actually calls this on the UI thread. 723 // So we don't do any thread checks. 724 VisitVector recent_visits; 725 // Make sure the private data is going to get as many recent visits as 726 // ScoredHistoryMatch::GetFrequency() hopes to use. 727 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); 728 if (history_db->GetMostRecentVisitsForURL(row_id, 729 kMaxVisitsToStoreInCache, 730 &recent_visits)) 731 UpdateRecentVisits(row_id, recent_visits); 732 } else { 733 DCHECK(history_service); 734 ScheduleUpdateRecentVisits(history_service, row_id); 735 } 736 737 return true; 738 } 739 740 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, 741 RowWordStarts* word_starts, 742 const std::string& languages) { 743 HistoryID history_id = static_cast<HistoryID>(row.id()); 744 // Split URL into individual, unique words then add in the title words. 745 const GURL& gurl(row.url()); 746 const base::string16& url = 747 bookmark_utils::CleanUpUrlForMatching(gurl, languages, NULL); 748 String16Set url_words = String16SetFromString16(url, 749 word_starts ? &word_starts->url_word_starts_ : NULL); 750 const base::string16& title = 751 bookmark_utils::CleanUpTitleForMatching(row.title()); 752 String16Set title_words = String16SetFromString16(title, 753 word_starts ? &word_starts->title_word_starts_ : NULL); 754 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words); 755 for (String16Set::iterator word_iter = words.begin(); 756 word_iter != words.end(); ++word_iter) 757 AddWordToIndex(*word_iter, history_id); 758 759 search_term_cache_.clear(); // Invalidate the term cache. 760 } 761 762 void URLIndexPrivateData::AddWordToIndex(const base::string16& term, 763 HistoryID history_id) { 764 WordMap::iterator word_pos = word_map_.find(term); 765 if (word_pos != word_map_.end()) 766 UpdateWordHistory(word_pos->second, history_id); 767 else 768 AddWordHistory(term, history_id); 769 } 770 771 void URLIndexPrivateData::AddWordHistory(const base::string16& term, 772 HistoryID history_id) { 773 WordID word_id = word_list_.size(); 774 if (available_words_.empty()) { 775 word_list_.push_back(term); 776 } else { 777 word_id = *(available_words_.begin()); 778 word_list_[word_id] = term; 779 available_words_.erase(word_id); 780 } 781 word_map_[term] = word_id; 782 783 HistoryIDSet history_id_set; 784 history_id_set.insert(history_id); 785 word_id_history_map_[word_id] = history_id_set; 786 AddToHistoryIDWordMap(history_id, word_id); 787 788 // For each character in the newly added word (i.e. a word that is not 789 // already in the word index), add the word to the character index. 790 Char16Set characters = Char16SetFromString16(term); 791 for (Char16Set::iterator uni_char_iter = characters.begin(); 792 uni_char_iter != characters.end(); ++uni_char_iter) { 793 base::char16 uni_char = *uni_char_iter; 794 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 795 if (char_iter != char_word_map_.end()) { 796 // Update existing entry in the char/word index. 797 WordIDSet& word_id_set(char_iter->second); 798 word_id_set.insert(word_id); 799 } else { 800 // Create a new entry in the char/word index. 801 WordIDSet word_id_set; 802 word_id_set.insert(word_id); 803 char_word_map_[uni_char] = word_id_set; 804 } 805 } 806 } 807 808 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, 809 HistoryID history_id) { 810 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 811 DCHECK(history_pos != word_id_history_map_.end()); 812 HistoryIDSet& history_id_set(history_pos->second); 813 history_id_set.insert(history_id); 814 AddToHistoryIDWordMap(history_id, word_id); 815 } 816 817 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, 818 WordID word_id) { 819 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); 820 if (iter != history_id_word_map_.end()) { 821 WordIDSet& word_id_set(iter->second); 822 word_id_set.insert(word_id); 823 } else { 824 WordIDSet word_id_set; 825 word_id_set.insert(word_id); 826 history_id_word_map_[history_id] = word_id_set; 827 } 828 } 829 830 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { 831 RemoveRowWordsFromIndex(row); 832 HistoryID history_id = static_cast<HistoryID>(row.id()); 833 history_info_map_.erase(history_id); 834 word_starts_map_.erase(history_id); 835 } 836 837 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { 838 // Remove the entries in history_id_word_map_ and word_id_history_map_ for 839 // this row. 840 HistoryID history_id = static_cast<HistoryID>(row.id()); 841 WordIDSet word_id_set = history_id_word_map_[history_id]; 842 history_id_word_map_.erase(history_id); 843 844 // Reconcile any changes to word usage. 845 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 846 word_id_iter != word_id_set.end(); ++word_id_iter) { 847 WordID word_id = *word_id_iter; 848 word_id_history_map_[word_id].erase(history_id); 849 if (!word_id_history_map_[word_id].empty()) 850 continue; // The word is still in use. 851 852 // The word is no longer in use. Reconcile any changes to character usage. 853 base::string16 word = word_list_[word_id]; 854 Char16Set characters = Char16SetFromString16(word); 855 for (Char16Set::iterator uni_char_iter = characters.begin(); 856 uni_char_iter != characters.end(); ++uni_char_iter) { 857 base::char16 uni_char = *uni_char_iter; 858 char_word_map_[uni_char].erase(word_id); 859 if (char_word_map_[uni_char].empty()) 860 char_word_map_.erase(uni_char); // No longer in use. 861 } 862 863 // Complete the removal of references to the word. 864 word_id_history_map_.erase(word_id); 865 word_map_.erase(word); 866 word_list_[word_id] = base::string16(); 867 available_words_.insert(word_id); 868 } 869 } 870 871 void URLIndexPrivateData::ResetSearchTermCache() { 872 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); 873 iter != search_term_cache_.end(); ++iter) 874 iter->second.used_ = false; 875 } 876 877 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { 878 base::TimeTicks beginning_time = base::TimeTicks::Now(); 879 InMemoryURLIndexCacheItem index_cache; 880 SavePrivateData(&index_cache); 881 std::string data; 882 if (!index_cache.SerializeToString(&data)) { 883 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; 884 return false; 885 } 886 887 int size = data.size(); 888 if (base::WriteFile(file_path, data.c_str(), size) != size) { 889 LOG(WARNING) << "Failed to write " << file_path.value(); 890 return false; 891 } 892 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 893 base::TimeTicks::Now() - beginning_time); 894 return true; 895 } 896 897 void URLIndexPrivateData::SavePrivateData( 898 InMemoryURLIndexCacheItem* cache) const { 899 DCHECK(cache); 900 cache->set_last_rebuild_timestamp( 901 last_time_rebuilt_from_history_.ToInternalValue()); 902 cache->set_version(saved_cache_version_); 903 // history_item_count_ is no longer used but rather than change the protobuf 904 // definition use a placeholder. This will go away with the switch to SQLite. 905 cache->set_history_item_count(0); 906 SaveWordList(cache); 907 SaveWordMap(cache); 908 SaveCharWordMap(cache); 909 SaveWordIDHistoryMap(cache); 910 SaveHistoryInfoMap(cache); 911 SaveWordStartsMap(cache); 912 } 913 914 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 915 if (word_list_.empty()) 916 return; 917 WordListItem* list_item = cache->mutable_word_list(); 918 list_item->set_word_count(word_list_.size()); 919 for (String16Vector::const_iterator iter = word_list_.begin(); 920 iter != word_list_.end(); ++iter) 921 list_item->add_word(base::UTF16ToUTF8(*iter)); 922 } 923 924 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { 925 if (word_map_.empty()) 926 return; 927 WordMapItem* map_item = cache->mutable_word_map(); 928 map_item->set_item_count(word_map_.size()); 929 for (WordMap::const_iterator iter = word_map_.begin(); 930 iter != word_map_.end(); ++iter) { 931 WordMapEntry* map_entry = map_item->add_word_map_entry(); 932 map_entry->set_word(base::UTF16ToUTF8(iter->first)); 933 map_entry->set_word_id(iter->second); 934 } 935 } 936 937 void URLIndexPrivateData::SaveCharWordMap( 938 InMemoryURLIndexCacheItem* cache) const { 939 if (char_word_map_.empty()) 940 return; 941 CharWordMapItem* map_item = cache->mutable_char_word_map(); 942 map_item->set_item_count(char_word_map_.size()); 943 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); 944 iter != char_word_map_.end(); ++iter) { 945 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); 946 map_entry->set_char_16(iter->first); 947 const WordIDSet& word_id_set(iter->second); 948 map_entry->set_item_count(word_id_set.size()); 949 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); 950 set_iter != word_id_set.end(); ++set_iter) 951 map_entry->add_word_id(*set_iter); 952 } 953 } 954 955 void URLIndexPrivateData::SaveWordIDHistoryMap( 956 InMemoryURLIndexCacheItem* cache) const { 957 if (word_id_history_map_.empty()) 958 return; 959 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); 960 map_item->set_item_count(word_id_history_map_.size()); 961 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); 962 iter != word_id_history_map_.end(); ++iter) { 963 WordIDHistoryMapEntry* map_entry = 964 map_item->add_word_id_history_map_entry(); 965 map_entry->set_word_id(iter->first); 966 const HistoryIDSet& history_id_set(iter->second); 967 map_entry->set_item_count(history_id_set.size()); 968 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); 969 set_iter != history_id_set.end(); ++set_iter) 970 map_entry->add_history_id(*set_iter); 971 } 972 } 973 974 void URLIndexPrivateData::SaveHistoryInfoMap( 975 InMemoryURLIndexCacheItem* cache) const { 976 if (history_info_map_.empty()) 977 return; 978 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); 979 map_item->set_item_count(history_info_map_.size()); 980 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 981 iter != history_info_map_.end(); ++iter) { 982 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); 983 map_entry->set_history_id(iter->first); 984 const URLRow& url_row(iter->second.url_row); 985 // Note: We only save information that contributes to the index so there 986 // is no need to save search_term_cache_ (not persistent). 987 map_entry->set_visit_count(url_row.visit_count()); 988 map_entry->set_typed_count(url_row.typed_count()); 989 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 990 map_entry->set_url(url_row.url().spec()); 991 map_entry->set_title(base::UTF16ToUTF8(url_row.title())); 992 const VisitInfoVector& visits(iter->second.visits); 993 for (VisitInfoVector::const_iterator visit_iter = visits.begin(); 994 visit_iter != visits.end(); ++visit_iter) { 995 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); 996 visit_info->set_visit_time(visit_iter->first.ToInternalValue()); 997 visit_info->set_transition_type(visit_iter->second); 998 } 999 } 1000 } 1001 1002 void URLIndexPrivateData::SaveWordStartsMap( 1003 InMemoryURLIndexCacheItem* cache) const { 1004 if (word_starts_map_.empty()) 1005 return; 1006 // For unit testing: Enable saving of the cache as an earlier version to 1007 // allow testing of cache file upgrading in ReadFromFile(). 1008 // TODO(mrossetti): Instead of intruding on production code with this kind of 1009 // test harness, save a copy of an older version cache with known results. 1010 // Implement this when switching the caching over to SQLite. 1011 if (saved_cache_version_ < 1) 1012 return; 1013 1014 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); 1015 map_item->set_item_count(word_starts_map_.size()); 1016 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); 1017 iter != word_starts_map_.end(); ++iter) { 1018 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); 1019 map_entry->set_history_id(iter->first); 1020 const RowWordStarts& word_starts(iter->second); 1021 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); 1022 i != word_starts.url_word_starts_.end(); ++i) 1023 map_entry->add_url_word_starts(*i); 1024 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); 1025 i != word_starts.title_word_starts_.end(); ++i) 1026 map_entry->add_title_word_starts(*i); 1027 } 1028 } 1029 1030 bool URLIndexPrivateData::RestorePrivateData( 1031 const InMemoryURLIndexCacheItem& cache, 1032 const std::string& languages) { 1033 last_time_rebuilt_from_history_ = 1034 base::Time::FromInternalValue(cache.last_rebuild_timestamp()); 1035 const base::TimeDelta rebuilt_ago = 1036 base::Time::Now() - last_time_rebuilt_from_history_; 1037 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || 1038 (rebuilt_ago < base::TimeDelta::FromDays(-1))) { 1039 // Cache is more than a week old or, somehow, from some time in the future. 1040 // It's probably a good time to rebuild the index from history to 1041 // allow synced entries to now appear, expired entries to disappear, etc. 1042 // Allow one day in the future to make the cache not rebuild on simple 1043 // system clock changes such as time zone changes. 1044 return false; 1045 } 1046 if (cache.has_version()) { 1047 if (cache.version() < kCurrentCacheFileVersion) { 1048 // Don't try to restore an old format cache file. (This will cause 1049 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData 1050 // from history.) 1051 return false; 1052 } 1053 restored_cache_version_ = cache.version(); 1054 } 1055 return RestoreWordList(cache) && RestoreWordMap(cache) && 1056 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && 1057 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); 1058 } 1059 1060 bool URLIndexPrivateData::RestoreWordList( 1061 const InMemoryURLIndexCacheItem& cache) { 1062 if (!cache.has_word_list()) 1063 return false; 1064 const WordListItem& list_item(cache.word_list()); 1065 uint32 expected_item_count = list_item.word_count(); 1066 uint32 actual_item_count = list_item.word_size(); 1067 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1068 return false; 1069 const RepeatedPtrField<std::string>& words(list_item.word()); 1070 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); 1071 iter != words.end(); ++iter) 1072 word_list_.push_back(base::UTF8ToUTF16(*iter)); 1073 return true; 1074 } 1075 1076 bool URLIndexPrivateData::RestoreWordMap( 1077 const InMemoryURLIndexCacheItem& cache) { 1078 if (!cache.has_word_map()) 1079 return false; 1080 const WordMapItem& list_item(cache.word_map()); 1081 uint32 expected_item_count = list_item.item_count(); 1082 uint32 actual_item_count = list_item.word_map_entry_size(); 1083 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1084 return false; 1085 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); 1086 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); 1087 iter != entries.end(); ++iter) 1088 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); 1089 return true; 1090 } 1091 1092 bool URLIndexPrivateData::RestoreCharWordMap( 1093 const InMemoryURLIndexCacheItem& cache) { 1094 if (!cache.has_char_word_map()) 1095 return false; 1096 const CharWordMapItem& list_item(cache.char_word_map()); 1097 uint32 expected_item_count = list_item.item_count(); 1098 uint32 actual_item_count = list_item.char_word_map_entry_size(); 1099 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1100 return false; 1101 const RepeatedPtrField<CharWordMapEntry>& 1102 entries(list_item.char_word_map_entry()); 1103 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = 1104 entries.begin(); iter != entries.end(); ++iter) { 1105 expected_item_count = iter->item_count(); 1106 actual_item_count = iter->word_id_size(); 1107 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1108 return false; 1109 base::char16 uni_char = static_cast<base::char16>(iter->char_16()); 1110 WordIDSet word_id_set; 1111 const RepeatedField<int32>& word_ids(iter->word_id()); 1112 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); 1113 jiter != word_ids.end(); ++jiter) 1114 word_id_set.insert(*jiter); 1115 char_word_map_[uni_char] = word_id_set; 1116 } 1117 return true; 1118 } 1119 1120 bool URLIndexPrivateData::RestoreWordIDHistoryMap( 1121 const InMemoryURLIndexCacheItem& cache) { 1122 if (!cache.has_word_id_history_map()) 1123 return false; 1124 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); 1125 uint32 expected_item_count = list_item.item_count(); 1126 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); 1127 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1128 return false; 1129 const RepeatedPtrField<WordIDHistoryMapEntry>& 1130 entries(list_item.word_id_history_map_entry()); 1131 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = 1132 entries.begin(); iter != entries.end(); ++iter) { 1133 expected_item_count = iter->item_count(); 1134 actual_item_count = iter->history_id_size(); 1135 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1136 return false; 1137 WordID word_id = iter->word_id(); 1138 HistoryIDSet history_id_set; 1139 const RepeatedField<int64>& history_ids(iter->history_id()); 1140 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); 1141 jiter != history_ids.end(); ++jiter) { 1142 history_id_set.insert(*jiter); 1143 AddToHistoryIDWordMap(*jiter, word_id); 1144 } 1145 word_id_history_map_[word_id] = history_id_set; 1146 } 1147 return true; 1148 } 1149 1150 bool URLIndexPrivateData::RestoreHistoryInfoMap( 1151 const InMemoryURLIndexCacheItem& cache) { 1152 if (!cache.has_history_info_map()) 1153 return false; 1154 const HistoryInfoMapItem& list_item(cache.history_info_map()); 1155 uint32 expected_item_count = list_item.item_count(); 1156 uint32 actual_item_count = list_item.history_info_map_entry_size(); 1157 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1158 return false; 1159 const RepeatedPtrField<HistoryInfoMapEntry>& 1160 entries(list_item.history_info_map_entry()); 1161 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = 1162 entries.begin(); iter != entries.end(); ++iter) { 1163 HistoryID history_id = iter->history_id(); 1164 GURL url(iter->url()); 1165 URLRow url_row(url, history_id); 1166 url_row.set_visit_count(iter->visit_count()); 1167 url_row.set_typed_count(iter->typed_count()); 1168 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1169 if (iter->has_title()) { 1170 base::string16 title(base::UTF8ToUTF16(iter->title())); 1171 url_row.set_title(title); 1172 } 1173 history_info_map_[history_id].url_row = url_row; 1174 1175 // Restore visits list. 1176 VisitInfoVector visits; 1177 visits.reserve(iter->visits_size()); 1178 for (int i = 0; i < iter->visits_size(); ++i) { 1179 visits.push_back(std::make_pair( 1180 base::Time::FromInternalValue(iter->visits(i).visit_time()), 1181 static_cast<content::PageTransition>(iter->visits(i). 1182 transition_type()))); 1183 } 1184 history_info_map_[history_id].visits = visits; 1185 } 1186 return true; 1187 } 1188 1189 bool URLIndexPrivateData::RestoreWordStartsMap( 1190 const InMemoryURLIndexCacheItem& cache, 1191 const std::string& languages) { 1192 // Note that this function must be called after RestoreHistoryInfoMap() has 1193 // been run as the word starts may have to be recalculated from the urls and 1194 // page titles. 1195 if (cache.has_word_starts_map()) { 1196 const WordStartsMapItem& list_item(cache.word_starts_map()); 1197 uint32 expected_item_count = list_item.item_count(); 1198 uint32 actual_item_count = list_item.word_starts_map_entry_size(); 1199 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1200 return false; 1201 const RepeatedPtrField<WordStartsMapEntry>& 1202 entries(list_item.word_starts_map_entry()); 1203 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = 1204 entries.begin(); iter != entries.end(); ++iter) { 1205 HistoryID history_id = iter->history_id(); 1206 RowWordStarts word_starts; 1207 // Restore the URL word starts. 1208 const RepeatedField<int32>& url_starts(iter->url_word_starts()); 1209 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); 1210 jiter != url_starts.end(); ++jiter) 1211 word_starts.url_word_starts_.push_back(*jiter); 1212 // Restore the page title word starts. 1213 const RepeatedField<int32>& title_starts(iter->title_word_starts()); 1214 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); 1215 jiter != title_starts.end(); ++jiter) 1216 word_starts.title_word_starts_.push_back(*jiter); 1217 word_starts_map_[history_id] = word_starts; 1218 } 1219 } else { 1220 // Since the cache did not contain any word starts we must rebuild then from 1221 // the URL and page titles. 1222 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); 1223 iter != history_info_map_.end(); ++iter) { 1224 RowWordStarts word_starts; 1225 const URLRow& row(iter->second.url_row); 1226 const base::string16& url = bookmark_utils::CleanUpUrlForMatching( 1227 row.url(), languages, NULL); 1228 String16VectorFromString16(url, false, &word_starts.url_word_starts_); 1229 const base::string16& title = 1230 bookmark_utils::CleanUpTitleForMatching(row.title()); 1231 String16VectorFromString16(title, false, &word_starts.title_word_starts_); 1232 word_starts_map_[iter->first] = word_starts; 1233 } 1234 } 1235 return true; 1236 } 1237 1238 // static 1239 bool URLIndexPrivateData::URLSchemeIsWhitelisted( 1240 const GURL& gurl, 1241 const std::set<std::string>& whitelist) { 1242 return whitelist.find(gurl.scheme()) != whitelist.end(); 1243 } 1244 1245 1246 // SearchTermCacheItem --------------------------------------------------------- 1247 1248 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( 1249 const WordIDSet& word_id_set, 1250 const HistoryIDSet& history_id_set) 1251 : word_id_set_(word_id_set), 1252 history_id_set_(history_id_set), 1253 used_(true) {} 1254 1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() 1256 : used_(true) {} 1257 1258 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} 1259 1260 1261 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- 1262 1263 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( 1264 const URLIndexPrivateData& private_data, 1265 const std::string& languages, 1266 HistoryClient* history_client, 1267 const base::string16& lower_string, 1268 const String16Vector& lower_terms, 1269 const base::Time now) 1270 : private_data_(private_data), 1271 languages_(languages), 1272 history_client_(history_client), 1273 lower_string_(lower_string), 1274 lower_terms_(lower_terms), 1275 now_(now) { 1276 // Calculate offsets for each term. For instance, the offset for 1277 // ".net" should be 1, indicating that the actual word-part of the term 1278 // starts at offset 1. 1279 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); 1280 for (size_t i = 0; i < lower_terms_.size(); ++i) { 1281 base::i18n::BreakIterator iter(lower_terms_[i], 1282 base::i18n::BreakIterator::BREAK_WORD); 1283 // If the iterator doesn't work, assume an offset of 0. 1284 if (!iter.Init()) 1285 continue; 1286 // Find the first word start. 1287 while (iter.Advance() && !iter.IsWord()) {} 1288 if (iter.IsWord()) 1289 lower_terms_to_word_starts_offsets_[i] = iter.prev(); 1290 // Else: the iterator didn't find a word break. Assume an offset of 0. 1291 } 1292 } 1293 1294 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} 1295 1296 void URLIndexPrivateData::AddHistoryMatch::operator()( 1297 const HistoryID history_id) { 1298 HistoryInfoMap::const_iterator hist_pos = 1299 private_data_.history_info_map_.find(history_id); 1300 if (hist_pos != private_data_.history_info_map_.end()) { 1301 const URLRow& hist_item = hist_pos->second.url_row; 1302 const VisitInfoVector& visits = hist_pos->second.visits; 1303 WordStartsMap::const_iterator starts_pos = 1304 private_data_.word_starts_map_.find(history_id); 1305 DCHECK(starts_pos != private_data_.word_starts_map_.end()); 1306 ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_, 1307 lower_terms_, lower_terms_to_word_starts_offsets_, 1308 starts_pos->second, now_, history_client_); 1309 if (match.raw_score() > 0) 1310 scored_matches_.push_back(match); 1311 } 1312 } 1313 1314 1315 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- 1316 1317 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( 1318 const HistoryInfoMap& history_info_map) 1319 : history_info_map_(history_info_map) { 1320 } 1321 1322 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} 1323 1324 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( 1325 const HistoryID h1, 1326 const HistoryID h2) { 1327 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); 1328 if (entry1 == history_info_map_.end()) 1329 return false; 1330 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); 1331 if (entry2 == history_info_map_.end()) 1332 return true; 1333 const URLRow& r1(entry1->second.url_row); 1334 const URLRow& r2(entry2->second.url_row); 1335 // First cut: typed count, visit count, recency. 1336 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks 1337 // recently visited (within the last 12/24 hours) as highly important. Get 1338 // input from mpearson. 1339 if (r1.typed_count() != r2.typed_count()) 1340 return (r1.typed_count() > r2.typed_count()); 1341 if (r1.visit_count() != r2.visit_count()) 1342 return (r1.visit_count() > r2.visit_count()); 1343 return (r1.last_visit() > r2.last_visit()); 1344 } 1345 1346 } // namespace history 1347