Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/url_index_private_data.h"
      6 
      7 #include <algorithm>
      8 #include <functional>
      9 #include <iterator>
     10 #include <limits>
     11 #include <numeric>
     12 #include <string>
     13 #include <vector>
     14 
     15 #include "base/basictypes.h"
     16 #include "base/file_util.h"
     17 #include "base/i18n/case_conversion.h"
     18 #include "base/metrics/histogram.h"
     19 #include "base/strings/string_util.h"
     20 #include "base/strings/utf_string_conversions.h"
     21 #include "base/time/time.h"
     22 #include "chrome/browser/autocomplete/autocomplete_provider.h"
     23 #include "chrome/browser/autocomplete/url_prefix.h"
     24 #include "chrome/browser/bookmarks/bookmark_service.h"
     25 #include "chrome/browser/history/history_database.h"
     26 #include "chrome/browser/history/history_db_task.h"
     27 #include "chrome/browser/history/history_service.h"
     28 #include "chrome/browser/history/in_memory_url_index.h"
     29 #include "content/public/browser/browser_thread.h"
     30 #include "content/public/browser/notification_details.h"
     31 #include "content/public/browser/notification_service.h"
     32 #include "content/public/browser/notification_source.h"
     33 #include "net/base/net_util.h"
     34 
     35 #if defined(USE_SYSTEM_PROTOBUF)
     36 #include <google/protobuf/repeated_field.h>
     37 #else
     38 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
     39 #endif
     40 
     41 using google::protobuf::RepeatedField;
     42 using google::protobuf::RepeatedPtrField;
     43 using in_memory_url_index::InMemoryURLIndexCacheItem;
     44 
     45 namespace {
     46 static const size_t kMaxVisitsToStoreInCache = 10u;
     47 }  // anonymous namespace
     48 
     49 namespace history {
     50 
     51 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
     52 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
     53 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
     54 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
     55 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
     56     CharWordMapEntry;
     57 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
     58     WordIDHistoryMapItem;
     59 typedef imui::
     60     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
     61     WordIDHistoryMapEntry;
     62 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
     63 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
     64     HistoryInfoMapEntry;
     65 typedef imui::
     66     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
     67     HistoryInfoMapEntry_VisitInfo;
     68 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
     69 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
     70     WordStartsMapEntry;
     71 
     72 
     73 // Algorithm Functions ---------------------------------------------------------
     74 
     75 // Comparison function for sorting search terms by descending length.
     76 bool LengthGreater(const string16& string_a, const string16& string_b) {
     77   return string_a.length() > string_b.length();
     78 }
     79 
     80 
     81 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
     82 
     83 // HistoryDBTask used to update the recent visit data for a particular
     84 // row from the history database.
     85 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
     86  public:
     87   explicit UpdateRecentVisitsFromHistoryDBTask(
     88       URLIndexPrivateData* private_data,
     89       URLID url_id);
     90 
     91   virtual bool RunOnDBThread(HistoryBackend* backend,
     92                              history::HistoryDatabase* db) OVERRIDE;
     93   virtual void DoneRunOnMainThread() OVERRIDE;
     94 
     95  private:
     96   virtual ~UpdateRecentVisitsFromHistoryDBTask();
     97 
     98   // The URLIndexPrivateData that gets updated after the historyDB
     99   // task returns.
    100   URLIndexPrivateData* private_data_;
    101   // The ID of the URL to get visits for and then update.
    102   URLID url_id_;
    103   // Whether fetching the recent visits for the URL succeeded.
    104   bool succeeded_;
    105   // The awaited data that's shown to private_data_ for it to copy and
    106   // store.
    107   VisitVector recent_visits_;
    108 
    109   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
    110 };
    111 
    112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
    113     URLIndexPrivateData* private_data,
    114     URLID url_id)
    115     : private_data_(private_data),
    116       url_id_(url_id),
    117       succeeded_(false) {
    118 }
    119 
    120 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
    121     HistoryBackend* backend,
    122     HistoryDatabase* db) {
    123   // Make sure the private data is going to get as many recent visits as
    124   // ScoredHistoryMatch::GetFrecency() hopes to use.
    125   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
    126   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
    127                                              kMaxVisitsToStoreInCache,
    128                                              &recent_visits_);
    129   if (!succeeded_)
    130     recent_visits_.clear();
    131   return true;  // Always claim to be done; do not retry failures.
    132 }
    133 
    134 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
    135   if (succeeded_)
    136     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
    137 }
    138 
    139 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
    140 }
    141 
    142 
    143 // URLIndexPrivateData ---------------------------------------------------------
    144 
    145 URLIndexPrivateData::URLIndexPrivateData()
    146     : restored_cache_version_(0),
    147       saved_cache_version_(kCurrentCacheFileVersion),
    148       pre_filter_item_count_(0),
    149       post_filter_item_count_(0),
    150       post_scoring_item_count_(0) {
    151 }
    152 
    153 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
    154     string16 search_string,
    155     size_t cursor_position,
    156     const std::string& languages,
    157     BookmarkService* bookmark_service) {
    158   // If cursor position is set and useful (not at either end of the
    159   // string), allow the search string to be broken at cursor position.
    160   // We do this by pretending there's a space where the cursor is.
    161   if ((cursor_position != string16::npos) &&
    162       (cursor_position < search_string.length()) &&
    163       (cursor_position > 0)) {
    164     search_string.insert(cursor_position, ASCIIToUTF16(" "));
    165   }
    166   pre_filter_item_count_ = 0;
    167   post_filter_item_count_ = 0;
    168   post_scoring_item_count_ = 0;
    169   // The search string we receive may contain escaped characters. For reducing
    170   // the index we need individual, lower-cased words, ignoring escapings. For
    171   // the final filtering we need whitespace separated substrings possibly
    172   // containing escaped characters.
    173   string16 lower_raw_string(base::i18n::ToLower(search_string));
    174   string16 lower_unescaped_string =
    175       net::UnescapeURLComponent(lower_raw_string,
    176           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
    177   // Extract individual 'words' (as opposed to 'terms'; see below) from the
    178   // search string. When the user types "colspec=ID%20Mstone Release" we get
    179   // four 'words': "colspec", "id", "mstone" and "release".
    180   String16Vector lower_words(
    181       history::String16VectorFromString16(lower_unescaped_string, false, NULL));
    182   ScoredHistoryMatches scored_items;
    183 
    184   // Do nothing if we have indexed no words (probably because we've not been
    185   // initialized yet) or the search string has no words.
    186   if (word_list_.empty() || lower_words.empty()) {
    187     search_term_cache_.clear();  // Invalidate the term cache.
    188     return scored_items;
    189   }
    190 
    191   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
    192   // approach.
    193   ResetSearchTermCache();
    194 
    195   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
    196 
    197   // Trim the candidate pool if it is large. Note that we do not filter out
    198   // items that do not contain the search terms as proper substrings -- doing
    199   // so is the performance-costly operation we are trying to avoid in order
    200   // to maintain omnibox responsiveness.
    201   const size_t kItemsToScoreLimit = 500;
    202   pre_filter_item_count_ = history_id_set.size();
    203   // If we trim the results set we do not want to cache the results for next
    204   // time as the user's ultimately desired result could easily be eliminated
    205   // in this early rough filter.
    206   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
    207   if (was_trimmed) {
    208     HistoryIDVector history_ids;
    209     std::copy(history_id_set.begin(), history_id_set.end(),
    210               std::back_inserter(history_ids));
    211     // Trim down the set by sorting by typed-count, visit-count, and last
    212     // visit.
    213     HistoryItemFactorGreater
    214         item_factor_functor(history_info_map_);
    215     std::partial_sort(history_ids.begin(),
    216                       history_ids.begin() + kItemsToScoreLimit,
    217                       history_ids.end(),
    218                       item_factor_functor);
    219     history_id_set.clear();
    220     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
    221               std::inserter(history_id_set, history_id_set.end()));
    222     post_filter_item_count_ = history_id_set.size();
    223   }
    224 
    225   // Pass over all of the candidates filtering out any without a proper
    226   // substring match, inserting those which pass in order by score. Note that
    227   // in this step we are using the raw search string complete with escaped
    228   // URL elements. When the user has specifically typed something akin to
    229   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
    230   // specific substring appears in the URL or page title.
    231 
    232   // We call these 'terms' (as opposed to 'words'; see above) as in this case
    233   // we only want to break up the search string on 'true' whitespace rather than
    234   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
    235   // get two 'terms': "colspec=id%20mstone" and "release".
    236   history::String16Vector lower_raw_terms;
    237   if (Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms) == 0) {
    238     // Don't score matches when there are no terms to score against.  (It's
    239     // possible that the word break iterater that extracts words to search
    240     // for in the database allows some whitespace "words" whereas Tokenize
    241     // excludes a long list of whitespace.)  One could write a scoring
    242     // function that gives a reasonable order to matches when there
    243     // are no terms (i.e., all the words are some form of whitespace),
    244     // but this is such a rare edge case that it's not worth the time.
    245     return scored_items;
    246   }
    247   scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
    248       AddHistoryMatch(*this, languages, bookmark_service, lower_raw_string,
    249                       lower_raw_terms, base::Time::Now())).ScoredMatches();
    250 
    251   // Select and sort only the top kMaxMatches results.
    252   if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
    253     std::partial_sort(scored_items.begin(),
    254                       scored_items.begin() +
    255                           AutocompleteProvider::kMaxMatches,
    256                       scored_items.end(),
    257                       ScoredHistoryMatch::MatchScoreGreater);
    258       scored_items.resize(AutocompleteProvider::kMaxMatches);
    259   } else {
    260     std::sort(scored_items.begin(), scored_items.end(),
    261               ScoredHistoryMatch::MatchScoreGreater);
    262   }
    263   post_scoring_item_count_ = scored_items.size();
    264 
    265   if (was_trimmed) {
    266     search_term_cache_.clear();  // Invalidate the term cache.
    267   } else {
    268     // Remove any stale SearchTermCacheItems.
    269     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
    270          cache_iter != search_term_cache_.end(); ) {
    271       if (!cache_iter->second.used_)
    272         search_term_cache_.erase(cache_iter++);
    273       else
    274         ++cache_iter;
    275     }
    276   }
    277 
    278   return scored_items;
    279 }
    280 
    281 bool URLIndexPrivateData::UpdateURL(
    282     HistoryService* history_service,
    283     const URLRow& row,
    284     const std::string& languages,
    285     const std::set<std::string>& scheme_whitelist) {
    286   // The row may or may not already be in our index. If it is not already
    287   // indexed and it qualifies then it gets indexed. If it is already
    288   // indexed and still qualifies then it gets updated, otherwise it
    289   // is deleted from the index.
    290   bool row_was_updated = false;
    291   URLID row_id = row.id();
    292   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
    293   if (row_pos == history_info_map_.end()) {
    294     // This new row should be indexed if it qualifies.
    295     URLRow new_row(row);
    296     new_row.set_id(row_id);
    297     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
    298         IndexRow(NULL, history_service, new_row, languages, scheme_whitelist);
    299   } else if (RowQualifiesAsSignificant(row, base::Time())) {
    300     // This indexed row still qualifies and will be re-indexed.
    301     // The url won't have changed but the title, visit count, etc.
    302     // might have changed.
    303     URLRow& row_to_update = row_pos->second.url_row;
    304     bool title_updated = row_to_update.title() != row.title();
    305     if (row_to_update.visit_count() != row.visit_count() ||
    306         row_to_update.typed_count() != row.typed_count() ||
    307         row_to_update.last_visit() != row.last_visit() || title_updated) {
    308       row_to_update.set_visit_count(row.visit_count());
    309       row_to_update.set_typed_count(row.typed_count());
    310       row_to_update.set_last_visit(row.last_visit());
    311       // If something appears to have changed, update the recent visits
    312       // information.
    313       ScheduleUpdateRecentVisits(history_service, row_id);
    314       // While the URL is guaranteed to remain stable, the title may have
    315       // changed. If so, then update the index with the changed words.
    316       if (title_updated) {
    317         // Clear all words associated with this row and re-index both the
    318         // URL and title.
    319         RemoveRowWordsFromIndex(row_to_update);
    320         row_to_update.set_title(row.title());
    321         RowWordStarts word_starts;
    322         AddRowWordsToIndex(row_to_update, &word_starts, languages);
    323         word_starts_map_[row_id] = word_starts;
    324       }
    325       row_was_updated = true;
    326     }
    327   } else {
    328     // This indexed row no longer qualifies and will be de-indexed by
    329     // clearing all words associated with this row.
    330     RemoveRowFromIndex(row);
    331     row_was_updated = true;
    332   }
    333   if (row_was_updated)
    334     search_term_cache_.clear();  // This invalidates the cache.
    335   return row_was_updated;
    336 }
    337 
    338 void URLIndexPrivateData::UpdateRecentVisits(
    339     URLID url_id,
    340     const VisitVector& recent_visits) {
    341   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
    342   if (row_pos != history_info_map_.end()) {
    343     VisitInfoVector* visits = &row_pos->second.visits;
    344     visits->clear();
    345     const size_t size =
    346         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
    347     visits->reserve(size);
    348     for (size_t i = 0; i < size; i++) {
    349       // Copy from the VisitVector the only fields visits needs.
    350       visits->push_back(std::make_pair(recent_visits[i].visit_time,
    351                                        recent_visits[i].transition));
    352     }
    353   }
    354   // Else: Oddly, the URL doesn't seem to exist in the private index.
    355   // Ignore this update.  This can happen if, for instance, the user
    356   // removes the URL from URLIndexPrivateData before the historyDB call
    357   // returns.
    358 }
    359 
    360 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
    361     HistoryService* history_service,
    362     URLID url_id) {
    363   history_service->ScheduleDBTask(
    364       new UpdateRecentVisitsFromHistoryDBTask(this, url_id),
    365       &recent_visits_consumer_);
    366 }
    367 
    368 // Helper functor for DeleteURL.
    369 class HistoryInfoMapItemHasURL {
    370  public:
    371   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
    372 
    373   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
    374     return item.second.url_row.url() == url_;
    375   }
    376 
    377  private:
    378   const GURL& url_;
    379 };
    380 
    381 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
    382   // Find the matching entry in the history_info_map_.
    383   HistoryInfoMap::iterator pos = std::find_if(
    384       history_info_map_.begin(),
    385       history_info_map_.end(),
    386       HistoryInfoMapItemHasURL(url));
    387   if (pos == history_info_map_.end())
    388     return false;
    389   RemoveRowFromIndex(pos->second.url_row);
    390   search_term_cache_.clear();  // This invalidates the cache.
    391   return true;
    392 }
    393 
    394 // static
    395 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
    396     const base::FilePath& file_path,
    397     const std::string& languages) {
    398   base::TimeTicks beginning_time = base::TimeTicks::Now();
    399   if (!base::PathExists(file_path))
    400     return NULL;
    401   std::string data;
    402   // If there is no cache file then simply give up. This will cause us to
    403   // attempt to rebuild from the history database.
    404   if (!file_util::ReadFileToString(file_path, &data))
    405     return NULL;
    406 
    407   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
    408   InMemoryURLIndexCacheItem index_cache;
    409   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
    410     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
    411                  << file_path.value();
    412     return restored_data;
    413   }
    414 
    415   if (!restored_data->RestorePrivateData(index_cache, languages))
    416     return NULL;
    417 
    418   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
    419                       base::TimeTicks::Now() - beginning_time);
    420   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
    421                        restored_data->history_id_word_map_.size());
    422   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
    423   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
    424                              restored_data->word_map_.size());
    425   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
    426                              restored_data->char_word_map_.size());
    427   if (restored_data->Empty())
    428     return NULL;  // 'No data' is the same as a failed reload.
    429   return restored_data;
    430 }
    431 
    432 // static
    433 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
    434     HistoryDatabase* history_db,
    435     const std::string& languages,
    436     const std::set<std::string>& scheme_whitelist) {
    437   if (!history_db)
    438     return NULL;
    439 
    440   base::TimeTicks beginning_time = base::TimeTicks::Now();
    441 
    442   scoped_refptr<URLIndexPrivateData>
    443       rebuilt_data(new URLIndexPrivateData);
    444   URLDatabase::URLEnumerator history_enum;
    445   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
    446     return NULL;
    447   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
    448   for (URLRow row; history_enum.GetNextURL(&row); ) {
    449     rebuilt_data->IndexRow(history_db, NULL, row, languages,
    450                            scheme_whitelist);
    451   }
    452 
    453   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
    454                       base::TimeTicks::Now() - beginning_time);
    455   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
    456                        rebuilt_data->history_id_word_map_.size());
    457   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
    458                              rebuilt_data->word_map_.size());
    459   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
    460                              rebuilt_data->char_word_map_.size());
    461   return rebuilt_data;
    462 }
    463 
    464 // static
    465 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
    466     scoped_refptr<URLIndexPrivateData> private_data,
    467     const base::FilePath& file_path) {
    468   DCHECK(private_data.get());
    469   DCHECK(!file_path.empty());
    470   return private_data->SaveToFile(file_path);
    471 }
    472 
    473 void URLIndexPrivateData::CancelPendingUpdates() {
    474   recent_visits_consumer_.CancelAllRequests();
    475 }
    476 
    477 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
    478   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
    479   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
    480   data_copy->word_list_ = word_list_;
    481   data_copy->available_words_ = available_words_;
    482   data_copy->word_map_ = word_map_;
    483   data_copy->char_word_map_ = char_word_map_;
    484   data_copy->word_id_history_map_ = word_id_history_map_;
    485   data_copy->history_id_word_map_ = history_id_word_map_;
    486   data_copy->history_info_map_ = history_info_map_;
    487   data_copy->word_starts_map_ = word_starts_map_;
    488   return data_copy;
    489   // Not copied:
    490   //    search_term_cache_
    491   //    pre_filter_item_count_
    492   //    post_filter_item_count_
    493   //    post_scoring_item_count_
    494 };
    495 
    496 bool URLIndexPrivateData::Empty() const {
    497   return history_info_map_.empty();
    498 }
    499 
    500 void URLIndexPrivateData::Clear() {
    501   last_time_rebuilt_from_history_ = base::Time();
    502   word_list_.clear();
    503   available_words_.clear();
    504   word_map_.clear();
    505   char_word_map_.clear();
    506   word_id_history_map_.clear();
    507   history_id_word_map_.clear();
    508   history_info_map_.clear();
    509   word_starts_map_.clear();
    510 }
    511 
    512 URLIndexPrivateData::~URLIndexPrivateData() {}
    513 
    514 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
    515     const String16Vector& unsorted_words) {
    516   // Break the terms down into individual terms (words), get the candidate
    517   // set for each term, and intersect each to get a final candidate list.
    518   // Note that a single 'term' from the user's perspective might be
    519   // a string like "http://www.somewebsite.com" which, from our perspective,
    520   // is four words: 'http', 'www', 'somewebsite', and 'com'.
    521   HistoryIDSet history_id_set;
    522   String16Vector words(unsorted_words);
    523   // Sort the words into the longest first as such are likely to narrow down
    524   // the results quicker. Also, single character words are the most expensive
    525   // to process so save them for last.
    526   std::sort(words.begin(), words.end(), LengthGreater);
    527   for (String16Vector::iterator iter = words.begin(); iter != words.end();
    528        ++iter) {
    529     string16 uni_word = *iter;
    530     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
    531     if (term_history_set.empty()) {
    532       history_id_set.clear();
    533       break;
    534     }
    535     if (iter == words.begin()) {
    536       history_id_set.swap(term_history_set);
    537     } else {
    538       HistoryIDSet new_history_id_set;
    539       std::set_intersection(history_id_set.begin(), history_id_set.end(),
    540                             term_history_set.begin(), term_history_set.end(),
    541                             std::inserter(new_history_id_set,
    542                                           new_history_id_set.begin()));
    543       history_id_set.swap(new_history_id_set);
    544     }
    545   }
    546   return history_id_set;
    547 }
    548 
    549 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
    550     const string16& term) {
    551   if (term.empty())
    552     return HistoryIDSet();
    553 
    554   // TODO(mrossetti): Consider optimizing for very common terms such as
    555   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
    556   // occuring words in the user's searches.
    557 
    558   size_t term_length = term.length();
    559   WordIDSet word_id_set;
    560   if (term_length > 1) {
    561     // See if this term or a prefix thereof is present in the cache.
    562     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
    563     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
    564          cache_iter != search_term_cache_.end(); ++cache_iter) {
    565       if (StartsWith(term, cache_iter->first, false) &&
    566           (best_prefix == search_term_cache_.end() ||
    567            cache_iter->first.length() > best_prefix->first.length()))
    568         best_prefix = cache_iter;
    569     }
    570 
    571     // If a prefix was found then determine the leftover characters to be used
    572     // for further refining the results from that prefix.
    573     Char16Set prefix_chars;
    574     string16 leftovers(term);
    575     if (best_prefix != search_term_cache_.end()) {
    576       // If the prefix is an exact match for the term then grab the cached
    577       // results and we're done.
    578       size_t prefix_length = best_prefix->first.length();
    579       if (prefix_length == term_length) {
    580         best_prefix->second.used_ = true;
    581         return best_prefix->second.history_id_set_;
    582       }
    583 
    584       // Otherwise we have a handy starting point.
    585       // If there are no history results for this prefix then we can bail early
    586       // as there will be no history results for the full term.
    587       if (best_prefix->second.history_id_set_.empty()) {
    588         search_term_cache_[term] = SearchTermCacheItem();
    589         return HistoryIDSet();
    590       }
    591       word_id_set = best_prefix->second.word_id_set_;
    592       prefix_chars = Char16SetFromString16(best_prefix->first);
    593       leftovers = term.substr(prefix_length);
    594     }
    595 
    596     // Filter for each remaining, unique character in the term.
    597     Char16Set leftover_chars = Char16SetFromString16(leftovers);
    598     Char16Set unique_chars;
    599     std::set_difference(leftover_chars.begin(), leftover_chars.end(),
    600                         prefix_chars.begin(), prefix_chars.end(),
    601                         std::inserter(unique_chars, unique_chars.begin()));
    602 
    603     // Reduce the word set with any leftover, unprocessed characters.
    604     if (!unique_chars.empty()) {
    605       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
    606       // We might come up empty on the leftovers.
    607       if (leftover_set.empty()) {
    608         search_term_cache_[term] = SearchTermCacheItem();
    609         return HistoryIDSet();
    610       }
    611       // Or there may not have been a prefix from which to start.
    612       if (prefix_chars.empty()) {
    613         word_id_set.swap(leftover_set);
    614       } else {
    615         WordIDSet new_word_id_set;
    616         std::set_intersection(word_id_set.begin(), word_id_set.end(),
    617                               leftover_set.begin(), leftover_set.end(),
    618                               std::inserter(new_word_id_set,
    619                                             new_word_id_set.begin()));
    620         word_id_set.swap(new_word_id_set);
    621       }
    622     }
    623 
    624     // We must filter the word list because the resulting word set surely
    625     // contains words which do not have the search term as a proper subset.
    626     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
    627          word_set_iter != word_id_set.end(); ) {
    628       if (word_list_[*word_set_iter].find(term) == string16::npos)
    629         word_id_set.erase(word_set_iter++);
    630       else
    631         ++word_set_iter;
    632     }
    633   } else {
    634     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
    635   }
    636 
    637   // If any words resulted then we can compose a set of history IDs by unioning
    638   // the sets from each word.
    639   HistoryIDSet history_id_set;
    640   if (!word_id_set.empty()) {
    641     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
    642          word_id_iter != word_id_set.end(); ++word_id_iter) {
    643       WordID word_id = *word_id_iter;
    644       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
    645       if (word_iter != word_id_history_map_.end()) {
    646         HistoryIDSet& word_history_id_set(word_iter->second);
    647         history_id_set.insert(word_history_id_set.begin(),
    648                               word_history_id_set.end());
    649       }
    650     }
    651   }
    652 
    653   // Record a new cache entry for this word if the term is longer than
    654   // a single character.
    655   if (term_length > 1)
    656     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
    657 
    658   return history_id_set;
    659 }
    660 
    661 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
    662     const Char16Set& term_chars) {
    663   WordIDSet word_id_set;
    664   for (Char16Set::const_iterator c_iter = term_chars.begin();
    665        c_iter != term_chars.end(); ++c_iter) {
    666     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
    667     if (char_iter == char_word_map_.end()) {
    668       // A character was not found so there are no matching results: bail.
    669       word_id_set.clear();
    670       break;
    671     }
    672     WordIDSet& char_word_id_set(char_iter->second);
    673     // It is possible for there to no longer be any words associated with
    674     // a particular character. Give up in that case.
    675     if (char_word_id_set.empty()) {
    676       word_id_set.clear();
    677       break;
    678     }
    679 
    680     if (c_iter == term_chars.begin()) {
    681       // First character results becomes base set of results.
    682       word_id_set = char_word_id_set;
    683     } else {
    684       // Subsequent character results get intersected in.
    685       WordIDSet new_word_id_set;
    686       std::set_intersection(word_id_set.begin(), word_id_set.end(),
    687                             char_word_id_set.begin(), char_word_id_set.end(),
    688                             std::inserter(new_word_id_set,
    689                                           new_word_id_set.begin()));
    690       word_id_set.swap(new_word_id_set);
    691     }
    692   }
    693   return word_id_set;
    694 }
    695 
    696 bool URLIndexPrivateData::IndexRow(
    697     HistoryDatabase* history_db,
    698     HistoryService* history_service,
    699     const URLRow& row,
    700     const std::string& languages,
    701     const std::set<std::string>& scheme_whitelist) {
    702   const GURL& gurl(row.url());
    703 
    704   // Index only URLs with a whitelisted scheme.
    705   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
    706     return false;
    707 
    708   URLID row_id = row.id();
    709   // Strip out username and password before saving and indexing.
    710   string16 url(net::FormatUrl(gurl, languages,
    711       net::kFormatUrlOmitUsernamePassword,
    712       net::UnescapeRule::NONE,
    713       NULL, NULL, NULL));
    714 
    715   HistoryID history_id = static_cast<HistoryID>(row_id);
    716   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
    717 
    718   // Add the row for quick lookup in the history info store.
    719   URLRow new_row(GURL(url), row_id);
    720   new_row.set_visit_count(row.visit_count());
    721   new_row.set_typed_count(row.typed_count());
    722   new_row.set_last_visit(row.last_visit());
    723   new_row.set_title(row.title());
    724   history_info_map_[history_id].url_row = new_row;
    725 
    726   // Index the words contained in the URL and title of the row.
    727   RowWordStarts word_starts;
    728   AddRowWordsToIndex(new_row, &word_starts, languages);
    729   word_starts_map_[history_id] = word_starts;
    730 
    731   // Update the recent visits information or schedule the update
    732   // as appropriate.
    733   if (history_db) {
    734     // We'd like to check that we're on the history DB thread.
    735     // However, unittest code actually calls this on the UI thread.
    736     // So we don't do any thread checks.
    737     VisitVector recent_visits;
    738     // Make sure the private data is going to get as many recent visits as
    739     // ScoredHistoryMatch::GetFrecency() hopes to use.
    740     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
    741     if (history_db->GetMostRecentVisitsForURL(row_id,
    742                                               kMaxVisitsToStoreInCache,
    743                                               &recent_visits))
    744       UpdateRecentVisits(row_id, recent_visits);
    745   } else {
    746     DCHECK(history_service);
    747     ScheduleUpdateRecentVisits(history_service, row_id);
    748   }
    749 
    750   return true;
    751 }
    752 
    753 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
    754                                              RowWordStarts* word_starts,
    755                                              const std::string& languages) {
    756   HistoryID history_id = static_cast<HistoryID>(row.id());
    757   // Split URL into individual, unique words then add in the title words.
    758   const GURL& gurl(row.url());
    759   const string16& url = CleanUpUrlForMatching(gurl, languages);
    760   String16Set url_words = String16SetFromString16(url,
    761       word_starts ? &word_starts->url_word_starts_ : NULL);
    762   const string16& title = CleanUpTitleForMatching(row.title());
    763   String16Set title_words = String16SetFromString16(title,
    764       word_starts ? &word_starts->title_word_starts_ : NULL);
    765   String16Set words;
    766   std::set_union(url_words.begin(), url_words.end(),
    767                  title_words.begin(), title_words.end(),
    768                  std::insert_iterator<String16Set>(words, words.begin()));
    769   for (String16Set::iterator word_iter = words.begin();
    770        word_iter != words.end(); ++word_iter)
    771     AddWordToIndex(*word_iter, history_id);
    772 
    773   search_term_cache_.clear();  // Invalidate the term cache.
    774 }
    775 
    776 void URLIndexPrivateData::AddWordToIndex(const string16& term,
    777                                          HistoryID history_id) {
    778   WordMap::iterator word_pos = word_map_.find(term);
    779   if (word_pos != word_map_.end())
    780     UpdateWordHistory(word_pos->second, history_id);
    781   else
    782     AddWordHistory(term, history_id);
    783 }
    784 
    785 void URLIndexPrivateData::AddWordHistory(const string16& term,
    786                                          HistoryID history_id) {
    787   WordID word_id = word_list_.size();
    788   if (available_words_.empty()) {
    789     word_list_.push_back(term);
    790   } else {
    791     word_id = *(available_words_.begin());
    792     word_list_[word_id] = term;
    793     available_words_.erase(word_id);
    794   }
    795   word_map_[term] = word_id;
    796 
    797   HistoryIDSet history_id_set;
    798   history_id_set.insert(history_id);
    799   word_id_history_map_[word_id] = history_id_set;
    800   AddToHistoryIDWordMap(history_id, word_id);
    801 
    802   // For each character in the newly added word (i.e. a word that is not
    803   // already in the word index), add the word to the character index.
    804   Char16Set characters = Char16SetFromString16(term);
    805   for (Char16Set::iterator uni_char_iter = characters.begin();
    806        uni_char_iter != characters.end(); ++uni_char_iter) {
    807     char16 uni_char = *uni_char_iter;
    808     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
    809     if (char_iter != char_word_map_.end()) {
    810       // Update existing entry in the char/word index.
    811       WordIDSet& word_id_set(char_iter->second);
    812       word_id_set.insert(word_id);
    813     } else {
    814       // Create a new entry in the char/word index.
    815       WordIDSet word_id_set;
    816       word_id_set.insert(word_id);
    817       char_word_map_[uni_char] = word_id_set;
    818     }
    819   }
    820 }
    821 
    822 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
    823                                             HistoryID history_id) {
    824   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
    825   DCHECK(history_pos != word_id_history_map_.end());
    826   HistoryIDSet& history_id_set(history_pos->second);
    827   history_id_set.insert(history_id);
    828   AddToHistoryIDWordMap(history_id, word_id);
    829 }
    830 
    831 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
    832                                                 WordID word_id) {
    833   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
    834   if (iter != history_id_word_map_.end()) {
    835     WordIDSet& word_id_set(iter->second);
    836     word_id_set.insert(word_id);
    837   } else {
    838     WordIDSet word_id_set;
    839     word_id_set.insert(word_id);
    840     history_id_word_map_[history_id] = word_id_set;
    841   }
    842 }
    843 
    844 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
    845   RemoveRowWordsFromIndex(row);
    846   HistoryID history_id = static_cast<HistoryID>(row.id());
    847   history_info_map_.erase(history_id);
    848   word_starts_map_.erase(history_id);
    849 }
    850 
    851 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
    852   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
    853   // this row.
    854   HistoryID history_id = static_cast<HistoryID>(row.id());
    855   WordIDSet word_id_set = history_id_word_map_[history_id];
    856   history_id_word_map_.erase(history_id);
    857 
    858   // Reconcile any changes to word usage.
    859   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
    860        word_id_iter != word_id_set.end(); ++word_id_iter) {
    861     WordID word_id = *word_id_iter;
    862     word_id_history_map_[word_id].erase(history_id);
    863     if (!word_id_history_map_[word_id].empty())
    864       continue;  // The word is still in use.
    865 
    866     // The word is no longer in use. Reconcile any changes to character usage.
    867     string16 word = word_list_[word_id];
    868     Char16Set characters = Char16SetFromString16(word);
    869     for (Char16Set::iterator uni_char_iter = characters.begin();
    870          uni_char_iter != characters.end(); ++uni_char_iter) {
    871       char16 uni_char = *uni_char_iter;
    872       char_word_map_[uni_char].erase(word_id);
    873       if (char_word_map_[uni_char].empty())
    874         char_word_map_.erase(uni_char);  // No longer in use.
    875     }
    876 
    877     // Complete the removal of references to the word.
    878     word_id_history_map_.erase(word_id);
    879     word_map_.erase(word);
    880     word_list_[word_id] = string16();
    881     available_words_.insert(word_id);
    882   }
    883 }
    884 
    885 void URLIndexPrivateData::ResetSearchTermCache() {
    886   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
    887        iter != search_term_cache_.end(); ++iter)
    888     iter->second.used_ = false;
    889 }
    890 
    891 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
    892   base::TimeTicks beginning_time = base::TimeTicks::Now();
    893   InMemoryURLIndexCacheItem index_cache;
    894   SavePrivateData(&index_cache);
    895   std::string data;
    896   if (!index_cache.SerializeToString(&data)) {
    897     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
    898     return false;
    899   }
    900 
    901   int size = data.size();
    902   if (file_util::WriteFile(file_path, data.c_str(), size) != size) {
    903     LOG(WARNING) << "Failed to write " << file_path.value();
    904     return false;
    905   }
    906   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
    907                       base::TimeTicks::Now() - beginning_time);
    908   return true;
    909 }
    910 
    911 void URLIndexPrivateData::SavePrivateData(
    912     InMemoryURLIndexCacheItem* cache) const {
    913   DCHECK(cache);
    914   cache->set_last_rebuild_timestamp(
    915       last_time_rebuilt_from_history_.ToInternalValue());
    916   cache->set_version(saved_cache_version_);
    917   // history_item_count_ is no longer used but rather than change the protobuf
    918   // definition use a placeholder. This will go away with the switch to SQLite.
    919   cache->set_history_item_count(0);
    920   SaveWordList(cache);
    921   SaveWordMap(cache);
    922   SaveCharWordMap(cache);
    923   SaveWordIDHistoryMap(cache);
    924   SaveHistoryInfoMap(cache);
    925   SaveWordStartsMap(cache);
    926 }
    927 
    928 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
    929   if (word_list_.empty())
    930     return;
    931   WordListItem* list_item = cache->mutable_word_list();
    932   list_item->set_word_count(word_list_.size());
    933   for (String16Vector::const_iterator iter = word_list_.begin();
    934        iter != word_list_.end(); ++iter)
    935     list_item->add_word(UTF16ToUTF8(*iter));
    936 }
    937 
    938 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
    939   if (word_map_.empty())
    940     return;
    941   WordMapItem* map_item = cache->mutable_word_map();
    942   map_item->set_item_count(word_map_.size());
    943   for (WordMap::const_iterator iter = word_map_.begin();
    944        iter != word_map_.end(); ++iter) {
    945     WordMapEntry* map_entry = map_item->add_word_map_entry();
    946     map_entry->set_word(UTF16ToUTF8(iter->first));
    947     map_entry->set_word_id(iter->second);
    948   }
    949 }
    950 
    951 void URLIndexPrivateData::SaveCharWordMap(
    952     InMemoryURLIndexCacheItem* cache) const {
    953   if (char_word_map_.empty())
    954     return;
    955   CharWordMapItem* map_item = cache->mutable_char_word_map();
    956   map_item->set_item_count(char_word_map_.size());
    957   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
    958        iter != char_word_map_.end(); ++iter) {
    959     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
    960     map_entry->set_char_16(iter->first);
    961     const WordIDSet& word_id_set(iter->second);
    962     map_entry->set_item_count(word_id_set.size());
    963     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
    964          set_iter != word_id_set.end(); ++set_iter)
    965       map_entry->add_word_id(*set_iter);
    966   }
    967 }
    968 
    969 void URLIndexPrivateData::SaveWordIDHistoryMap(
    970     InMemoryURLIndexCacheItem* cache) const {
    971   if (word_id_history_map_.empty())
    972     return;
    973   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
    974   map_item->set_item_count(word_id_history_map_.size());
    975   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
    976        iter != word_id_history_map_.end(); ++iter) {
    977     WordIDHistoryMapEntry* map_entry =
    978         map_item->add_word_id_history_map_entry();
    979     map_entry->set_word_id(iter->first);
    980     const HistoryIDSet& history_id_set(iter->second);
    981     map_entry->set_item_count(history_id_set.size());
    982     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
    983          set_iter != history_id_set.end(); ++set_iter)
    984       map_entry->add_history_id(*set_iter);
    985   }
    986 }
    987 
    988 void URLIndexPrivateData::SaveHistoryInfoMap(
    989     InMemoryURLIndexCacheItem* cache) const {
    990   if (history_info_map_.empty())
    991     return;
    992   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
    993   map_item->set_item_count(history_info_map_.size());
    994   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
    995        iter != history_info_map_.end(); ++iter) {
    996     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
    997     map_entry->set_history_id(iter->first);
    998     const URLRow& url_row(iter->second.url_row);
    999     // Note: We only save information that contributes to the index so there
   1000     // is no need to save search_term_cache_ (not persistent).
   1001     map_entry->set_visit_count(url_row.visit_count());
   1002     map_entry->set_typed_count(url_row.typed_count());
   1003     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
   1004     map_entry->set_url(url_row.url().spec());
   1005     map_entry->set_title(UTF16ToUTF8(url_row.title()));
   1006     const VisitInfoVector& visits(iter->second.visits);
   1007     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
   1008          visit_iter != visits.end(); ++visit_iter) {
   1009       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
   1010       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
   1011       visit_info->set_transition_type(visit_iter->second);
   1012     }
   1013   }
   1014 }
   1015 
   1016 void URLIndexPrivateData::SaveWordStartsMap(
   1017     InMemoryURLIndexCacheItem* cache) const {
   1018   if (word_starts_map_.empty())
   1019     return;
   1020   // For unit testing: Enable saving of the cache as an earlier version to
   1021   // allow testing of cache file upgrading in ReadFromFile().
   1022   // TODO(mrossetti): Instead of intruding on production code with this kind of
   1023   // test harness, save a copy of an older version cache with known results.
   1024   // Implement this when switching the caching over to SQLite.
   1025   if (saved_cache_version_ < 1)
   1026     return;
   1027 
   1028   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
   1029   map_item->set_item_count(word_starts_map_.size());
   1030   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
   1031        iter != word_starts_map_.end(); ++iter) {
   1032     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
   1033     map_entry->set_history_id(iter->first);
   1034     const RowWordStarts& word_starts(iter->second);
   1035     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
   1036          i != word_starts.url_word_starts_.end(); ++i)
   1037       map_entry->add_url_word_starts(*i);
   1038     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
   1039          i != word_starts.title_word_starts_.end(); ++i)
   1040       map_entry->add_title_word_starts(*i);
   1041   }
   1042 }
   1043 
   1044 bool URLIndexPrivateData::RestorePrivateData(
   1045     const InMemoryURLIndexCacheItem& cache,
   1046     const std::string& languages) {
   1047   last_time_rebuilt_from_history_ =
   1048       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
   1049   const base::TimeDelta rebuilt_ago =
   1050       base::Time::Now() - last_time_rebuilt_from_history_;
   1051   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
   1052       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
   1053     // Cache is more than a week old or, somehow, from some time in the future.
   1054     // It's probably a good time to rebuild the index from history to
   1055     // allow synced entries to now appear, expired entries to disappear, etc.
   1056     // Allow one day in the future to make the cache not rebuild on simple
   1057     // system clock changes such as time zone changes.
   1058     return false;
   1059   }
   1060   if (cache.has_version()) {
   1061     if (cache.version() < kCurrentCacheFileVersion) {
   1062       // Don't try to restore an old format cache file.  (This will cause
   1063       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
   1064       // from history.)
   1065       return false;
   1066     }
   1067     restored_cache_version_ = cache.version();
   1068   }
   1069   return RestoreWordList(cache) && RestoreWordMap(cache) &&
   1070       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
   1071       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
   1072 }
   1073 
   1074 bool URLIndexPrivateData::RestoreWordList(
   1075     const InMemoryURLIndexCacheItem& cache) {
   1076   if (!cache.has_word_list())
   1077     return false;
   1078   const WordListItem& list_item(cache.word_list());
   1079   uint32 expected_item_count = list_item.word_count();
   1080   uint32 actual_item_count = list_item.word_size();
   1081   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1082     return false;
   1083   const RepeatedPtrField<std::string>& words(list_item.word());
   1084   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
   1085        iter != words.end(); ++iter)
   1086     word_list_.push_back(UTF8ToUTF16(*iter));
   1087   return true;
   1088 }
   1089 
   1090 bool URLIndexPrivateData::RestoreWordMap(
   1091     const InMemoryURLIndexCacheItem& cache) {
   1092   if (!cache.has_word_map())
   1093     return false;
   1094   const WordMapItem& list_item(cache.word_map());
   1095   uint32 expected_item_count = list_item.item_count();
   1096   uint32 actual_item_count = list_item.word_map_entry_size();
   1097   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1098     return false;
   1099   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
   1100   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
   1101        iter != entries.end(); ++iter)
   1102     word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();
   1103   return true;
   1104 }
   1105 
   1106 bool URLIndexPrivateData::RestoreCharWordMap(
   1107     const InMemoryURLIndexCacheItem& cache) {
   1108   if (!cache.has_char_word_map())
   1109     return false;
   1110   const CharWordMapItem& list_item(cache.char_word_map());
   1111   uint32 expected_item_count = list_item.item_count();
   1112   uint32 actual_item_count = list_item.char_word_map_entry_size();
   1113   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1114     return false;
   1115   const RepeatedPtrField<CharWordMapEntry>&
   1116       entries(list_item.char_word_map_entry());
   1117   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
   1118        entries.begin(); iter != entries.end(); ++iter) {
   1119     expected_item_count = iter->item_count();
   1120     actual_item_count = iter->word_id_size();
   1121     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1122       return false;
   1123     char16 uni_char = static_cast<char16>(iter->char_16());
   1124     WordIDSet word_id_set;
   1125     const RepeatedField<int32>& word_ids(iter->word_id());
   1126     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
   1127          jiter != word_ids.end(); ++jiter)
   1128       word_id_set.insert(*jiter);
   1129     char_word_map_[uni_char] = word_id_set;
   1130   }
   1131   return true;
   1132 }
   1133 
   1134 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
   1135     const InMemoryURLIndexCacheItem& cache) {
   1136   if (!cache.has_word_id_history_map())
   1137     return false;
   1138   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
   1139   uint32 expected_item_count = list_item.item_count();
   1140   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
   1141   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1142     return false;
   1143   const RepeatedPtrField<WordIDHistoryMapEntry>&
   1144       entries(list_item.word_id_history_map_entry());
   1145   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
   1146        entries.begin(); iter != entries.end(); ++iter) {
   1147     expected_item_count = iter->item_count();
   1148     actual_item_count = iter->history_id_size();
   1149     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1150       return false;
   1151     WordID word_id = iter->word_id();
   1152     HistoryIDSet history_id_set;
   1153     const RepeatedField<int64>& history_ids(iter->history_id());
   1154     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
   1155          jiter != history_ids.end(); ++jiter) {
   1156       history_id_set.insert(*jiter);
   1157       AddToHistoryIDWordMap(*jiter, word_id);
   1158     }
   1159     word_id_history_map_[word_id] = history_id_set;
   1160   }
   1161   return true;
   1162 }
   1163 
   1164 bool URLIndexPrivateData::RestoreHistoryInfoMap(
   1165     const InMemoryURLIndexCacheItem& cache) {
   1166   if (!cache.has_history_info_map())
   1167     return false;
   1168   const HistoryInfoMapItem& list_item(cache.history_info_map());
   1169   uint32 expected_item_count = list_item.item_count();
   1170   uint32 actual_item_count = list_item.history_info_map_entry_size();
   1171   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1172     return false;
   1173   const RepeatedPtrField<HistoryInfoMapEntry>&
   1174       entries(list_item.history_info_map_entry());
   1175   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
   1176        entries.begin(); iter != entries.end(); ++iter) {
   1177     HistoryID history_id = iter->history_id();
   1178     GURL url(iter->url());
   1179     URLRow url_row(url, history_id);
   1180     url_row.set_visit_count(iter->visit_count());
   1181     url_row.set_typed_count(iter->typed_count());
   1182     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
   1183     if (iter->has_title()) {
   1184       string16 title(UTF8ToUTF16(iter->title()));
   1185       url_row.set_title(title);
   1186     }
   1187     history_info_map_[history_id].url_row = url_row;
   1188 
   1189     // Restore visits list.
   1190     VisitInfoVector visits;
   1191     visits.reserve(iter->visits_size());
   1192     for (int i = 0; i < iter->visits_size(); ++i) {
   1193       visits.push_back(std::make_pair(
   1194           base::Time::FromInternalValue(iter->visits(i).visit_time()),
   1195           static_cast<content::PageTransition>(iter->visits(i).
   1196                                                transition_type())));
   1197     }
   1198     history_info_map_[history_id].visits = visits;
   1199   }
   1200   return true;
   1201 }
   1202 
   1203 bool URLIndexPrivateData::RestoreWordStartsMap(
   1204     const InMemoryURLIndexCacheItem& cache,
   1205     const std::string& languages) {
   1206   // Note that this function must be called after RestoreHistoryInfoMap() has
   1207   // been run as the word starts may have to be recalculated from the urls and
   1208   // page titles.
   1209   if (cache.has_word_starts_map()) {
   1210     const WordStartsMapItem& list_item(cache.word_starts_map());
   1211     uint32 expected_item_count = list_item.item_count();
   1212     uint32 actual_item_count = list_item.word_starts_map_entry_size();
   1213     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1214       return false;
   1215     const RepeatedPtrField<WordStartsMapEntry>&
   1216         entries(list_item.word_starts_map_entry());
   1217     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
   1218          entries.begin(); iter != entries.end(); ++iter) {
   1219       HistoryID history_id = iter->history_id();
   1220       RowWordStarts word_starts;
   1221       // Restore the URL word starts.
   1222       const RepeatedField<int32>& url_starts(iter->url_word_starts());
   1223       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
   1224            jiter != url_starts.end(); ++jiter)
   1225         word_starts.url_word_starts_.push_back(*jiter);
   1226       // Restore the page title word starts.
   1227       const RepeatedField<int32>& title_starts(iter->title_word_starts());
   1228       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
   1229            jiter != title_starts.end(); ++jiter)
   1230         word_starts.title_word_starts_.push_back(*jiter);
   1231       word_starts_map_[history_id] = word_starts;
   1232     }
   1233   } else {
   1234     // Since the cache did not contain any word starts we must rebuild then from
   1235     // the URL and page titles.
   1236     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
   1237          iter != history_info_map_.end(); ++iter) {
   1238       RowWordStarts word_starts;
   1239       const URLRow& row(iter->second.url_row);
   1240       const string16& url = CleanUpUrlForMatching(row.url(), languages);
   1241       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
   1242       const string16& title = CleanUpTitleForMatching(row.title());
   1243       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
   1244       word_starts_map_[iter->first] = word_starts;
   1245     }
   1246   }
   1247   return true;
   1248 }
   1249 
   1250 // static
   1251 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
   1252     const GURL& gurl,
   1253     const std::set<std::string>& whitelist) {
   1254   return whitelist.find(gurl.scheme()) != whitelist.end();
   1255 }
   1256 
   1257 
   1258 // SearchTermCacheItem ---------------------------------------------------------
   1259 
   1260 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
   1261     const WordIDSet& word_id_set,
   1262     const HistoryIDSet& history_id_set)
   1263     : word_id_set_(word_id_set),
   1264       history_id_set_(history_id_set),
   1265       used_(true) {}
   1266 
   1267 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
   1268     : used_(true) {}
   1269 
   1270 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
   1271 
   1272 
   1273 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
   1274 
   1275 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
   1276     const URLIndexPrivateData& private_data,
   1277     const std::string& languages,
   1278     BookmarkService* bookmark_service,
   1279     const string16& lower_string,
   1280     const String16Vector& lower_terms,
   1281     const base::Time now)
   1282   : private_data_(private_data),
   1283     languages_(languages),
   1284     bookmark_service_(bookmark_service),
   1285     lower_string_(lower_string),
   1286     lower_terms_(lower_terms),
   1287     now_(now) {}
   1288 
   1289 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
   1290 
   1291 void URLIndexPrivateData::AddHistoryMatch::operator()(
   1292     const HistoryID history_id) {
   1293   HistoryInfoMap::const_iterator hist_pos =
   1294       private_data_.history_info_map_.find(history_id);
   1295   if (hist_pos != private_data_.history_info_map_.end()) {
   1296     const URLRow& hist_item = hist_pos->second.url_row;
   1297     const VisitInfoVector& visits = hist_pos->second.visits;
   1298     WordStartsMap::const_iterator starts_pos =
   1299         private_data_.word_starts_map_.find(history_id);
   1300     DCHECK(starts_pos != private_data_.word_starts_map_.end());
   1301     ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
   1302                              lower_terms_, starts_pos->second, now_,
   1303                              bookmark_service_);
   1304     if (match.raw_score > 0)
   1305       scored_matches_.push_back(match);
   1306   }
   1307 }
   1308 
   1309 
   1310 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
   1311 
   1312 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
   1313     const HistoryInfoMap& history_info_map)
   1314     : history_info_map_(history_info_map) {
   1315 }
   1316 
   1317 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
   1318 
   1319 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
   1320     const HistoryID h1,
   1321     const HistoryID h2) {
   1322   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
   1323   if (entry1 == history_info_map_.end())
   1324     return false;
   1325   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
   1326   if (entry2 == history_info_map_.end())
   1327     return true;
   1328   const URLRow& r1(entry1->second.url_row);
   1329   const URLRow& r2(entry2->second.url_row);
   1330   // First cut: typed count, visit count, recency.
   1331   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
   1332   // recently visited (within the last 12/24 hours) as highly important. Get
   1333   // input from mpearson.
   1334   if (r1.typed_count() != r2.typed_count())
   1335     return (r1.typed_count() > r2.typed_count());
   1336   if (r1.visit_count() != r2.visit_count())
   1337     return (r1.visit_count() > r2.visit_count());
   1338   return (r1.last_visit() > r2.last_visit());
   1339 }
   1340 
   1341 }  // namespace history
   1342