Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/url_index_private_data.h"
      6 
      7 #include <functional>
      8 #include <iterator>
      9 #include <limits>
     10 #include <numeric>
     11 #include <string>
     12 #include <vector>
     13 
     14 #include "base/basictypes.h"
     15 #include "base/file_util.h"
     16 #include "base/i18n/break_iterator.h"
     17 #include "base/i18n/case_conversion.h"
     18 #include "base/metrics/histogram.h"
     19 #include "base/strings/string_util.h"
     20 #include "base/strings/utf_string_conversions.h"
     21 #include "base/time/time.h"
     22 #include "chrome/browser/history/history_database.h"
     23 #include "chrome/browser/history/history_db_task.h"
     24 #include "chrome/browser/history/history_service.h"
     25 #include "chrome/browser/history/in_memory_url_index.h"
     26 #include "components/bookmarks/browser/bookmark_utils.h"
     27 #include "components/history/core/browser/history_client.h"
     28 #include "net/base/net_util.h"
     29 
     30 #if defined(USE_SYSTEM_PROTOBUF)
     31 #include <google/protobuf/repeated_field.h>
     32 #else
     33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
     34 #endif
     35 
     36 using google::protobuf::RepeatedField;
     37 using google::protobuf::RepeatedPtrField;
     38 using in_memory_url_index::InMemoryURLIndexCacheItem;
     39 
     40 namespace {
     41 static const size_t kMaxVisitsToStoreInCache = 10u;
     42 }  // anonymous namespace
     43 
     44 namespace history {
     45 
     46 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
     47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
     48 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
     49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
     50 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
     51     CharWordMapEntry;
     52 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
     53     WordIDHistoryMapItem;
     54 typedef imui::
     55     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
     56     WordIDHistoryMapEntry;
     57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
     58 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
     59     HistoryInfoMapEntry;
     60 typedef imui::
     61     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
     62     HistoryInfoMapEntry_VisitInfo;
     63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
     64 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
     65     WordStartsMapEntry;
     66 
     67 
     68 // Algorithm Functions ---------------------------------------------------------
     69 
     70 // Comparison function for sorting search terms by descending length.
     71 bool LengthGreater(const base::string16& string_a,
     72                    const base::string16& string_b) {
     73   return string_a.length() > string_b.length();
     74 }
     75 
     76 
     77 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
     78 
     79 // HistoryDBTask used to update the recent visit data for a particular
     80 // row from the history database.
     81 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
     82  public:
     83   explicit UpdateRecentVisitsFromHistoryDBTask(
     84       URLIndexPrivateData* private_data,
     85       URLID url_id);
     86 
     87   virtual bool RunOnDBThread(HistoryBackend* backend,
     88                              history::HistoryDatabase* db) OVERRIDE;
     89   virtual void DoneRunOnMainThread() OVERRIDE;
     90 
     91  private:
     92   virtual ~UpdateRecentVisitsFromHistoryDBTask();
     93 
     94   // The URLIndexPrivateData that gets updated after the historyDB
     95   // task returns.
     96   URLIndexPrivateData* private_data_;
     97   // The ID of the URL to get visits for and then update.
     98   URLID url_id_;
     99   // Whether fetching the recent visits for the URL succeeded.
    100   bool succeeded_;
    101   // The awaited data that's shown to private_data_ for it to copy and
    102   // store.
    103   VisitVector recent_visits_;
    104 
    105   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
    106 };
    107 
    108 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
    109     URLIndexPrivateData* private_data,
    110     URLID url_id)
    111     : private_data_(private_data),
    112       url_id_(url_id),
    113       succeeded_(false) {
    114 }
    115 
    116 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
    117     HistoryBackend* backend,
    118     HistoryDatabase* db) {
    119   // Make sure the private data is going to get as many recent visits as
    120   // ScoredHistoryMatch::GetFrequency() hopes to use.
    121   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
    122   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
    123                                              kMaxVisitsToStoreInCache,
    124                                              &recent_visits_);
    125   if (!succeeded_)
    126     recent_visits_.clear();
    127   return true;  // Always claim to be done; do not retry failures.
    128 }
    129 
    130 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
    131   if (succeeded_)
    132     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
    133 }
    134 
    135 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
    136 }
    137 
    138 
    139 // URLIndexPrivateData ---------------------------------------------------------
    140 
    141 URLIndexPrivateData::URLIndexPrivateData()
    142     : restored_cache_version_(0),
    143       saved_cache_version_(kCurrentCacheFileVersion),
    144       pre_filter_item_count_(0),
    145       post_filter_item_count_(0),
    146       post_scoring_item_count_(0) {
    147 }
    148 
    149 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
    150     base::string16 search_string,
    151     size_t cursor_position,
    152     size_t max_matches,
    153     const std::string& languages,
    154     HistoryClient* history_client) {
    155   // If cursor position is set and useful (not at either end of the
    156   // string), allow the search string to be broken at cursor position.
    157   // We do this by pretending there's a space where the cursor is.
    158   if ((cursor_position != base::string16::npos) &&
    159       (cursor_position < search_string.length()) &&
    160       (cursor_position > 0)) {
    161     search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
    162   }
    163   pre_filter_item_count_ = 0;
    164   post_filter_item_count_ = 0;
    165   post_scoring_item_count_ = 0;
    166   // The search string we receive may contain escaped characters. For reducing
    167   // the index we need individual, lower-cased words, ignoring escapings. For
    168   // the final filtering we need whitespace separated substrings possibly
    169   // containing escaped characters.
    170   base::string16 lower_raw_string(base::i18n::ToLower(search_string));
    171   base::string16 lower_unescaped_string =
    172       net::UnescapeURLComponent(lower_raw_string,
    173           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
    174   // Extract individual 'words' (as opposed to 'terms'; see below) from the
    175   // search string. When the user types "colspec=ID%20Mstone Release" we get
    176   // four 'words': "colspec", "id", "mstone" and "release".
    177   String16Vector lower_words(
    178       history::String16VectorFromString16(lower_unescaped_string, false, NULL));
    179   ScoredHistoryMatches scored_items;
    180 
    181   // Do nothing if we have indexed no words (probably because we've not been
    182   // initialized yet) or the search string has no words.
    183   if (word_list_.empty() || lower_words.empty()) {
    184     search_term_cache_.clear();  // Invalidate the term cache.
    185     return scored_items;
    186   }
    187 
    188   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
    189   // approach.
    190   ResetSearchTermCache();
    191 
    192   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
    193 
    194   // Trim the candidate pool if it is large. Note that we do not filter out
    195   // items that do not contain the search terms as proper substrings -- doing
    196   // so is the performance-costly operation we are trying to avoid in order
    197   // to maintain omnibox responsiveness.
    198   const size_t kItemsToScoreLimit = 500;
    199   pre_filter_item_count_ = history_id_set.size();
    200   // If we trim the results set we do not want to cache the results for next
    201   // time as the user's ultimately desired result could easily be eliminated
    202   // in this early rough filter.
    203   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
    204   if (was_trimmed) {
    205     HistoryIDVector history_ids;
    206     std::copy(history_id_set.begin(), history_id_set.end(),
    207               std::back_inserter(history_ids));
    208     // Trim down the set by sorting by typed-count, visit-count, and last
    209     // visit.
    210     HistoryItemFactorGreater
    211         item_factor_functor(history_info_map_);
    212     std::partial_sort(history_ids.begin(),
    213                       history_ids.begin() + kItemsToScoreLimit,
    214                       history_ids.end(),
    215                       item_factor_functor);
    216     history_id_set.clear();
    217     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
    218               std::inserter(history_id_set, history_id_set.end()));
    219     post_filter_item_count_ = history_id_set.size();
    220   }
    221 
    222   // Pass over all of the candidates filtering out any without a proper
    223   // substring match, inserting those which pass in order by score. Note that
    224   // in this step we are using the raw search string complete with escaped
    225   // URL elements. When the user has specifically typed something akin to
    226   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
    227   // specific substring appears in the URL or page title.
    228 
    229   // We call these 'terms' (as opposed to 'words'; see above) as in this case
    230   // we only want to break up the search string on 'true' whitespace rather than
    231   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
    232   // get two 'terms': "colspec=id%20mstone" and "release".
    233   history::String16Vector lower_raw_terms;
    234   if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
    235                &lower_raw_terms) == 0) {
    236     // Don't score matches when there are no terms to score against.  (It's
    237     // possible that the word break iterater that extracts words to search
    238     // for in the database allows some whitespace "words" whereas Tokenize
    239     // excludes a long list of whitespace.)  One could write a scoring
    240     // function that gives a reasonable order to matches when there
    241     // are no terms (i.e., all the words are some form of whitespace),
    242     // but this is such a rare edge case that it's not worth the time.
    243     return scored_items;
    244   }
    245   scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
    246       AddHistoryMatch(*this, languages, history_client, lower_raw_string,
    247                       lower_raw_terms, base::Time::Now())).ScoredMatches();
    248 
    249   // Select and sort only the top |max_matches| results.
    250   if (scored_items.size() > max_matches) {
    251     std::partial_sort(scored_items.begin(),
    252                       scored_items.begin() +
    253                           max_matches,
    254                       scored_items.end(),
    255                       ScoredHistoryMatch::MatchScoreGreater);
    256       scored_items.resize(max_matches);
    257   } else {
    258     std::sort(scored_items.begin(), scored_items.end(),
    259               ScoredHistoryMatch::MatchScoreGreater);
    260   }
    261   post_scoring_item_count_ = scored_items.size();
    262 
    263   if (was_trimmed) {
    264     search_term_cache_.clear();  // Invalidate the term cache.
    265   } else {
    266     // Remove any stale SearchTermCacheItems.
    267     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
    268          cache_iter != search_term_cache_.end(); ) {
    269       if (!cache_iter->second.used_)
    270         search_term_cache_.erase(cache_iter++);
    271       else
    272         ++cache_iter;
    273     }
    274   }
    275 
    276   return scored_items;
    277 }
    278 
    279 bool URLIndexPrivateData::UpdateURL(
    280     HistoryService* history_service,
    281     const URLRow& row,
    282     const std::string& languages,
    283     const std::set<std::string>& scheme_whitelist) {
    284   // The row may or may not already be in our index. If it is not already
    285   // indexed and it qualifies then it gets indexed. If it is already
    286   // indexed and still qualifies then it gets updated, otherwise it
    287   // is deleted from the index.
    288   bool row_was_updated = false;
    289   URLID row_id = row.id();
    290   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
    291   if (row_pos == history_info_map_.end()) {
    292     // This new row should be indexed if it qualifies.
    293     URLRow new_row(row);
    294     new_row.set_id(row_id);
    295     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
    296         IndexRow(NULL, history_service, new_row, languages, scheme_whitelist);
    297   } else if (RowQualifiesAsSignificant(row, base::Time())) {
    298     // This indexed row still qualifies and will be re-indexed.
    299     // The url won't have changed but the title, visit count, etc.
    300     // might have changed.
    301     URLRow& row_to_update = row_pos->second.url_row;
    302     bool title_updated = row_to_update.title() != row.title();
    303     if (row_to_update.visit_count() != row.visit_count() ||
    304         row_to_update.typed_count() != row.typed_count() ||
    305         row_to_update.last_visit() != row.last_visit() || title_updated) {
    306       row_to_update.set_visit_count(row.visit_count());
    307       row_to_update.set_typed_count(row.typed_count());
    308       row_to_update.set_last_visit(row.last_visit());
    309       // If something appears to have changed, update the recent visits
    310       // information.
    311       ScheduleUpdateRecentVisits(history_service, row_id);
    312       // While the URL is guaranteed to remain stable, the title may have
    313       // changed. If so, then update the index with the changed words.
    314       if (title_updated) {
    315         // Clear all words associated with this row and re-index both the
    316         // URL and title.
    317         RemoveRowWordsFromIndex(row_to_update);
    318         row_to_update.set_title(row.title());
    319         RowWordStarts word_starts;
    320         AddRowWordsToIndex(row_to_update, &word_starts, languages);
    321         word_starts_map_[row_id] = word_starts;
    322       }
    323       row_was_updated = true;
    324     }
    325   } else {
    326     // This indexed row no longer qualifies and will be de-indexed by
    327     // clearing all words associated with this row.
    328     RemoveRowFromIndex(row);
    329     row_was_updated = true;
    330   }
    331   if (row_was_updated)
    332     search_term_cache_.clear();  // This invalidates the cache.
    333   return row_was_updated;
    334 }
    335 
    336 void URLIndexPrivateData::UpdateRecentVisits(
    337     URLID url_id,
    338     const VisitVector& recent_visits) {
    339   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
    340   if (row_pos != history_info_map_.end()) {
    341     VisitInfoVector* visits = &row_pos->second.visits;
    342     visits->clear();
    343     const size_t size =
    344         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
    345     visits->reserve(size);
    346     for (size_t i = 0; i < size; i++) {
    347       // Copy from the VisitVector the only fields visits needs.
    348       visits->push_back(std::make_pair(recent_visits[i].visit_time,
    349                                        recent_visits[i].transition));
    350     }
    351   }
    352   // Else: Oddly, the URL doesn't seem to exist in the private index.
    353   // Ignore this update.  This can happen if, for instance, the user
    354   // removes the URL from URLIndexPrivateData before the historyDB call
    355   // returns.
    356 }
    357 
    358 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
    359     HistoryService* history_service,
    360     URLID url_id) {
    361   history_service->ScheduleDBTask(
    362       new UpdateRecentVisitsFromHistoryDBTask(this, url_id),
    363       &recent_visits_consumer_);
    364 }
    365 
    366 // Helper functor for DeleteURL.
    367 class HistoryInfoMapItemHasURL {
    368  public:
    369   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
    370 
    371   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
    372     return item.second.url_row.url() == url_;
    373   }
    374 
    375  private:
    376   const GURL& url_;
    377 };
    378 
    379 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
    380   // Find the matching entry in the history_info_map_.
    381   HistoryInfoMap::iterator pos = std::find_if(
    382       history_info_map_.begin(),
    383       history_info_map_.end(),
    384       HistoryInfoMapItemHasURL(url));
    385   if (pos == history_info_map_.end())
    386     return false;
    387   RemoveRowFromIndex(pos->second.url_row);
    388   search_term_cache_.clear();  // This invalidates the cache.
    389   return true;
    390 }
    391 
    392 // static
    393 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
    394     const base::FilePath& file_path,
    395     const std::string& languages) {
    396   base::TimeTicks beginning_time = base::TimeTicks::Now();
    397   if (!base::PathExists(file_path))
    398     return NULL;
    399   std::string data;
    400   // If there is no cache file then simply give up. This will cause us to
    401   // attempt to rebuild from the history database.
    402   if (!base::ReadFileToString(file_path, &data))
    403     return NULL;
    404 
    405   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
    406   InMemoryURLIndexCacheItem index_cache;
    407   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
    408     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
    409                  << file_path.value();
    410     return restored_data;
    411   }
    412 
    413   if (!restored_data->RestorePrivateData(index_cache, languages))
    414     return NULL;
    415 
    416   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
    417                       base::TimeTicks::Now() - beginning_time);
    418   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
    419                        restored_data->history_id_word_map_.size());
    420   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
    421   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
    422                              restored_data->word_map_.size());
    423   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
    424                              restored_data->char_word_map_.size());
    425   if (restored_data->Empty())
    426     return NULL;  // 'No data' is the same as a failed reload.
    427   return restored_data;
    428 }
    429 
    430 // static
    431 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
    432     HistoryDatabase* history_db,
    433     const std::string& languages,
    434     const std::set<std::string>& scheme_whitelist) {
    435   if (!history_db)
    436     return NULL;
    437 
    438   base::TimeTicks beginning_time = base::TimeTicks::Now();
    439 
    440   scoped_refptr<URLIndexPrivateData>
    441       rebuilt_data(new URLIndexPrivateData);
    442   URLDatabase::URLEnumerator history_enum;
    443   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
    444     return NULL;
    445   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
    446   for (URLRow row; history_enum.GetNextURL(&row); ) {
    447     rebuilt_data->IndexRow(history_db, NULL, row, languages,
    448                            scheme_whitelist);
    449   }
    450 
    451   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
    452                       base::TimeTicks::Now() - beginning_time);
    453   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
    454                        rebuilt_data->history_id_word_map_.size());
    455   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
    456                              rebuilt_data->word_map_.size());
    457   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
    458                              rebuilt_data->char_word_map_.size());
    459   return rebuilt_data;
    460 }
    461 
    462 // static
    463 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
    464     scoped_refptr<URLIndexPrivateData> private_data,
    465     const base::FilePath& file_path) {
    466   DCHECK(private_data.get());
    467   DCHECK(!file_path.empty());
    468   return private_data->SaveToFile(file_path);
    469 }
    470 
    471 void URLIndexPrivateData::CancelPendingUpdates() {
    472   recent_visits_consumer_.CancelAllRequests();
    473 }
    474 
    475 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
    476   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
    477   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
    478   data_copy->word_list_ = word_list_;
    479   data_copy->available_words_ = available_words_;
    480   data_copy->word_map_ = word_map_;
    481   data_copy->char_word_map_ = char_word_map_;
    482   data_copy->word_id_history_map_ = word_id_history_map_;
    483   data_copy->history_id_word_map_ = history_id_word_map_;
    484   data_copy->history_info_map_ = history_info_map_;
    485   data_copy->word_starts_map_ = word_starts_map_;
    486   return data_copy;
    487   // Not copied:
    488   //    search_term_cache_
    489   //    pre_filter_item_count_
    490   //    post_filter_item_count_
    491   //    post_scoring_item_count_
    492 }
    493 
    494 bool URLIndexPrivateData::Empty() const {
    495   return history_info_map_.empty();
    496 }
    497 
    498 void URLIndexPrivateData::Clear() {
    499   last_time_rebuilt_from_history_ = base::Time();
    500   word_list_.clear();
    501   available_words_.clear();
    502   word_map_.clear();
    503   char_word_map_.clear();
    504   word_id_history_map_.clear();
    505   history_id_word_map_.clear();
    506   history_info_map_.clear();
    507   word_starts_map_.clear();
    508 }
    509 
    510 URLIndexPrivateData::~URLIndexPrivateData() {}
    511 
    512 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
    513     const String16Vector& unsorted_words) {
    514   // Break the terms down into individual terms (words), get the candidate
    515   // set for each term, and intersect each to get a final candidate list.
    516   // Note that a single 'term' from the user's perspective might be
    517   // a string like "http://www.somewebsite.com" which, from our perspective,
    518   // is four words: 'http', 'www', 'somewebsite', and 'com'.
    519   HistoryIDSet history_id_set;
    520   String16Vector words(unsorted_words);
    521   // Sort the words into the longest first as such are likely to narrow down
    522   // the results quicker. Also, single character words are the most expensive
    523   // to process so save them for last.
    524   std::sort(words.begin(), words.end(), LengthGreater);
    525   for (String16Vector::iterator iter = words.begin(); iter != words.end();
    526        ++iter) {
    527     base::string16 uni_word = *iter;
    528     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
    529     if (term_history_set.empty()) {
    530       history_id_set.clear();
    531       break;
    532     }
    533     if (iter == words.begin()) {
    534       history_id_set.swap(term_history_set);
    535     } else {
    536       HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
    537           history_id_set, term_history_set);
    538       history_id_set.swap(new_history_id_set);
    539     }
    540   }
    541   return history_id_set;
    542 }
    543 
    544 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
    545     const base::string16& term) {
    546   if (term.empty())
    547     return HistoryIDSet();
    548 
    549   // TODO(mrossetti): Consider optimizing for very common terms such as
    550   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
    551   // occuring words in the user's searches.
    552 
    553   size_t term_length = term.length();
    554   WordIDSet word_id_set;
    555   if (term_length > 1) {
    556     // See if this term or a prefix thereof is present in the cache.
    557     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
    558     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
    559          cache_iter != search_term_cache_.end(); ++cache_iter) {
    560       if (StartsWith(term, cache_iter->first, false) &&
    561           (best_prefix == search_term_cache_.end() ||
    562            cache_iter->first.length() > best_prefix->first.length()))
    563         best_prefix = cache_iter;
    564     }
    565 
    566     // If a prefix was found then determine the leftover characters to be used
    567     // for further refining the results from that prefix.
    568     Char16Set prefix_chars;
    569     base::string16 leftovers(term);
    570     if (best_prefix != search_term_cache_.end()) {
    571       // If the prefix is an exact match for the term then grab the cached
    572       // results and we're done.
    573       size_t prefix_length = best_prefix->first.length();
    574       if (prefix_length == term_length) {
    575         best_prefix->second.used_ = true;
    576         return best_prefix->second.history_id_set_;
    577       }
    578 
    579       // Otherwise we have a handy starting point.
    580       // If there are no history results for this prefix then we can bail early
    581       // as there will be no history results for the full term.
    582       if (best_prefix->second.history_id_set_.empty()) {
    583         search_term_cache_[term] = SearchTermCacheItem();
    584         return HistoryIDSet();
    585       }
    586       word_id_set = best_prefix->second.word_id_set_;
    587       prefix_chars = Char16SetFromString16(best_prefix->first);
    588       leftovers = term.substr(prefix_length);
    589     }
    590 
    591     // Filter for each remaining, unique character in the term.
    592     Char16Set leftover_chars = Char16SetFromString16(leftovers);
    593     Char16Set unique_chars =
    594         base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
    595 
    596     // Reduce the word set with any leftover, unprocessed characters.
    597     if (!unique_chars.empty()) {
    598       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
    599       // We might come up empty on the leftovers.
    600       if (leftover_set.empty()) {
    601         search_term_cache_[term] = SearchTermCacheItem();
    602         return HistoryIDSet();
    603       }
    604       // Or there may not have been a prefix from which to start.
    605       if (prefix_chars.empty()) {
    606         word_id_set.swap(leftover_set);
    607       } else {
    608         WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
    609             word_id_set, leftover_set);
    610         word_id_set.swap(new_word_id_set);
    611       }
    612     }
    613 
    614     // We must filter the word list because the resulting word set surely
    615     // contains words which do not have the search term as a proper subset.
    616     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
    617          word_set_iter != word_id_set.end(); ) {
    618       if (word_list_[*word_set_iter].find(term) == base::string16::npos)
    619         word_id_set.erase(word_set_iter++);
    620       else
    621         ++word_set_iter;
    622     }
    623   } else {
    624     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
    625   }
    626 
    627   // If any words resulted then we can compose a set of history IDs by unioning
    628   // the sets from each word.
    629   HistoryIDSet history_id_set;
    630   if (!word_id_set.empty()) {
    631     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
    632          word_id_iter != word_id_set.end(); ++word_id_iter) {
    633       WordID word_id = *word_id_iter;
    634       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
    635       if (word_iter != word_id_history_map_.end()) {
    636         HistoryIDSet& word_history_id_set(word_iter->second);
    637         history_id_set.insert(word_history_id_set.begin(),
    638                               word_history_id_set.end());
    639       }
    640     }
    641   }
    642 
    643   // Record a new cache entry for this word if the term is longer than
    644   // a single character.
    645   if (term_length > 1)
    646     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
    647 
    648   return history_id_set;
    649 }
    650 
    651 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
    652     const Char16Set& term_chars) {
    653   WordIDSet word_id_set;
    654   for (Char16Set::const_iterator c_iter = term_chars.begin();
    655        c_iter != term_chars.end(); ++c_iter) {
    656     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
    657     if (char_iter == char_word_map_.end()) {
    658       // A character was not found so there are no matching results: bail.
    659       word_id_set.clear();
    660       break;
    661     }
    662     WordIDSet& char_word_id_set(char_iter->second);
    663     // It is possible for there to no longer be any words associated with
    664     // a particular character. Give up in that case.
    665     if (char_word_id_set.empty()) {
    666       word_id_set.clear();
    667       break;
    668     }
    669 
    670     if (c_iter == term_chars.begin()) {
    671       // First character results becomes base set of results.
    672       word_id_set = char_word_id_set;
    673     } else {
    674       // Subsequent character results get intersected in.
    675       WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
    676           word_id_set, char_word_id_set);
    677       word_id_set.swap(new_word_id_set);
    678     }
    679   }
    680   return word_id_set;
    681 }
    682 
    683 bool URLIndexPrivateData::IndexRow(
    684     HistoryDatabase* history_db,
    685     HistoryService* history_service,
    686     const URLRow& row,
    687     const std::string& languages,
    688     const std::set<std::string>& scheme_whitelist) {
    689   const GURL& gurl(row.url());
    690 
    691   // Index only URLs with a whitelisted scheme.
    692   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
    693     return false;
    694 
    695   URLID row_id = row.id();
    696   // Strip out username and password before saving and indexing.
    697   base::string16 url(net::FormatUrl(gurl, languages,
    698       net::kFormatUrlOmitUsernamePassword,
    699       net::UnescapeRule::NONE,
    700       NULL, NULL, NULL));
    701 
    702   HistoryID history_id = static_cast<HistoryID>(row_id);
    703   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
    704 
    705   // Add the row for quick lookup in the history info store.
    706   URLRow new_row(GURL(url), row_id);
    707   new_row.set_visit_count(row.visit_count());
    708   new_row.set_typed_count(row.typed_count());
    709   new_row.set_last_visit(row.last_visit());
    710   new_row.set_title(row.title());
    711   history_info_map_[history_id].url_row = new_row;
    712 
    713   // Index the words contained in the URL and title of the row.
    714   RowWordStarts word_starts;
    715   AddRowWordsToIndex(new_row, &word_starts, languages);
    716   word_starts_map_[history_id] = word_starts;
    717 
    718   // Update the recent visits information or schedule the update
    719   // as appropriate.
    720   if (history_db) {
    721     // We'd like to check that we're on the history DB thread.
    722     // However, unittest code actually calls this on the UI thread.
    723     // So we don't do any thread checks.
    724     VisitVector recent_visits;
    725     // Make sure the private data is going to get as many recent visits as
    726     // ScoredHistoryMatch::GetFrequency() hopes to use.
    727     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
    728     if (history_db->GetMostRecentVisitsForURL(row_id,
    729                                               kMaxVisitsToStoreInCache,
    730                                               &recent_visits))
    731       UpdateRecentVisits(row_id, recent_visits);
    732   } else {
    733     DCHECK(history_service);
    734     ScheduleUpdateRecentVisits(history_service, row_id);
    735   }
    736 
    737   return true;
    738 }
    739 
    740 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
    741                                              RowWordStarts* word_starts,
    742                                              const std::string& languages) {
    743   HistoryID history_id = static_cast<HistoryID>(row.id());
    744   // Split URL into individual, unique words then add in the title words.
    745   const GURL& gurl(row.url());
    746   const base::string16& url =
    747       bookmark_utils::CleanUpUrlForMatching(gurl, languages, NULL);
    748   String16Set url_words = String16SetFromString16(url,
    749       word_starts ? &word_starts->url_word_starts_ : NULL);
    750   const base::string16& title =
    751       bookmark_utils::CleanUpTitleForMatching(row.title());
    752   String16Set title_words = String16SetFromString16(title,
    753       word_starts ? &word_starts->title_word_starts_ : NULL);
    754   String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
    755   for (String16Set::iterator word_iter = words.begin();
    756        word_iter != words.end(); ++word_iter)
    757     AddWordToIndex(*word_iter, history_id);
    758 
    759   search_term_cache_.clear();  // Invalidate the term cache.
    760 }
    761 
    762 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
    763                                          HistoryID history_id) {
    764   WordMap::iterator word_pos = word_map_.find(term);
    765   if (word_pos != word_map_.end())
    766     UpdateWordHistory(word_pos->second, history_id);
    767   else
    768     AddWordHistory(term, history_id);
    769 }
    770 
    771 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
    772                                          HistoryID history_id) {
    773   WordID word_id = word_list_.size();
    774   if (available_words_.empty()) {
    775     word_list_.push_back(term);
    776   } else {
    777     word_id = *(available_words_.begin());
    778     word_list_[word_id] = term;
    779     available_words_.erase(word_id);
    780   }
    781   word_map_[term] = word_id;
    782 
    783   HistoryIDSet history_id_set;
    784   history_id_set.insert(history_id);
    785   word_id_history_map_[word_id] = history_id_set;
    786   AddToHistoryIDWordMap(history_id, word_id);
    787 
    788   // For each character in the newly added word (i.e. a word that is not
    789   // already in the word index), add the word to the character index.
    790   Char16Set characters = Char16SetFromString16(term);
    791   for (Char16Set::iterator uni_char_iter = characters.begin();
    792        uni_char_iter != characters.end(); ++uni_char_iter) {
    793     base::char16 uni_char = *uni_char_iter;
    794     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
    795     if (char_iter != char_word_map_.end()) {
    796       // Update existing entry in the char/word index.
    797       WordIDSet& word_id_set(char_iter->second);
    798       word_id_set.insert(word_id);
    799     } else {
    800       // Create a new entry in the char/word index.
    801       WordIDSet word_id_set;
    802       word_id_set.insert(word_id);
    803       char_word_map_[uni_char] = word_id_set;
    804     }
    805   }
    806 }
    807 
    808 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
    809                                             HistoryID history_id) {
    810   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
    811   DCHECK(history_pos != word_id_history_map_.end());
    812   HistoryIDSet& history_id_set(history_pos->second);
    813   history_id_set.insert(history_id);
    814   AddToHistoryIDWordMap(history_id, word_id);
    815 }
    816 
    817 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
    818                                                 WordID word_id) {
    819   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
    820   if (iter != history_id_word_map_.end()) {
    821     WordIDSet& word_id_set(iter->second);
    822     word_id_set.insert(word_id);
    823   } else {
    824     WordIDSet word_id_set;
    825     word_id_set.insert(word_id);
    826     history_id_word_map_[history_id] = word_id_set;
    827   }
    828 }
    829 
    830 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
    831   RemoveRowWordsFromIndex(row);
    832   HistoryID history_id = static_cast<HistoryID>(row.id());
    833   history_info_map_.erase(history_id);
    834   word_starts_map_.erase(history_id);
    835 }
    836 
    837 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
    838   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
    839   // this row.
    840   HistoryID history_id = static_cast<HistoryID>(row.id());
    841   WordIDSet word_id_set = history_id_word_map_[history_id];
    842   history_id_word_map_.erase(history_id);
    843 
    844   // Reconcile any changes to word usage.
    845   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
    846        word_id_iter != word_id_set.end(); ++word_id_iter) {
    847     WordID word_id = *word_id_iter;
    848     word_id_history_map_[word_id].erase(history_id);
    849     if (!word_id_history_map_[word_id].empty())
    850       continue;  // The word is still in use.
    851 
    852     // The word is no longer in use. Reconcile any changes to character usage.
    853     base::string16 word = word_list_[word_id];
    854     Char16Set characters = Char16SetFromString16(word);
    855     for (Char16Set::iterator uni_char_iter = characters.begin();
    856          uni_char_iter != characters.end(); ++uni_char_iter) {
    857       base::char16 uni_char = *uni_char_iter;
    858       char_word_map_[uni_char].erase(word_id);
    859       if (char_word_map_[uni_char].empty())
    860         char_word_map_.erase(uni_char);  // No longer in use.
    861     }
    862 
    863     // Complete the removal of references to the word.
    864     word_id_history_map_.erase(word_id);
    865     word_map_.erase(word);
    866     word_list_[word_id] = base::string16();
    867     available_words_.insert(word_id);
    868   }
    869 }
    870 
    871 void URLIndexPrivateData::ResetSearchTermCache() {
    872   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
    873        iter != search_term_cache_.end(); ++iter)
    874     iter->second.used_ = false;
    875 }
    876 
    877 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
    878   base::TimeTicks beginning_time = base::TimeTicks::Now();
    879   InMemoryURLIndexCacheItem index_cache;
    880   SavePrivateData(&index_cache);
    881   std::string data;
    882   if (!index_cache.SerializeToString(&data)) {
    883     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
    884     return false;
    885   }
    886 
    887   int size = data.size();
    888   if (base::WriteFile(file_path, data.c_str(), size) != size) {
    889     LOG(WARNING) << "Failed to write " << file_path.value();
    890     return false;
    891   }
    892   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
    893                       base::TimeTicks::Now() - beginning_time);
    894   return true;
    895 }
    896 
    897 void URLIndexPrivateData::SavePrivateData(
    898     InMemoryURLIndexCacheItem* cache) const {
    899   DCHECK(cache);
    900   cache->set_last_rebuild_timestamp(
    901       last_time_rebuilt_from_history_.ToInternalValue());
    902   cache->set_version(saved_cache_version_);
    903   // history_item_count_ is no longer used but rather than change the protobuf
    904   // definition use a placeholder. This will go away with the switch to SQLite.
    905   cache->set_history_item_count(0);
    906   SaveWordList(cache);
    907   SaveWordMap(cache);
    908   SaveCharWordMap(cache);
    909   SaveWordIDHistoryMap(cache);
    910   SaveHistoryInfoMap(cache);
    911   SaveWordStartsMap(cache);
    912 }
    913 
    914 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
    915   if (word_list_.empty())
    916     return;
    917   WordListItem* list_item = cache->mutable_word_list();
    918   list_item->set_word_count(word_list_.size());
    919   for (String16Vector::const_iterator iter = word_list_.begin();
    920        iter != word_list_.end(); ++iter)
    921     list_item->add_word(base::UTF16ToUTF8(*iter));
    922 }
    923 
    924 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
    925   if (word_map_.empty())
    926     return;
    927   WordMapItem* map_item = cache->mutable_word_map();
    928   map_item->set_item_count(word_map_.size());
    929   for (WordMap::const_iterator iter = word_map_.begin();
    930        iter != word_map_.end(); ++iter) {
    931     WordMapEntry* map_entry = map_item->add_word_map_entry();
    932     map_entry->set_word(base::UTF16ToUTF8(iter->first));
    933     map_entry->set_word_id(iter->second);
    934   }
    935 }
    936 
    937 void URLIndexPrivateData::SaveCharWordMap(
    938     InMemoryURLIndexCacheItem* cache) const {
    939   if (char_word_map_.empty())
    940     return;
    941   CharWordMapItem* map_item = cache->mutable_char_word_map();
    942   map_item->set_item_count(char_word_map_.size());
    943   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
    944        iter != char_word_map_.end(); ++iter) {
    945     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
    946     map_entry->set_char_16(iter->first);
    947     const WordIDSet& word_id_set(iter->second);
    948     map_entry->set_item_count(word_id_set.size());
    949     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
    950          set_iter != word_id_set.end(); ++set_iter)
    951       map_entry->add_word_id(*set_iter);
    952   }
    953 }
    954 
    955 void URLIndexPrivateData::SaveWordIDHistoryMap(
    956     InMemoryURLIndexCacheItem* cache) const {
    957   if (word_id_history_map_.empty())
    958     return;
    959   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
    960   map_item->set_item_count(word_id_history_map_.size());
    961   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
    962        iter != word_id_history_map_.end(); ++iter) {
    963     WordIDHistoryMapEntry* map_entry =
    964         map_item->add_word_id_history_map_entry();
    965     map_entry->set_word_id(iter->first);
    966     const HistoryIDSet& history_id_set(iter->second);
    967     map_entry->set_item_count(history_id_set.size());
    968     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
    969          set_iter != history_id_set.end(); ++set_iter)
    970       map_entry->add_history_id(*set_iter);
    971   }
    972 }
    973 
    974 void URLIndexPrivateData::SaveHistoryInfoMap(
    975     InMemoryURLIndexCacheItem* cache) const {
    976   if (history_info_map_.empty())
    977     return;
    978   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
    979   map_item->set_item_count(history_info_map_.size());
    980   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
    981        iter != history_info_map_.end(); ++iter) {
    982     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
    983     map_entry->set_history_id(iter->first);
    984     const URLRow& url_row(iter->second.url_row);
    985     // Note: We only save information that contributes to the index so there
    986     // is no need to save search_term_cache_ (not persistent).
    987     map_entry->set_visit_count(url_row.visit_count());
    988     map_entry->set_typed_count(url_row.typed_count());
    989     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
    990     map_entry->set_url(url_row.url().spec());
    991     map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
    992     const VisitInfoVector& visits(iter->second.visits);
    993     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
    994          visit_iter != visits.end(); ++visit_iter) {
    995       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
    996       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
    997       visit_info->set_transition_type(visit_iter->second);
    998     }
    999   }
   1000 }
   1001 
   1002 void URLIndexPrivateData::SaveWordStartsMap(
   1003     InMemoryURLIndexCacheItem* cache) const {
   1004   if (word_starts_map_.empty())
   1005     return;
   1006   // For unit testing: Enable saving of the cache as an earlier version to
   1007   // allow testing of cache file upgrading in ReadFromFile().
   1008   // TODO(mrossetti): Instead of intruding on production code with this kind of
   1009   // test harness, save a copy of an older version cache with known results.
   1010   // Implement this when switching the caching over to SQLite.
   1011   if (saved_cache_version_ < 1)
   1012     return;
   1013 
   1014   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
   1015   map_item->set_item_count(word_starts_map_.size());
   1016   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
   1017        iter != word_starts_map_.end(); ++iter) {
   1018     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
   1019     map_entry->set_history_id(iter->first);
   1020     const RowWordStarts& word_starts(iter->second);
   1021     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
   1022          i != word_starts.url_word_starts_.end(); ++i)
   1023       map_entry->add_url_word_starts(*i);
   1024     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
   1025          i != word_starts.title_word_starts_.end(); ++i)
   1026       map_entry->add_title_word_starts(*i);
   1027   }
   1028 }
   1029 
   1030 bool URLIndexPrivateData::RestorePrivateData(
   1031     const InMemoryURLIndexCacheItem& cache,
   1032     const std::string& languages) {
   1033   last_time_rebuilt_from_history_ =
   1034       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
   1035   const base::TimeDelta rebuilt_ago =
   1036       base::Time::Now() - last_time_rebuilt_from_history_;
   1037   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
   1038       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
   1039     // Cache is more than a week old or, somehow, from some time in the future.
   1040     // It's probably a good time to rebuild the index from history to
   1041     // allow synced entries to now appear, expired entries to disappear, etc.
   1042     // Allow one day in the future to make the cache not rebuild on simple
   1043     // system clock changes such as time zone changes.
   1044     return false;
   1045   }
   1046   if (cache.has_version()) {
   1047     if (cache.version() < kCurrentCacheFileVersion) {
   1048       // Don't try to restore an old format cache file.  (This will cause
   1049       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
   1050       // from history.)
   1051       return false;
   1052     }
   1053     restored_cache_version_ = cache.version();
   1054   }
   1055   return RestoreWordList(cache) && RestoreWordMap(cache) &&
   1056       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
   1057       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
   1058 }
   1059 
   1060 bool URLIndexPrivateData::RestoreWordList(
   1061     const InMemoryURLIndexCacheItem& cache) {
   1062   if (!cache.has_word_list())
   1063     return false;
   1064   const WordListItem& list_item(cache.word_list());
   1065   uint32 expected_item_count = list_item.word_count();
   1066   uint32 actual_item_count = list_item.word_size();
   1067   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1068     return false;
   1069   const RepeatedPtrField<std::string>& words(list_item.word());
   1070   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
   1071        iter != words.end(); ++iter)
   1072     word_list_.push_back(base::UTF8ToUTF16(*iter));
   1073   return true;
   1074 }
   1075 
   1076 bool URLIndexPrivateData::RestoreWordMap(
   1077     const InMemoryURLIndexCacheItem& cache) {
   1078   if (!cache.has_word_map())
   1079     return false;
   1080   const WordMapItem& list_item(cache.word_map());
   1081   uint32 expected_item_count = list_item.item_count();
   1082   uint32 actual_item_count = list_item.word_map_entry_size();
   1083   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1084     return false;
   1085   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
   1086   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
   1087        iter != entries.end(); ++iter)
   1088     word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
   1089   return true;
   1090 }
   1091 
   1092 bool URLIndexPrivateData::RestoreCharWordMap(
   1093     const InMemoryURLIndexCacheItem& cache) {
   1094   if (!cache.has_char_word_map())
   1095     return false;
   1096   const CharWordMapItem& list_item(cache.char_word_map());
   1097   uint32 expected_item_count = list_item.item_count();
   1098   uint32 actual_item_count = list_item.char_word_map_entry_size();
   1099   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1100     return false;
   1101   const RepeatedPtrField<CharWordMapEntry>&
   1102       entries(list_item.char_word_map_entry());
   1103   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
   1104        entries.begin(); iter != entries.end(); ++iter) {
   1105     expected_item_count = iter->item_count();
   1106     actual_item_count = iter->word_id_size();
   1107     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1108       return false;
   1109     base::char16 uni_char = static_cast<base::char16>(iter->char_16());
   1110     WordIDSet word_id_set;
   1111     const RepeatedField<int32>& word_ids(iter->word_id());
   1112     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
   1113          jiter != word_ids.end(); ++jiter)
   1114       word_id_set.insert(*jiter);
   1115     char_word_map_[uni_char] = word_id_set;
   1116   }
   1117   return true;
   1118 }
   1119 
   1120 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
   1121     const InMemoryURLIndexCacheItem& cache) {
   1122   if (!cache.has_word_id_history_map())
   1123     return false;
   1124   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
   1125   uint32 expected_item_count = list_item.item_count();
   1126   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
   1127   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1128     return false;
   1129   const RepeatedPtrField<WordIDHistoryMapEntry>&
   1130       entries(list_item.word_id_history_map_entry());
   1131   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
   1132        entries.begin(); iter != entries.end(); ++iter) {
   1133     expected_item_count = iter->item_count();
   1134     actual_item_count = iter->history_id_size();
   1135     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1136       return false;
   1137     WordID word_id = iter->word_id();
   1138     HistoryIDSet history_id_set;
   1139     const RepeatedField<int64>& history_ids(iter->history_id());
   1140     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
   1141          jiter != history_ids.end(); ++jiter) {
   1142       history_id_set.insert(*jiter);
   1143       AddToHistoryIDWordMap(*jiter, word_id);
   1144     }
   1145     word_id_history_map_[word_id] = history_id_set;
   1146   }
   1147   return true;
   1148 }
   1149 
   1150 bool URLIndexPrivateData::RestoreHistoryInfoMap(
   1151     const InMemoryURLIndexCacheItem& cache) {
   1152   if (!cache.has_history_info_map())
   1153     return false;
   1154   const HistoryInfoMapItem& list_item(cache.history_info_map());
   1155   uint32 expected_item_count = list_item.item_count();
   1156   uint32 actual_item_count = list_item.history_info_map_entry_size();
   1157   if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1158     return false;
   1159   const RepeatedPtrField<HistoryInfoMapEntry>&
   1160       entries(list_item.history_info_map_entry());
   1161   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
   1162        entries.begin(); iter != entries.end(); ++iter) {
   1163     HistoryID history_id = iter->history_id();
   1164     GURL url(iter->url());
   1165     URLRow url_row(url, history_id);
   1166     url_row.set_visit_count(iter->visit_count());
   1167     url_row.set_typed_count(iter->typed_count());
   1168     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
   1169     if (iter->has_title()) {
   1170       base::string16 title(base::UTF8ToUTF16(iter->title()));
   1171       url_row.set_title(title);
   1172     }
   1173     history_info_map_[history_id].url_row = url_row;
   1174 
   1175     // Restore visits list.
   1176     VisitInfoVector visits;
   1177     visits.reserve(iter->visits_size());
   1178     for (int i = 0; i < iter->visits_size(); ++i) {
   1179       visits.push_back(std::make_pair(
   1180           base::Time::FromInternalValue(iter->visits(i).visit_time()),
   1181           static_cast<content::PageTransition>(iter->visits(i).
   1182                                                transition_type())));
   1183     }
   1184     history_info_map_[history_id].visits = visits;
   1185   }
   1186   return true;
   1187 }
   1188 
   1189 bool URLIndexPrivateData::RestoreWordStartsMap(
   1190     const InMemoryURLIndexCacheItem& cache,
   1191     const std::string& languages) {
   1192   // Note that this function must be called after RestoreHistoryInfoMap() has
   1193   // been run as the word starts may have to be recalculated from the urls and
   1194   // page titles.
   1195   if (cache.has_word_starts_map()) {
   1196     const WordStartsMapItem& list_item(cache.word_starts_map());
   1197     uint32 expected_item_count = list_item.item_count();
   1198     uint32 actual_item_count = list_item.word_starts_map_entry_size();
   1199     if (actual_item_count == 0 || actual_item_count != expected_item_count)
   1200       return false;
   1201     const RepeatedPtrField<WordStartsMapEntry>&
   1202         entries(list_item.word_starts_map_entry());
   1203     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
   1204          entries.begin(); iter != entries.end(); ++iter) {
   1205       HistoryID history_id = iter->history_id();
   1206       RowWordStarts word_starts;
   1207       // Restore the URL word starts.
   1208       const RepeatedField<int32>& url_starts(iter->url_word_starts());
   1209       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
   1210            jiter != url_starts.end(); ++jiter)
   1211         word_starts.url_word_starts_.push_back(*jiter);
   1212       // Restore the page title word starts.
   1213       const RepeatedField<int32>& title_starts(iter->title_word_starts());
   1214       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
   1215            jiter != title_starts.end(); ++jiter)
   1216         word_starts.title_word_starts_.push_back(*jiter);
   1217       word_starts_map_[history_id] = word_starts;
   1218     }
   1219   } else {
   1220     // Since the cache did not contain any word starts we must rebuild then from
   1221     // the URL and page titles.
   1222     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
   1223          iter != history_info_map_.end(); ++iter) {
   1224       RowWordStarts word_starts;
   1225       const URLRow& row(iter->second.url_row);
   1226       const base::string16& url = bookmark_utils::CleanUpUrlForMatching(
   1227           row.url(), languages, NULL);
   1228       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
   1229       const base::string16& title =
   1230           bookmark_utils::CleanUpTitleForMatching(row.title());
   1231       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
   1232       word_starts_map_[iter->first] = word_starts;
   1233     }
   1234   }
   1235   return true;
   1236 }
   1237 
   1238 // static
   1239 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
   1240     const GURL& gurl,
   1241     const std::set<std::string>& whitelist) {
   1242   return whitelist.find(gurl.scheme()) != whitelist.end();
   1243 }
   1244 
   1245 
   1246 // SearchTermCacheItem ---------------------------------------------------------
   1247 
   1248 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
   1249     const WordIDSet& word_id_set,
   1250     const HistoryIDSet& history_id_set)
   1251     : word_id_set_(word_id_set),
   1252       history_id_set_(history_id_set),
   1253       used_(true) {}
   1254 
   1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
   1256     : used_(true) {}
   1257 
   1258 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
   1259 
   1260 
   1261 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
   1262 
   1263 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
   1264     const URLIndexPrivateData& private_data,
   1265     const std::string& languages,
   1266     HistoryClient* history_client,
   1267     const base::string16& lower_string,
   1268     const String16Vector& lower_terms,
   1269     const base::Time now)
   1270     : private_data_(private_data),
   1271       languages_(languages),
   1272       history_client_(history_client),
   1273       lower_string_(lower_string),
   1274       lower_terms_(lower_terms),
   1275       now_(now) {
   1276   // Calculate offsets for each term.  For instance, the offset for
   1277   // ".net" should be 1, indicating that the actual word-part of the term
   1278   // starts at offset 1.
   1279   lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
   1280   for (size_t i = 0; i < lower_terms_.size(); ++i) {
   1281     base::i18n::BreakIterator iter(lower_terms_[i],
   1282                                    base::i18n::BreakIterator::BREAK_WORD);
   1283     // If the iterator doesn't work, assume an offset of 0.
   1284     if (!iter.Init())
   1285       continue;
   1286     // Find the first word start.
   1287     while (iter.Advance() && !iter.IsWord()) {}
   1288     if (iter.IsWord())
   1289       lower_terms_to_word_starts_offsets_[i] = iter.prev();
   1290     // Else: the iterator didn't find a word break.  Assume an offset of 0.
   1291   }
   1292 }
   1293 
   1294 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
   1295 
   1296 void URLIndexPrivateData::AddHistoryMatch::operator()(
   1297     const HistoryID history_id) {
   1298   HistoryInfoMap::const_iterator hist_pos =
   1299       private_data_.history_info_map_.find(history_id);
   1300   if (hist_pos != private_data_.history_info_map_.end()) {
   1301     const URLRow& hist_item = hist_pos->second.url_row;
   1302     const VisitInfoVector& visits = hist_pos->second.visits;
   1303     WordStartsMap::const_iterator starts_pos =
   1304         private_data_.word_starts_map_.find(history_id);
   1305     DCHECK(starts_pos != private_data_.word_starts_map_.end());
   1306     ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
   1307                              lower_terms_, lower_terms_to_word_starts_offsets_,
   1308                              starts_pos->second, now_, history_client_);
   1309     if (match.raw_score() > 0)
   1310       scored_matches_.push_back(match);
   1311   }
   1312 }
   1313 
   1314 
   1315 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
   1316 
   1317 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
   1318     const HistoryInfoMap& history_info_map)
   1319     : history_info_map_(history_info_map) {
   1320 }
   1321 
   1322 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
   1323 
   1324 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
   1325     const HistoryID h1,
   1326     const HistoryID h2) {
   1327   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
   1328   if (entry1 == history_info_map_.end())
   1329     return false;
   1330   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
   1331   if (entry2 == history_info_map_.end())
   1332     return true;
   1333   const URLRow& r1(entry1->second.url_row);
   1334   const URLRow& r2(entry2->second.url_row);
   1335   // First cut: typed count, visit count, recency.
   1336   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
   1337   // recently visited (within the last 12/24 hours) as highly important. Get
   1338   // input from mpearson.
   1339   if (r1.typed_count() != r2.typed_count())
   1340     return (r1.typed_count() > r2.typed_count());
   1341   if (r1.visit_count() != r2.visit_count())
   1342     return (r1.visit_count() > r2.visit_count());
   1343   return (r1.last_visit() > r2.last_visit());
   1344 }
   1345 
   1346 }  // namespace history
   1347