Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
      6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
      7 
      8 #include <map>
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "base/strings/string16.h"
     13 #include "chrome/browser/autocomplete/history_provider_util.h"
     14 #include "chrome/browser/history/history_types.h"
     15 #include "url/gurl.h"
     16 
     17 namespace history {
     18 
     19 // The maximum number of characters to consider from an URL and page title
     20 // while matching user-typed terms.
     21 const size_t kMaxSignificantChars = 50;
     22 
     23 // Matches within URL and Title Strings ----------------------------------------
     24 
     25 // Specifies where an omnibox term occurs within a string. Used for specifying
     26 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
     27 // scoring a result.
     28 struct TermMatch {
     29   TermMatch() : term_num(0), offset(0), length(0) {}
     30   TermMatch(int term_num, size_t offset, size_t length)
     31       : term_num(term_num),
     32         offset(offset),
     33         length(length) {}
     34 
     35   int term_num;  // The index of the term in the original search string.
     36   size_t offset;  // The starting offset of the substring match.
     37   size_t length;  // The length of the substring match.
     38 };
     39 typedef std::vector<TermMatch> TermMatches;
     40 
     41 // Unescapes the URL and lower-cases it, returning the result.  This
     42 // unescaping makes it possible to match substrings that were
     43 // originally escaped for navigation; for example, if the user
     44 // searched for "a&p", the query would be escaped as "a%26p", so
     45 // without unescaping, an input string of "a&p" would no longer match
     46 // this URL.  Note that the resulting unescaped URL may not be
     47 // directly navigable (which is why we escaped it to begin with).
     48 // |languages| is passed to net::FormatUrl().
     49 string16 CleanUpUrlForMatching(const GURL& gurl,
     50                                const std::string& languages);
     51 
     52 // Returns the lower-cased title.
     53 string16 CleanUpTitleForMatching(const string16& title);
     54 
     55 // Returns a TermMatches which has an entry for each occurrence of the
     56 // string |term| found in the string |cleaned_string|. Use
     57 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
     58 // |cleaned_string| to this function. The function marks each match
     59 // with |term_num| so that the resulting TermMatches can be merged
     60 // with other TermMatches for other terms. Note that only the first
     61 // 2,048 characters of |string| are considered during the match
     62 // operation.
     63 TermMatches MatchTermInString(const string16& term,
     64                               const string16& cleaned_string,
     65                               int term_num);
     66 
     67 // Sorts and removes overlapping substring matches from |matches| and
     68 // returns the cleaned up matches.
     69 TermMatches SortAndDeoverlapMatches(const TermMatches& matches);
     70 
     71 // Extracts and returns the offsets from |matches|.
     72 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);
     73 
     74 // Replaces the offsets in |matches| with those given in |offsets|, deleting
     75 // any which are npos, and returns the updated list of matches.
     76 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
     77                                         const std::vector<size_t>& offsets);
     78 
     79 // Convenience Types -----------------------------------------------------------
     80 
     81 typedef std::vector<string16> String16Vector;
     82 typedef std::set<string16> String16Set;
     83 typedef std::set<char16> Char16Set;
     84 typedef std::vector<char16> Char16Vector;
     85 
     86 // A vector that contains the offsets at which each word starts within a string.
     87 typedef std::vector<size_t> WordStarts;
     88 
     89 // Utility Functions -----------------------------------------------------------
     90 
     91 // Breaks the string |cleaned_uni_string| down into individual words.
     92 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before
     93 // passing |cleaned_uni_string| to this function. If |word_starts| is
     94 // not NULL then clears and pushes the offsets within
     95 // |cleaned_uni_string| at which each word starts onto
     96 // |word_starts|. These offsets are collected only up to the first
     97 // kMaxSignificantChars of |cleaned_uni_string|.
     98 String16Set String16SetFromString16(const string16& cleaned_uni_string,
     99                                     WordStarts* word_starts);
    100 
    101 // Breaks the |cleaned_uni_string| string down into individual words
    102 // and return a vector with the individual words in their original
    103 // order.  Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching()
    104 // before passing |cleaned_uni_string| to this function.  If
    105 // |break_on_space| is false then the resulting list will contain only
    106 // words containing alpha-numeric characters. If |break_on_space| is
    107 // true then the resulting list will contain strings broken at
    108 // whitespace. (|break_on_space| indicates that the
    109 // BreakIterator::BREAK_SPACE (equivalent to BREAK_LINE) approach is
    110 // to be used. For a complete description of this algorithm refer to
    111 // the comments in base/i18n/break_iterator.h.) If |word_starts| is
    112 // not NULL then clears and pushes the word starts onto |word_starts|.
    113 //
    114 // Example:
    115 //   Given: |cleaned_uni_string|: "http://www.google.com/ harry the rabbit."
    116 //   With |break_on_space| false the returned list will contain:
    117 //    "http", "www", "google", "com", "harry", "the", "rabbit"
    118 //   With |break_on_space| true the returned list will contain:
    119 //    "http://", "www.google.com/", "harry", "the", "rabbit."
    120 String16Vector String16VectorFromString16(const string16& cleaned_uni_string,
    121                                           bool break_on_space,
    122                                           WordStarts* word_starts);
    123 
    124 // Breaks the |uni_word| string down into its individual characters.
    125 // Note that this is temporarily intended to work on a single word, but
    126 // _will_ work on a string of words, perhaps with unexpected results.
    127 // TODO(mrossetti): Lots of optimizations possible here for not restarting
    128 // a search if the user is just typing along. Also, change this to uniString
    129 // and properly handle substring matches, scoring and sorting the results
    130 // by score. Also, provide the metrics for where the matches occur so that
    131 // the UI can highlight the matched sections.
    132 Char16Set Char16SetFromString16(const string16& uni_word);
    133 
    134 // Support for InMemoryURLIndex Private Data -----------------------------------
    135 
    136 // An index into a list of all of the words we have indexed.
    137 typedef size_t WordID;
    138 
    139 // A map allowing a WordID to be determined given a word.
    140 typedef std::map<string16, WordID> WordMap;
    141 
    142 // A map from character to the word_ids of words containing that character.
    143 typedef std::set<WordID> WordIDSet;  // An index into the WordList.
    144 typedef std::map<char16, WordIDSet> CharWordIDMap;
    145 
    146 // A map from word (by word_id) to history items containing that word.
    147 typedef history::URLID HistoryID;
    148 typedef std::set<HistoryID> HistoryIDSet;
    149 typedef std::vector<HistoryID> HistoryIDVector;
    150 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
    151 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;
    152 
    153 
    154 // Information used in scoring a particular URL.
    155 typedef std::vector<VisitInfo> VisitInfoVector;
    156 struct HistoryInfoMapValue {
    157   HistoryInfoMapValue();
    158   ~HistoryInfoMapValue();
    159 
    160   // This field is always populated.
    161   URLRow url_row;
    162 
    163   // This field gets filled in asynchronously after a visit.  As such,
    164   // it's almost always correct.  If it's wrong, it's likely to either
    165   // be empty (if this URL was recently added to the index) or
    166   // slightly out-of-date (one visit behind).
    167   VisitInfoVector visits;
    168 };
    169 
    170 // A map from history_id to the history's URL and title.
    171 typedef std::map<HistoryID, HistoryInfoMapValue> HistoryInfoMap;
    172 
    173 // A map from history_id to URL and page title word start metrics.
    174 struct RowWordStarts {
    175   RowWordStarts();
    176   ~RowWordStarts();
    177 
    178   // Clears both url_word_starts_ and title_word_starts_.
    179   void Clear();
    180 
    181   WordStarts url_word_starts_;
    182   WordStarts title_word_starts_;
    183 };
    184 typedef std::map<HistoryID, RowWordStarts> WordStartsMap;
    185 
    186 }  // namespace history
    187 
    188 #endif  // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
    189