Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
      6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
      7 
      8 #include <map>
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "base/strings/string16.h"
     13 #include "components/history/core/browser/history_types.h"
     14 #include "url/gurl.h"
     15 
     16 namespace history {
     17 
     18 // The maximum number of characters to consider from an URL and page title
     19 // while matching user-typed terms.
     20 const size_t kMaxSignificantChars = 200;
     21 
     22 // Matches within URL and Title Strings ----------------------------------------
     23 
     24 // Specifies where an omnibox term occurs within a string. Used for specifying
     25 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
     26 // scoring a result.
     27 struct TermMatch {
     28   TermMatch() : term_num(0), offset(0), length(0) {}
     29   TermMatch(int term_num, size_t offset, size_t length)
     30       : term_num(term_num),
     31         offset(offset),
     32         length(length) {}
     33 
     34   int term_num;  // The index of the term in the original search string.
     35   size_t offset;  // The starting offset of the substring match.
     36   size_t length;  // The length of the substring match.
     37 };
     38 typedef std::vector<TermMatch> TermMatches;
     39 
     40 // Returns a TermMatches which has an entry for each occurrence of the
     41 // string |term| found in the string |cleaned_string|. Use
     42 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
     43 // |cleaned_string| to this function. The function marks each match
     44 // with |term_num| so that the resulting TermMatches can be merged
     45 // with other TermMatches for other terms. Note that only the first
     46 // 2,048 characters of |string| are considered during the match
     47 // operation.
     48 TermMatches MatchTermInString(const base::string16& term,
     49                               const base::string16& cleaned_string,
     50                               int term_num);
     51 
     52 // Sorts and removes overlapping substring matches from |matches| and
     53 // returns the cleaned up matches.
     54 TermMatches SortAndDeoverlapMatches(const TermMatches& matches);
     55 
     56 // Extracts and returns the offsets from |matches|.  This includes both
     57 // the offsets corresponding to the beginning of a match and the offsets
     58 // corresponding to the end of a match (i.e., offset+length for that match).
     59 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);
     60 
     61 // Replaces the offsets and lengths in |matches| with those given in |offsets|.
     62 // |offsets| gives beginning and ending offsets for each match; this function
     63 // translates (beginning, ending) offset into (beginning offset, length of
     64 // match).  It deletes any matches for which an endpoint is npos and returns
     65 // the updated list of matches.
     66 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
     67                                         const std::vector<size_t>& offsets);
     68 
     69 // Convenience Types -----------------------------------------------------------
     70 
     71 typedef std::vector<base::string16> String16Vector;
     72 typedef std::set<base::string16> String16Set;
     73 typedef std::set<base::char16> Char16Set;
     74 typedef std::vector<base::char16> Char16Vector;
     75 
     76 // A vector that contains the offsets at which each word starts within a string.
     77 typedef std::vector<size_t> WordStarts;
     78 
     79 // Utility Functions -----------------------------------------------------------
     80 
     81 // Breaks the string |cleaned_uni_string| down into individual words.
     82 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before
     83 // passing |cleaned_uni_string| to this function. If |word_starts| is
     84 // not NULL then clears and pushes the offsets within
     85 // |cleaned_uni_string| at which each word starts onto
     86 // |word_starts|. These offsets are collected only up to the first
     87 // kMaxSignificantChars of |cleaned_uni_string|.
     88 String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
     89                                     WordStarts* word_starts);
     90 
     91 // Breaks the |cleaned_uni_string| string down into individual words
     92 // and return a vector with the individual words in their original
     93 // order.  Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching()
     94 // before passing |cleaned_uni_string| to this function.  If
     95 // |break_on_space| is false then the resulting list will contain only
     96 // words containing alpha-numeric characters. If |break_on_space| is
     97 // true then the resulting list will contain strings broken at
     98 // whitespace. (|break_on_space| indicates that the
     99 // BreakIterator::BREAK_SPACE (equivalent to BREAK_LINE) approach is
    100 // to be used. For a complete description of this algorithm refer to
    101 // the comments in base/i18n/break_iterator.h.) If |word_starts| is
    102 // not NULL then clears and pushes the word starts onto |word_starts|.
    103 //
    104 // Example:
    105 //   Given: |cleaned_uni_string|: "http://www.google.com/ harry the rabbit."
    106 //   With |break_on_space| false the returned list will contain:
    107 //    "http", "www", "google", "com", "harry", "the", "rabbit"
    108 //   With |break_on_space| true the returned list will contain:
    109 //    "http://", "www.google.com/", "harry", "the", "rabbit."
    110 String16Vector String16VectorFromString16(
    111     const base::string16& cleaned_uni_string,
    112     bool break_on_space,
    113     WordStarts* word_starts);
    114 
    115 // Breaks the |uni_word| string down into its individual characters.
    116 // Note that this is temporarily intended to work on a single word, but
    117 // _will_ work on a string of words, perhaps with unexpected results.
    118 // TODO(mrossetti): Lots of optimizations possible here for not restarting
    119 // a search if the user is just typing along. Also, change this to uniString
    120 // and properly handle substring matches, scoring and sorting the results
    121 // by score. Also, provide the metrics for where the matches occur so that
    122 // the UI can highlight the matched sections.
    123 Char16Set Char16SetFromString16(const base::string16& uni_word);
    124 
    125 // Support for InMemoryURLIndex Private Data -----------------------------------
    126 
    127 // An index into a list of all of the words we have indexed.
    128 typedef size_t WordID;
    129 
    130 // A map allowing a WordID to be determined given a word.
    131 typedef std::map<base::string16, WordID> WordMap;
    132 
    133 // A map from character to the word_ids of words containing that character.
    134 typedef std::set<WordID> WordIDSet;  // An index into the WordList.
    135 typedef std::map<base::char16, WordIDSet> CharWordIDMap;
    136 
    137 // A map from word (by word_id) to history items containing that word.
    138 typedef history::URLID HistoryID;
    139 typedef std::set<HistoryID> HistoryIDSet;
    140 typedef std::vector<HistoryID> HistoryIDVector;
    141 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
    142 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;
    143 
    144 
    145 // Information used in scoring a particular URL.
    146 typedef std::vector<VisitInfo> VisitInfoVector;
    147 struct HistoryInfoMapValue {
    148   HistoryInfoMapValue();
    149   ~HistoryInfoMapValue();
    150 
    151   // This field is always populated.
    152   URLRow url_row;
    153 
    154   // This field gets filled in asynchronously after a visit.  As such,
    155   // it's almost always correct.  If it's wrong, it's likely to either
    156   // be empty (if this URL was recently added to the index) or
    157   // slightly out-of-date (one visit behind).
    158   VisitInfoVector visits;
    159 };
    160 
    161 // A map from history_id to the history's URL and title.
    162 typedef std::map<HistoryID, HistoryInfoMapValue> HistoryInfoMap;
    163 
    164 // A map from history_id to URL and page title word start metrics.
    165 struct RowWordStarts {
    166   RowWordStarts();
    167   ~RowWordStarts();
    168 
    169   // Clears both url_word_starts_ and title_word_starts_.
    170   void Clear();
    171 
    172   WordStarts url_word_starts_;
    173   WordStarts title_word_starts_;
    174 };
    175 typedef std::map<HistoryID, RowWordStarts> WordStartsMap;
    176 
    177 }  // namespace history
    178 
    179 #endif  // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_
    180