1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 7 8 #include <map> 9 #include <set> 10 #include <vector> 11 12 #include "base/strings/string16.h" 13 #include "chrome/browser/history/history_types.h" 14 #include "url/gurl.h" 15 16 namespace history { 17 18 // The maximum number of characters to consider from an URL and page title 19 // while matching user-typed terms. 20 const size_t kMaxSignificantChars = 200; 21 22 // Matches within URL and Title Strings ---------------------------------------- 23 24 // Specifies where an omnibox term occurs within a string. Used for specifying 25 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in 26 // scoring a result. 27 struct TermMatch { 28 TermMatch() : term_num(0), offset(0), length(0) {} 29 TermMatch(int term_num, size_t offset, size_t length) 30 : term_num(term_num), 31 offset(offset), 32 length(length) {} 33 34 int term_num; // The index of the term in the original search string. 35 size_t offset; // The starting offset of the substring match. 36 size_t length; // The length of the substring match. 37 }; 38 typedef std::vector<TermMatch> TermMatches; 39 40 // Returns a TermMatches which has an entry for each occurrence of the 41 // string |term| found in the string |cleaned_string|. Use 42 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing 43 // |cleaned_string| to this function. The function marks each match 44 // with |term_num| so that the resulting TermMatches can be merged 45 // with other TermMatches for other terms. Note that only the first 46 // 2,048 characters of |string| are considered during the match 47 // operation. 48 TermMatches MatchTermInString(const base::string16& term, 49 const base::string16& cleaned_string, 50 int term_num); 51 52 // Sorts and removes overlapping substring matches from |matches| and 53 // returns the cleaned up matches. 54 TermMatches SortAndDeoverlapMatches(const TermMatches& matches); 55 56 // Extracts and returns the offsets from |matches|. This includes both 57 // the offsets corresponding to the beginning of a match and the offsets 58 // corresponding to the end of a match (i.e., offset+length for that match). 59 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches); 60 61 // Replaces the offsets and lengths in |matches| with those given in |offsets|. 62 // |offsets| gives beginning and ending offsets for each match; this function 63 // translates (beginning, ending) offset into (beginning offset, length of 64 // match). It deletes any matches for which an endpoint is npos and returns 65 // the updated list of matches. 66 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches, 67 const std::vector<size_t>& offsets); 68 69 // Convenience Types ----------------------------------------------------------- 70 71 typedef std::vector<base::string16> String16Vector; 72 typedef std::set<base::string16> String16Set; 73 typedef std::set<base::char16> Char16Set; 74 typedef std::vector<base::char16> Char16Vector; 75 76 // A vector that contains the offsets at which each word starts within a string. 77 typedef std::vector<size_t> WordStarts; 78 79 // Utility Functions ----------------------------------------------------------- 80 81 // Breaks the string |cleaned_uni_string| down into individual words. 82 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before 83 // passing |cleaned_uni_string| to this function. If |word_starts| is 84 // not NULL then clears and pushes the offsets within 85 // |cleaned_uni_string| at which each word starts onto 86 // |word_starts|. These offsets are collected only up to the first 87 // kMaxSignificantChars of |cleaned_uni_string|. 88 String16Set String16SetFromString16(const base::string16& cleaned_uni_string, 89 WordStarts* word_starts); 90 91 // Breaks the |cleaned_uni_string| string down into individual words 92 // and return a vector with the individual words in their original 93 // order. Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() 94 // before passing |cleaned_uni_string| to this function. If 95 // |break_on_space| is false then the resulting list will contain only 96 // words containing alpha-numeric characters. If |break_on_space| is 97 // true then the resulting list will contain strings broken at 98 // whitespace. (|break_on_space| indicates that the 99 // BreakIterator::BREAK_SPACE (equivalent to BREAK_LINE) approach is 100 // to be used. For a complete description of this algorithm refer to 101 // the comments in base/i18n/break_iterator.h.) If |word_starts| is 102 // not NULL then clears and pushes the word starts onto |word_starts|. 103 // 104 // Example: 105 // Given: |cleaned_uni_string|: "http://www.google.com/ harry the rabbit." 106 // With |break_on_space| false the returned list will contain: 107 // "http", "www", "google", "com", "harry", "the", "rabbit" 108 // With |break_on_space| true the returned list will contain: 109 // "http://", "www.google.com/", "harry", "the", "rabbit." 110 String16Vector String16VectorFromString16( 111 const base::string16& cleaned_uni_string, 112 bool break_on_space, 113 WordStarts* word_starts); 114 115 // Breaks the |uni_word| string down into its individual characters. 116 // Note that this is temporarily intended to work on a single word, but 117 // _will_ work on a string of words, perhaps with unexpected results. 118 // TODO(mrossetti): Lots of optimizations possible here for not restarting 119 // a search if the user is just typing along. Also, change this to uniString 120 // and properly handle substring matches, scoring and sorting the results 121 // by score. Also, provide the metrics for where the matches occur so that 122 // the UI can highlight the matched sections. 123 Char16Set Char16SetFromString16(const base::string16& uni_word); 124 125 // Support for InMemoryURLIndex Private Data ----------------------------------- 126 127 // An index into a list of all of the words we have indexed. 128 typedef size_t WordID; 129 130 // A map allowing a WordID to be determined given a word. 131 typedef std::map<base::string16, WordID> WordMap; 132 133 // A map from character to the word_ids of words containing that character. 134 typedef std::set<WordID> WordIDSet; // An index into the WordList. 135 typedef std::map<base::char16, WordIDSet> CharWordIDMap; 136 137 // A map from word (by word_id) to history items containing that word. 138 typedef history::URLID HistoryID; 139 typedef std::set<HistoryID> HistoryIDSet; 140 typedef std::vector<HistoryID> HistoryIDVector; 141 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; 142 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap; 143 144 145 // Information used in scoring a particular URL. 146 typedef std::vector<VisitInfo> VisitInfoVector; 147 struct HistoryInfoMapValue { 148 HistoryInfoMapValue(); 149 ~HistoryInfoMapValue(); 150 151 // This field is always populated. 152 URLRow url_row; 153 154 // This field gets filled in asynchronously after a visit. As such, 155 // it's almost always correct. If it's wrong, it's likely to either 156 // be empty (if this URL was recently added to the index) or 157 // slightly out-of-date (one visit behind). 158 VisitInfoVector visits; 159 }; 160 161 // A map from history_id to the history's URL and title. 162 typedef std::map<HistoryID, HistoryInfoMapValue> HistoryInfoMap; 163 164 // A map from history_id to URL and page title word start metrics. 165 struct RowWordStarts { 166 RowWordStarts(); 167 ~RowWordStarts(); 168 169 // Clears both url_word_starts_ and title_word_starts_. 170 void Clear(); 171 172 WordStarts url_word_starts_; 173 WordStarts title_word_starts_; 174 }; 175 typedef std::map<HistoryID, RowWordStarts> WordStartsMap; 176 177 } // namespace history 178 179 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 180