1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 7 8 #include <map> 9 #include <set> 10 #include <vector> 11 12 #include "base/strings/string16.h" 13 #include "chrome/browser/autocomplete/history_provider_util.h" 14 #include "chrome/browser/history/history_types.h" 15 #include "url/gurl.h" 16 17 namespace history { 18 19 // The maximum number of characters to consider from an URL and page title 20 // while matching user-typed terms. 21 const size_t kMaxSignificantChars = 50; 22 23 // Matches within URL and Title Strings ---------------------------------------- 24 25 // Specifies where an omnibox term occurs within a string. Used for specifying 26 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in 27 // scoring a result. 28 struct TermMatch { 29 TermMatch() : term_num(0), offset(0), length(0) {} 30 TermMatch(int term_num, size_t offset, size_t length) 31 : term_num(term_num), 32 offset(offset), 33 length(length) {} 34 35 int term_num; // The index of the term in the original search string. 36 size_t offset; // The starting offset of the substring match. 37 size_t length; // The length of the substring match. 38 }; 39 typedef std::vector<TermMatch> TermMatches; 40 41 // Unescapes the URL and lower-cases it, returning the result. This 42 // unescaping makes it possible to match substrings that were 43 // originally escaped for navigation; for example, if the user 44 // searched for "a&p", the query would be escaped as "a%26p", so 45 // without unescaping, an input string of "a&p" would no longer match 46 // this URL. Note that the resulting unescaped URL may not be 47 // directly navigable (which is why we escaped it to begin with). 48 // |languages| is passed to net::FormatUrl(). 49 string16 CleanUpUrlForMatching(const GURL& gurl, 50 const std::string& languages); 51 52 // Returns the lower-cased title. 53 string16 CleanUpTitleForMatching(const string16& title); 54 55 // Returns a TermMatches which has an entry for each occurrence of the 56 // string |term| found in the string |cleaned_string|. Use 57 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing 58 // |cleaned_string| to this function. The function marks each match 59 // with |term_num| so that the resulting TermMatches can be merged 60 // with other TermMatches for other terms. Note that only the first 61 // 2,048 characters of |string| are considered during the match 62 // operation. 63 TermMatches MatchTermInString(const string16& term, 64 const string16& cleaned_string, 65 int term_num); 66 67 // Sorts and removes overlapping substring matches from |matches| and 68 // returns the cleaned up matches. 69 TermMatches SortAndDeoverlapMatches(const TermMatches& matches); 70 71 // Extracts and returns the offsets from |matches|. 72 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches); 73 74 // Replaces the offsets in |matches| with those given in |offsets|, deleting 75 // any which are npos, and returns the updated list of matches. 76 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches, 77 const std::vector<size_t>& offsets); 78 79 // Convenience Types ----------------------------------------------------------- 80 81 typedef std::vector<string16> String16Vector; 82 typedef std::set<string16> String16Set; 83 typedef std::set<char16> Char16Set; 84 typedef std::vector<char16> Char16Vector; 85 86 // A vector that contains the offsets at which each word starts within a string. 87 typedef std::vector<size_t> WordStarts; 88 89 // Utility Functions ----------------------------------------------------------- 90 91 // Breaks the string |cleaned_uni_string| down into individual words. 92 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before 93 // passing |cleaned_uni_string| to this function. If |word_starts| is 94 // not NULL then clears and pushes the offsets within 95 // |cleaned_uni_string| at which each word starts onto 96 // |word_starts|. These offsets are collected only up to the first 97 // kMaxSignificantChars of |cleaned_uni_string|. 98 String16Set String16SetFromString16(const string16& cleaned_uni_string, 99 WordStarts* word_starts); 100 101 // Breaks the |cleaned_uni_string| string down into individual words 102 // and return a vector with the individual words in their original 103 // order. Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() 104 // before passing |cleaned_uni_string| to this function. If 105 // |break_on_space| is false then the resulting list will contain only 106 // words containing alpha-numeric characters. If |break_on_space| is 107 // true then the resulting list will contain strings broken at 108 // whitespace. (|break_on_space| indicates that the 109 // BreakIterator::BREAK_SPACE (equivalent to BREAK_LINE) approach is 110 // to be used. For a complete description of this algorithm refer to 111 // the comments in base/i18n/break_iterator.h.) If |word_starts| is 112 // not NULL then clears and pushes the word starts onto |word_starts|. 113 // 114 // Example: 115 // Given: |cleaned_uni_string|: "http://www.google.com/ harry the rabbit." 116 // With |break_on_space| false the returned list will contain: 117 // "http", "www", "google", "com", "harry", "the", "rabbit" 118 // With |break_on_space| true the returned list will contain: 119 // "http://", "www.google.com/", "harry", "the", "rabbit." 120 String16Vector String16VectorFromString16(const string16& cleaned_uni_string, 121 bool break_on_space, 122 WordStarts* word_starts); 123 124 // Breaks the |uni_word| string down into its individual characters. 125 // Note that this is temporarily intended to work on a single word, but 126 // _will_ work on a string of words, perhaps with unexpected results. 127 // TODO(mrossetti): Lots of optimizations possible here for not restarting 128 // a search if the user is just typing along. Also, change this to uniString 129 // and properly handle substring matches, scoring and sorting the results 130 // by score. Also, provide the metrics for where the matches occur so that 131 // the UI can highlight the matched sections. 132 Char16Set Char16SetFromString16(const string16& uni_word); 133 134 // Support for InMemoryURLIndex Private Data ----------------------------------- 135 136 // An index into a list of all of the words we have indexed. 137 typedef size_t WordID; 138 139 // A map allowing a WordID to be determined given a word. 140 typedef std::map<string16, WordID> WordMap; 141 142 // A map from character to the word_ids of words containing that character. 143 typedef std::set<WordID> WordIDSet; // An index into the WordList. 144 typedef std::map<char16, WordIDSet> CharWordIDMap; 145 146 // A map from word (by word_id) to history items containing that word. 147 typedef history::URLID HistoryID; 148 typedef std::set<HistoryID> HistoryIDSet; 149 typedef std::vector<HistoryID> HistoryIDVector; 150 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; 151 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap; 152 153 154 // Information used in scoring a particular URL. 155 typedef std::vector<VisitInfo> VisitInfoVector; 156 struct HistoryInfoMapValue { 157 HistoryInfoMapValue(); 158 ~HistoryInfoMapValue(); 159 160 // This field is always populated. 161 URLRow url_row; 162 163 // This field gets filled in asynchronously after a visit. As such, 164 // it's almost always correct. If it's wrong, it's likely to either 165 // be empty (if this URL was recently added to the index) or 166 // slightly out-of-date (one visit behind). 167 VisitInfoVector visits; 168 }; 169 170 // A map from history_id to the history's URL and title. 171 typedef std::map<HistoryID, HistoryInfoMapValue> HistoryInfoMap; 172 173 // A map from history_id to URL and page title word start metrics. 174 struct RowWordStarts { 175 RowWordStarts(); 176 ~RowWordStarts(); 177 178 // Clears both url_word_starts_ and title_word_starts_. 179 void Clear(); 180 181 WordStarts url_word_starts_; 182 WordStarts title_word_starts_; 183 }; 184 typedef std::map<HistoryID, RowWordStarts> WordStartsMap; 185 186 } // namespace history 187 188 #endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_TYPES_H_ 189