1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/in_memory_url_index_types.h" 6 7 #include <algorithm> 8 #include <functional> 9 #include <iterator> 10 #include <numeric> 11 #include <set> 12 13 #include "base/i18n/break_iterator.h" 14 #include "base/i18n/case_conversion.h" 15 #include "base/strings/string_util.h" 16 #include "net/base/escape.h" 17 #include "net/base/net_util.h" 18 19 namespace history { 20 21 // Matches within URL and Title Strings ---------------------------------------- 22 23 TermMatches MatchTermInString(const base::string16& term, 24 const base::string16& cleaned_string, 25 int term_num) { 26 const size_t kMaxCompareLength = 2048; 27 const base::string16& short_string = 28 (cleaned_string.length() > kMaxCompareLength) ? 29 cleaned_string.substr(0, kMaxCompareLength) : cleaned_string; 30 TermMatches matches; 31 for (size_t location = short_string.find(term); 32 location != base::string16::npos; 33 location = short_string.find(term, location + 1)) 34 matches.push_back(TermMatch(term_num, location, term.length())); 35 return matches; 36 } 37 38 // Comparison function for sorting TermMatches by their offsets. 39 bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) { 40 return m1.offset < m2.offset; 41 } 42 43 TermMatches SortAndDeoverlapMatches(const TermMatches& matches) { 44 if (matches.empty()) 45 return matches; 46 TermMatches sorted_matches = matches; 47 std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess); 48 TermMatches clean_matches; 49 TermMatch last_match; 50 for (TermMatches::const_iterator iter = sorted_matches.begin(); 51 iter != sorted_matches.end(); ++iter) { 52 if (iter->offset >= last_match.offset + last_match.length) { 53 last_match = *iter; 54 clean_matches.push_back(last_match); 55 } 56 } 57 return clean_matches; 58 } 59 60 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) { 61 std::vector<size_t> offsets; 62 for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); 63 ++i) { 64 offsets.push_back(i->offset); 65 offsets.push_back(i->offset + i->length); 66 } 67 return offsets; 68 } 69 70 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches, 71 const std::vector<size_t>& offsets) { 72 DCHECK_EQ(2 * matches.size(), offsets.size()); 73 TermMatches new_matches; 74 std::vector<size_t>::const_iterator offset_iter = offsets.begin(); 75 for (TermMatches::const_iterator term_iter = matches.begin(); 76 term_iter != matches.end(); ++term_iter, ++offset_iter) { 77 const size_t starting_offset = *offset_iter; 78 ++offset_iter; 79 const size_t ending_offset = *offset_iter; 80 if ((starting_offset != base::string16::npos) && 81 (ending_offset != base::string16::npos) && 82 (starting_offset != ending_offset)) { 83 TermMatch new_match(*term_iter); 84 new_match.offset = starting_offset; 85 new_match.length = ending_offset - starting_offset; 86 new_matches.push_back(new_match); 87 } 88 } 89 return new_matches; 90 } 91 92 // Utility Functions ----------------------------------------------------------- 93 94 String16Set String16SetFromString16(const base::string16& cleaned_uni_string, 95 WordStarts* word_starts) { 96 String16Vector words = 97 String16VectorFromString16(cleaned_uni_string, false, word_starts); 98 String16Set word_set; 99 for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); 100 ++iter) 101 word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxSignificantChars)); 102 return word_set; 103 } 104 105 String16Vector String16VectorFromString16( 106 const base::string16& cleaned_uni_string, 107 bool break_on_space, 108 WordStarts* word_starts) { 109 if (word_starts) 110 word_starts->clear(); 111 base::i18n::BreakIterator iter(cleaned_uni_string, 112 break_on_space ? base::i18n::BreakIterator::BREAK_SPACE : 113 base::i18n::BreakIterator::BREAK_WORD); 114 String16Vector words; 115 if (!iter.Init()) 116 return words; 117 while (iter.Advance()) { 118 if (break_on_space || iter.IsWord()) { 119 base::string16 word(iter.GetString()); 120 size_t initial_whitespace = 0; 121 if (break_on_space) { 122 base::string16 trimmed_word; 123 base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word); 124 initial_whitespace = word.length() - trimmed_word.length(); 125 base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word); 126 } 127 if (word.empty()) 128 continue; 129 words.push_back(word); 130 if (!word_starts) 131 continue; 132 size_t word_start = iter.prev() + initial_whitespace; 133 if (word_start < kMaxSignificantChars) 134 word_starts->push_back(word_start); 135 } 136 } 137 return words; 138 } 139 140 Char16Set Char16SetFromString16(const base::string16& term) { 141 Char16Set characters; 142 for (base::string16::const_iterator iter = term.begin(); iter != term.end(); 143 ++iter) 144 characters.insert(*iter); 145 return characters; 146 } 147 148 // HistoryInfoMapValue --------------------------------------------------------- 149 150 HistoryInfoMapValue::HistoryInfoMapValue() {} 151 HistoryInfoMapValue::~HistoryInfoMapValue() {} 152 153 // RowWordStarts --------------------------------------------------------------- 154 155 RowWordStarts::RowWordStarts() {} 156 RowWordStarts::~RowWordStarts() {} 157 158 void RowWordStarts::Clear() { 159 url_word_starts_.clear(); 160 title_word_starts_.clear(); 161 } 162 163 } // namespace history 164