1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/query_parser/snippet.h" 6 7 #include <algorithm> 8 9 #include "base/logging.h" 10 #include "base/memory/scoped_ptr.h" 11 #include "base/strings/string_split.h" 12 #include "base/strings/string_util.h" 13 #include "base/strings/utf_string_conversions.h" 14 #include "third_party/icu/source/common/unicode/brkiter.h" 15 #include "third_party/icu/source/common/unicode/utext.h" 16 #include "third_party/icu/source/common/unicode/utf8.h" 17 18 namespace query_parser { 19 namespace { 20 21 bool PairFirstLessThan(const Snippet::MatchPosition& a, 22 const Snippet::MatchPosition& b) { 23 return a.first < b.first; 24 } 25 26 // Combines all pairs after offset in match_positions that are contained 27 // or touch the pair at offset. 28 void CoalescePositionsFrom(size_t offset, 29 Snippet::MatchPositions* match_positions) { 30 DCHECK(offset < match_positions->size()); 31 Snippet::MatchPosition& pair((*match_positions)[offset]); 32 ++offset; 33 while (offset < match_positions->size() && 34 pair.second >= (*match_positions)[offset].first) { 35 pair.second = std::max(pair.second, (*match_positions)[offset].second); 36 match_positions->erase(match_positions->begin() + offset); 37 } 38 } 39 40 // Makes sure there is a pair in match_positions that contains the specified 41 // range. This keeps the pairs ordered in match_positions by first, and makes 42 // sure none of the pairs in match_positions touch each other. 43 void AddMatch(size_t start, 44 size_t end, 45 Snippet::MatchPositions* match_positions) { 46 DCHECK(start < end); 47 DCHECK(match_positions); 48 Snippet::MatchPosition pair(start, end); 49 if (match_positions->empty()) { 50 match_positions->push_back(pair); 51 return; 52 } 53 // There's at least one match. Find the position of the new match, 54 // potentially extending pairs around it. 55 Snippet::MatchPositions::iterator i = 56 std::lower_bound(match_positions->begin(), match_positions->end(), 57 pair, &PairFirstLessThan); 58 if (i != match_positions->end() && i->first == start) { 59 // Match not at the end and there is already a pair with the same 60 // start. 61 if (end > i->second) { 62 // New pair extends beyond existing pair. Extend existing pair and 63 // coalesce matches after it. 64 i->second = end; 65 CoalescePositionsFrom(i - match_positions->begin(), match_positions); 66 } // else case, new pair completely contained in existing pair, nothing 67 // to do. 68 } else if (i == match_positions->begin()) { 69 // Match at the beginning and the first pair doesn't have the same 70 // start. Insert new pair and coalesce matches after it. 71 match_positions->insert(i, pair); 72 CoalescePositionsFrom(0, match_positions); 73 } else { 74 // Not at the beginning (but may be at the end). 75 --i; 76 if (start <= i->second && end > i->second) { 77 // Previous element contains match. Extend it and coalesce. 78 i->second = end; 79 CoalescePositionsFrom(i - match_positions->begin(), match_positions); 80 } else if (end > i->second) { 81 // Region doesn't touch previous element. See if region touches current 82 // element. 83 ++i; 84 if (i == match_positions->end() || end < i->first) { 85 match_positions->insert(i, pair); 86 } else { 87 i->first = start; 88 i->second = end; 89 CoalescePositionsFrom(i - match_positions->begin(), match_positions); 90 } 91 } 92 } 93 } 94 95 // Converts an index in a utf8 string into the index in the corresponding utf16 96 // string and returns the utf16 index. This is intended to be called in a loop 97 // iterating through a utf8 string. 98 // 99 // utf8_string: the utf8 string. 100 // utf8_length: length of the utf8 string. 101 // offset: the utf8 offset to convert. 102 // utf8_pos: current offset in the utf8 string. This is modified and on return 103 // matches offset. 104 // wide_pos: current index in the wide string. This is the same as the return 105 // value. 106 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string, 107 int32_t utf8_length, 108 int32_t offset, 109 int32_t* utf8_pos, 110 size_t* utf16_pos) { 111 DCHECK(offset >= *utf8_pos && offset <= utf8_length); 112 113 UChar32 wide_char; 114 while (*utf8_pos < offset) { 115 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char); 116 *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2; 117 } 118 return *utf16_pos; 119 } 120 121 // Given a character break iterator over a UTF-8 string, set the iterator 122 // position to |*utf8_pos| and move by |count| characters. |count| can 123 // be either positive or negative. 124 void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) { 125 // Ignore the return value. A side effect of the current position 126 // being set at or following |*utf8_pos| is exploited here. 127 // It's simpler than calling following(n) and then previous(). 128 // isBoundary() is not very fast, but should be good enough for the 129 // snippet generation. If not, revisit the way we scan in ComputeSnippet. 130 bi->isBoundary(static_cast<int32_t>(*utf8_pos)); 131 bi->next(count); 132 *utf8_pos = static_cast<size_t>(bi->current()); 133 } 134 135 // The amount of context to include for a given hit. Note that it's counted 136 // in terms of graphemes rather than bytes. 137 const int kSnippetContext = 50; 138 139 // Returns true if next match falls within a snippet window 140 // from the previous match. The window size is counted in terms 141 // of graphemes rather than bytes in UTF-8. 142 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi, 143 size_t previous_match_end, 144 size_t next_match_start) { 145 // If it's within a window in terms of bytes, it's certain 146 // that it's within a window in terms of graphemes as well. 147 if (next_match_start < previous_match_end + kSnippetContext) 148 return true; 149 bi->isBoundary(static_cast<int32_t>(previous_match_end)); 150 // An alternative to this is to call |bi->next()| at most 151 // kSnippetContext times, compare |bi->current()| with |next_match_start| 152 // after each call and return early if possible. There are other 153 // heuristics to speed things up if necessary, but it's not likely that 154 // we need to bother. 155 bi->next(kSnippetContext); 156 int64 current = bi->current(); 157 return (next_match_start < static_cast<uint64>(current) || 158 current == icu::BreakIterator::DONE); 159 } 160 161 } // namespace 162 163 // static 164 void Snippet::ExtractMatchPositions(const std::string& offsets_str, 165 const std::string& column_num, 166 MatchPositions* match_positions) { 167 DCHECK(match_positions); 168 if (offsets_str.empty()) 169 return; 170 std::vector<std::string> offsets; 171 base::SplitString(offsets_str, ' ', &offsets); 172 // SQLite offsets are sets of four integers: 173 // column, query term, match offset, match length 174 // Matches within a string are marked by (start, end) pairs. 175 for (size_t i = 0; i < offsets.size() - 3; i += 4) { 176 if (offsets[i] != column_num) 177 continue; 178 const size_t start = atoi(offsets[i + 2].c_str()); 179 const size_t end = start + atoi(offsets[i + 3].c_str()); 180 // Switch to DCHECK after debugging http://crbug.com/15261. 181 CHECK(end >= start); 182 AddMatch(start, end, match_positions); 183 } 184 } 185 186 // static 187 void Snippet::ConvertMatchPositionsToWide( 188 const std::string& utf8_string, 189 Snippet::MatchPositions* match_positions) { 190 DCHECK(match_positions); 191 int32_t utf8_pos = 0; 192 size_t utf16_pos = 0; 193 const char* utf8_cstring = utf8_string.c_str(); 194 const int32_t utf8_length = static_cast<int32_t>(utf8_string.size()); 195 for (Snippet::MatchPositions::iterator i = match_positions->begin(); 196 i != match_positions->end(); ++i) { 197 i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 198 static_cast<int32_t>(i->first), 199 &utf8_pos, &utf16_pos); 200 i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 201 static_cast<int32_t>(i->second), 202 &utf8_pos, &utf16_pos); 203 } 204 } 205 206 Snippet::Snippet() { 207 } 208 209 Snippet::~Snippet() { 210 } 211 212 void Snippet::ComputeSnippet(const MatchPositions& match_positions, 213 const std::string& document) { 214 // The length of snippets we try to produce. 215 // We can generate longer snippets but stop once we cross kSnippetMaxLength. 216 const size_t kSnippetMaxLength = 200; 217 const base::string16 kEllipsis = base::ASCIIToUTF16(" ... "); 218 219 UText* document_utext = NULL; 220 UErrorCode status = U_ZERO_ERROR; 221 document_utext = utext_openUTF8(document_utext, document.data(), 222 document.size(), &status); 223 // Locale does not matter because there's no per-locale customization 224 // for character iterator. 225 scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance( 226 icu::Locale::getDefault(), status)); 227 bi->setText(document_utext, status); 228 DCHECK(U_SUCCESS(status)); 229 230 // We build the snippet by iterating through the matches and then grabbing 231 // context around each match. If matches are near enough each other (within 232 // kSnippetContext), we skip the "..." between them. 233 base::string16 snippet; 234 size_t start = 0; 235 for (size_t i = 0; i < match_positions.size(); ++i) { 236 // Some shorter names for the current match. 237 const size_t match_start = match_positions[i].first; 238 const size_t match_end = match_positions[i].second; 239 240 // Switch to DCHECK after debugging http://crbug.com/15261. 241 CHECK(match_end > match_start); 242 CHECK(match_end <= document.size()); 243 244 // Add the context, if any, to show before the match. 245 size_t context_start = match_start; 246 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start); 247 start = std::max(start, context_start); 248 if (start < match_start) { 249 if (start > 0) 250 snippet += kEllipsis; 251 // Switch to DCHECK after debugging http://crbug.com/15261. 252 CHECK(start < document.size()); 253 snippet += base::UTF8ToUTF16(document.substr(start, match_start - start)); 254 } 255 256 // Add the match. 257 const size_t first = snippet.size(); 258 snippet += base::UTF8ToUTF16(document.substr(match_start, 259 match_end - match_start)); 260 matches_.push_back(std::make_pair(first, snippet.size())); 261 262 // Compute the context, if any, to show after the match. 263 size_t end; 264 // Check if the next match falls within our snippet window. 265 if (i + 1 < match_positions.size() && 266 IsNextMatchWithinSnippetWindow(bi.get(), match_end, 267 match_positions[i + 1].first)) { 268 // Yes, it's within the window. Make the end context extend just up 269 // to the next match. 270 end = match_positions[i + 1].first; 271 // Switch to DCHECK after debugging http://crbug.com/15261. 272 CHECK(end >= match_end); 273 CHECK(end <= document.size()); 274 snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); 275 } else { 276 // No, there's either no next match or the next match is too far away. 277 end = match_end; 278 MoveByNGraphemes(bi.get(), kSnippetContext, &end); 279 // Switch to DCHECK after debugging http://crbug.com/15261. 280 CHECK(end >= match_end); 281 CHECK(end <= document.size()); 282 snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); 283 if (end < document.size()) 284 snippet += kEllipsis; 285 } 286 start = end; 287 288 // Stop here if we have enough snippet computed. 289 if (snippet.size() >= kSnippetMaxLength) 290 break; 291 } 292 293 utext_close(document_utext); 294 swap(text_, snippet); 295 } 296 297 void Snippet::Swap(Snippet* other) { 298 text_.swap(other->text_); 299 matches_.swap(other->matches_); 300 } 301 302 } // namespace query_parser 303