Home | History | Annotate | Download | only in query_parser
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/query_parser/snippet.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/logging.h"
     10 #include "base/memory/scoped_ptr.h"
     11 #include "base/strings/string_split.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "third_party/icu/source/common/unicode/brkiter.h"
     15 #include "third_party/icu/source/common/unicode/utext.h"
     16 #include "third_party/icu/source/common/unicode/utf8.h"
     17 
     18 namespace query_parser {
     19 namespace {
     20 
     21 bool PairFirstLessThan(const Snippet::MatchPosition& a,
     22                        const Snippet::MatchPosition& b) {
     23   return a.first < b.first;
     24 }
     25 
     26 // Combines all pairs after offset in match_positions that are contained
     27 // or touch the pair at offset.
     28 void CoalescePositionsFrom(size_t offset,
     29                            Snippet::MatchPositions* match_positions) {
     30   DCHECK(offset < match_positions->size());
     31   Snippet::MatchPosition& pair((*match_positions)[offset]);
     32   ++offset;
     33   while (offset < match_positions->size() &&
     34          pair.second >= (*match_positions)[offset].first) {
     35     pair.second = std::max(pair.second, (*match_positions)[offset].second);
     36     match_positions->erase(match_positions->begin() + offset);
     37   }
     38 }
     39 
     40 // Makes sure there is a pair in match_positions that contains the specified
     41 // range. This keeps the pairs ordered in match_positions by first, and makes
     42 // sure none of the pairs in match_positions touch each other.
     43 void AddMatch(size_t start,
     44               size_t end,
     45               Snippet::MatchPositions* match_positions) {
     46   DCHECK(start < end);
     47   DCHECK(match_positions);
     48   Snippet::MatchPosition pair(start, end);
     49   if (match_positions->empty()) {
     50     match_positions->push_back(pair);
     51     return;
     52   }
     53   // There's at least one match. Find the position of the new match,
     54   // potentially extending pairs around it.
     55   Snippet::MatchPositions::iterator i =
     56       std::lower_bound(match_positions->begin(), match_positions->end(),
     57                        pair, &PairFirstLessThan);
     58   if (i != match_positions->end() && i->first == start) {
     59     // Match not at the end and there is already a pair with the same
     60     // start.
     61     if (end > i->second) {
     62       // New pair extends beyond existing pair. Extend existing pair and
     63       // coalesce matches after it.
     64       i->second = end;
     65       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
     66     }  // else case, new pair completely contained in existing pair, nothing
     67        // to do.
     68   } else if (i == match_positions->begin()) {
     69     // Match at the beginning and the first pair doesn't have the same
     70     // start. Insert new pair and coalesce matches after it.
     71     match_positions->insert(i, pair);
     72     CoalescePositionsFrom(0, match_positions);
     73   } else {
     74     // Not at the beginning (but may be at the end).
     75     --i;
     76     if (start <= i->second && end > i->second) {
     77       // Previous element contains match. Extend it and coalesce.
     78       i->second = end;
     79       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
     80     } else if (end > i->second) {
     81       // Region doesn't touch previous element. See if region touches current
     82       // element.
     83       ++i;
     84       if (i == match_positions->end() || end < i->first) {
     85         match_positions->insert(i, pair);
     86       } else {
     87         i->first = start;
     88         i->second = end;
     89         CoalescePositionsFrom(i - match_positions->begin(), match_positions);
     90       }
     91     }
     92   }
     93 }
     94 
     95 // Converts an index in a utf8 string into the index in the corresponding utf16
     96 // string and returns the utf16 index. This is intended to be called in a loop
     97 // iterating through a utf8 string.
     98 //
     99 // utf8_string: the utf8 string.
    100 // utf8_length: length of the utf8 string.
    101 // offset: the utf8 offset to convert.
    102 // utf8_pos: current offset in the utf8 string. This is modified and on return
    103 //           matches offset.
    104 // wide_pos: current index in the wide string. This is the same as the return
    105 //           value.
    106 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
    107                                 int32_t utf8_length,
    108                                 int32_t offset,
    109                                 int32_t* utf8_pos,
    110                                 size_t* utf16_pos) {
    111   DCHECK(offset >= *utf8_pos && offset <= utf8_length);
    112 
    113   UChar32 wide_char;
    114   while (*utf8_pos < offset) {
    115     U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
    116     *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
    117   }
    118   return *utf16_pos;
    119 }
    120 
    121 // Given a character break iterator over a UTF-8 string, set the iterator
    122 // position to |*utf8_pos| and move by |count| characters. |count| can
    123 // be either positive or negative.
    124 void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
    125   // Ignore the return value. A side effect of the current position
    126   // being set at or following |*utf8_pos| is exploited here.
    127   // It's simpler than calling following(n) and then previous().
    128   // isBoundary() is not very fast, but should be good enough for the
    129   // snippet generation. If not, revisit the way we scan in ComputeSnippet.
    130   bi->isBoundary(static_cast<int32_t>(*utf8_pos));
    131   bi->next(count);
    132   *utf8_pos = static_cast<size_t>(bi->current());
    133 }
    134 
    135 // The amount of context to include for a given hit. Note that it's counted
    136 // in terms of graphemes rather than bytes.
    137 const int kSnippetContext = 50;
    138 
    139 // Returns true if next match falls within a snippet window
    140 // from the previous match. The window size is counted in terms
    141 // of graphemes rather than bytes in UTF-8.
    142 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
    143                                     size_t previous_match_end,
    144                                     size_t next_match_start) {
    145   // If it's within a window in terms of bytes, it's certain
    146   // that it's within a window in terms of graphemes as well.
    147   if (next_match_start < previous_match_end + kSnippetContext)
    148     return true;
    149   bi->isBoundary(static_cast<int32_t>(previous_match_end));
    150   // An alternative to this is to call |bi->next()| at most
    151   // kSnippetContext times, compare |bi->current()| with |next_match_start|
    152   // after each call and return early if possible. There are other
    153   // heuristics to speed things up if necessary, but it's not likely that
    154   // we need to bother.
    155   bi->next(kSnippetContext);
    156   int64 current = bi->current();
    157   return (next_match_start < static_cast<uint64>(current) ||
    158           current == icu::BreakIterator::DONE);
    159 }
    160 
    161 }  // namespace
    162 
    163 // static
    164 void Snippet::ExtractMatchPositions(const std::string& offsets_str,
    165                                     const std::string& column_num,
    166                                     MatchPositions* match_positions) {
    167   DCHECK(match_positions);
    168   if (offsets_str.empty())
    169     return;
    170   std::vector<std::string> offsets;
    171   base::SplitString(offsets_str, ' ', &offsets);
    172   // SQLite offsets are sets of four integers:
    173   //   column, query term, match offset, match length
    174   // Matches within a string are marked by (start, end) pairs.
    175   for (size_t i = 0; i < offsets.size() - 3; i += 4) {
    176     if (offsets[i] != column_num)
    177       continue;
    178     const size_t start = atoi(offsets[i + 2].c_str());
    179     const size_t end = start + atoi(offsets[i + 3].c_str());
    180     // Switch to DCHECK after debugging http://crbug.com/15261.
    181     CHECK(end >= start);
    182     AddMatch(start, end, match_positions);
    183   }
    184 }
    185 
    186 // static
    187 void Snippet::ConvertMatchPositionsToWide(
    188     const std::string& utf8_string,
    189     Snippet::MatchPositions* match_positions) {
    190   DCHECK(match_positions);
    191   int32_t utf8_pos = 0;
    192   size_t utf16_pos = 0;
    193   const char* utf8_cstring = utf8_string.c_str();
    194   const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
    195   for (Snippet::MatchPositions::iterator i = match_positions->begin();
    196        i != match_positions->end(); ++i) {
    197     i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
    198                                         static_cast<int32_t>(i->first),
    199                                         &utf8_pos, &utf16_pos);
    200     i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
    201                                          static_cast<int32_t>(i->second),
    202                                          &utf8_pos, &utf16_pos);
    203   }
    204 }
    205 
    206 Snippet::Snippet() {
    207 }
    208 
    209 Snippet::~Snippet() {
    210 }
    211 
    212 void Snippet::ComputeSnippet(const MatchPositions& match_positions,
    213                              const std::string& document) {
    214   // The length of snippets we try to produce.
    215   // We can generate longer snippets but stop once we cross kSnippetMaxLength.
    216   const size_t kSnippetMaxLength = 200;
    217   const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");
    218 
    219   UText* document_utext = NULL;
    220   UErrorCode status = U_ZERO_ERROR;
    221   document_utext = utext_openUTF8(document_utext, document.data(),
    222                                   document.size(), &status);
    223   // Locale does not matter because there's no per-locale customization
    224   // for character iterator.
    225   scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
    226       icu::Locale::getDefault(), status));
    227   bi->setText(document_utext, status);
    228   DCHECK(U_SUCCESS(status));
    229 
    230   // We build the snippet by iterating through the matches and then grabbing
    231   // context around each match.  If matches are near enough each other (within
    232   // kSnippetContext), we skip the "..." between them.
    233   base::string16 snippet;
    234   size_t start = 0;
    235   for (size_t i = 0; i < match_positions.size(); ++i) {
    236     // Some shorter names for the current match.
    237     const size_t match_start = match_positions[i].first;
    238     const size_t match_end = match_positions[i].second;
    239 
    240     // Switch to DCHECK after debugging http://crbug.com/15261.
    241     CHECK(match_end > match_start);
    242     CHECK(match_end <= document.size());
    243 
    244     // Add the context, if any, to show before the match.
    245     size_t context_start = match_start;
    246     MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
    247     start = std::max(start, context_start);
    248     if (start < match_start) {
    249       if (start > 0)
    250         snippet += kEllipsis;
    251       // Switch to DCHECK after debugging http://crbug.com/15261.
    252       CHECK(start < document.size());
    253       snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
    254     }
    255 
    256     // Add the match.
    257     const size_t first = snippet.size();
    258     snippet += base::UTF8ToUTF16(document.substr(match_start,
    259                                                  match_end - match_start));
    260     matches_.push_back(std::make_pair(first, snippet.size()));
    261 
    262     // Compute the context, if any, to show after the match.
    263     size_t end;
    264     // Check if the next match falls within our snippet window.
    265     if (i + 1 < match_positions.size() &&
    266         IsNextMatchWithinSnippetWindow(bi.get(), match_end,
    267             match_positions[i + 1].first)) {
    268       // Yes, it's within the window.  Make the end context extend just up
    269       // to the next match.
    270       end = match_positions[i + 1].first;
    271       // Switch to DCHECK after debugging http://crbug.com/15261.
    272       CHECK(end >= match_end);
    273       CHECK(end <= document.size());
    274       snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
    275     } else {
    276       // No, there's either no next match or the next match is too far away.
    277       end = match_end;
    278       MoveByNGraphemes(bi.get(), kSnippetContext, &end);
    279       // Switch to DCHECK after debugging http://crbug.com/15261.
    280       CHECK(end >= match_end);
    281       CHECK(end <= document.size());
    282       snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
    283       if (end < document.size())
    284         snippet += kEllipsis;
    285     }
    286     start = end;
    287 
    288     // Stop here if we have enough snippet computed.
    289     if (snippet.size() >= kSnippetMaxLength)
    290       break;
    291   }
    292 
    293   utext_close(document_utext);
    294   swap(text_, snippet);
    295 }
    296 
    297 void Snippet::Swap(Snippet* other) {
    298   text_.swap(other->text_);
    299   matches_.swap(other->matches_);
    300 }
    301 
    302 }  // namespace query_parser
    303