Home | History | Annotate | Download | only in query_parser
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/query_parser/query_parser.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/compiler_specific.h"
     10 #include "base/i18n/break_iterator.h"
     11 #include "base/i18n/case_conversion.h"
     12 #include "base/logging.h"
     13 #include "base/stl_util.h"
     14 #include "base/strings/utf_string_conversions.h"
     15 
     16 namespace query_parser {
     17 namespace {
     18 
     19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
     20 // sort match positions.
     21 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
     22                          const Snippet::MatchPosition& mp2) {
     23   return mp1.first < mp2.first;
     24 }
     25 
     26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
     27 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
     28 // function.
     29 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
     30                        const Snippet::MatchPosition& mp2) {
     31   return mp2.first >= mp1.first && mp2.first <= mp1.second;
     32 }
     33 
     34 // Coalesces match positions in |matches| after index that intersect the match
     35 // position at |index|.
     36 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
     37   Snippet::MatchPosition& mp = (*matches)[index];
     38   for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
     39        i != matches->end(); ) {
     40     if (SnippetIntersects(mp, *i)) {
     41       mp.second = std::max(mp.second, i->second);
     42       i = matches->erase(i);
     43     } else {
     44       return;
     45     }
     46   }
     47 }
     48 
     49 // Returns true if the character is considered a quote.
     50 bool IsQueryQuote(wchar_t ch) {
     51   return ch == '"' ||
     52          ch == 0xab ||    // left pointing double angle bracket
     53          ch == 0xbb ||    // right pointing double angle bracket
     54          ch == 0x201c ||  // left double quotation mark
     55          ch == 0x201d ||  // right double quotation mark
     56          ch == 0x201e;    // double low-9 quotation mark
     57 }
     58 
     59 }  // namespace
     60 
     61 // Inheritance structure:
     62 // Queries are represented as trees of QueryNodes.
     63 // QueryNodes are either a collection of subnodes (a QueryNodeList)
     64 // or a single word (a QueryNodeWord).
     65 
     66 // A QueryNodeWord is a single word in the query.
     67 class QueryNodeWord : public QueryNode {
     68  public:
     69   explicit QueryNodeWord(const base::string16& word);
     70   virtual ~QueryNodeWord();
     71 
     72   const base::string16& word() const { return word_; }
     73 
     74   void set_literal(bool literal) { literal_ = literal; }
     75 
     76   // QueryNode:
     77   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
     78   virtual bool IsWord() const OVERRIDE;
     79   virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
     80   virtual bool HasMatchIn(
     81       const QueryWordVector& words,
     82       Snippet::MatchPositions* match_positions) const OVERRIDE;
     83   virtual bool HasMatchIn(
     84       const QueryWordVector& words) const OVERRIDE;
     85   virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
     86 
     87  private:
     88   base::string16 word_;
     89   bool literal_;
     90 
     91   DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
     92 };
     93 
     94 QueryNodeWord::QueryNodeWord(const base::string16& word)
     95     : word_(word),
     96       literal_(false) {}
     97 
     98 QueryNodeWord::~QueryNodeWord() {}
     99 
    100 int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const {
    101   query->append(word_);
    102 
    103   // Use prefix search if we're not literal and long enough.
    104   if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    105     *query += L'*';
    106   return 1;
    107 }
    108 
    109 bool QueryNodeWord::IsWord() const {
    110   return true;
    111 }
    112 
    113 bool QueryNodeWord::Matches(const base::string16& word, bool exact) const {
    114   if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    115     return word == word_;
    116   return word.size() >= word_.size() &&
    117          (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
    118 }
    119 
    120 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words,
    121                                Snippet::MatchPositions* match_positions) const {
    122   bool matched = false;
    123   for (size_t i = 0; i < words.size(); ++i) {
    124     if (Matches(words[i].word, false)) {
    125       size_t match_start = words[i].position;
    126       match_positions->push_back(
    127           Snippet::MatchPosition(match_start,
    128                                  match_start + static_cast<int>(word_.size())));
    129       matched = true;
    130     }
    131   }
    132   return matched;
    133 }
    134 
    135 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words) const {
    136   for (size_t i = 0; i < words.size(); ++i) {
    137     if (Matches(words[i].word, false))
    138       return true;
    139   }
    140   return false;
    141 }
    142 
    143 void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const {
    144   words->push_back(word_);
    145 }
    146 
    147 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
    148 class QueryNodeList : public QueryNode {
    149  public:
    150   QueryNodeList();
    151   virtual ~QueryNodeList();
    152 
    153   QueryNodeStarVector* children() { return &children_; }
    154 
    155   void AddChild(QueryNode* node);
    156 
    157   // Remove empty subnodes left over from other parsing.
    158   void RemoveEmptySubnodes();
    159 
    160   // QueryNode:
    161   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
    162   virtual bool IsWord() const OVERRIDE;
    163   virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
    164   virtual bool HasMatchIn(
    165       const QueryWordVector& words,
    166       Snippet::MatchPositions* match_positions) const OVERRIDE;
    167   virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
    168   virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
    169 
    170  protected:
    171   int AppendChildrenToString(base::string16* query) const;
    172 
    173   QueryNodeStarVector children_;
    174 
    175  private:
    176   DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
    177 };
    178 
    179 QueryNodeList::QueryNodeList() {}
    180 
    181 QueryNodeList::~QueryNodeList() {
    182   STLDeleteElements(&children_);
    183 }
    184 
    185 void QueryNodeList::AddChild(QueryNode* node) {
    186   children_.push_back(node);
    187 }
    188 
    189 void QueryNodeList::RemoveEmptySubnodes() {
    190   for (size_t i = 0; i < children_.size(); ++i) {
    191     if (children_[i]->IsWord())
    192       continue;
    193 
    194     QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
    195     list_node->RemoveEmptySubnodes();
    196     if (list_node->children()->empty()) {
    197       children_.erase(children_.begin() + i);
    198       --i;
    199       delete list_node;
    200     }
    201   }
    202 }
    203 
    204 int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const {
    205   return AppendChildrenToString(query);
    206 }
    207 
    208 bool QueryNodeList::IsWord() const {
    209   return false;
    210 }
    211 
    212 bool QueryNodeList::Matches(const base::string16& word, bool exact) const {
    213   NOTREACHED();
    214   return false;
    215 }
    216 
    217 bool QueryNodeList::HasMatchIn(const QueryWordVector& words,
    218                                Snippet::MatchPositions* match_positions) const {
    219   NOTREACHED();
    220   return false;
    221 }
    222 
    223 bool QueryNodeList::HasMatchIn(const QueryWordVector& words) const {
    224   NOTREACHED();
    225   return false;
    226 }
    227 
    228 void QueryNodeList::AppendWords(std::vector<base::string16>* words) const {
    229   for (size_t i = 0; i < children_.size(); ++i)
    230     children_[i]->AppendWords(words);
    231 }
    232 
    233 int QueryNodeList::AppendChildrenToString(base::string16* query) const {
    234   int num_words = 0;
    235   for (QueryNodeStarVector::const_iterator node = children_.begin();
    236        node != children_.end(); ++node) {
    237     if (node != children_.begin())
    238       query->push_back(L' ');
    239     num_words += (*node)->AppendToSQLiteQuery(query);
    240   }
    241   return num_words;
    242 }
    243 
    244 // A QueryNodePhrase is a phrase query ("quoted").
    245 class QueryNodePhrase : public QueryNodeList {
    246  public:
    247   QueryNodePhrase();
    248   virtual ~QueryNodePhrase();
    249 
    250   // QueryNodeList:
    251   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
    252   virtual bool HasMatchIn(
    253       const QueryWordVector& words,
    254       Snippet::MatchPositions* match_positions) const OVERRIDE;
    255   virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
    256 
    257  private:
    258   bool MatchesAll(const QueryWordVector& words,
    259                   const QueryWord** first_word,
    260                   const QueryWord** last_word) const;
    261   DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
    262 };
    263 
    264 QueryNodePhrase::QueryNodePhrase() {}
    265 
    266 QueryNodePhrase::~QueryNodePhrase() {}
    267 
    268 int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const {
    269   query->push_back(L'"');
    270   int num_words = AppendChildrenToString(query);
    271   query->push_back(L'"');
    272   return num_words;
    273 }
    274 
    275 bool QueryNodePhrase::MatchesAll(const QueryWordVector& words,
    276                                  const QueryWord** first_word,
    277                                  const QueryWord** last_word) const {
    278   if (words.size() < children_.size())
    279     return false;
    280 
    281   for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
    282     bool matched_all = true;
    283     for (size_t j = 0; j < children_.size(); ++j) {
    284       if (!children_[j]->Matches(words[i + j].word, true)) {
    285         matched_all = false;
    286         break;
    287       }
    288     }
    289     if (matched_all) {
    290       *first_word = &words[i];
    291       *last_word = &words[i + children_.size() - 1];
    292       return true;
    293     }
    294   }
    295   return false;
    296 }
    297 
    298 bool QueryNodePhrase::HasMatchIn(
    299     const QueryWordVector& words,
    300     Snippet::MatchPositions* match_positions) const {
    301   const QueryWord* first_word;
    302   const QueryWord* last_word;
    303 
    304   if (MatchesAll(words, &first_word, &last_word)) {
    305     match_positions->push_back(
    306         Snippet::MatchPosition(first_word->position,
    307                                last_word->position + last_word->word.length()));
    308       return true;
    309   }
    310   return false;
    311 }
    312 
    313 bool QueryNodePhrase::HasMatchIn(const QueryWordVector& words) const {
    314   const QueryWord* first_word;
    315   const QueryWord* last_word;
    316   return MatchesAll(words, &first_word, &last_word);
    317 }
    318 
    319 QueryParser::QueryParser() {}
    320 
    321 // static
    322 bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) {
    323   DCHECK(!word.empty());
    324   size_t minimum_length = 3;
    325   // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
    326   // because they 'behave like' Latin letters. Moreover, we should
    327   // normalize the former before reaching here.
    328   if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
    329     minimum_length = 2;
    330   return word.size() >= minimum_length;
    331 }
    332 
    333 int QueryParser::ParseQuery(const base::string16& query,
    334                             base::string16* sqlite_query) {
    335   QueryNodeList root;
    336   if (!ParseQueryImpl(query, &root))
    337     return 0;
    338   return root.AppendToSQLiteQuery(sqlite_query);
    339 }
    340 
    341 void QueryParser::ParseQueryWords(const base::string16& query,
    342                                   std::vector<base::string16>* words) {
    343   QueryNodeList root;
    344   if (!ParseQueryImpl(query, &root))
    345     return;
    346   root.AppendWords(words);
    347 }
    348 
    349 void QueryParser::ParseQueryNodes(const base::string16& query,
    350                                   QueryNodeStarVector* nodes) {
    351   QueryNodeList root;
    352   if (ParseQueryImpl(base::i18n::ToLower(query), &root))
    353     nodes->swap(*root.children());
    354 }
    355 
    356 bool QueryParser::DoesQueryMatch(const base::string16& text,
    357                                  const QueryNodeStarVector& query_nodes,
    358                                  Snippet::MatchPositions* match_positions) {
    359   if (query_nodes.empty())
    360     return false;
    361 
    362   QueryWordVector query_words;
    363   base::string16 lower_text = base::i18n::ToLower(text);
    364   ExtractQueryWords(lower_text, &query_words);
    365 
    366   if (query_words.empty())
    367     return false;
    368 
    369   Snippet::MatchPositions matches;
    370   for (size_t i = 0; i < query_nodes.size(); ++i) {
    371     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
    372       return false;
    373   }
    374   if (lower_text.length() != text.length()) {
    375     // The lower case string differs from the original string. The matches are
    376     // meaningless.
    377     // TODO(sky): we need a better way to align the positions so that we don't
    378     // completely punt here.
    379     match_positions->clear();
    380   } else {
    381     SortAndCoalesceMatchPositions(&matches);
    382     match_positions->swap(matches);
    383   }
    384   return true;
    385 }
    386 
    387 bool QueryParser::DoesQueryMatch(const QueryWordVector& query_words,
    388                                  const QueryNodeStarVector& query_nodes) {
    389   if (query_nodes.empty() || query_words.empty())
    390     return false;
    391 
    392   for (size_t i = 0; i < query_nodes.size(); ++i) {
    393     if (!query_nodes[i]->HasMatchIn(query_words))
    394       return false;
    395   }
    396   return true;
    397 }
    398 
    399 bool QueryParser::ParseQueryImpl(const base::string16& query,
    400                                  QueryNodeList* root) {
    401   base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
    402   // TODO(evanm): support a locale here
    403   if (!iter.Init())
    404     return false;
    405 
    406   // To handle nesting, we maintain a stack of QueryNodeLists.
    407   // The last element (back) of the stack contains the current, deepest node.
    408   std::vector<QueryNodeList*> query_stack;
    409   query_stack.push_back(root);
    410 
    411   bool in_quotes = false;  // whether we're currently in a quoted phrase
    412   while (iter.Advance()) {
    413     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    414     // is not necessarily a word, but could also be a sequence of punctuation
    415     // or whitespace.
    416     if (iter.IsWord()) {
    417       QueryNodeWord* word_node = new QueryNodeWord(iter.GetString());
    418       if (in_quotes)
    419         word_node->set_literal(true);
    420       query_stack.back()->AddChild(word_node);
    421     } else {  // Punctuation.
    422       if (IsQueryQuote(query[iter.prev()])) {
    423         if (!in_quotes) {
    424           QueryNodeList* quotes_node = new QueryNodePhrase;
    425           query_stack.back()->AddChild(quotes_node);
    426           query_stack.push_back(quotes_node);
    427           in_quotes = true;
    428         } else {
    429           query_stack.pop_back();  // Stop adding to the quoted phrase.
    430           in_quotes = false;
    431         }
    432       }
    433     }
    434   }
    435 
    436   root->RemoveEmptySubnodes();
    437   return true;
    438 }
    439 
    440 void QueryParser::ExtractQueryWords(const base::string16& text,
    441                                     QueryWordVector* words) {
    442   base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
    443   // TODO(evanm): support a locale here
    444   if (!iter.Init())
    445     return;
    446 
    447   while (iter.Advance()) {
    448     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    449     // is not necessarily a word, but could also be a sequence of punctuation
    450     // or whitespace.
    451     if (iter.IsWord()) {
    452       base::string16 word = iter.GetString();
    453       if (!word.empty()) {
    454         words->push_back(QueryWord());
    455         words->back().word = word;
    456         words->back().position = iter.prev();
    457      }
    458     }
    459   }
    460 }
    461 
    462 // static
    463 void QueryParser::SortAndCoalesceMatchPositions(
    464     Snippet::MatchPositions* matches) {
    465   std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
    466   // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
    467   // from matches.
    468   for (size_t i = 0; i < matches->size(); ++i)
    469     CoalesceMatchesFrom(i, matches);
    470 }
    471 
    472 }  // namespace query_parser
    473