Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/query_parser.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/compiler_specific.h"
     10 #include "base/i18n/break_iterator.h"
     11 #include "base/i18n/case_conversion.h"
     12 #include "base/logging.h"
     13 #include "base/stl_util.h"
     14 #include "base/strings/utf_string_conversions.h"
     15 
     16 namespace {
     17 
     18 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
     19 // sort match positions.
     20 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
     21                          const Snippet::MatchPosition& mp2) {
     22   return mp1.first < mp2.first;
     23 }
     24 
     25 // Returns true if |mp2| intersects |mp1|. This is intended for use by
     26 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
     27 // function.
     28 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
     29                        const Snippet::MatchPosition& mp2) {
     30   return mp2.first >= mp1.first && mp2.first <= mp1.second;
     31 }
     32 
     33 // Coalesces match positions in |matches| after index that intersect the match
     34 // position at |index|.
     35 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
     36   Snippet::MatchPosition& mp = (*matches)[index];
     37   for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
     38        i != matches->end(); ) {
     39     if (SnippetIntersects(mp, *i)) {
     40       mp.second = std::max(mp.second, i->second);
     41       i = matches->erase(i);
     42     } else {
     43       return;
     44     }
     45   }
     46 }
     47 
     48 // Sorts the match positions in |matches| by their first index, then coalesces
     49 // any match positions that intersect each other.
     50 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
     51   std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
     52   // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
     53   // from matches.
     54   for (size_t i = 0; i < matches->size(); ++i)
     55     CoalesceMatchesFrom(i, matches);
     56 }
     57 
     58 // Returns true if the character is considered a quote.
     59 bool IsQueryQuote(wchar_t ch) {
     60   return ch == '"' ||
     61          ch == 0xab ||    // left pointing double angle bracket
     62          ch == 0xbb ||    // right pointing double angle bracket
     63          ch == 0x201c ||  // left double quotation mark
     64          ch == 0x201d ||  // right double quotation mark
     65          ch == 0x201e;    // double low-9 quotation mark
     66 }
     67 
     68 }  // namespace
     69 
     70 // Inheritance structure:
     71 // Queries are represented as trees of QueryNodes.
     72 // QueryNodes are either a collection of subnodes (a QueryNodeList)
     73 // or a single word (a QueryNodeWord).
     74 
     75 // A QueryNodeWord is a single word in the query.
     76 class QueryNodeWord : public QueryNode {
     77  public:
     78   explicit QueryNodeWord(const string16& word);
     79   virtual ~QueryNodeWord();
     80 
     81   const string16& word() const { return word_; }
     82 
     83   void set_literal(bool literal) { literal_ = literal; }
     84 
     85   // QueryNode:
     86   virtual int AppendToSQLiteQuery(string16* query) const OVERRIDE;
     87   virtual bool IsWord() const OVERRIDE;
     88   virtual bool Matches(const string16& word, bool exact) const OVERRIDE;
     89   virtual bool HasMatchIn(
     90       const std::vector<QueryWord>& words,
     91       Snippet::MatchPositions* match_positions) const OVERRIDE;
     92   virtual bool HasMatchIn(
     93       const std::vector<QueryWord>& words) const OVERRIDE;
     94   virtual void AppendWords(std::vector<string16>* words) const OVERRIDE;
     95 
     96  private:
     97   string16 word_;
     98   bool literal_;
     99 
    100   DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
    101 };
    102 
    103 QueryNodeWord::QueryNodeWord(const string16& word)
    104     : word_(word),
    105       literal_(false) {}
    106 
    107 QueryNodeWord::~QueryNodeWord() {}
    108 
    109 int QueryNodeWord::AppendToSQLiteQuery(string16* query) const {
    110   query->append(word_);
    111 
    112   // Use prefix search if we're not literal and long enough.
    113   if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    114     *query += L'*';
    115   return 1;
    116 }
    117 
    118 bool QueryNodeWord::IsWord() const {
    119   return true;
    120 }
    121 
    122 bool QueryNodeWord::Matches(const string16& word, bool exact) const {
    123   if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    124     return word == word_;
    125   return word.size() >= word_.size() &&
    126          (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
    127 }
    128 
    129 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
    130                                Snippet::MatchPositions* match_positions) const {
    131   bool matched = false;
    132   for (size_t i = 0; i < words.size(); ++i) {
    133     if (Matches(words[i].word, false)) {
    134       size_t match_start = words[i].position;
    135       match_positions->push_back(
    136           Snippet::MatchPosition(match_start,
    137                                  match_start + static_cast<int>(word_.size())));
    138       matched = true;
    139     }
    140   }
    141   return matched;
    142 }
    143 
    144 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words) const {
    145   for (size_t i = 0; i < words.size(); ++i) {
    146     if (Matches(words[i].word, false))
    147       return true;
    148   }
    149   return false;
    150 }
    151 
    152 void QueryNodeWord::AppendWords(std::vector<string16>* words) const {
    153   words->push_back(word_);
    154 }
    155 
    156 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
    157 class QueryNodeList : public QueryNode {
    158  public:
    159   typedef std::vector<QueryNode*> QueryNodeVector;
    160 
    161   QueryNodeList();
    162   virtual ~QueryNodeList();
    163 
    164   QueryNodeVector* children() { return &children_; }
    165 
    166   void AddChild(QueryNode* node);
    167 
    168   // Remove empty subnodes left over from other parsing.
    169   void RemoveEmptySubnodes();
    170 
    171   // QueryNode:
    172   virtual int AppendToSQLiteQuery(string16* query) const OVERRIDE;
    173   virtual bool IsWord() const OVERRIDE;
    174   virtual bool Matches(const string16& word, bool exact) const OVERRIDE;
    175   virtual bool HasMatchIn(
    176       const std::vector<QueryWord>& words,
    177       Snippet::MatchPositions* match_positions) const OVERRIDE;
    178   virtual bool HasMatchIn(
    179       const std::vector<QueryWord>& words) const OVERRIDE;
    180   virtual void AppendWords(std::vector<string16>* words) const OVERRIDE;
    181 
    182  protected:
    183   int AppendChildrenToString(string16* query) const;
    184 
    185   QueryNodeVector children_;
    186 
    187  private:
    188   DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
    189 };
    190 
    191 QueryNodeList::QueryNodeList() {}
    192 
    193 QueryNodeList::~QueryNodeList() {
    194   STLDeleteElements(&children_);
    195 }
    196 
    197 void QueryNodeList::AddChild(QueryNode* node) {
    198   children_.push_back(node);
    199 }
    200 
    201 void QueryNodeList::RemoveEmptySubnodes() {
    202   for (size_t i = 0; i < children_.size(); ++i) {
    203     if (children_[i]->IsWord())
    204       continue;
    205 
    206     QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
    207     list_node->RemoveEmptySubnodes();
    208     if (list_node->children()->empty()) {
    209       children_.erase(children_.begin() + i);
    210       --i;
    211       delete list_node;
    212     }
    213   }
    214 }
    215 
    216 int QueryNodeList::AppendToSQLiteQuery(string16* query) const {
    217   return AppendChildrenToString(query);
    218 }
    219 
    220 bool QueryNodeList::IsWord() const {
    221   return false;
    222 }
    223 
    224 bool QueryNodeList::Matches(const string16& word, bool exact) const {
    225   NOTREACHED();
    226   return false;
    227 }
    228 
    229 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words,
    230                                Snippet::MatchPositions* match_positions) const {
    231   NOTREACHED();
    232   return false;
    233 }
    234 
    235 bool QueryNodeList::HasMatchIn(const std::vector<QueryWord>& words) const {
    236   NOTREACHED();
    237   return false;
    238 }
    239 
    240 void QueryNodeList::AppendWords(std::vector<string16>* words) const {
    241   for (size_t i = 0; i < children_.size(); ++i)
    242     children_[i]->AppendWords(words);
    243 }
    244 
    245 int QueryNodeList::AppendChildrenToString(string16* query) const {
    246   int num_words = 0;
    247   for (QueryNodeVector::const_iterator node = children_.begin();
    248        node != children_.end(); ++node) {
    249     if (node != children_.begin())
    250       query->push_back(L' ');
    251     num_words += (*node)->AppendToSQLiteQuery(query);
    252   }
    253   return num_words;
    254 }
    255 
    256 // A QueryNodePhrase is a phrase query ("quoted").
    257 class QueryNodePhrase : public QueryNodeList {
    258  public:
    259   QueryNodePhrase();
    260   virtual ~QueryNodePhrase();
    261 
    262   // QueryNodeList:
    263   virtual int AppendToSQLiteQuery(string16* query) const OVERRIDE;
    264   virtual bool HasMatchIn(
    265       const std::vector<QueryWord>& words,
    266       Snippet::MatchPositions* match_positions) const OVERRIDE;
    267   virtual bool HasMatchIn(
    268       const std::vector<QueryWord>& words) const OVERRIDE;
    269 
    270  private:
    271   bool MatchesAll(const std::vector<QueryWord>& words,
    272                   const QueryWord** first_word,
    273                   const QueryWord** last_word) const;
    274   DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
    275 };
    276 
    277 QueryNodePhrase::QueryNodePhrase() {}
    278 
    279 QueryNodePhrase::~QueryNodePhrase() {}
    280 
    281 int QueryNodePhrase::AppendToSQLiteQuery(string16* query) const {
    282   query->push_back(L'"');
    283   int num_words = AppendChildrenToString(query);
    284   query->push_back(L'"');
    285   return num_words;
    286 }
    287 
    288 bool QueryNodePhrase::MatchesAll(const std::vector<QueryWord>& words,
    289                                  const QueryWord** first_word,
    290                                  const QueryWord** last_word) const {
    291   if (words.size() < children_.size())
    292     return false;
    293 
    294   for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
    295     bool matched_all = true;
    296     for (size_t j = 0; j < children_.size(); ++j) {
    297       if (!children_[j]->Matches(words[i + j].word, true)) {
    298         matched_all = false;
    299         break;
    300       }
    301     }
    302     if (matched_all) {
    303       *first_word = &words[i];
    304       *last_word = &words[i + children_.size() - 1];
    305       return true;
    306     }
    307   }
    308   return false;
    309 }
    310 
    311 bool QueryNodePhrase::HasMatchIn(
    312     const std::vector<QueryWord>& words,
    313     Snippet::MatchPositions* match_positions) const {
    314   const QueryWord* first_word;
    315   const QueryWord* last_word;
    316 
    317   if (MatchesAll(words, &first_word, &last_word)) {
    318     match_positions->push_back(
    319         Snippet::MatchPosition(first_word->position,
    320                                last_word->position + last_word->word.length()));
    321       return true;
    322   }
    323   return false;
    324 }
    325 
    326 bool QueryNodePhrase::HasMatchIn(const std::vector<QueryWord>& words) const {
    327   const QueryWord* first_word;
    328   const QueryWord* last_word;
    329   return MatchesAll(words, &first_word, &last_word);
    330 }
    331 
    332 QueryParser::QueryParser() {}
    333 
    334 // static
    335 bool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) {
    336   DCHECK(!word.empty());
    337   size_t minimum_length = 3;
    338   // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
    339   // because they 'behave like' Latin letters. Moreover, we should
    340   // normalize the former before reaching here.
    341   if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
    342     minimum_length = 2;
    343   return word.size() >= minimum_length;
    344 }
    345 
    346 int QueryParser::ParseQuery(const string16& query, string16* sqlite_query) {
    347   QueryNodeList root;
    348   if (!ParseQueryImpl(query, &root))
    349     return 0;
    350   return root.AppendToSQLiteQuery(sqlite_query);
    351 }
    352 
    353 void QueryParser::ParseQueryWords(const string16& query,
    354                                   std::vector<string16>* words) {
    355   QueryNodeList root;
    356   if (!ParseQueryImpl(query, &root))
    357     return;
    358   root.AppendWords(words);
    359 }
    360 
    361 void QueryParser::ParseQueryNodes(const string16& query,
    362                                   std::vector<QueryNode*>* nodes) {
    363   QueryNodeList root;
    364   if (ParseQueryImpl(base::i18n::ToLower(query), &root))
    365     nodes->swap(*root.children());
    366 }
    367 
    368 bool QueryParser::DoesQueryMatch(const string16& text,
    369                                  const std::vector<QueryNode*>& query_nodes,
    370                                  Snippet::MatchPositions* match_positions) {
    371   if (query_nodes.empty())
    372     return false;
    373 
    374   std::vector<QueryWord> query_words;
    375   string16 lower_text = base::i18n::ToLower(text);
    376   ExtractQueryWords(lower_text, &query_words);
    377 
    378   if (query_words.empty())
    379     return false;
    380 
    381   Snippet::MatchPositions matches;
    382   for (size_t i = 0; i < query_nodes.size(); ++i) {
    383     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
    384       return false;
    385   }
    386   if (lower_text.length() != text.length()) {
    387     // The lower case string differs from the original string. The matches are
    388     // meaningless.
    389     // TODO(sky): we need a better way to align the positions so that we don't
    390     // completely punt here.
    391     match_positions->clear();
    392   } else {
    393     CoalseAndSortMatchPositions(&matches);
    394     match_positions->swap(matches);
    395   }
    396   return true;
    397 }
    398 
    399 bool QueryParser::DoesQueryMatch(const std::vector<QueryWord>& query_words,
    400                                  const std::vector<QueryNode*>& query_nodes) {
    401   if (query_nodes.empty() || query_words.empty())
    402     return false;
    403 
    404   for (size_t i = 0; i < query_nodes.size(); ++i) {
    405     if (!query_nodes[i]->HasMatchIn(query_words))
    406       return false;
    407   }
    408   return true;
    409 }
    410 
    411 bool QueryParser::ParseQueryImpl(const string16& query, QueryNodeList* root) {
    412   base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
    413   // TODO(evanm): support a locale here
    414   if (!iter.Init())
    415     return false;
    416 
    417   // To handle nesting, we maintain a stack of QueryNodeLists.
    418   // The last element (back) of the stack contains the current, deepest node.
    419   std::vector<QueryNodeList*> query_stack;
    420   query_stack.push_back(root);
    421 
    422   bool in_quotes = false;  // whether we're currently in a quoted phrase
    423   while (iter.Advance()) {
    424     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    425     // is not necessarily a word, but could also be a sequence of punctuation
    426     // or whitespace.
    427     if (iter.IsWord()) {
    428       QueryNodeWord* word_node = new QueryNodeWord(iter.GetString());
    429       if (in_quotes)
    430         word_node->set_literal(true);
    431       query_stack.back()->AddChild(word_node);
    432     } else {  // Punctuation.
    433       if (IsQueryQuote(query[iter.prev()])) {
    434         if (!in_quotes) {
    435           QueryNodeList* quotes_node = new QueryNodePhrase;
    436           query_stack.back()->AddChild(quotes_node);
    437           query_stack.push_back(quotes_node);
    438           in_quotes = true;
    439         } else {
    440           query_stack.pop_back();  // Stop adding to the quoted phrase.
    441           in_quotes = false;
    442         }
    443       }
    444     }
    445   }
    446 
    447   root->RemoveEmptySubnodes();
    448   return true;
    449 }
    450 
    451 void QueryParser::ExtractQueryWords(const string16& text,
    452                                     std::vector<QueryWord>* words) {
    453   base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
    454   // TODO(evanm): support a locale here
    455   if (!iter.Init())
    456     return;
    457 
    458   while (iter.Advance()) {
    459     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    460     // is not necessarily a word, but could also be a sequence of punctuation
    461     // or whitespace.
    462     if (iter.IsWord()) {
    463       string16 word = iter.GetString();
    464       if (!word.empty()) {
    465         words->push_back(QueryWord());
    466         words->back().word = word;
    467         words->back().position = iter.prev();
    468       }
    469     }
    470   }
    471 }
    472