Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/query_parser.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/i18n/break_iterator.h"
     10 #include "base/logging.h"
     11 #include "base/memory/scoped_vector.h"
     12 #include "base/string_util.h"
     13 #include "base/utf_string_conversions.h"
     14 #include "ui/base/l10n/l10n_util.h"
     15 #include "unicode/uscript.h"
     16 
     17 namespace {
     18 
     19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
     20 // sort match positions.
     21 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
     22                          const Snippet::MatchPosition& mp2) {
     23   return mp1.first < mp2.first;
     24 }
     25 
     26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
     27 // CoalesceMatchesFrom and isn't meant as a general intersectpion comparison
     28 // function.
     29 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
     30                        const Snippet::MatchPosition& mp2) {
     31   return mp2.first >= mp1.first && mp2.first <= mp1.second;
     32 }
     33 
     34 // Coalesces match positions in |matches| after index that intersect the match
     35 // position at |index|.
     36 void CoalesceMatchesFrom(size_t index,
     37                          Snippet::MatchPositions* matches) {
     38   Snippet::MatchPosition& mp = (*matches)[index];
     39   for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
     40        i != matches->end(); ) {
     41     if (SnippetIntersects(mp, *i)) {
     42       mp.second = i->second;
     43       i = matches->erase(i);
     44     } else {
     45       return;
     46     }
     47   }
     48 }
     49 
     50 // Sorts the match positions in |matches| by their first index, then coalesces
     51 // any match positions that intersect each other.
     52 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
     53   std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
     54   // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
     55   // from matches.
     56   for (size_t i = 0; i < matches->size(); ++i)
     57     CoalesceMatchesFrom(i, matches);
     58 }
     59 
     60 }  // namespace
     61 
     62 // Inheritance structure:
     63 // Queries are represented as trees of QueryNodes.
     64 // QueryNodes are either a collection of subnodes (a QueryNodeList)
     65 // or a single word (a QueryNodeWord).
     66 
     67 // A QueryNodeWord is a single word in the query.
     68 class QueryNodeWord : public QueryNode {
     69  public:
     70   explicit QueryNodeWord(const string16& word)
     71       : word_(word), literal_(false) {}
     72   virtual ~QueryNodeWord() {}
     73   virtual int AppendToSQLiteQuery(string16* query) const;
     74   virtual bool IsWord() const { return true; }
     75 
     76   const string16& word() const { return word_; }
     77   void set_literal(bool literal) { literal_ = literal; }
     78 
     79   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
     80                           Snippet::MatchPositions* match_positions) const;
     81 
     82   virtual bool Matches(const string16& word, bool exact) const;
     83   virtual void AppendWords(std::vector<string16>* words) const;
     84 
     85  private:
     86   string16 word_;
     87   bool literal_;
     88 };
     89 
     90 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
     91                                Snippet::MatchPositions* match_positions) const {
     92   for (size_t i = 0; i < words.size(); ++i) {
     93     if (Matches(words[i].word, false)) {
     94       size_t match_start = words[i].position;
     95       match_positions->push_back(
     96           Snippet::MatchPosition(match_start,
     97                                  match_start + static_cast<int>(word_.size())));
     98       return true;
     99     }
    100   }
    101   return false;
    102 }
    103 
    104 bool QueryNodeWord::Matches(const string16& word, bool exact) const {
    105   if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    106     return word == word_;
    107   return word.size() >= word_.size() &&
    108          (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
    109 }
    110 
    111 void QueryNodeWord::AppendWords(std::vector<string16>* words) const {
    112   words->push_back(word_);
    113 }
    114 
    115 int QueryNodeWord::AppendToSQLiteQuery(string16* query) const {
    116   query->append(word_);
    117 
    118   // Use prefix search if we're not literal and long enough.
    119   if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
    120     *query += L'*';
    121   return 1;
    122 }
    123 
    124 // A QueryNodeList has a collection of child QueryNodes
    125 // which it cleans up after.
    126 class QueryNodeList : public QueryNode {
    127  public:
    128   virtual ~QueryNodeList();
    129 
    130   virtual int AppendToSQLiteQuery(string16* query) const {
    131     return AppendChildrenToString(query);
    132   }
    133   virtual bool IsWord() const { return false; }
    134 
    135   void AddChild(QueryNode* node) { children_.push_back(node); }
    136 
    137   typedef std::vector<QueryNode*> QueryNodeVector;
    138   QueryNodeVector* children() { return &children_; }
    139 
    140   // Remove empty subnodes left over from other parsing.
    141   void RemoveEmptySubnodes();
    142 
    143   // QueryNodeList is never used with Matches or HasMatchIn.
    144   virtual bool Matches(const string16& word, bool exact) const {
    145     NOTREACHED();
    146     return false;
    147   }
    148   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
    149                           Snippet::MatchPositions* match_positions) const {
    150     NOTREACHED();
    151     return false;
    152   }
    153   virtual void AppendWords(std::vector<string16>* words) const;
    154 
    155  protected:
    156   int AppendChildrenToString(string16* query) const;
    157 
    158   QueryNodeVector children_;
    159 };
    160 
    161 QueryNodeList::~QueryNodeList() {
    162   for (QueryNodeVector::iterator node = children_.begin();
    163        node != children_.end(); ++node)
    164     delete *node;
    165 }
    166 
    167 void QueryNodeList::RemoveEmptySubnodes() {
    168   for (size_t i = 0; i < children_.size(); ++i) {
    169     if (children_[i]->IsWord())
    170       continue;
    171 
    172     QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
    173     list_node->RemoveEmptySubnodes();
    174     if (list_node->children()->empty()) {
    175       children_.erase(children_.begin() + i);
    176       --i;
    177       delete list_node;
    178     }
    179   }
    180 }
    181 
    182 void QueryNodeList::AppendWords(std::vector<string16>* words) const {
    183   for (size_t i = 0; i < children_.size(); ++i)
    184     children_[i]->AppendWords(words);
    185 }
    186 
    187 int QueryNodeList::AppendChildrenToString(string16* query) const {
    188   int num_words = 0;
    189   for (QueryNodeVector::const_iterator node = children_.begin();
    190        node != children_.end(); ++node) {
    191     if (node != children_.begin())
    192       query->push_back(L' ');
    193     num_words += (*node)->AppendToSQLiteQuery(query);
    194   }
    195   return num_words;
    196 }
    197 
    198 // A QueryNodePhrase is a phrase query ("quoted").
    199 class QueryNodePhrase : public QueryNodeList {
    200  public:
    201   virtual int AppendToSQLiteQuery(string16* query) const {
    202     query->push_back(L'"');
    203     int num_words = AppendChildrenToString(query);
    204     query->push_back(L'"');
    205     return num_words;
    206   }
    207 
    208   virtual bool Matches(const string16& word, bool exact) const;
    209   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
    210                           Snippet::MatchPositions* match_positions) const;
    211 };
    212 
    213 bool QueryNodePhrase::Matches(const string16& word, bool exact) const {
    214   NOTREACHED();
    215   return false;
    216 }
    217 
    218 bool QueryNodePhrase::HasMatchIn(
    219     const std::vector<QueryWord>& words,
    220     Snippet::MatchPositions* match_positions) const {
    221   if (words.size() < children_.size())
    222     return false;
    223 
    224   for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
    225     bool matched_all = true;
    226     for (size_t j = 0; j < children_.size(); ++j) {
    227       if (!children_[j]->Matches(words[i + j].word, true)) {
    228         matched_all = false;
    229         break;
    230       }
    231     }
    232     if (matched_all) {
    233       const QueryWord& last_word = words[i + children_.size() - 1];
    234       match_positions->push_back(
    235           Snippet::MatchPosition(words[i].position,
    236                                  last_word.position + last_word.word.length()));
    237       return true;
    238     }
    239   }
    240   return false;
    241 }
    242 
    243 QueryParser::QueryParser() {
    244 }
    245 
    246 // static
    247 bool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) {
    248   DCHECK(!word.empty());
    249   size_t minimum_length = 3;
    250   // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
    251   // because they 'behave like' Latin letters. Moreover, we should
    252   // normalize the former before reaching here.
    253   if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
    254     minimum_length = 2;
    255   return word.size() >= minimum_length;
    256 }
    257 
    258 // Returns true if the character is considered a quote.
    259 static bool IsQueryQuote(wchar_t ch) {
    260   return ch == '"' ||
    261          ch == 0xab ||    // left pointing double angle bracket
    262          ch == 0xbb ||    // right pointing double angle bracket
    263          ch == 0x201c ||  // left double quotation mark
    264          ch == 0x201d ||  // right double quotation mark
    265          ch == 0x201e;    // double low-9 quotation mark
    266 }
    267 
    268 int QueryParser::ParseQuery(const string16& query,
    269                             string16* sqlite_query) {
    270   QueryNodeList root;
    271   if (!ParseQueryImpl(query, &root))
    272     return 0;
    273   return root.AppendToSQLiteQuery(sqlite_query);
    274 }
    275 
    276 void QueryParser::ParseQuery(const string16& query,
    277                              std::vector<QueryNode*>* nodes) {
    278   QueryNodeList root;
    279   if (ParseQueryImpl(l10n_util::ToLower(query), &root))
    280     nodes->swap(*root.children());
    281 }
    282 
    283 
    284 void QueryParser::ExtractQueryWords(const string16& query,
    285                                     std::vector<string16>* words) {
    286   QueryNodeList root;
    287   if (!ParseQueryImpl(query, &root))
    288     return;
    289   root.AppendWords(words);
    290 }
    291 
    292 bool QueryParser::DoesQueryMatch(const string16& text,
    293                                  const std::vector<QueryNode*>& query_nodes,
    294                                  Snippet::MatchPositions* match_positions) {
    295   if (query_nodes.empty())
    296     return false;
    297 
    298   std::vector<QueryWord> query_words;
    299   string16 lower_text = l10n_util::ToLower(text);
    300   ExtractQueryWords(lower_text, &query_words);
    301 
    302   if (query_words.empty())
    303     return false;
    304 
    305   Snippet::MatchPositions matches;
    306   for (size_t i = 0; i < query_nodes.size(); ++i) {
    307     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
    308       return false;
    309   }
    310   if (lower_text.length() != text.length()) {
    311     // The lower case string differs from the original string. The matches are
    312     // meaningless.
    313     // TODO(sky): we need a better way to align the positions so that we don't
    314     // completely punt here.
    315     match_positions->clear();
    316   } else {
    317     CoalseAndSortMatchPositions(&matches);
    318     match_positions->swap(matches);
    319   }
    320   return true;
    321 }
    322 
    323 bool QueryParser::ParseQueryImpl(const string16& query,
    324                                 QueryNodeList* root) {
    325   base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD);
    326   // TODO(evanm): support a locale here
    327   if (!iter.Init())
    328     return false;
    329 
    330   // To handle nesting, we maintain a stack of QueryNodeLists.
    331   // The last element (back) of the stack contains the current, deepest node.
    332   std::vector<QueryNodeList*> query_stack;
    333   query_stack.push_back(root);
    334 
    335   bool in_quotes = false;  // whether we're currently in a quoted phrase
    336   while (iter.Advance()) {
    337     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    338     // is not necessarily a word, but could also be a sequence of punctuation
    339     // or whitespace.
    340     if (iter.IsWord()) {
    341       string16 word = iter.GetString();
    342 
    343       QueryNodeWord* word_node = new QueryNodeWord(word);
    344       if (in_quotes)
    345         word_node->set_literal(true);
    346       query_stack.back()->AddChild(word_node);
    347     } else {  // Punctuation.
    348       if (IsQueryQuote(query[iter.prev()])) {
    349         if (!in_quotes) {
    350           QueryNodeList* quotes_node = new QueryNodePhrase;
    351           query_stack.back()->AddChild(quotes_node);
    352           query_stack.push_back(quotes_node);
    353           in_quotes = true;
    354         } else {
    355           query_stack.pop_back();  // Stop adding to the quoted phrase.
    356           in_quotes = false;
    357         }
    358       }
    359     }
    360   }
    361 
    362   root->RemoveEmptySubnodes();
    363   return true;
    364 }
    365 
    366 void QueryParser::ExtractQueryWords(const string16& text,
    367                                     std::vector<QueryWord>* words) {
    368   base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD);
    369   // TODO(evanm): support a locale here
    370   if (!iter.Init())
    371     return;
    372 
    373   while (iter.Advance()) {
    374     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
    375     // is not necessarily a word, but could also be a sequence of punctuation
    376     // or whitespace.
    377     if (iter.IsWord()) {
    378       string16 word = iter.GetString();
    379       if (!word.empty()) {
    380         words->push_back(QueryWord());
    381         words->back().word = word;
    382         words->back().position = iter.prev();
    383       }
    384     }
    385   }
    386 }
    387