1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/query_parser.h" 6 7 #include <algorithm> 8 9 #include "base/i18n/break_iterator.h" 10 #include "base/logging.h" 11 #include "base/memory/scoped_vector.h" 12 #include "base/string_util.h" 13 #include "base/utf_string_conversions.h" 14 #include "ui/base/l10n/l10n_util.h" 15 #include "unicode/uscript.h" 16 17 namespace { 18 19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to 20 // sort match positions. 21 int CompareMatchPosition(const Snippet::MatchPosition& mp1, 22 const Snippet::MatchPosition& mp2) { 23 return mp1.first < mp2.first; 24 } 25 26 // Returns true if |mp2| intersects |mp1|. This is intended for use by 27 // CoalesceMatchesFrom and isn't meant as a general intersectpion comparison 28 // function. 29 bool SnippetIntersects(const Snippet::MatchPosition& mp1, 30 const Snippet::MatchPosition& mp2) { 31 return mp2.first >= mp1.first && mp2.first <= mp1.second; 32 } 33 34 // Coalesces match positions in |matches| after index that intersect the match 35 // position at |index|. 36 void CoalesceMatchesFrom(size_t index, 37 Snippet::MatchPositions* matches) { 38 Snippet::MatchPosition& mp = (*matches)[index]; 39 for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1; 40 i != matches->end(); ) { 41 if (SnippetIntersects(mp, *i)) { 42 mp.second = i->second; 43 i = matches->erase(i); 44 } else { 45 return; 46 } 47 } 48 } 49 50 // Sorts the match positions in |matches| by their first index, then coalesces 51 // any match positions that intersect each other. 52 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) { 53 std::sort(matches->begin(), matches->end(), &CompareMatchPosition); 54 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove 55 // from matches. 56 for (size_t i = 0; i < matches->size(); ++i) 57 CoalesceMatchesFrom(i, matches); 58 } 59 60 } // namespace 61 62 // Inheritance structure: 63 // Queries are represented as trees of QueryNodes. 64 // QueryNodes are either a collection of subnodes (a QueryNodeList) 65 // or a single word (a QueryNodeWord). 66 67 // A QueryNodeWord is a single word in the query. 68 class QueryNodeWord : public QueryNode { 69 public: 70 explicit QueryNodeWord(const string16& word) 71 : word_(word), literal_(false) {} 72 virtual ~QueryNodeWord() {} 73 virtual int AppendToSQLiteQuery(string16* query) const; 74 virtual bool IsWord() const { return true; } 75 76 const string16& word() const { return word_; } 77 void set_literal(bool literal) { literal_ = literal; } 78 79 virtual bool HasMatchIn(const std::vector<QueryWord>& words, 80 Snippet::MatchPositions* match_positions) const; 81 82 virtual bool Matches(const string16& word, bool exact) const; 83 virtual void AppendWords(std::vector<string16>* words) const; 84 85 private: 86 string16 word_; 87 bool literal_; 88 }; 89 90 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words, 91 Snippet::MatchPositions* match_positions) const { 92 for (size_t i = 0; i < words.size(); ++i) { 93 if (Matches(words[i].word, false)) { 94 size_t match_start = words[i].position; 95 match_positions->push_back( 96 Snippet::MatchPosition(match_start, 97 match_start + static_cast<int>(word_.size()))); 98 return true; 99 } 100 } 101 return false; 102 } 103 104 bool QueryNodeWord::Matches(const string16& word, bool exact) const { 105 if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 106 return word == word_; 107 return word.size() >= word_.size() && 108 (word_.compare(0, word_.size(), word, 0, word_.size()) == 0); 109 } 110 111 void QueryNodeWord::AppendWords(std::vector<string16>* words) const { 112 words->push_back(word_); 113 } 114 115 int QueryNodeWord::AppendToSQLiteQuery(string16* query) const { 116 query->append(word_); 117 118 // Use prefix search if we're not literal and long enough. 119 if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 120 *query += L'*'; 121 return 1; 122 } 123 124 // A QueryNodeList has a collection of child QueryNodes 125 // which it cleans up after. 126 class QueryNodeList : public QueryNode { 127 public: 128 virtual ~QueryNodeList(); 129 130 virtual int AppendToSQLiteQuery(string16* query) const { 131 return AppendChildrenToString(query); 132 } 133 virtual bool IsWord() const { return false; } 134 135 void AddChild(QueryNode* node) { children_.push_back(node); } 136 137 typedef std::vector<QueryNode*> QueryNodeVector; 138 QueryNodeVector* children() { return &children_; } 139 140 // Remove empty subnodes left over from other parsing. 141 void RemoveEmptySubnodes(); 142 143 // QueryNodeList is never used with Matches or HasMatchIn. 144 virtual bool Matches(const string16& word, bool exact) const { 145 NOTREACHED(); 146 return false; 147 } 148 virtual bool HasMatchIn(const std::vector<QueryWord>& words, 149 Snippet::MatchPositions* match_positions) const { 150 NOTREACHED(); 151 return false; 152 } 153 virtual void AppendWords(std::vector<string16>* words) const; 154 155 protected: 156 int AppendChildrenToString(string16* query) const; 157 158 QueryNodeVector children_; 159 }; 160 161 QueryNodeList::~QueryNodeList() { 162 for (QueryNodeVector::iterator node = children_.begin(); 163 node != children_.end(); ++node) 164 delete *node; 165 } 166 167 void QueryNodeList::RemoveEmptySubnodes() { 168 for (size_t i = 0; i < children_.size(); ++i) { 169 if (children_[i]->IsWord()) 170 continue; 171 172 QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]); 173 list_node->RemoveEmptySubnodes(); 174 if (list_node->children()->empty()) { 175 children_.erase(children_.begin() + i); 176 --i; 177 delete list_node; 178 } 179 } 180 } 181 182 void QueryNodeList::AppendWords(std::vector<string16>* words) const { 183 for (size_t i = 0; i < children_.size(); ++i) 184 children_[i]->AppendWords(words); 185 } 186 187 int QueryNodeList::AppendChildrenToString(string16* query) const { 188 int num_words = 0; 189 for (QueryNodeVector::const_iterator node = children_.begin(); 190 node != children_.end(); ++node) { 191 if (node != children_.begin()) 192 query->push_back(L' '); 193 num_words += (*node)->AppendToSQLiteQuery(query); 194 } 195 return num_words; 196 } 197 198 // A QueryNodePhrase is a phrase query ("quoted"). 199 class QueryNodePhrase : public QueryNodeList { 200 public: 201 virtual int AppendToSQLiteQuery(string16* query) const { 202 query->push_back(L'"'); 203 int num_words = AppendChildrenToString(query); 204 query->push_back(L'"'); 205 return num_words; 206 } 207 208 virtual bool Matches(const string16& word, bool exact) const; 209 virtual bool HasMatchIn(const std::vector<QueryWord>& words, 210 Snippet::MatchPositions* match_positions) const; 211 }; 212 213 bool QueryNodePhrase::Matches(const string16& word, bool exact) const { 214 NOTREACHED(); 215 return false; 216 } 217 218 bool QueryNodePhrase::HasMatchIn( 219 const std::vector<QueryWord>& words, 220 Snippet::MatchPositions* match_positions) const { 221 if (words.size() < children_.size()) 222 return false; 223 224 for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) { 225 bool matched_all = true; 226 for (size_t j = 0; j < children_.size(); ++j) { 227 if (!children_[j]->Matches(words[i + j].word, true)) { 228 matched_all = false; 229 break; 230 } 231 } 232 if (matched_all) { 233 const QueryWord& last_word = words[i + children_.size() - 1]; 234 match_positions->push_back( 235 Snippet::MatchPosition(words[i].position, 236 last_word.position + last_word.word.length())); 237 return true; 238 } 239 } 240 return false; 241 } 242 243 QueryParser::QueryParser() { 244 } 245 246 // static 247 bool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) { 248 DCHECK(!word.empty()); 249 size_t minimum_length = 3; 250 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility) 251 // because they 'behave like' Latin letters. Moreover, we should 252 // normalize the former before reaching here. 253 if (0xAC00 <= word[0] && word[0] <= 0xD7A3) 254 minimum_length = 2; 255 return word.size() >= minimum_length; 256 } 257 258 // Returns true if the character is considered a quote. 259 static bool IsQueryQuote(wchar_t ch) { 260 return ch == '"' || 261 ch == 0xab || // left pointing double angle bracket 262 ch == 0xbb || // right pointing double angle bracket 263 ch == 0x201c || // left double quotation mark 264 ch == 0x201d || // right double quotation mark 265 ch == 0x201e; // double low-9 quotation mark 266 } 267 268 int QueryParser::ParseQuery(const string16& query, 269 string16* sqlite_query) { 270 QueryNodeList root; 271 if (!ParseQueryImpl(query, &root)) 272 return 0; 273 return root.AppendToSQLiteQuery(sqlite_query); 274 } 275 276 void QueryParser::ParseQuery(const string16& query, 277 std::vector<QueryNode*>* nodes) { 278 QueryNodeList root; 279 if (ParseQueryImpl(l10n_util::ToLower(query), &root)) 280 nodes->swap(*root.children()); 281 } 282 283 284 void QueryParser::ExtractQueryWords(const string16& query, 285 std::vector<string16>* words) { 286 QueryNodeList root; 287 if (!ParseQueryImpl(query, &root)) 288 return; 289 root.AppendWords(words); 290 } 291 292 bool QueryParser::DoesQueryMatch(const string16& text, 293 const std::vector<QueryNode*>& query_nodes, 294 Snippet::MatchPositions* match_positions) { 295 if (query_nodes.empty()) 296 return false; 297 298 std::vector<QueryWord> query_words; 299 string16 lower_text = l10n_util::ToLower(text); 300 ExtractQueryWords(lower_text, &query_words); 301 302 if (query_words.empty()) 303 return false; 304 305 Snippet::MatchPositions matches; 306 for (size_t i = 0; i < query_nodes.size(); ++i) { 307 if (!query_nodes[i]->HasMatchIn(query_words, &matches)) 308 return false; 309 } 310 if (lower_text.length() != text.length()) { 311 // The lower case string differs from the original string. The matches are 312 // meaningless. 313 // TODO(sky): we need a better way to align the positions so that we don't 314 // completely punt here. 315 match_positions->clear(); 316 } else { 317 CoalseAndSortMatchPositions(&matches); 318 match_positions->swap(matches); 319 } 320 return true; 321 } 322 323 bool QueryParser::ParseQueryImpl(const string16& query, 324 QueryNodeList* root) { 325 base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD); 326 // TODO(evanm): support a locale here 327 if (!iter.Init()) 328 return false; 329 330 // To handle nesting, we maintain a stack of QueryNodeLists. 331 // The last element (back) of the stack contains the current, deepest node. 332 std::vector<QueryNodeList*> query_stack; 333 query_stack.push_back(root); 334 335 bool in_quotes = false; // whether we're currently in a quoted phrase 336 while (iter.Advance()) { 337 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 338 // is not necessarily a word, but could also be a sequence of punctuation 339 // or whitespace. 340 if (iter.IsWord()) { 341 string16 word = iter.GetString(); 342 343 QueryNodeWord* word_node = new QueryNodeWord(word); 344 if (in_quotes) 345 word_node->set_literal(true); 346 query_stack.back()->AddChild(word_node); 347 } else { // Punctuation. 348 if (IsQueryQuote(query[iter.prev()])) { 349 if (!in_quotes) { 350 QueryNodeList* quotes_node = new QueryNodePhrase; 351 query_stack.back()->AddChild(quotes_node); 352 query_stack.push_back(quotes_node); 353 in_quotes = true; 354 } else { 355 query_stack.pop_back(); // Stop adding to the quoted phrase. 356 in_quotes = false; 357 } 358 } 359 } 360 } 361 362 root->RemoveEmptySubnodes(); 363 return true; 364 } 365 366 void QueryParser::ExtractQueryWords(const string16& text, 367 std::vector<QueryWord>* words) { 368 base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD); 369 // TODO(evanm): support a locale here 370 if (!iter.Init()) 371 return; 372 373 while (iter.Advance()) { 374 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 375 // is not necessarily a word, but could also be a sequence of punctuation 376 // or whitespace. 377 if (iter.IsWord()) { 378 string16 word = iter.GetString(); 379 if (!word.empty()) { 380 words->push_back(QueryWord()); 381 words->back().word = word; 382 words->back().position = iter.prev(); 383 } 384 } 385 } 386 } 387