Home | History | Annotate | Download | only in search
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/ui/app_list/search/tokenized_string_match.h"
      6 
      7 #include "base/i18n/string_search.h"
      8 #include "base/logging.h"
      9 #include "chrome/browser/ui/app_list/search/tokenized_string_char_iterator.h"
     10 
     11 namespace app_list {
     12 
     13 namespace {
     14 
     15 // The factors below are applied when the current char of query matches
     16 // the current char of the text to be matched. Different factors are chosen
     17 // based on where the match happens. kIsPrefixMultiplier is used when the
     18 // matched portion is a prefix of both the query and the text, which implies
     19 // that the matched chars are at the same position in query and text. This is
     20 // the most preferred case thus it has the highest score. When the current char
     21 // of the query and the text does not match, the algorithm moves to the next
     22 // token in the text and try to match from there. kIsFrontOfWordMultipler will
     23 // be used if the first char of the token matches the current char of the query.
     24 // Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
     25 // used.
     26 // Examples:
     27 //   Suppose the text to be matched is 'Google Chrome'.
     28 //   Query 'go' would yield kIsPrefixMultiplier for each char.
     29 //   Query 'gc' would use kIsPrefixMultiplier for 'g' and
     30 //       kIsFrontOfWordMultipler for 'c'.
     31 //   Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
     32 //       kIsWeakHitMultiplier for 'h'.
     33 //   Query 'oo' does not match any prefix and would use the substring match
     34 //       fallback, thus kIsSubstringMultiplier is used for each char.
     35 const double kIsPrefixMultiplier = 1.0;
     36 const double kIsFrontOfWordMultipler = 0.8;
     37 const double kIsWeakHitMultiplier = 0.6;
     38 const double kIsSubstringMultiplier = 0.4;
     39 
     40 // A relevance score that represents no match.
     41 const double kNoMatchScore = 0.0;
     42 
     43 // PrefixMatcher matches the chars of a given query as prefix of tokens in
     44 // a given text or as prefix of the acronyms of those text tokens.
     45 class PrefixMatcher {
     46  public:
     47   PrefixMatcher(const TokenizedString& query,
     48                 const TokenizedString& text)
     49       : query_iter_(query),
     50         text_iter_(text),
     51         current_match_(gfx::Range::InvalidRange()),
     52         current_relevance_(kNoMatchScore) {
     53   }
     54 
     55   // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
     56   // perform DFS (depth first search) so that all possible matches are
     57   // attempted. Stops on the first full match and returns true. Otherwise,
     58   // returns false to indicate no match.
     59   bool Match() {
     60     while (!RunMatch()) {
     61       // No match found and no more states to try. Bail out.
     62       if (states_.empty()) {
     63         current_relevance_ = kNoMatchScore;
     64         current_hits_.clear();
     65         return false;
     66       }
     67 
     68       PopState();
     69 
     70       // Skip restored match to try other possibilites.
     71       AdvanceToNextTextToken();
     72     }
     73 
     74     if (current_match_.IsValid())
     75       current_hits_.push_back(current_match_);
     76 
     77     return true;
     78   }
     79 
     80   double relevance() const { return current_relevance_; }
     81   const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
     82 
     83  private:
     84   // Context record of a match.
     85   struct State {
     86     State() : relevance(kNoMatchScore) {}
     87     State(double relevance,
     88           const gfx::Range& current_match,
     89           const TokenizedStringMatch::Hits& hits,
     90           const TokenizedStringCharIterator& query_iter,
     91           const TokenizedStringCharIterator& text_iter)
     92         : relevance(relevance),
     93           current_match(current_match),
     94           hits(hits.begin(), hits.end()),
     95           query_iter_state(query_iter.GetState()),
     96           text_iter_state(text_iter.GetState()) {}
     97 
     98     // The current score of the processed query chars.
     99     double relevance;
    100 
    101     // Current matching range.
    102     gfx::Range current_match;
    103 
    104     // Completed matching ranges of the processed query chars.
    105     TokenizedStringMatch::Hits hits;
    106 
    107     // States of the processed query and text chars.
    108     TokenizedStringCharIterator::State query_iter_state;
    109     TokenizedStringCharIterator::State text_iter_state;
    110   };
    111   typedef std::vector<State> States;
    112 
    113   // Match chars from the query and text one by one. For each matching char,
    114   // calculate relevance and matching ranges. And the current stats is
    115   // recorded so that the match could be skipped later to try other
    116   // possiblities. Repeat until any of the iterators run out. Return true if
    117   // query iterator runs out, i.e. all chars in query are matched.
    118   bool RunMatch() {
    119     while (!query_iter_.end() && !text_iter_.end()) {
    120       if (query_iter_.Get() == text_iter_.Get()) {
    121         PushState();
    122 
    123         if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
    124           current_relevance_ += kIsPrefixMultiplier;
    125         else if (text_iter_.IsFirstCharOfToken())
    126           current_relevance_ += kIsFrontOfWordMultipler;
    127         else
    128           current_relevance_ += kIsWeakHitMultiplier;
    129 
    130         if (!current_match_.IsValid())
    131           current_match_.set_start(text_iter_.GetArrayPos());
    132         current_match_.set_end(text_iter_.GetArrayPos() +
    133                               text_iter_.GetCharSize());
    134 
    135         query_iter_.NextChar();
    136         text_iter_.NextChar();
    137       } else {
    138         AdvanceToNextTextToken();
    139       }
    140     }
    141 
    142     return query_iter_.end();
    143   }
    144 
    145   // Skip to the next text token and close current match. Invoked when a
    146   // mismatch happens or to skip a restored match.
    147   void AdvanceToNextTextToken() {
    148     if (current_match_.IsValid()) {
    149       current_hits_.push_back(current_match_);
    150       current_match_ = gfx::Range::InvalidRange();
    151     }
    152 
    153     text_iter_.NextToken();
    154   }
    155 
    156   void PushState() {
    157     states_.push_back(State(current_relevance_,
    158                             current_match_,
    159                             current_hits_,
    160                             query_iter_,
    161                             text_iter_));
    162   }
    163 
    164   void PopState() {
    165     DCHECK(!states_.empty());
    166 
    167     State& last_match = states_.back();
    168     current_relevance_ = last_match.relevance;
    169     current_match_ = last_match.current_match;
    170     current_hits_.swap(last_match.hits);
    171     query_iter_.SetState(last_match.query_iter_state);
    172     text_iter_.SetState(last_match.text_iter_state);
    173 
    174     states_.pop_back();
    175   }
    176 
    177   TokenizedStringCharIterator query_iter_;
    178   TokenizedStringCharIterator text_iter_;
    179 
    180   States states_;
    181   gfx::Range current_match_;
    182 
    183   double current_relevance_;
    184   TokenizedStringMatch::Hits current_hits_;
    185 
    186   DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
    187 };
    188 
    189 }  // namespace
    190 
    191 TokenizedStringMatch::TokenizedStringMatch()
    192     : relevance_(kNoMatchScore) {}
    193 
    194 TokenizedStringMatch::~TokenizedStringMatch() {}
    195 
    196 bool TokenizedStringMatch::Calculate(const TokenizedString& query,
    197                                      const TokenizedString& text) {
    198   relevance_ = kNoMatchScore;
    199   hits_.clear();
    200 
    201   PrefixMatcher matcher(query, text);
    202   if (matcher.Match()) {
    203     relevance_ = matcher.relevance();
    204     hits_.assign(matcher.hits().begin(), matcher.hits().end());
    205   }
    206 
    207   // Substring match as a fallback.
    208   if (relevance_ == kNoMatchScore) {
    209     size_t substr_match_start = 0;
    210     size_t substr_match_length = 0;
    211     if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
    212                                                        text.text(),
    213                                                        &substr_match_start,
    214                                                        &substr_match_length)) {
    215       relevance_ = kIsSubstringMultiplier * substr_match_length;
    216       hits_.push_back(gfx::Range(substr_match_start,
    217                                  substr_match_start + substr_match_length));
    218     }
    219   }
    220 
    221   // Using length() for normalizing is not 100% correct but should be good
    222   // enough compared with using real char count of the text.
    223   if (text.text().length())
    224     relevance_ /= text.text().length();
    225 
    226   return relevance_ > kNoMatchScore;
    227 }
    228 
    229 bool TokenizedStringMatch::Calculate(const base::string16& query,
    230                                      const base::string16& text) {
    231   const TokenizedString tokenized_query(query);
    232   const TokenizedString tokenized_text(text);
    233   return Calculate(tokenized_query, tokenized_text);
    234 }
    235 
    236 }  // namespace app_list
    237