app_list/search/tokenized_string_match.cc

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "ui/app_list/search/tokenized_string_match.h"

#include "base/i18n/string_search.h"
#include "base/logging.h"
#include "ui/app_list/search/tokenized_string_char_iterator.h"

namespace app_list {

namespace {

// The factors below are applied when the current char of query matches
// the current char of the text to be matched. Different factors are chosen
// based on where the match happens. kIsPrefixMultiplier is used when the
// matched portion is a prefix of both the query and the text, which implies
// that the matched chars are at the same position in query and text. This is
// the most preferred case thus it has the highest score. When the current char
// of the query and the text does not match, the algorithm moves to the next
// token in the text and try to match from there. kIsFrontOfWordMultipler will
// be used if the first char of the token matches the current char of the query.
// Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
// used.
// Examples:
//   Suppose the text to be matched is 'Google Chrome'.
//   Query 'go' would yield kIsPrefixMultiplier for each char.
//   Query 'gc' would use kIsPrefixMultiplier for 'g' and
//       kIsFrontOfWordMultipler for 'c'.
//   Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
//       kIsWeakHitMultiplier for 'h'.
//   Query 'oo' does not match any prefix and would use the substring match
//       fallback, thus kIsSubstringMultiplier is used for each char.
const double kIsPrefixMultiplier = 1.0;
const double kIsFrontOfWordMultipler = 0.8;
const double kIsWeakHitMultiplier = 0.6;
const double kIsSubstringMultiplier = 0.4;

// A relevance score that represents no match.
const double kNoMatchScore = 0.0;

// PrefixMatcher matches the chars of a given query as prefix of tokens in
// a given text or as prefix of the acronyms of those text tokens.
class PrefixMatcher {
 public:
  PrefixMatcher(const TokenizedString& query,
                const TokenizedString& text)
      : query_iter_(query),
        text_iter_(text),
        current_match_(gfx::Range::InvalidRange()),
        current_relevance_(kNoMatchScore) {
  }

  // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
  // perform DFS (depth first search) so that all possible matches are
  // attempted. Stops on the first full match and returns true. Otherwise,
  // returns false to indicate no match.
  bool Match() {
    while (!RunMatch()) {
      // No match found and no more states to try. Bail out.
      if (states_.empty()) {
        current_relevance_ = kNoMatchScore;
        current_hits_.clear();
        return false;
      }

      PopState();

      // Skip restored match to try other possibilites.
      AdvanceToNextTextToken();
    }

    if (current_match_.IsValid())
      current_hits_.push_back(current_match_);

    return true;
  }

  double relevance() const { return current_relevance_; }
  const TokenizedStringMatch::Hits& hits() const { return current_hits_; }

 private:
  // Context record of a match.
  struct State {
    State() : relevance(kNoMatchScore) {}
    State(double relevance,
          const gfx::Range& current_match,
          const TokenizedStringMatch::Hits& hits,
          const TokenizedStringCharIterator& query_iter,
          const TokenizedStringCharIterator& text_iter)
        : relevance(relevance),
          current_match(current_match),
          hits(hits.begin(), hits.end()),
          query_iter_state(query_iter.GetState()),
          text_iter_state(text_iter.GetState()) {}

    // The current score of the processed query chars.
    double relevance;

    // Current matching range.
    gfx::Range current_match;

    // Completed matching ranges of the processed query chars.
    TokenizedStringMatch::Hits hits;

    // States of the processed query and text chars.
    TokenizedStringCharIterator::State query_iter_state;
    TokenizedStringCharIterator::State text_iter_state;
  };
  typedef std::vector<State> States;

  // Match chars from the query and text one by one. For each matching char,
  // calculate relevance and matching ranges. And the current stats is
  // recorded so that the match could be skipped later to try other
  // possiblities. Repeat until any of the iterators run out. Return true if
  // query iterator runs out, i.e. all chars in query are matched.
  bool RunMatch() {
    while (!query_iter_.end() && !text_iter_.end()) {
      if (query_iter_.Get() == text_iter_.Get()) {
        PushState();

        if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
          current_relevance_ += kIsPrefixMultiplier;
        else if (text_iter_.IsFirstCharOfToken())
          current_relevance_ += kIsFrontOfWordMultipler;
        else
          current_relevance_ += kIsWeakHitMultiplier;

        if (!current_match_.IsValid())
          current_match_.set_start(text_iter_.GetArrayPos());
        current_match_.set_end(text_iter_.GetArrayPos() +
                              text_iter_.GetCharSize());

        query_iter_.NextChar();
        text_iter_.NextChar();
      } else {
        AdvanceToNextTextToken();
      }
    }

    return query_iter_.end();
  }

  // Skip to the next text token and close current match. Invoked when a
  // mismatch happens or to skip a restored match.
  void AdvanceToNextTextToken() {
    if (current_match_.IsValid()) {
      current_hits_.push_back(current_match_);
      current_match_ = gfx::Range::InvalidRange();
    }

    text_iter_.NextToken();
  }

  void PushState() {
    states_.push_back(State(current_relevance_,
                            current_match_,
                            current_hits_,
                            query_iter_,
                            text_iter_));
  }

  void PopState() {
    DCHECK(!states_.empty());

    State& last_match = states_.back();
    current_relevance_ = last_match.relevance;
    current_match_ = last_match.current_match;
    current_hits_.swap(last_match.hits);
    query_iter_.SetState(last_match.query_iter_state);
    text_iter_.SetState(last_match.text_iter_state);

    states_.pop_back();
  }

  TokenizedStringCharIterator query_iter_;
  TokenizedStringCharIterator text_iter_;

  States states_;
  gfx::Range current_match_;

  double current_relevance_;
  TokenizedStringMatch::Hits current_hits_;

  DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
};

}  // namespace

TokenizedStringMatch::TokenizedStringMatch()
    : relevance_(kNoMatchScore) {}

TokenizedStringMatch::~TokenizedStringMatch() {}

bool TokenizedStringMatch::Calculate(const TokenizedString& query,
                                     const TokenizedString& text) {
  relevance_ = kNoMatchScore;
  hits_.clear();

  PrefixMatcher matcher(query, text);
  if (matcher.Match()) {
    relevance_ = matcher.relevance();
    hits_.assign(matcher.hits().begin(), matcher.hits().end());
  }

  // Substring match as a fallback.
  if (relevance_ == kNoMatchScore) {
    size_t substr_match_start = 0;
    size_t substr_match_length = 0;
    if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
                                                       text.text(),
                                                       &substr_match_start,
                                                       &substr_match_length)) {
      relevance_ = kIsSubstringMultiplier * substr_match_length;
      hits_.push_back(gfx::Range(substr_match_start,
                                 substr_match_start + substr_match_length));
    }
  }

  // Using length() for normalizing is not 100% correct but should be good
  // enough compared with using real char count of the text.
  if (text.text().length())
    relevance_ /= text.text().length();

  return relevance_ > kNoMatchScore;
}

bool TokenizedStringMatch::Calculate(const base::string16& query,
                                     const base::string16& text) {
  const TokenizedString tokenized_query(query);
  const TokenizedString tokenized_text(text);
  return Calculate(tokenized_query, tokenized_text);
}

}  // namespace app_list