Home | History | Annotate | Download | only in actions
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "actions/ngram-model.h"
     18 
     19 #include <algorithm>
     20 
     21 #include "actions/feature-processor.h"
     22 #include "utils/hash/farmhash.h"
     23 #include "utils/strings/stringpiece.h"
     24 
     25 namespace libtextclassifier3 {
     26 namespace {
     27 
     28 // An iterator to iterate over the initial tokens of the n-grams of a model.
     29 class FirstTokenIterator
     30     : public std::iterator<std::random_access_iterator_tag,
     31                            /*value_type=*/uint32, /*difference_type=*/ptrdiff_t,
     32                            /*pointer=*/const uint32*,
     33                            /*reference=*/uint32&> {
     34  public:
     35   explicit FirstTokenIterator(const NGramLinearRegressionModel* model,
     36                               int index)
     37       : model_(model), index_(index) {}
     38 
     39   FirstTokenIterator& operator++() {
     40     index_++;
     41     return *this;
     42   }
     43   FirstTokenIterator& operator+=(ptrdiff_t dist) {
     44     index_ += dist;
     45     return *this;
     46   }
     47   ptrdiff_t operator-(const FirstTokenIterator& other_it) const {
     48     return index_ - other_it.index_;
     49   }
     50   uint32 operator*() const {
     51     const uint32 token_offset = (*model_->ngram_start_offsets())[index_];
     52     return (*model_->hashed_ngram_tokens())[token_offset];
     53   }
     54   int index() const { return index_; }
     55 
     56  private:
     57   const NGramLinearRegressionModel* model_;
     58   int index_;
     59 };
     60 
     61 }  // anonymous namespace
     62 
     63 std::unique_ptr<NGramModel> NGramModel::Create(
     64     const NGramLinearRegressionModel* model, const Tokenizer* tokenizer,
     65     const UniLib* unilib) {
     66   if (model == nullptr) {
     67     return nullptr;
     68   }
     69   if (tokenizer == nullptr && model->tokenizer_options() == nullptr) {
     70     TC3_LOG(ERROR) << "No tokenizer options specified.";
     71     return nullptr;
     72   }
     73   return std::unique_ptr<NGramModel>(new NGramModel(model, tokenizer, unilib));
     74 }
     75 
     76 NGramModel::NGramModel(const NGramLinearRegressionModel* model,
     77                        const Tokenizer* tokenizer, const UniLib* unilib)
     78     : model_(model) {
     79   // Create new tokenizer if options are specified, reuse feature processor
     80   // tokenizer otherwise.
     81   if (model->tokenizer_options() != nullptr) {
     82     owned_tokenizer_ = CreateTokenizer(model->tokenizer_options(), unilib);
     83     tokenizer_ = owned_tokenizer_.get();
     84   } else {
     85     tokenizer_ = tokenizer;
     86   }
     87 }
     88 
     89 // Returns whether a given n-gram matches the token stream.
     90 bool NGramModel::IsNGramMatch(const uint32* tokens, size_t num_tokens,
     91                               const uint32* ngram_tokens,
     92                               size_t num_ngram_tokens, int max_skips) const {
     93   int token_idx = 0, ngram_token_idx = 0, skip_remain = 0;
     94   for (; token_idx < num_tokens && ngram_token_idx < num_ngram_tokens;) {
     95     if (tokens[token_idx] == ngram_tokens[ngram_token_idx]) {
     96       // Token matches. Advance both and reset the skip budget.
     97       ++token_idx;
     98       ++ngram_token_idx;
     99       skip_remain = max_skips;
    100     } else if (skip_remain > 0) {
    101       // No match, but we have skips left, so just advance over the token.
    102       ++token_idx;
    103       skip_remain--;
    104     } else {
    105       // No match and we're out of skips. Reject.
    106       return false;
    107     }
    108   }
    109   return ngram_token_idx == num_ngram_tokens;
    110 }
    111 
    112 // Calculates the total number of skip-grams that can be created for a stream
    113 // with the given number of tokens.
    114 uint64 NGramModel::GetNumSkipGrams(int num_tokens, int max_ngram_length,
    115                                    int max_skips) {
    116   // Start with unigrams.
    117   uint64 total = num_tokens;
    118   for (int ngram_len = 2;
    119        ngram_len <= max_ngram_length && ngram_len <= num_tokens; ++ngram_len) {
    120     // We can easily compute the expected length of the n-gram (with skips),
    121     // but it doesn't account for the fact that they may be longer than the
    122     // input and should be pruned.
    123     // Instead, we iterate over the distribution of effective n-gram lengths
    124     // and add each length individually.
    125     const int num_gaps = ngram_len - 1;
    126     const int len_min = ngram_len;
    127     const int len_max = ngram_len + num_gaps * max_skips;
    128     const int len_mid = (len_max + len_min) / 2;
    129     for (int len_i = len_min; len_i <= len_max; ++len_i) {
    130       if (len_i > num_tokens) continue;
    131       const int num_configs_of_len_i =
    132           len_i <= len_mid ? len_i - len_min + 1 : len_max - len_i + 1;
    133       const int num_start_offsets = num_tokens - len_i + 1;
    134       total += num_configs_of_len_i * num_start_offsets;
    135     }
    136   }
    137   return total;
    138 }
    139 
    140 std::pair<int, int> NGramModel::GetFirstTokenMatches(uint32 token_hash) const {
    141   const int num_ngrams = model_->ngram_weights()->size();
    142   const auto start_it = FirstTokenIterator(model_, 0);
    143   const auto end_it = FirstTokenIterator(model_, num_ngrams);
    144   const int start = std::lower_bound(start_it, end_it, token_hash).index();
    145   const int end = std::upper_bound(start_it, end_it, token_hash).index();
    146   return std::make_pair(start, end);
    147 }
    148 
    149 bool NGramModel::Eval(const UnicodeText& text, float* score) const {
    150   const std::vector<Token> raw_tokens = tokenizer_->Tokenize(text);
    151 
    152   // If we have no tokens, then just bail early.
    153   if (raw_tokens.empty()) {
    154     if (score != nullptr) {
    155       *score = model_->default_token_weight();
    156     }
    157     return false;
    158   }
    159 
    160   // Hash the tokens.
    161   std::vector<uint32> tokens;
    162   tokens.reserve(raw_tokens.size());
    163   for (const Token& raw_token : raw_tokens) {
    164     tokens.push_back(tc3farmhash::Fingerprint32(raw_token.value.data(),
    165                                                 raw_token.value.length()));
    166   }
    167 
    168   // Calculate the total number of skip-grams that can be generated for the
    169   // input text.
    170   const uint64 num_candidates = GetNumSkipGrams(
    171       tokens.size(), model_->max_denom_ngram_length(), model_->max_skips());
    172 
    173   // For each token, see whether it denotes the start of an n-gram in the model.
    174   int num_matches = 0;
    175   float weight_matches = 0.f;
    176   for (size_t start_i = 0; start_i < tokens.size(); ++start_i) {
    177     const std::pair<int, int> ngram_range =
    178         GetFirstTokenMatches(tokens[start_i]);
    179     for (int ngram_idx = ngram_range.first; ngram_idx < ngram_range.second;
    180          ++ngram_idx) {
    181       const uint16 ngram_tokens_begin =
    182           (*model_->ngram_start_offsets())[ngram_idx];
    183       const uint16 ngram_tokens_end =
    184           (*model_->ngram_start_offsets())[ngram_idx + 1];
    185       if (IsNGramMatch(
    186               /*tokens=*/tokens.data() + start_i,
    187               /*num_tokens=*/tokens.size() - start_i,
    188               /*ngram_tokens=*/model_->hashed_ngram_tokens()->data() +
    189                   ngram_tokens_begin,
    190               /*num_ngram_tokens=*/ngram_tokens_end - ngram_tokens_begin,
    191               /*max_skips=*/model_->max_skips())) {
    192         ++num_matches;
    193         weight_matches += (*model_->ngram_weights())[ngram_idx];
    194       }
    195     }
    196   }
    197 
    198   // Calculate the score.
    199   const int num_misses = num_candidates - num_matches;
    200   const float internal_score =
    201       (weight_matches + (model_->default_token_weight() * num_misses)) /
    202       num_candidates;
    203   if (score != nullptr) {
    204     *score = internal_score;
    205   }
    206   return internal_score > model_->threshold();
    207 }
    208 
    209 }  // namespace libtextclassifier3
    210