1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "actions/ngram-model.h" 18 19 #include <algorithm> 20 21 #include "actions/feature-processor.h" 22 #include "utils/hash/farmhash.h" 23 #include "utils/strings/stringpiece.h" 24 25 namespace libtextclassifier3 { 26 namespace { 27 28 // An iterator to iterate over the initial tokens of the n-grams of a model. 29 class FirstTokenIterator 30 : public std::iterator<std::random_access_iterator_tag, 31 /*value_type=*/uint32, /*difference_type=*/ptrdiff_t, 32 /*pointer=*/const uint32*, 33 /*reference=*/uint32&> { 34 public: 35 explicit FirstTokenIterator(const NGramLinearRegressionModel* model, 36 int index) 37 : model_(model), index_(index) {} 38 39 FirstTokenIterator& operator++() { 40 index_++; 41 return *this; 42 } 43 FirstTokenIterator& operator+=(ptrdiff_t dist) { 44 index_ += dist; 45 return *this; 46 } 47 ptrdiff_t operator-(const FirstTokenIterator& other_it) const { 48 return index_ - other_it.index_; 49 } 50 uint32 operator*() const { 51 const uint32 token_offset = (*model_->ngram_start_offsets())[index_]; 52 return (*model_->hashed_ngram_tokens())[token_offset]; 53 } 54 int index() const { return index_; } 55 56 private: 57 const NGramLinearRegressionModel* model_; 58 int index_; 59 }; 60 61 } // anonymous namespace 62 63 std::unique_ptr<NGramModel> NGramModel::Create( 64 const NGramLinearRegressionModel* model, const Tokenizer* tokenizer, 65 const UniLib* unilib) { 66 if (model == nullptr) { 67 return nullptr; 68 } 69 if (tokenizer == nullptr && model->tokenizer_options() == nullptr) { 70 TC3_LOG(ERROR) << "No tokenizer options specified."; 71 return nullptr; 72 } 73 return std::unique_ptr<NGramModel>(new NGramModel(model, tokenizer, unilib)); 74 } 75 76 NGramModel::NGramModel(const NGramLinearRegressionModel* model, 77 const Tokenizer* tokenizer, const UniLib* unilib) 78 : model_(model) { 79 // Create new tokenizer if options are specified, reuse feature processor 80 // tokenizer otherwise. 81 if (model->tokenizer_options() != nullptr) { 82 owned_tokenizer_ = CreateTokenizer(model->tokenizer_options(), unilib); 83 tokenizer_ = owned_tokenizer_.get(); 84 } else { 85 tokenizer_ = tokenizer; 86 } 87 } 88 89 // Returns whether a given n-gram matches the token stream. 90 bool NGramModel::IsNGramMatch(const uint32* tokens, size_t num_tokens, 91 const uint32* ngram_tokens, 92 size_t num_ngram_tokens, int max_skips) const { 93 int token_idx = 0, ngram_token_idx = 0, skip_remain = 0; 94 for (; token_idx < num_tokens && ngram_token_idx < num_ngram_tokens;) { 95 if (tokens[token_idx] == ngram_tokens[ngram_token_idx]) { 96 // Token matches. Advance both and reset the skip budget. 97 ++token_idx; 98 ++ngram_token_idx; 99 skip_remain = max_skips; 100 } else if (skip_remain > 0) { 101 // No match, but we have skips left, so just advance over the token. 102 ++token_idx; 103 skip_remain--; 104 } else { 105 // No match and we're out of skips. Reject. 106 return false; 107 } 108 } 109 return ngram_token_idx == num_ngram_tokens; 110 } 111 112 // Calculates the total number of skip-grams that can be created for a stream 113 // with the given number of tokens. 114 uint64 NGramModel::GetNumSkipGrams(int num_tokens, int max_ngram_length, 115 int max_skips) { 116 // Start with unigrams. 117 uint64 total = num_tokens; 118 for (int ngram_len = 2; 119 ngram_len <= max_ngram_length && ngram_len <= num_tokens; ++ngram_len) { 120 // We can easily compute the expected length of the n-gram (with skips), 121 // but it doesn't account for the fact that they may be longer than the 122 // input and should be pruned. 123 // Instead, we iterate over the distribution of effective n-gram lengths 124 // and add each length individually. 125 const int num_gaps = ngram_len - 1; 126 const int len_min = ngram_len; 127 const int len_max = ngram_len + num_gaps * max_skips; 128 const int len_mid = (len_max + len_min) / 2; 129 for (int len_i = len_min; len_i <= len_max; ++len_i) { 130 if (len_i > num_tokens) continue; 131 const int num_configs_of_len_i = 132 len_i <= len_mid ? len_i - len_min + 1 : len_max - len_i + 1; 133 const int num_start_offsets = num_tokens - len_i + 1; 134 total += num_configs_of_len_i * num_start_offsets; 135 } 136 } 137 return total; 138 } 139 140 std::pair<int, int> NGramModel::GetFirstTokenMatches(uint32 token_hash) const { 141 const int num_ngrams = model_->ngram_weights()->size(); 142 const auto start_it = FirstTokenIterator(model_, 0); 143 const auto end_it = FirstTokenIterator(model_, num_ngrams); 144 const int start = std::lower_bound(start_it, end_it, token_hash).index(); 145 const int end = std::upper_bound(start_it, end_it, token_hash).index(); 146 return std::make_pair(start, end); 147 } 148 149 bool NGramModel::Eval(const UnicodeText& text, float* score) const { 150 const std::vector<Token> raw_tokens = tokenizer_->Tokenize(text); 151 152 // If we have no tokens, then just bail early. 153 if (raw_tokens.empty()) { 154 if (score != nullptr) { 155 *score = model_->default_token_weight(); 156 } 157 return false; 158 } 159 160 // Hash the tokens. 161 std::vector<uint32> tokens; 162 tokens.reserve(raw_tokens.size()); 163 for (const Token& raw_token : raw_tokens) { 164 tokens.push_back(tc3farmhash::Fingerprint32(raw_token.value.data(), 165 raw_token.value.length())); 166 } 167 168 // Calculate the total number of skip-grams that can be generated for the 169 // input text. 170 const uint64 num_candidates = GetNumSkipGrams( 171 tokens.size(), model_->max_denom_ngram_length(), model_->max_skips()); 172 173 // For each token, see whether it denotes the start of an n-gram in the model. 174 int num_matches = 0; 175 float weight_matches = 0.f; 176 for (size_t start_i = 0; start_i < tokens.size(); ++start_i) { 177 const std::pair<int, int> ngram_range = 178 GetFirstTokenMatches(tokens[start_i]); 179 for (int ngram_idx = ngram_range.first; ngram_idx < ngram_range.second; 180 ++ngram_idx) { 181 const uint16 ngram_tokens_begin = 182 (*model_->ngram_start_offsets())[ngram_idx]; 183 const uint16 ngram_tokens_end = 184 (*model_->ngram_start_offsets())[ngram_idx + 1]; 185 if (IsNGramMatch( 186 /*tokens=*/tokens.data() + start_i, 187 /*num_tokens=*/tokens.size() - start_i, 188 /*ngram_tokens=*/model_->hashed_ngram_tokens()->data() + 189 ngram_tokens_begin, 190 /*num_ngram_tokens=*/ngram_tokens_end - ngram_tokens_begin, 191 /*max_skips=*/model_->max_skips())) { 192 ++num_matches; 193 weight_matches += (*model_->ngram_weights())[ngram_idx]; 194 } 195 } 196 } 197 198 // Calculate the score. 199 const int num_misses = num_candidates - num_matches; 200 const float internal_score = 201 (weight_matches + (model_->default_token_weight() * num_misses)) / 202 num_candidates; 203 if (score != nullptr) { 204 *score = internal_score; 205 } 206 return internal_score > model_->threshold(); 207 } 208 209 } // namespace libtextclassifier3 210