Home | History | Annotate | Download | only in actions
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_
     18 #define LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_
     19 
     20 #include <memory>
     21 
     22 #include "actions/actions_model_generated.h"
     23 #include "utils/tokenizer.h"
     24 #include "utils/utf8/unicodetext.h"
     25 #include "utils/utf8/unilib.h"
     26 
     27 namespace libtextclassifier3 {
     28 
     29 class NGramModel {
     30  public:
     31   static std::unique_ptr<NGramModel> Create(
     32       const NGramLinearRegressionModel* model, const Tokenizer* tokenizer,
     33       const UniLib* unilib);
     34 
     35   // Evaluates an n-gram linear regression model, and tests against the
     36   // threshold. Returns true in case of a positive classification. The caller
     37   // may also optionally query the score.
     38   bool Eval(const UnicodeText& text, float* score = nullptr) const;
     39 
     40   // Exposed for testing only.
     41   static uint64 GetNumSkipGrams(int num_tokens, int max_ngram_length,
     42                                 int max_skips);
     43 
     44  private:
     45   NGramModel(const NGramLinearRegressionModel* model,
     46              const Tokenizer* tokenizer, const UniLib* unilib);
     47 
     48   // Returns the (begin,end] range of n-grams where the first hashed token
     49   // matches the given value.
     50   std::pair<int, int> GetFirstTokenMatches(uint32 token_hash) const;
     51 
     52   // Returns whether a given n-gram matches the token stream.
     53   bool IsNGramMatch(const uint32* tokens, size_t num_tokens,
     54                     const uint32* ngram_tokens, size_t num_ngram_tokens,
     55                     int max_skips) const;
     56 
     57   const NGramLinearRegressionModel* model_;
     58   const Tokenizer* tokenizer_;
     59   std::unique_ptr<Tokenizer> owned_tokenizer_;
     60 };
     61 
     62 }  // namespace libtextclassifier3
     63 
     64 #endif  // LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_
     65