Home | History | Annotate | Download | only in features
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
     18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
     19 
     20 #include <mutex>  // NOLINT: see comments for state_mutex_
     21 #include <string>
     22 
     23 #include "lang_id/common/fel/feature-extractor.h"
     24 #include "lang_id/common/fel/task-context.h"
     25 #include "lang_id/common/fel/workspace.h"
     26 #include "lang_id/features/light-sentence-features.h"
     27 #include "lang_id/light-sentence.h"
     28 
     29 // TODO(abakalov): Add a test.
     30 namespace libtextclassifier3 {
     31 namespace mobile {
     32 namespace lang_id {
     33 
     34 // Class for computing continuous char ngram features.
     35 //
     36 // Feature function descriptor parameters:
     37 //   include_terminators(bool, false):
     38 //     If 'true', then splits the text based on spaces to get tokens, adds "^"
     39 //     to the beginning of each token, and adds "$" to the end of each token.
     40 //     NOTE: currently, we support only include_terminators=true.
     41 //   include_spaces(bool, false):
     42 //     If 'true', then includes char ngrams containing spaces.
     43 //     NOTE: currently, we support only include_spaces=false.
     44 //   use_equal_weight(bool, false):
     45 //     If 'true', then weighs each unique ngram by 1.0 / (number of unique
     46 //     ngrams in the input). Otherwise, weighs each unique ngram by (ngram
     47 //     count) / (total number of ngrams).
     48 //     NOTE: currently, we support only use_equal_weight=false.
     49 //   id_dim(int, 10000):
     50 //     The integer id of each char ngram is computed as follows:
     51 //     Hash32WithDefault(char ngram) % id_dim.
     52 //   size(int, 3):
     53 //     Only ngrams of this size will be extracted.
     54 //
     55 // NOTE: this class is not thread-safe.  TODO(salcianu): make it thread-safe.
     56 class ContinuousBagOfNgramsFunction : public LightSentenceFeature {
     57  public:
     58   bool Setup(TaskContext *context) override;
     59   bool Init(TaskContext *context) override;
     60 
     61   // Appends the features computed from the sentence to the feature vector.
     62   void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence,
     63                 FeatureVector *result) const override;
     64 
     65   SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams",
     66                                    ContinuousBagOfNgramsFunction);
     67 
     68  private:
     69   // Auxiliary for Evaluate().  Fills counts_ and non_zero_count_indices_ (see
     70   // below), and returns the total ngram count.
     71   int ComputeNgramCounts(const LightSentence &sentence) const;
     72 
     73   // Guards counts_ and non_zero_count_indices_.  NOTE: we use std::* constructs
     74   // (instead of absl::Mutex & co) to simplify porting to Android and to avoid
     75   // pulling in absl (which increases our code size).
     76   mutable std::mutex state_mutex_;
     77 
     78   // counts_[i] is the count of all ngrams with id i.  Work data for Evaluate().
     79   // NOTE: we declare this vector as a field, such that its underlying capacity
     80   // stays allocated in between calls to Evaluate().
     81   mutable std::vector<int> counts_;
     82 
     83   // Indices of non-zero elements of counts_.  See comments for counts_.
     84   mutable std::vector<int> non_zero_count_indices_;
     85 
     86   // The integer id of each char ngram is computed as follows:
     87   // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
     88   int ngram_id_dimension_;
     89 
     90   // Only ngrams of size ngram_size_ will be extracted.
     91   int ngram_size_;
     92 };
     93 
     94 }  // namespace lang_id
     95 }  // namespace mobile
     96 }  // namespace nlp_saft
     97 
     98 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
     99