Home | History | Annotate | Download | only in lang_id
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
     18 #define LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
     19 
     20 #include <string>
     21 
     22 #include "common/feature-extractor.h"
     23 #include "common/task-context.h"
     24 #include "common/workspace.h"
     25 #include "lang_id/light-sentence-features.h"
     26 #include "lang_id/light-sentence.h"
     27 
     28 namespace libtextclassifier {
     29 namespace nlp_core {
     30 namespace lang_id {
     31 
     32 // Class for computing continuous char ngram features.
     33 //
     34 // Feature function descriptor parameters:
     35 //   id_dim(int, 10000):
     36 //     The integer id of each char ngram is computed as follows:
     37 //     Hash32WithDefaultSeed(char ngram) % id_dim.
     38 //   size(int, 3):
     39 //     Only ngrams of this size will be extracted.
     40 //
     41 // NOTE: this class is not thread-safe.  TODO(salcianu): make it thread-safe.
     42 class ContinuousBagOfNgramsFunction : public LightSentenceFeature {
     43  public:
     44   bool Setup(TaskContext *context) override;
     45   bool Init(TaskContext *context) override;
     46 
     47   // Appends the features computed from the sentence to the feature vector.
     48   void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence,
     49                 FeatureVector *result) const override;
     50 
     51   TC_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams",
     52                                 ContinuousBagOfNgramsFunction);
     53 
     54  private:
     55   // Auxiliary for Evaluate().  Fills counts_ and non_zero_count_indices_ (see
     56   // below), and returns the total ngram count.
     57   int ComputeNgramCounts(const LightSentence &sentence) const;
     58 
     59   // counts_[i] is the count of all ngrams with id i.  Work data for Evaluate().
     60   // NOTE: we declare this vector as a field, such that its underlying capacity
     61   // stays allocated in between calls to Evaluate().
     62   mutable std::vector<int> counts_;
     63 
     64   // Indices of non-zero elements of counts_.  See comments for counts_.
     65   mutable std::vector<int> non_zero_count_indices_;
     66 
     67   // The integer id of each char ngram is computed as follows:
     68   // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
     69   int ngram_id_dimension_;
     70 
     71   // Only ngrams of size ngram_size_ will be extracted.
     72   int ngram_size_;
     73 };
     74 
     75 }  // namespace lang_id
     76 }  // namespace nlp_core
     77 }  // namespace libtextclassifier
     78 
     79 #endif  // LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
     80