Home | History | Annotate | Download | only in features
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "lang_id/features/relevant-script-feature.h"
     18 
     19 #include <string>
     20 
     21 #include "lang_id/common/fel/feature-types.h"
     22 #include "lang_id/common/fel/task-context.h"
     23 #include "lang_id/common/fel/workspace.h"
     24 #include "lang_id/common/lite_base/logging.h"
     25 #include "lang_id/common/utf8.h"
     26 #include "lang_id/script/script-detector.h"
     27 
     28 namespace libtextclassifier3 {
     29 namespace mobile {
     30 namespace lang_id {
     31 
     32 bool RelevantScriptFeature::Setup(TaskContext *context) {
     33   string script_detector_name = GetParameter(
     34       "script_detector_name", /* default_value = */ "tiny-script-detector");
     35 
     36   // We don't use absl::WrapUnique, nor the rest of absl, see http://b/71873194
     37   script_detector_.reset(ScriptDetector::Create(script_detector_name));
     38   if (script_detector_ == nullptr) {
     39     // This means ScriptDetector::Create() could not find the requested
     40     // script_detector_name.  In that case, Create() already logged an error
     41     // message.
     42     return false;
     43   }
     44 
     45   // We use default value 172 because this is the number of scripts supported by
     46   // the first model we trained with this feature.  See http://b/70617713.
     47   // Newer models may support more scripts.
     48   num_supported_scripts_ = GetIntParameter("num_supported_scripts", 172);
     49   return true;
     50 }
     51 
     52 bool RelevantScriptFeature::Init(TaskContext *context) {
     53   set_feature_type(new NumericFeatureType(name(), num_supported_scripts_));
     54   return true;
     55 }
     56 
     57 void RelevantScriptFeature::Evaluate(
     58     const WorkspaceSet &workspaces, const LightSentence &sentence,
     59     FeatureVector *result) const {
     60   // counts[s] is the number of characters with script s.
     61   std::vector<int> counts(num_supported_scripts_);
     62   int total_count = 0;
     63   for (const string &word : sentence) {
     64     const char *const word_end = word.data() + word.size();
     65     const char *curr = word.data();
     66 
     67     // Skip over token start '^'.
     68     SAFTM_DCHECK_EQ(*curr, '^');
     69     curr += utils::OneCharLen(curr);
     70     while (true) {
     71       const int num_bytes = utils::OneCharLen(curr);
     72 
     73       int script = script_detector_->GetScript(curr, num_bytes);
     74 
     75       // We do this update and the if (...) break below *before* incrementing
     76       // counts[script] in order to skip the token end '$'.
     77       curr += num_bytes;
     78       if (curr >= word_end) {
     79         SAFTM_DCHECK_EQ(*(curr - num_bytes), '$');
     80         break;
     81       }
     82       SAFTM_DCHECK_GE(script, 0);
     83 
     84       if (script < num_supported_scripts_) {
     85         counts[script]++;
     86         total_count++;
     87       } else {
     88         // Unsupported script: this usually indicates a script that is
     89         // recognized by newer versions of the code, after the model was
     90         // trained.  E.g., new code running with old model.
     91       }
     92     }
     93   }
     94 
     95   for (int script_id = 0; script_id < num_supported_scripts_; ++script_id) {
     96     int count = counts[script_id];
     97     if (count > 0) {
     98       const float weight = static_cast<float>(count) / total_count;
     99       FloatFeatureValue value(script_id, weight);
    100       result->add(feature_type(), value.discrete_value);
    101     }
    102   }
    103 }
    104 
    105 SAFTM_STATIC_REGISTRATION(RelevantScriptFeature);
    106 
    107 }  // namespace lang_id
    108 }  // namespace mobile
    109 }  // namespace nlp_saft
    110