1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_RELEVANT_SCRIPT_FEATURE_H_ 18 #define LIBTEXTCLASSIFIER_LANG_ID_RELEVANT_SCRIPT_FEATURE_H_ 19 20 #include "common/feature-extractor.h" 21 #include "common/task-context.h" 22 #include "common/workspace.h" 23 #include "lang_id/light-sentence-features.h" 24 #include "lang_id/light-sentence.h" 25 26 namespace libtextclassifier { 27 namespace nlp_core { 28 namespace lang_id { 29 30 // Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode 31 // script (see below): each such feature indicates the script and the ratio of 32 // UTF8 characters in that script, in the given sentence. 33 // 34 // What is a relevant script? Recognizing all 100+ Unicode scripts would 35 // require too much code size and runtime. Instead, we focus only on a few 36 // scripts that communicate a lot of language information: e.g., the use of 37 // Hiragana characters almost always indicates Japanese, so Hiragana is a 38 // "relevant" script for us. The Latin script is used by dozens of language, so 39 // Latin is not relevant in this context. 40 class RelevantScriptFeature : public LightSentenceFeature { 41 public: 42 // Idiomatic SAFT Setup() and Init(). 43 bool Setup(TaskContext *context) override; 44 bool Init(TaskContext *context) override; 45 46 // Appends the features computed from the sentence to the feature vector. 47 void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence, 48 FeatureVector *result) const override; 49 50 TC_DEFINE_REGISTRATION_METHOD("continuous-bag-of-relevant-scripts", 51 RelevantScriptFeature); 52 }; 53 54 } // namespace lang_id 55 } // namespace nlp_core 56 } // namespace libtextclassifier 57 58 #endif // LIBTEXTCLASSIFIER_LANG_ID_RELEVANT_SCRIPT_FEATURE_H_ 59