1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ 19 20 #include "lang_id/common/utf8.h" 21 #include "lang_id/script/script-detector.h" 22 23 namespace libtextclassifier3 { 24 namespace mobile { 25 26 // Returns script for the UTF-8 character that starts at address |s| and has 27 // |num_bytes| bytes. Note: behavior is unspecified if s points to a UTF-8 28 // character that has a different number of bytes. If you don't know 29 // |num_bytes|, call GetApproxScript(const char *s). 30 // 31 // NOTE: to keep BUILD deps small, this function returns an int, but you can 32 // assume it's an enum UScriptCode (unicode/uscript.h) 33 // 34 // If unable to determine the script, this function returns kUnknownUscript, the 35 // int value of USCRIPT_UNKNOWN from enum UScriptCode. 36 int GetApproxScript(const unsigned char *s, int num_bytes); 37 38 // See comments for GetApproxScript() above. 39 extern const int kUnknownUscript; 40 41 // Same as before, but s is a const char *pointer (no unsigned). Internally, we 42 // prefer "unsigned char" (the signed status of char is ambiguous), so we cast 43 // and call the previous version (with const unsigned char *). 44 inline int GetApproxScript(const char *s, int num_bytes) { 45 return GetApproxScript(reinterpret_cast<const unsigned char *>(s), num_bytes); 46 } 47 48 // Returns script for the UTF-8 character that starts at address |s|. NOTE: 49 // UTF-8 is a var-length encoding, taking between 1 and 4 bytes per Unicode 50 // character. We infer the number of bytes based on s[0]. If that number is k, 51 // we expect to be able to read k bytes starting from address |s|. I.e., do not 52 // call this function on broken UTF-8. 53 inline int GetApproxScript(const char *s) { 54 return GetApproxScript(s, utils::OneCharLen(s)); 55 } 56 57 // Returns max value returned by the GetApproxScript() functions. 58 int GetMaxApproxScriptResult(); 59 60 class ApproxScriptDetector : public ScriptDetector { 61 public: 62 ~ApproxScriptDetector() override = default; 63 64 // Note: the int result of this method is actually a UScriptCode enum value. 65 // We return int to match the general case from the base class ScriptDetector 66 // (some script detectors do not use UScriptCode). 67 int GetScript(const char *s, int num_bytes) const override { 68 return GetApproxScript(s, num_bytes); 69 } 70 71 int GetMaxScript() const override { 72 return GetMaxApproxScriptResult(); 73 } 74 75 SAFTM_DEFINE_REGISTRATION_METHOD("approx-unicode-script-detector", 76 ApproxScriptDetector); 77 }; 78 79 } // namespace mobile 80 } // namespace nlp_saft 81 82 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_H_ 83