1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_ 19 20 #include "lang_id/script/script-detector.h" 21 22 namespace libtextclassifier3 { 23 namespace mobile { 24 namespace lang_id { 25 26 // Unicode scripts we care about. To get compact and fast code, we detect only 27 // a few Unicode scripts that offer a strong indication about the language of 28 // the text (e.g., Hiragana -> Japanese). 29 enum Script { 30 // Special value to indicate internal errors in the script detection code. 31 kScriptError, 32 33 // Special values for all Unicode scripts that we do not detect. One special 34 // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we 35 // already have that information, we use it). kScriptOtherUtf8OneByte means 36 // ~Latin and kScriptOtherUtf8FourBytes means ~Han. 37 kScriptOtherUtf8OneByte, 38 kScriptOtherUtf8TwoBytes, 39 kScriptOtherUtf8ThreeBytes, 40 kScriptOtherUtf8FourBytes, 41 42 kScriptGreek, 43 kScriptCyrillic, 44 kScriptHebrew, 45 kScriptArabic, 46 kScriptHangulJamo, // Used primarily for Korean. 47 kScriptHiragana, // Used primarily for Japanese. 48 kScriptKatakana, // Used primarily for Japanese. 49 50 // Add new scripts here. 51 52 // Do not add any script after kNumRelevantScripts. This value indicates the 53 // number of elements in this enum Script (except this value) such that we can 54 // easily iterate over the scripts. 55 kNumRelevantScripts, 56 }; 57 58 template<typename IntType> 59 inline bool InRange(IntType value, IntType low, IntType hi) { 60 return (value >= low) && (value <= hi); 61 } 62 63 // Returns Script for the UTF8 character that starts at address p. 64 // Precondition: p points to a valid UTF8 character of num_bytes bytes. 65 inline Script GetScript(const unsigned char *p, int num_bytes) { 66 switch (num_bytes) { 67 case 1: 68 return kScriptOtherUtf8OneByte; 69 70 case 2: { 71 // 2-byte UTF8 characters have 11 bits of information. unsigned int has 72 // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so 73 // it's enough. It's also usually the fastest int type on the current 74 // CPU, so it's better to use than int32. 75 static const unsigned int kGreekStart = 0x370; 76 77 // Commented out (unsued in the code): kGreekEnd = 0x3FF; 78 static const unsigned int kCyrillicStart = 0x400; 79 static const unsigned int kCyrillicEnd = 0x4FF; 80 static const unsigned int kHebrewStart = 0x590; 81 82 // Commented out (unsued in the code): kHebrewEnd = 0x5FF; 83 static const unsigned int kArabicStart = 0x600; 84 static const unsigned int kArabicEnd = 0x6FF; 85 const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F); 86 if (codepoint > kCyrillicEnd) { 87 if (codepoint >= kArabicStart) { 88 if (codepoint <= kArabicEnd) { 89 return kScriptArabic; 90 } 91 } else { 92 // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so 93 // codepoint <= kHebrewEnd. 94 if (codepoint >= kHebrewStart) { 95 return kScriptHebrew; 96 } 97 } 98 } else { 99 if (codepoint >= kCyrillicStart) { 100 return kScriptCyrillic; 101 } else { 102 // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so 103 // codepoint <= kGreekEnd. 104 if (codepoint >= kGreekStart) { 105 return kScriptGreek; 106 } 107 } 108 } 109 return kScriptOtherUtf8TwoBytes; 110 } 111 112 case 3: { 113 // 3-byte UTF8 characters have 16 bits of information. unsigned int has 114 // at least 16 bits. 115 static const unsigned int kHangulJamoStart = 0x1100; 116 static const unsigned int kHangulJamoEnd = 0x11FF; 117 static const unsigned int kHiraganaStart = 0x3041; 118 static const unsigned int kHiraganaEnd = 0x309F; 119 120 // Commented out (unsued in the code): kKatakanaStart = 0x30A0; 121 static const unsigned int kKatakanaEnd = 0x30FF; 122 const unsigned int codepoint = 123 ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); 124 if (codepoint > kHiraganaEnd) { 125 // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so 126 // codepoint >= kKatakanaStart. 127 if (codepoint <= kKatakanaEnd) { 128 return kScriptKatakana; 129 } 130 } else { 131 if (codepoint >= kHiraganaStart) { 132 return kScriptHiragana; 133 } else { 134 if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) { 135 return kScriptHangulJamo; 136 } 137 } 138 } 139 return kScriptOtherUtf8ThreeBytes; 140 } 141 142 case 4: 143 return kScriptOtherUtf8FourBytes; 144 145 default: 146 return kScriptError; 147 } 148 } 149 150 // Returns Script for the UTF8 character that starts at address p. Similar to 151 // the previous version of GetScript, except for "char" vs "unsigned char". 152 // Most code works with "char *" pointers, ignoring the fact that char is 153 // unsigned (by default) on most platforms, but signed on iOS. This code takes 154 // care of making sure we always treat chars as unsigned. 155 inline Script GetScript(const char *p, int num_bytes) { 156 return GetScript(reinterpret_cast<const unsigned char *>(p), 157 num_bytes); 158 } 159 160 class TinyScriptDetector : public ScriptDetector { 161 public: 162 ~TinyScriptDetector() override = default; 163 164 int GetScript(const char *s, int num_bytes) const override { 165 // Add the namespace in indicate that we want to call the method outside 166 // this class, instead of performing an infinite recursive call. 167 return libtextclassifier3::mobile::lang_id::GetScript(s, num_bytes); 168 } 169 170 int GetMaxScript() const override { 171 return kNumRelevantScripts - 1; 172 } 173 174 SAFTM_DEFINE_REGISTRATION_METHOD("tiny-script-detector", TinyScriptDetector); 175 }; 176 177 } // namespace lang_id 178 } // namespace mobile 179 } // namespace nlp_saft 180 181 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_ 182