1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ 18 #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ 19 20 namespace libtextclassifier { 21 namespace nlp_core { 22 namespace lang_id { 23 24 // Unicode scripts we care about. To get compact and fast code, we detect only 25 // a few Unicode scripts that offer a strong indication about the language of 26 // the text (e.g., Hiragana -> Japanese). 27 enum Script { 28 // Special value to indicate internal errors in the script detection code. 29 kScriptError, 30 31 // Special values for all Unicode scripts that we do not detect. One special 32 // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we 33 // already have that information, we use it). kScriptOtherUtf8OneByte means 34 // ~Latin and kScriptOtherUtf8FourBytes means ~Han. 35 kScriptOtherUtf8OneByte, 36 kScriptOtherUtf8TwoBytes, 37 kScriptOtherUtf8ThreeBytes, 38 kScriptOtherUtf8FourBytes, 39 40 kScriptGreek, 41 kScriptCyrillic, 42 kScriptHebrew, 43 kScriptArabic, 44 kScriptHangulJamo, // Used primarily for Korean. 45 kScriptHiragana, // Used primarily for Japanese. 46 kScriptKatakana, // Used primarily for Japanese. 47 48 // Add new scripts here. 49 50 // Do not add any script after kNumRelevantScripts. This value indicates the 51 // number of elements in this enum Script (except this value) such that we can 52 // easily iterate over the scripts. 53 kNumRelevantScripts, 54 }; 55 56 template<typename IntType> 57 inline bool InRange(IntType value, IntType low, IntType hi) { 58 return (value >= low) && (value <= hi); 59 } 60 61 // Returns Script for the UTF8 character that starts at address p. 62 // Precondition: p points to a valid UTF8 character of num_bytes bytes. 63 inline Script GetScript(const unsigned char *p, int num_bytes) { 64 switch (num_bytes) { 65 case 1: 66 return kScriptOtherUtf8OneByte; 67 68 case 2: { 69 // 2-byte UTF8 characters have 11 bits of information. unsigned int has 70 // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so 71 // it's enough. It's also usually the fastest int type on the current 72 // CPU, so it's better to use than int32. 73 static const unsigned int kGreekStart = 0x370; 74 75 // Commented out (unsued in the code): kGreekEnd = 0x3FF; 76 static const unsigned int kCyrillicStart = 0x400; 77 static const unsigned int kCyrillicEnd = 0x4FF; 78 static const unsigned int kHebrewStart = 0x590; 79 80 // Commented out (unsued in the code): kHebrewEnd = 0x5FF; 81 static const unsigned int kArabicStart = 0x600; 82 static const unsigned int kArabicEnd = 0x6FF; 83 const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F); 84 if (codepoint > kCyrillicEnd) { 85 if (codepoint >= kArabicStart) { 86 if (codepoint <= kArabicEnd) { 87 return kScriptArabic; 88 } 89 } else { 90 // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so 91 // codepoint <= kHebrewEnd. 92 if (codepoint >= kHebrewStart) { 93 return kScriptHebrew; 94 } 95 } 96 } else { 97 if (codepoint >= kCyrillicStart) { 98 return kScriptCyrillic; 99 } else { 100 // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so 101 // codepoint <= kGreekEnd. 102 if (codepoint >= kGreekStart) { 103 return kScriptGreek; 104 } 105 } 106 } 107 return kScriptOtherUtf8TwoBytes; 108 } 109 110 case 3: { 111 // 3-byte UTF8 characters have 16 bits of information. unsigned int has 112 // at least 16 bits. 113 static const unsigned int kHangulJamoStart = 0x1100; 114 static const unsigned int kHangulJamoEnd = 0x11FF; 115 static const unsigned int kHiraganaStart = 0x3041; 116 static const unsigned int kHiraganaEnd = 0x309F; 117 118 // Commented out (unsued in the code): kKatakanaStart = 0x30A0; 119 static const unsigned int kKatakanaEnd = 0x30FF; 120 const unsigned int codepoint = 121 ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); 122 if (codepoint > kHiraganaEnd) { 123 // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so 124 // codepoint >= kKatakanaStart. 125 if (codepoint <= kKatakanaEnd) { 126 return kScriptKatakana; 127 } 128 } else { 129 if (codepoint >= kHiraganaStart) { 130 return kScriptHiragana; 131 } else { 132 if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) { 133 return kScriptHangulJamo; 134 } 135 } 136 } 137 return kScriptOtherUtf8ThreeBytes; 138 } 139 140 case 4: 141 return kScriptOtherUtf8FourBytes; 142 143 default: 144 return kScriptError; 145 } 146 } 147 148 // Returns Script for the UTF8 character that starts at address p. Similar to 149 // the previous version of GetScript, except for "char" vs "unsigned char". 150 // Most code works with "char *" pointers, ignoring the fact that char is 151 // unsigned (by default) on most platforms, but signed on iOS. This code takes 152 // care of making sure we always treat chars as unsigned. 153 inline Script GetScript(const char *p, int num_bytes) { 154 return GetScript(reinterpret_cast<const unsigned char *>(p), 155 num_bytes); 156 } 157 158 } // namespace lang_id 159 } // namespace nlp_core 160 } // namespace libtextclassifier 161 162 #endif // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ 163