1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define LOG_TAG "Minikin" 18 19 #include "FontLanguage.h" 20 21 #include <algorithm> 22 #include <hb.h> 23 #include <string.h> 24 #include <unicode/uloc.h> 25 26 namespace minikin { 27 28 #define SCRIPT_TAG(c1, c2, c3, c4) \ 29 (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) << 8 | \ 30 ((uint32_t)(c4))) 31 32 // Check if a language code supports emoji according to its subtag 33 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) { 34 if (bufLen < subtagLen) { 35 return false; 36 } 37 if (strncmp(buf, subtag, subtagLen) != 0) { 38 return false; // no match between two strings 39 } 40 return (bufLen == subtagLen || buf[subtagLen] == '\0' || 41 buf[subtagLen] == '-' || buf[subtagLen] == '_'); 42 } 43 44 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0. 45 // For the region code, the letters must be all digits in three letter case, so the number of 46 // possible values are 10. For the language code, the letters must be all small alphabets, so the 47 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the 48 // three letter language code or region code to 15 bits. 49 // 50 // In case of two letter code, use fullbit(0x1f) for the first letter instead. 51 static uint16_t packLanguageOrRegion(const char* c, size_t length, uint8_t twoLetterBase, 52 uint8_t threeLetterBase) { 53 if (length == 2) { 54 return 0x7c00u | // 0x1fu << 10 55 (uint16_t)(c[0] - twoLetterBase) << 5 | 56 (uint16_t)(c[1] - twoLetterBase); 57 } else { 58 return ((uint16_t)(c[0] - threeLetterBase) << 10) | 59 (uint16_t)(c[1] - threeLetterBase) << 5 | 60 (uint16_t)(c[2] - threeLetterBase); 61 } 62 } 63 64 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase, 65 uint8_t threeLetterBase) { 66 uint8_t first = (in >> 10) & 0x1f; 67 uint8_t second = (in >> 5) & 0x1f; 68 uint8_t third = in & 0x1f; 69 70 if (first == 0x1f) { 71 out[0] = second + twoLetterBase; 72 out[1] = third + twoLetterBase; 73 return 2; 74 } else { 75 out[0] = first + threeLetterBase; 76 out[1] = second + threeLetterBase; 77 out[2] = third + threeLetterBase; 78 return 3; 79 } 80 } 81 82 // Find the next '-' or '_' index from startOffset position. If not found, returns bufferLength. 83 static size_t nextDelimiterIndex(const char* buffer, size_t bufferLength, size_t startOffset) { 84 for (size_t i = startOffset; i < bufferLength; ++i) { 85 if (buffer[i] == '-' || buffer[i] == '_') { 86 return i; 87 } 88 } 89 return bufferLength; 90 } 91 92 static inline bool isLowercase(char c) { 93 return 'a' <= c && c <= 'z'; 94 } 95 96 static inline bool isUppercase(char c) { 97 return 'A' <= c && c <= 'Z'; 98 } 99 100 static inline bool isDigit(char c) { 101 return '0' <= c && c <= '9'; 102 } 103 104 // Returns true if the buffer is valid for language code. 105 static inline bool isValidLanguageCode(const char* buffer, size_t length) { 106 if (length != 2 && length != 3) return false; 107 if (!isLowercase(buffer[0])) return false; 108 if (!isLowercase(buffer[1])) return false; 109 if (length == 3 && !isLowercase(buffer[2])) return false; 110 return true; 111 } 112 113 // Returns true if buffer is valid for script code. The length of buffer must be 4. 114 static inline bool isValidScriptCode(const char* buffer) { 115 return isUppercase(buffer[0]) && isLowercase(buffer[1]) && isLowercase(buffer[2]) && 116 isLowercase(buffer[3]); 117 } 118 119 // Returns true if the buffer is valid for region code. 120 static inline bool isValidRegionCode(const char* buffer, size_t length) { 121 return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) || 122 (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2])); 123 } 124 125 // Parse BCP 47 language identifier into internal structure 126 FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() { 127 size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0); 128 if (isValidLanguageCode(buf, firstDelimiterPos)) { 129 mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a'); 130 } else { 131 // We don't understand anything other than two-letter or three-letter 132 // language codes, so we skip parsing the rest of the string. 133 return; 134 } 135 136 if (firstDelimiterPos == length) { 137 mHbLanguage = hb_language_from_string(getString().c_str(), -1); 138 return; // Language code only. 139 } 140 141 size_t nextComponentStartPos = firstDelimiterPos + 1; 142 size_t nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos); 143 size_t componentLength = nextDelimiterPos - nextComponentStartPos; 144 145 if (componentLength == 4) { 146 // Possibly script code. 147 const char* p = buf + nextComponentStartPos; 148 if (isValidScriptCode(p)) { 149 mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]); 150 mSubScriptBits = scriptToSubScriptBits(mScript); 151 } 152 153 if (nextDelimiterPos == length) { 154 mHbLanguage = hb_language_from_string(getString().c_str(), -1); 155 mEmojiStyle = resolveEmojiStyle(buf, length, mScript); 156 return; // No region code. 157 } 158 159 nextComponentStartPos = nextDelimiterPos + 1; 160 nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos); 161 componentLength = nextDelimiterPos - nextComponentStartPos; 162 } 163 164 if (componentLength == 2 || componentLength == 3) { 165 // Possibly region code. 166 const char* p = buf + nextComponentStartPos; 167 if (isValidRegionCode(p, componentLength)) { 168 mRegion = packLanguageOrRegion(p, componentLength, 'A', '0'); 169 } 170 } 171 172 mHbLanguage = hb_language_from_string(getString().c_str(), -1); 173 mEmojiStyle = resolveEmojiStyle(buf, length, mScript); 174 } 175 176 // static 177 FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf, size_t length, 178 uint32_t script) { 179 // First, lookup emoji subtag. 180 // 10 is the length of "-u-em-text", which is the shortest emoji subtag, 181 // unnecessary comparison can be avoided if total length is smaller than 10. 182 const size_t kMinSubtagLength = 10; 183 if (length >= kMinSubtagLength) { 184 static const char kPrefix[] = "-u-em-"; 185 const char *pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix)); 186 if (pos != buf + length) { // found 187 pos += strlen(kPrefix); 188 const size_t remainingLength = length - (pos - buf); 189 if (isEmojiSubtag(pos, remainingLength, "emoji", 5)){ 190 return EMSTYLE_EMOJI; 191 } else if (isEmojiSubtag(pos, remainingLength, "text", 4)){ 192 return EMSTYLE_TEXT; 193 } else if (isEmojiSubtag(pos, remainingLength, "default", 7)){ 194 return EMSTYLE_DEFAULT; 195 } 196 } 197 } 198 199 // If no emoji subtag was provided, resolve the emoji style from script code. 200 if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) { 201 return EMSTYLE_EMOJI; 202 } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) { 203 return EMSTYLE_TEXT; 204 } 205 206 return EMSTYLE_EMPTY; 207 } 208 209 //static 210 uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) { 211 uint8_t subScriptBits = 0u; 212 switch (script) { 213 case SCRIPT_TAG('B', 'o', 'p', 'o'): 214 subScriptBits = kBopomofoFlag; 215 break; 216 case SCRIPT_TAG('H', 'a', 'n', 'g'): 217 subScriptBits = kHangulFlag; 218 break; 219 case SCRIPT_TAG('H', 'a', 'n', 'b'): 220 // Bopomofo is almost exclusively used in Taiwan. 221 subScriptBits = kHanFlag | kBopomofoFlag; 222 break; 223 case SCRIPT_TAG('H', 'a', 'n', 'i'): 224 subScriptBits = kHanFlag; 225 break; 226 case SCRIPT_TAG('H', 'a', 'n', 's'): 227 subScriptBits = kHanFlag | kSimplifiedChineseFlag; 228 break; 229 case SCRIPT_TAG('H', 'a', 'n', 't'): 230 subScriptBits = kHanFlag | kTraditionalChineseFlag; 231 break; 232 case SCRIPT_TAG('H', 'i', 'r', 'a'): 233 subScriptBits = kHiraganaFlag; 234 break; 235 case SCRIPT_TAG('H', 'r', 'k', 't'): 236 subScriptBits = kKatakanaFlag | kHiraganaFlag; 237 break; 238 case SCRIPT_TAG('J', 'p', 'a', 'n'): 239 subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag; 240 break; 241 case SCRIPT_TAG('K', 'a', 'n', 'a'): 242 subScriptBits = kKatakanaFlag; 243 break; 244 case SCRIPT_TAG('K', 'o', 'r', 'e'): 245 subScriptBits = kHanFlag | kHangulFlag; 246 break; 247 } 248 return subScriptBits; 249 } 250 251 std::string FontLanguage::getString() const { 252 if (isUnsupported()) { 253 return "und"; 254 } 255 char buf[16]; 256 size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a'); 257 if (mScript != 0) { 258 buf[i++] = '-'; 259 buf[i++] = (mScript >> 24) & 0xFFu; 260 buf[i++] = (mScript >> 16) & 0xFFu; 261 buf[i++] = (mScript >> 8) & 0xFFu; 262 buf[i++] = mScript & 0xFFu; 263 } 264 if (mRegion != INVALID_CODE) { 265 buf[i++] = '-'; 266 i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0'); 267 } 268 return std::string(buf, i); 269 } 270 271 bool FontLanguage::isEqualScript(const FontLanguage& other) const { 272 return other.mScript == mScript; 273 } 274 275 // static 276 bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) { 277 return requestedBits != 0 && (providedBits & requestedBits) == requestedBits; 278 } 279 280 bool FontLanguage::supportsHbScript(hb_script_t script) const { 281 static_assert(SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'), 282 "The Minikin script and HarfBuzz hb_script_t have different encodings."); 283 if (script == mScript) return true; 284 return supportsScript(mSubScriptBits, scriptToSubScriptBits(script)); 285 } 286 287 int FontLanguage::calcScoreFor(const FontLanguages& supported) const { 288 bool languageScriptMatch = false; 289 bool subtagMatch = false; 290 bool scriptMatch = false; 291 292 for (size_t i = 0; i < supported.size(); ++i) { 293 if (mEmojiStyle != EMSTYLE_EMPTY && 294 mEmojiStyle == supported[i].mEmojiStyle) { 295 subtagMatch = true; 296 if (mLanguage == supported[i].mLanguage) { 297 return 4; 298 } 299 } 300 if (isEqualScript(supported[i]) || 301 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) { 302 scriptMatch = true; 303 if (mLanguage == supported[i].mLanguage) { 304 languageScriptMatch = true; 305 } 306 } 307 } 308 309 if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) { 310 scriptMatch = true; 311 if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLanguage()) { 312 return 3; 313 } 314 } 315 316 if (languageScriptMatch) { 317 return 3; 318 } else if (subtagMatch) { 319 return 2; 320 } else if (scriptMatch) { 321 return 1; 322 } 323 return 0; 324 } 325 326 FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages) 327 : mLanguages(std::move(languages)) { 328 if (mLanguages.empty()) { 329 return; 330 } 331 332 const FontLanguage& lang = mLanguages[0]; 333 334 mIsAllTheSameLanguage = true; 335 mUnionOfSubScriptBits = lang.mSubScriptBits; 336 for (size_t i = 1; i < mLanguages.size(); ++i) { 337 mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits; 338 if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) { 339 mIsAllTheSameLanguage = false; 340 } 341 } 342 } 343 344 #undef SCRIPT_TAG 345 } // namespace minikin 346