1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "Locale.h" 18 19 #include <algorithm> 20 21 #include <hb.h> 22 23 #include "minikin/LocaleList.h" 24 25 #include "LocaleListCache.h" 26 #include "MinikinInternal.h" 27 #include "StringPiece.h" 28 29 namespace minikin { 30 31 constexpr uint32_t FIVE_BITS = 0x1f; 32 33 uint32_t registerLocaleList(const std::string& locales) { 34 return LocaleListCache::getId(locales); 35 } 36 37 // Check if a language code supports emoji according to its subtag 38 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) { 39 if (bufLen < subtagLen) { 40 return false; 41 } 42 if (strncmp(buf, subtag, subtagLen) != 0) { 43 return false; // no match between two strings 44 } 45 return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' || 46 buf[subtagLen] == '_'); 47 } 48 49 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0. 50 // For the region code, the letters must be all digits in three letter case, so the number of 51 // possible values are 10. For the language code, the letters must be all small alphabets, so the 52 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the 53 // three letter language code or region code to 15 bits. 54 // 55 // In case of two letter code, use fullbit(0x1f) for the first letter instead. 56 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase, 57 uint8_t threeLetterBase) { 58 if (in.length() == 2) { 59 return 0x7c00u | // 0x1fu << 10 60 (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase); 61 } else { 62 return ((uint16_t)(in[0] - threeLetterBase) << 10) | 63 (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase); 64 } 65 } 66 67 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase, 68 uint8_t threeLetterBase) { 69 uint8_t first = (in >> 10) & FIVE_BITS; 70 uint8_t second = (in >> 5) & FIVE_BITS; 71 uint8_t third = in & FIVE_BITS; 72 73 if (first == 0x1f) { 74 out[0] = second + twoLetterBase; 75 out[1] = third + twoLetterBase; 76 return 2; 77 } else { 78 out[0] = first + threeLetterBase; 79 out[1] = second + threeLetterBase; 80 out[2] = third + threeLetterBase; 81 return 3; 82 } 83 } 84 85 static uint16_t packLanguage(const StringPiece& in) { 86 return packLanguageOrRegion(in, 'a', 'a'); 87 } 88 89 static size_t unpackLanguage(uint16_t in, char* out) { 90 return unpackLanguageOrRegion(in, out, 'a', 'a'); 91 } 92 93 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) { 94 constexpr char FIRST_LETTER_BASE = 'A'; 95 constexpr char REST_LETTER_BASE = 'a'; 96 return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 | 97 ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE); 98 } 99 100 constexpr uint32_t packScript(uint32_t script) { 101 return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff); 102 } 103 104 constexpr uint32_t unpackScript(uint32_t packedScript) { 105 constexpr char FIRST_LETTER_BASE = 'A'; 106 constexpr char REST_LETTER_BASE = 'a'; 107 const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE; 108 const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE; 109 const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE; 110 const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE; 111 112 return first << 24 | second << 16 | third << 8 | fourth; 113 } 114 115 static uint16_t packRegion(const StringPiece& in) { 116 return packLanguageOrRegion(in, 'A', '0'); 117 } 118 119 static size_t unpackRegion(uint16_t in, char* out) { 120 return unpackLanguageOrRegion(in, out, 'A', '0'); 121 } 122 123 static inline bool isLowercase(char c) { 124 return 'a' <= c && c <= 'z'; 125 } 126 127 static inline bool isUppercase(char c) { 128 return 'A' <= c && c <= 'Z'; 129 } 130 131 static inline bool isDigit(char c) { 132 return '0' <= c && c <= '9'; 133 } 134 135 // Returns true if the buffer is valid for language code. 136 static inline bool isValidLanguageCode(const StringPiece& buffer) { 137 if (buffer.length() != 2 && buffer.length() != 3) return false; 138 if (!isLowercase(buffer[0])) return false; 139 if (!isLowercase(buffer[1])) return false; 140 if (buffer.length() == 3 && !isLowercase(buffer[2])) return false; 141 return true; 142 } 143 144 // Returns true if buffer is valid for script code. The length of buffer must be 4. 145 static inline bool isValidScriptCode(const StringPiece& buffer) { 146 return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) && 147 isLowercase(buffer[2]) && isLowercase(buffer[3]); 148 } 149 150 // Returns true if the buffer is valid for region code. 151 static inline bool isValidRegionCode(const StringPiece& buffer) { 152 return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) || 153 (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2])); 154 } 155 156 // Parse BCP 47 language identifier into internal structure 157 Locale::Locale(const StringPiece& input) : Locale() { 158 SplitIterator it(input, '-'); 159 160 StringPiece language = it.next(); 161 if (isValidLanguageCode(language)) { 162 mLanguage = packLanguage(language); 163 } else { 164 // We don't understand anything other than two-letter or three-letter 165 // language codes, so we skip parsing the rest of the string. 166 return; 167 } 168 169 if (!it.hasNext()) { 170 return; // Language code only. 171 } 172 StringPiece token = it.next(); 173 174 if (isValidScriptCode(token)) { 175 mScript = packScript(token[0], token[1], token[2], token[3]); 176 mSubScriptBits = scriptToSubScriptBits(mScript); 177 178 if (!it.hasNext()) { 179 goto finalize; // No variant, emoji subtag and region code. 180 } 181 token = it.next(); 182 } 183 184 if (isValidRegionCode(token)) { 185 mRegion = packRegion(token); 186 187 if (!it.hasNext()) { 188 goto finalize; // No variant or emoji subtag. 189 } 190 token = it.next(); 191 } 192 193 if (language == "de") { // We are only interested in German variants. 194 if (token == "1901") { 195 mVariant = Variant::GERMAN_1901_ORTHOGRAPHY; 196 } else if (token == "1996") { 197 mVariant = Variant::GERMAN_1996_ORTHOGRAPHY; 198 } 199 200 if (mVariant != Variant::NO_VARIANT) { 201 if (!it.hasNext()) { 202 goto finalize; // No emoji subtag. 203 } 204 205 token = it.next(); 206 } 207 } 208 209 mEmojiStyle = resolveEmojiStyle(input.data(), input.length()); 210 211 finalize: 212 if (mEmojiStyle == EmojiStyle::EMPTY) { 213 mEmojiStyle = scriptToEmojiStyle(mScript); 214 } 215 } 216 217 // static 218 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) { 219 // First, lookup emoji subtag. 220 // 10 is the length of "-u-em-text", which is the shortest emoji subtag, 221 // unnecessary comparison can be avoided if total length is smaller than 10. 222 const size_t kMinSubtagLength = 10; 223 if (length >= kMinSubtagLength) { 224 static const char kPrefix[] = "-u-em-"; 225 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix)); 226 if (pos != buf + length) { // found 227 pos += strlen(kPrefix); 228 const size_t remainingLength = length - (pos - buf); 229 if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) { 230 return EmojiStyle::EMOJI; 231 } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) { 232 return EmojiStyle::TEXT; 233 } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) { 234 return EmojiStyle::DEFAULT; 235 } 236 } 237 } 238 return EmojiStyle::EMPTY; 239 } 240 241 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) { 242 // If no emoji subtag was provided, resolve the emoji style from script code. 243 if (script == packScript('Z', 's', 'y', 'e')) { 244 return EmojiStyle::EMOJI; 245 } else if (script == packScript('Z', 's', 'y', 'm')) { 246 return EmojiStyle::TEXT; 247 } 248 return EmojiStyle::EMPTY; 249 } 250 251 // static 252 uint8_t Locale::scriptToSubScriptBits(uint32_t script) { 253 uint8_t subScriptBits = 0u; 254 switch (script) { 255 case packScript('B', 'o', 'p', 'o'): 256 subScriptBits = kBopomofoFlag; 257 break; 258 case packScript('H', 'a', 'n', 'g'): 259 subScriptBits = kHangulFlag; 260 break; 261 case packScript('H', 'a', 'n', 'b'): 262 // Bopomofo is almost exclusively used in Taiwan. 263 subScriptBits = kHanFlag | kBopomofoFlag; 264 break; 265 case packScript('H', 'a', 'n', 'i'): 266 subScriptBits = kHanFlag; 267 break; 268 case packScript('H', 'a', 'n', 's'): 269 subScriptBits = kHanFlag | kSimplifiedChineseFlag; 270 break; 271 case packScript('H', 'a', 'n', 't'): 272 subScriptBits = kHanFlag | kTraditionalChineseFlag; 273 break; 274 case packScript('H', 'i', 'r', 'a'): 275 subScriptBits = kHiraganaFlag; 276 break; 277 case packScript('H', 'r', 'k', 't'): 278 subScriptBits = kKatakanaFlag | kHiraganaFlag; 279 break; 280 case packScript('J', 'p', 'a', 'n'): 281 subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag; 282 break; 283 case packScript('K', 'a', 'n', 'a'): 284 subScriptBits = kKatakanaFlag; 285 break; 286 case packScript('K', 'o', 'r', 'e'): 287 subScriptBits = kHanFlag | kHangulFlag; 288 break; 289 } 290 return subScriptBits; 291 } 292 293 std::string Locale::getString() const { 294 char buf[24]; 295 size_t i; 296 if (mLanguage == NO_LANGUAGE) { 297 buf[0] = 'u'; 298 buf[1] = 'n'; 299 buf[2] = 'd'; 300 i = 3; 301 } else { 302 i = unpackLanguage(mLanguage, buf); 303 } 304 if (mScript != NO_SCRIPT) { 305 uint32_t rawScript = unpackScript(mScript); 306 buf[i++] = '-'; 307 buf[i++] = (rawScript >> 24) & 0xFFu; 308 buf[i++] = (rawScript >> 16) & 0xFFu; 309 buf[i++] = (rawScript >> 8) & 0xFFu; 310 buf[i++] = rawScript & 0xFFu; 311 } 312 if (mRegion != NO_REGION) { 313 buf[i++] = '-'; 314 i += unpackRegion(mRegion, buf + i); 315 } 316 if (mVariant != Variant::NO_VARIANT) { 317 buf[i++] = '-'; 318 buf[i++] = '1'; 319 buf[i++] = '9'; 320 switch (mVariant) { 321 case Variant::GERMAN_1901_ORTHOGRAPHY: 322 buf[i++] = '0'; 323 buf[i++] = '1'; 324 break; 325 case Variant::GERMAN_1996_ORTHOGRAPHY: 326 buf[i++] = '9'; 327 buf[i++] = '6'; 328 break; 329 default: 330 MINIKIN_ASSERT(false, "Must not reached."); 331 } 332 } 333 return std::string(buf, i); 334 } 335 336 Locale Locale::getPartialLocale(SubtagBits bits) const { 337 Locale subLocale; 338 if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) { 339 subLocale.mLanguage = mLanguage; 340 } else { 341 subLocale.mLanguage = packLanguage("und"); 342 } 343 if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) { 344 subLocale.mScript = mScript; 345 subLocale.mSubScriptBits = mSubScriptBits; 346 } 347 if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) { 348 subLocale.mRegion = mRegion; 349 } 350 if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) { 351 subLocale.mVariant = mVariant; 352 } 353 if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) { 354 subLocale.mEmojiStyle = mEmojiStyle; 355 } 356 return subLocale; 357 } 358 359 bool Locale::isEqualScript(const Locale& other) const { 360 return other.mScript == mScript; 361 } 362 363 // static 364 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) { 365 return requestedBits != 0 && (providedBits & requestedBits) == requestedBits; 366 } 367 368 bool Locale::supportsHbScript(hb_script_t script) const { 369 static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'), 370 "The Minikin script and HarfBuzz hb_script_t have different encodings."); 371 uint32_t packedScript = packScript(script); 372 if (packedScript == mScript) return true; 373 return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript)); 374 } 375 376 int Locale::calcScoreFor(const LocaleList& supported) const { 377 bool languageScriptMatch = false; 378 bool subtagMatch = false; 379 bool scriptMatch = false; 380 381 for (size_t i = 0; i < supported.size(); ++i) { 382 if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) { 383 subtagMatch = true; 384 if (mLanguage == supported[i].mLanguage) { 385 return 4; 386 } 387 } 388 if (isEqualScript(supported[i]) || 389 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) { 390 scriptMatch = true; 391 if (mLanguage == supported[i].mLanguage) { 392 languageScriptMatch = true; 393 } 394 } 395 } 396 397 if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) { 398 scriptMatch = true; 399 if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) { 400 return 3; 401 } 402 } 403 404 if (languageScriptMatch) { 405 return 3; 406 } else if (subtagMatch) { 407 return 2; 408 } else if (scriptMatch) { 409 return 1; 410 } 411 return 0; 412 } 413 414 static hb_language_t buildHbLanguage(const Locale& locale) { 415 return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1) 416 : HB_LANGUAGE_INVALID; 417 } 418 419 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) { 420 mIsAllTheSameLocale = true; 421 mUnionOfSubScriptBits = 0u; 422 mHbLangs.reserve(mLocales.size()); 423 mEmojiStyle = EmojiStyle::EMPTY; 424 const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage; 425 for (const Locale& locale : mLocales) { 426 mUnionOfSubScriptBits |= locale.mSubScriptBits; 427 if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) { 428 mIsAllTheSameLocale = false; 429 } 430 mHbLangs.push_back(buildHbLanguage(locale)); 431 if (mEmojiStyle == EmojiStyle::EMPTY) { 432 mEmojiStyle = locale.getEmojiStyle(); 433 } 434 } 435 } 436 437 } // namespace minikin 438