Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "Locale.h"
     18 
     19 #include <algorithm>
     20 
     21 #include <hb.h>
     22 
     23 #include "minikin/LocaleList.h"
     24 
     25 #include "LocaleListCache.h"
     26 #include "MinikinInternal.h"
     27 #include "StringPiece.h"
     28 
     29 namespace minikin {
     30 
     31 constexpr uint32_t FIVE_BITS = 0x1f;
     32 
     33 uint32_t registerLocaleList(const std::string& locales) {
     34     return LocaleListCache::getId(locales);
     35 }
     36 
     37 // Check if a language code supports emoji according to its subtag
     38 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
     39     if (bufLen < subtagLen) {
     40         return false;
     41     }
     42     if (strncmp(buf, subtag, subtagLen) != 0) {
     43         return false;  // no match between two strings
     44     }
     45     return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
     46             buf[subtagLen] == '_');
     47 }
     48 
     49 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
     50 // For the region code, the letters must be all digits in three letter case, so the number of
     51 // possible values are 10. For the language code, the letters must be all small alphabets, so the
     52 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
     53 // three letter language code or region code to 15 bits.
     54 //
     55 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
     56 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
     57                                      uint8_t threeLetterBase) {
     58     if (in.length() == 2) {
     59         return 0x7c00u |  // 0x1fu << 10
     60                (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
     61     } else {
     62         return ((uint16_t)(in[0] - threeLetterBase) << 10) |
     63                (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
     64     }
     65 }
     66 
     67 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
     68                                      uint8_t threeLetterBase) {
     69     uint8_t first = (in >> 10) & FIVE_BITS;
     70     uint8_t second = (in >> 5) & FIVE_BITS;
     71     uint8_t third = in & FIVE_BITS;
     72 
     73     if (first == 0x1f) {
     74         out[0] = second + twoLetterBase;
     75         out[1] = third + twoLetterBase;
     76         return 2;
     77     } else {
     78         out[0] = first + threeLetterBase;
     79         out[1] = second + threeLetterBase;
     80         out[2] = third + threeLetterBase;
     81         return 3;
     82     }
     83 }
     84 
     85 static uint16_t packLanguage(const StringPiece& in) {
     86     return packLanguageOrRegion(in, 'a', 'a');
     87 }
     88 
     89 static size_t unpackLanguage(uint16_t in, char* out) {
     90     return unpackLanguageOrRegion(in, out, 'a', 'a');
     91 }
     92 
     93 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
     94     constexpr char FIRST_LETTER_BASE = 'A';
     95     constexpr char REST_LETTER_BASE = 'a';
     96     return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
     97            ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
     98 }
     99 
    100 constexpr uint32_t packScript(uint32_t script) {
    101     return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
    102 }
    103 
    104 constexpr uint32_t unpackScript(uint32_t packedScript) {
    105     constexpr char FIRST_LETTER_BASE = 'A';
    106     constexpr char REST_LETTER_BASE = 'a';
    107     const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
    108     const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
    109     const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
    110     const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
    111 
    112     return first << 24 | second << 16 | third << 8 | fourth;
    113 }
    114 
    115 static uint16_t packRegion(const StringPiece& in) {
    116     return packLanguageOrRegion(in, 'A', '0');
    117 }
    118 
    119 static size_t unpackRegion(uint16_t in, char* out) {
    120     return unpackLanguageOrRegion(in, out, 'A', '0');
    121 }
    122 
    123 static inline bool isLowercase(char c) {
    124     return 'a' <= c && c <= 'z';
    125 }
    126 
    127 static inline bool isUppercase(char c) {
    128     return 'A' <= c && c <= 'Z';
    129 }
    130 
    131 static inline bool isDigit(char c) {
    132     return '0' <= c && c <= '9';
    133 }
    134 
    135 // Returns true if the buffer is valid for language code.
    136 static inline bool isValidLanguageCode(const StringPiece& buffer) {
    137     if (buffer.length() != 2 && buffer.length() != 3) return false;
    138     if (!isLowercase(buffer[0])) return false;
    139     if (!isLowercase(buffer[1])) return false;
    140     if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
    141     return true;
    142 }
    143 
    144 // Returns true if buffer is valid for script code. The length of buffer must be 4.
    145 static inline bool isValidScriptCode(const StringPiece& buffer) {
    146     return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
    147            isLowercase(buffer[2]) && isLowercase(buffer[3]);
    148 }
    149 
    150 // Returns true if the buffer is valid for region code.
    151 static inline bool isValidRegionCode(const StringPiece& buffer) {
    152     return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
    153            (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
    154 }
    155 
    156 // Parse BCP 47 language identifier into internal structure
    157 Locale::Locale(const StringPiece& input) : Locale() {
    158     SplitIterator it(input, '-');
    159 
    160     StringPiece language = it.next();
    161     if (isValidLanguageCode(language)) {
    162         mLanguage = packLanguage(language);
    163     } else {
    164         // We don't understand anything other than two-letter or three-letter
    165         // language codes, so we skip parsing the rest of the string.
    166         return;
    167     }
    168 
    169     if (!it.hasNext()) {
    170         return;  // Language code only.
    171     }
    172     StringPiece token = it.next();
    173 
    174     if (isValidScriptCode(token)) {
    175         mScript = packScript(token[0], token[1], token[2], token[3]);
    176         mSubScriptBits = scriptToSubScriptBits(mScript);
    177 
    178         if (!it.hasNext()) {
    179             goto finalize;  // No variant, emoji subtag and region code.
    180         }
    181         token = it.next();
    182     }
    183 
    184     if (isValidRegionCode(token)) {
    185         mRegion = packRegion(token);
    186 
    187         if (!it.hasNext()) {
    188             goto finalize;  // No variant or emoji subtag.
    189         }
    190         token = it.next();
    191     }
    192 
    193     if (language == "de") {  // We are only interested in German variants.
    194         if (token == "1901") {
    195             mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
    196         } else if (token == "1996") {
    197             mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
    198         }
    199 
    200         if (mVariant != Variant::NO_VARIANT) {
    201             if (!it.hasNext()) {
    202                 goto finalize;  // No emoji subtag.
    203             }
    204 
    205             token = it.next();
    206         }
    207     }
    208 
    209     mEmojiStyle = resolveEmojiStyle(input.data(), input.length());
    210 
    211 finalize:
    212     if (mEmojiStyle == EmojiStyle::EMPTY) {
    213         mEmojiStyle = scriptToEmojiStyle(mScript);
    214     }
    215 }
    216 
    217 // static
    218 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
    219     // First, lookup emoji subtag.
    220     // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
    221     // unnecessary comparison can be avoided if total length is smaller than 10.
    222     const size_t kMinSubtagLength = 10;
    223     if (length >= kMinSubtagLength) {
    224         static const char kPrefix[] = "-u-em-";
    225         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
    226         if (pos != buf + length) {  // found
    227             pos += strlen(kPrefix);
    228             const size_t remainingLength = length - (pos - buf);
    229             if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) {
    230                 return EmojiStyle::EMOJI;
    231             } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) {
    232                 return EmojiStyle::TEXT;
    233             } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) {
    234                 return EmojiStyle::DEFAULT;
    235             }
    236         }
    237     }
    238     return EmojiStyle::EMPTY;
    239 }
    240 
    241 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
    242     // If no emoji subtag was provided, resolve the emoji style from script code.
    243     if (script == packScript('Z', 's', 'y', 'e')) {
    244         return EmojiStyle::EMOJI;
    245     } else if (script == packScript('Z', 's', 'y', 'm')) {
    246         return EmojiStyle::TEXT;
    247     }
    248     return EmojiStyle::EMPTY;
    249 }
    250 
    251 // static
    252 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
    253     uint8_t subScriptBits = 0u;
    254     switch (script) {
    255         case packScript('B', 'o', 'p', 'o'):
    256             subScriptBits = kBopomofoFlag;
    257             break;
    258         case packScript('H', 'a', 'n', 'g'):
    259             subScriptBits = kHangulFlag;
    260             break;
    261         case packScript('H', 'a', 'n', 'b'):
    262             // Bopomofo is almost exclusively used in Taiwan.
    263             subScriptBits = kHanFlag | kBopomofoFlag;
    264             break;
    265         case packScript('H', 'a', 'n', 'i'):
    266             subScriptBits = kHanFlag;
    267             break;
    268         case packScript('H', 'a', 'n', 's'):
    269             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
    270             break;
    271         case packScript('H', 'a', 'n', 't'):
    272             subScriptBits = kHanFlag | kTraditionalChineseFlag;
    273             break;
    274         case packScript('H', 'i', 'r', 'a'):
    275             subScriptBits = kHiraganaFlag;
    276             break;
    277         case packScript('H', 'r', 'k', 't'):
    278             subScriptBits = kKatakanaFlag | kHiraganaFlag;
    279             break;
    280         case packScript('J', 'p', 'a', 'n'):
    281             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
    282             break;
    283         case packScript('K', 'a', 'n', 'a'):
    284             subScriptBits = kKatakanaFlag;
    285             break;
    286         case packScript('K', 'o', 'r', 'e'):
    287             subScriptBits = kHanFlag | kHangulFlag;
    288             break;
    289     }
    290     return subScriptBits;
    291 }
    292 
    293 std::string Locale::getString() const {
    294     char buf[24];
    295     size_t i;
    296     if (mLanguage == NO_LANGUAGE) {
    297         buf[0] = 'u';
    298         buf[1] = 'n';
    299         buf[2] = 'd';
    300         i = 3;
    301     } else {
    302         i = unpackLanguage(mLanguage, buf);
    303     }
    304     if (mScript != NO_SCRIPT) {
    305         uint32_t rawScript = unpackScript(mScript);
    306         buf[i++] = '-';
    307         buf[i++] = (rawScript >> 24) & 0xFFu;
    308         buf[i++] = (rawScript >> 16) & 0xFFu;
    309         buf[i++] = (rawScript >> 8) & 0xFFu;
    310         buf[i++] = rawScript & 0xFFu;
    311     }
    312     if (mRegion != NO_REGION) {
    313         buf[i++] = '-';
    314         i += unpackRegion(mRegion, buf + i);
    315     }
    316     if (mVariant != Variant::NO_VARIANT) {
    317         buf[i++] = '-';
    318         buf[i++] = '1';
    319         buf[i++] = '9';
    320         switch (mVariant) {
    321             case Variant::GERMAN_1901_ORTHOGRAPHY:
    322                 buf[i++] = '0';
    323                 buf[i++] = '1';
    324                 break;
    325             case Variant::GERMAN_1996_ORTHOGRAPHY:
    326                 buf[i++] = '9';
    327                 buf[i++] = '6';
    328                 break;
    329             default:
    330                 MINIKIN_ASSERT(false, "Must not reached.");
    331         }
    332     }
    333     return std::string(buf, i);
    334 }
    335 
    336 Locale Locale::getPartialLocale(SubtagBits bits) const {
    337     Locale subLocale;
    338     if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
    339         subLocale.mLanguage = mLanguage;
    340     } else {
    341         subLocale.mLanguage = packLanguage("und");
    342     }
    343     if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
    344         subLocale.mScript = mScript;
    345         subLocale.mSubScriptBits = mSubScriptBits;
    346     }
    347     if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
    348         subLocale.mRegion = mRegion;
    349     }
    350     if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
    351         subLocale.mVariant = mVariant;
    352     }
    353     if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
    354         subLocale.mEmojiStyle = mEmojiStyle;
    355     }
    356     return subLocale;
    357 }
    358 
    359 bool Locale::isEqualScript(const Locale& other) const {
    360     return other.mScript == mScript;
    361 }
    362 
    363 // static
    364 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
    365     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
    366 }
    367 
    368 bool Locale::supportsHbScript(hb_script_t script) const {
    369     static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
    370                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
    371     uint32_t packedScript = packScript(script);
    372     if (packedScript == mScript) return true;
    373     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
    374 }
    375 
    376 int Locale::calcScoreFor(const LocaleList& supported) const {
    377     bool languageScriptMatch = false;
    378     bool subtagMatch = false;
    379     bool scriptMatch = false;
    380 
    381     for (size_t i = 0; i < supported.size(); ++i) {
    382         if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
    383             subtagMatch = true;
    384             if (mLanguage == supported[i].mLanguage) {
    385                 return 4;
    386             }
    387         }
    388         if (isEqualScript(supported[i]) ||
    389             supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
    390             scriptMatch = true;
    391             if (mLanguage == supported[i].mLanguage) {
    392                 languageScriptMatch = true;
    393             }
    394         }
    395     }
    396 
    397     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
    398         scriptMatch = true;
    399         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
    400             return 3;
    401         }
    402     }
    403 
    404     if (languageScriptMatch) {
    405         return 3;
    406     } else if (subtagMatch) {
    407         return 2;
    408     } else if (scriptMatch) {
    409         return 1;
    410     }
    411     return 0;
    412 }
    413 
    414 static hb_language_t buildHbLanguage(const Locale& locale) {
    415     return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
    416                                 : HB_LANGUAGE_INVALID;
    417 }
    418 
    419 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
    420     mIsAllTheSameLocale = true;
    421     mUnionOfSubScriptBits = 0u;
    422     mHbLangs.reserve(mLocales.size());
    423     mEmojiStyle = EmojiStyle::EMPTY;
    424     const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
    425     for (const Locale& locale : mLocales) {
    426         mUnionOfSubScriptBits |= locale.mSubScriptBits;
    427         if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
    428             mIsAllTheSameLocale = false;
    429         }
    430         mHbLangs.push_back(buildHbLanguage(locale));
    431         if (mEmojiStyle == EmojiStyle::EMPTY) {
    432             mEmojiStyle = locale.getEmojiStyle();
    433         }
    434     }
    435 }
    436 
    437 }  // namespace minikin
    438