Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define LOG_TAG "Minikin"
     18 
     19 #include "FontLanguage.h"
     20 
     21 #include <algorithm>
     22 #include <hb.h>
     23 #include <string.h>
     24 #include <unicode/uloc.h>
     25 
     26 namespace minikin {
     27 
     28 #define SCRIPT_TAG(c1, c2, c3, c4) \
     29         (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) <<  8 | \
     30          ((uint32_t)(c4)))
     31 
     32 // Check if a language code supports emoji according to its subtag
     33 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
     34     if (bufLen < subtagLen) {
     35         return false;
     36     }
     37     if (strncmp(buf, subtag, subtagLen) != 0) {
     38         return false;  // no match between two strings
     39     }
     40     return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
     41             buf[subtagLen] == '-' || buf[subtagLen] == '_');
     42 }
     43 
     44 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
     45 // For the region code, the letters must be all digits in three letter case, so the number of
     46 // possible values are 10. For the language code, the letters must be all small alphabets, so the
     47 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
     48 // three letter language code or region code to 15 bits.
     49 //
     50 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
     51 static uint16_t packLanguageOrRegion(const char* c, size_t length, uint8_t twoLetterBase,
     52         uint8_t threeLetterBase) {
     53     if (length == 2) {
     54         return 0x7c00u |  // 0x1fu << 10
     55                 (uint16_t)(c[0] - twoLetterBase) << 5 |
     56                 (uint16_t)(c[1] - twoLetterBase);
     57     } else {
     58         return ((uint16_t)(c[0] - threeLetterBase) << 10) |
     59                 (uint16_t)(c[1] - threeLetterBase) << 5 |
     60                 (uint16_t)(c[2] - threeLetterBase);
     61     }
     62 }
     63 
     64 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
     65         uint8_t threeLetterBase) {
     66     uint8_t first = (in >> 10) & 0x1f;
     67     uint8_t second = (in >> 5) & 0x1f;
     68     uint8_t third = in & 0x1f;
     69 
     70     if (first == 0x1f) {
     71         out[0] = second + twoLetterBase;
     72         out[1] = third + twoLetterBase;
     73         return 2;
     74     } else {
     75         out[0] = first + threeLetterBase;
     76         out[1] = second + threeLetterBase;
     77         out[2] = third + threeLetterBase;
     78         return 3;
     79     }
     80 }
     81 
     82 // Find the next '-' or '_' index from startOffset position. If not found, returns bufferLength.
     83 static size_t nextDelimiterIndex(const char* buffer, size_t bufferLength, size_t startOffset) {
     84     for (size_t i = startOffset; i < bufferLength; ++i) {
     85         if (buffer[i] == '-' || buffer[i] == '_') {
     86             return i;
     87         }
     88     }
     89     return bufferLength;
     90 }
     91 
     92 static inline bool isLowercase(char c) {
     93     return 'a' <= c && c <= 'z';
     94 }
     95 
     96 static inline bool isUppercase(char c) {
     97     return 'A' <= c && c <= 'Z';
     98 }
     99 
    100 static inline bool isDigit(char c) {
    101     return '0' <= c && c <= '9';
    102 }
    103 
    104 // Returns true if the buffer is valid for language code.
    105 static inline bool isValidLanguageCode(const char* buffer, size_t length) {
    106     if (length != 2 && length != 3) return false;
    107     if (!isLowercase(buffer[0])) return false;
    108     if (!isLowercase(buffer[1])) return false;
    109     if (length == 3 && !isLowercase(buffer[2])) return false;
    110     return true;
    111 }
    112 
    113 // Returns true if buffer is valid for script code. The length of buffer must be 4.
    114 static inline bool isValidScriptCode(const char* buffer) {
    115     return isUppercase(buffer[0]) && isLowercase(buffer[1]) && isLowercase(buffer[2]) &&
    116         isLowercase(buffer[3]);
    117 }
    118 
    119 // Returns true if the buffer is valid for region code.
    120 static inline bool isValidRegionCode(const char* buffer, size_t length) {
    121     return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
    122             (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
    123 }
    124 
    125 // Parse BCP 47 language identifier into internal structure
    126 FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
    127     size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
    128     if (isValidLanguageCode(buf, firstDelimiterPos)) {
    129         mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
    130     } else {
    131         // We don't understand anything other than two-letter or three-letter
    132         // language codes, so we skip parsing the rest of the string.
    133         return;
    134     }
    135 
    136     if (firstDelimiterPos == length) {
    137         mHbLanguage = hb_language_from_string(getString().c_str(), -1);
    138         return;  // Language code only.
    139     }
    140 
    141     size_t nextComponentStartPos = firstDelimiterPos + 1;
    142     size_t nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
    143     size_t componentLength = nextDelimiterPos - nextComponentStartPos;
    144 
    145     if (componentLength == 4) {
    146         // Possibly script code.
    147         const char* p = buf + nextComponentStartPos;
    148         if (isValidScriptCode(p)) {
    149             mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
    150             mSubScriptBits = scriptToSubScriptBits(mScript);
    151         }
    152 
    153         if (nextDelimiterPos == length) {
    154             mHbLanguage = hb_language_from_string(getString().c_str(), -1);
    155             mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
    156             return;  // No region code.
    157         }
    158 
    159         nextComponentStartPos = nextDelimiterPos + 1;
    160         nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
    161         componentLength = nextDelimiterPos - nextComponentStartPos;
    162     }
    163 
    164     if (componentLength == 2 || componentLength == 3) {
    165         // Possibly region code.
    166         const char* p = buf + nextComponentStartPos;
    167         if (isValidRegionCode(p, componentLength)) {
    168             mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
    169         }
    170     }
    171 
    172     mHbLanguage = hb_language_from_string(getString().c_str(), -1);
    173     mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
    174 }
    175 
    176 // static
    177 FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf, size_t length,
    178         uint32_t script) {
    179     // First, lookup emoji subtag.
    180     // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
    181     // unnecessary comparison can be avoided if total length is smaller than 10.
    182     const size_t kMinSubtagLength = 10;
    183     if (length >= kMinSubtagLength) {
    184         static const char kPrefix[] = "-u-em-";
    185         const char *pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
    186         if (pos != buf + length) {  // found
    187             pos += strlen(kPrefix);
    188             const size_t remainingLength = length - (pos - buf);
    189             if (isEmojiSubtag(pos, remainingLength, "emoji", 5)){
    190                 return EMSTYLE_EMOJI;
    191             } else if (isEmojiSubtag(pos, remainingLength, "text", 4)){
    192                 return EMSTYLE_TEXT;
    193             } else if (isEmojiSubtag(pos, remainingLength, "default", 7)){
    194                 return EMSTYLE_DEFAULT;
    195             }
    196         }
    197     }
    198 
    199     // If no emoji subtag was provided, resolve the emoji style from script code.
    200     if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
    201         return EMSTYLE_EMOJI;
    202     } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
    203         return EMSTYLE_TEXT;
    204     }
    205 
    206     return EMSTYLE_EMPTY;
    207 }
    208 
    209 //static
    210 uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
    211     uint8_t subScriptBits = 0u;
    212     switch (script) {
    213         case SCRIPT_TAG('B', 'o', 'p', 'o'):
    214             subScriptBits = kBopomofoFlag;
    215             break;
    216         case SCRIPT_TAG('H', 'a', 'n', 'g'):
    217             subScriptBits = kHangulFlag;
    218             break;
    219         case SCRIPT_TAG('H', 'a', 'n', 'b'):
    220             // Bopomofo is almost exclusively used in Taiwan.
    221             subScriptBits = kHanFlag | kBopomofoFlag;
    222             break;
    223         case SCRIPT_TAG('H', 'a', 'n', 'i'):
    224             subScriptBits = kHanFlag;
    225             break;
    226         case SCRIPT_TAG('H', 'a', 'n', 's'):
    227             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
    228             break;
    229         case SCRIPT_TAG('H', 'a', 'n', 't'):
    230             subScriptBits = kHanFlag | kTraditionalChineseFlag;
    231             break;
    232         case SCRIPT_TAG('H', 'i', 'r', 'a'):
    233             subScriptBits = kHiraganaFlag;
    234             break;
    235         case SCRIPT_TAG('H', 'r', 'k', 't'):
    236             subScriptBits = kKatakanaFlag | kHiraganaFlag;
    237             break;
    238         case SCRIPT_TAG('J', 'p', 'a', 'n'):
    239             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
    240             break;
    241         case SCRIPT_TAG('K', 'a', 'n', 'a'):
    242             subScriptBits = kKatakanaFlag;
    243             break;
    244         case SCRIPT_TAG('K', 'o', 'r', 'e'):
    245             subScriptBits = kHanFlag | kHangulFlag;
    246             break;
    247     }
    248     return subScriptBits;
    249 }
    250 
    251 std::string FontLanguage::getString() const {
    252     if (isUnsupported()) {
    253         return "und";
    254     }
    255     char buf[16];
    256     size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
    257     if (mScript != 0) {
    258         buf[i++] = '-';
    259         buf[i++] = (mScript >> 24) & 0xFFu;
    260         buf[i++] = (mScript >> 16) & 0xFFu;
    261         buf[i++] = (mScript >> 8) & 0xFFu;
    262         buf[i++] = mScript & 0xFFu;
    263     }
    264     if (mRegion != INVALID_CODE) {
    265         buf[i++] = '-';
    266         i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
    267     }
    268     return std::string(buf, i);
    269 }
    270 
    271 bool FontLanguage::isEqualScript(const FontLanguage& other) const {
    272     return other.mScript == mScript;
    273 }
    274 
    275 // static
    276 bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
    277     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
    278 }
    279 
    280 bool FontLanguage::supportsHbScript(hb_script_t script) const {
    281     static_assert(SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
    282                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
    283     if (script == mScript) return true;
    284     return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
    285 }
    286 
    287 int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
    288     bool languageScriptMatch = false;
    289     bool subtagMatch = false;
    290     bool scriptMatch = false;
    291 
    292     for (size_t i = 0; i < supported.size(); ++i) {
    293         if (mEmojiStyle != EMSTYLE_EMPTY &&
    294                mEmojiStyle == supported[i].mEmojiStyle) {
    295             subtagMatch = true;
    296             if (mLanguage == supported[i].mLanguage) {
    297                 return 4;
    298             }
    299         }
    300         if (isEqualScript(supported[i]) ||
    301                 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
    302             scriptMatch = true;
    303             if (mLanguage == supported[i].mLanguage) {
    304                 languageScriptMatch = true;
    305             }
    306         }
    307     }
    308 
    309     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
    310         scriptMatch = true;
    311         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLanguage()) {
    312             return 3;
    313         }
    314     }
    315 
    316     if (languageScriptMatch) {
    317         return 3;
    318     } else if (subtagMatch) {
    319         return 2;
    320     } else if (scriptMatch) {
    321         return 1;
    322     }
    323     return 0;
    324 }
    325 
    326 FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
    327     : mLanguages(std::move(languages)) {
    328     if (mLanguages.empty()) {
    329         return;
    330     }
    331 
    332     const FontLanguage& lang = mLanguages[0];
    333 
    334     mIsAllTheSameLanguage = true;
    335     mUnionOfSubScriptBits = lang.mSubScriptBits;
    336     for (size_t i = 1; i < mLanguages.size(); ++i) {
    337         mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
    338         if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
    339             mIsAllTheSameLanguage = false;
    340         }
    341     }
    342 }
    343 
    344 #undef SCRIPT_TAG
    345 }  // namespace minikin
    346