Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <vector>
     18 #include <memory>
     19 #include <algorithm>
     20 #include <string>
     21 #include <unicode/uchar.h>
     22 #include <unicode/uscript.h>
     23 
     24 // HACK: for reading pattern file
     25 #include <fcntl.h>
     26 
     27 #define LOG_TAG "Minikin"
     28 #include "utils/Log.h"
     29 
     30 #include "minikin/Hyphenator.h"
     31 
     32 using std::vector;
     33 
     34 namespace minikin {
     35 
     36 static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
     37 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
     38 static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
     39 static const uint16_t CHAR_HYPHEN = 0x2010;
     40 
     41 // The following are structs that correspond to tables inside the hyb file format
     42 
     43 struct AlphabetTable0 {
     44     uint32_t version;
     45     uint32_t min_codepoint;
     46     uint32_t max_codepoint;
     47     uint8_t data[1];  // actually flexible array, size is known at runtime
     48 };
     49 
     50 struct AlphabetTable1 {
     51     uint32_t version;
     52     uint32_t n_entries;
     53     uint32_t data[1]; // actually flexible array, size is known at runtime
     54 
     55     static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
     56     static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
     57 };
     58 
     59 struct Trie {
     60     uint32_t version;
     61     uint32_t char_mask;
     62     uint32_t link_shift;
     63     uint32_t link_mask;
     64     uint32_t pattern_shift;
     65     uint32_t n_entries;
     66     uint32_t data[1];  // actually flexible array, size is known at runtime
     67 };
     68 
     69 struct Pattern {
     70     uint32_t version;
     71     uint32_t n_entries;
     72     uint32_t pattern_offset;
     73     uint32_t pattern_size;
     74     uint32_t data[1];  // actually flexible array, size is known at runtime
     75 
     76     // accessors
     77     static uint32_t len(uint32_t entry) { return entry >> 26; }
     78     static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
     79     const uint8_t* buf(uint32_t entry) const {
     80         return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
     81     }
     82 };
     83 
     84 struct Header {
     85     uint32_t magic;
     86     uint32_t version;
     87     uint32_t alphabet_offset;
     88     uint32_t trie_offset;
     89     uint32_t pattern_offset;
     90     uint32_t file_size;
     91 
     92     // accessors
     93     const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
     94     uint32_t alphabetVersion() const {
     95         return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
     96     }
     97     const AlphabetTable0* alphabetTable0() const {
     98         return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
     99     }
    100     const AlphabetTable1* alphabetTable1() const {
    101         return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
    102     }
    103     const Trie* trieTable() const {
    104         return reinterpret_cast<const Trie*>(bytes() + trie_offset);
    105     }
    106     const Pattern* patternTable() const {
    107         return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
    108     }
    109 };
    110 
    111 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) {
    112     Hyphenator* result = new Hyphenator;
    113     result->patternData = patternData;
    114     result->minPrefix = minPrefix;
    115     result->minSuffix = minSuffix;
    116     return result;
    117 }
    118 
    119 void Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len,
    120         const icu::Locale& locale) {
    121     result->clear();
    122     result->resize(len);
    123     const size_t paddedLen = len + 2;  // start and stop code each count for 1
    124     if (patternData != nullptr &&
    125             len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) {
    126         uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
    127         const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
    128         if (hyphenValue != HyphenationType::DONT_BREAK) {
    129             hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
    130             return;
    131         }
    132         // TODO: try NFC normalization
    133         // TODO: handle non-BMP Unicode (requires remapping of offsets)
    134     }
    135     // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
    136     // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
    137     // would return DONT_BREAK.
    138     hyphenateWithNoPatterns(result->data(), word, len, locale);
    139 }
    140 
    141 // This function determines whether a character is like U+2010 HYPHEN in
    142 // line breaking and usage: a character immediately after which line breaks
    143 // are allowed, but words containing it should not be automatically
    144 // hyphenated using patterns. This is a curated set, created by manually
    145 // inspecting all the characters that have the Unicode line breaking
    146 // property of BA or HY and seeing which ones are hyphens.
    147 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
    148     return (c == 0x002D || // HYPHEN-MINUS
    149             c == 0x058A || // ARMENIAN HYPHEN
    150             c == 0x05BE || // HEBREW PUNCTUATION MAQAF
    151             c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
    152             c == 0x2010 || // HYPHEN
    153             c == 0x2013 || // EN DASH
    154             c == 0x2027 || // HYPHENATION POINT
    155             c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
    156             c == 0x2E40);  // DOUBLE HYPHEN
    157 }
    158 
    159 const static uint32_t HYPHEN_STR[] = {0x2010, 0};
    160 const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
    161 const static uint32_t MAQAF_STR[] = {0x05BE, 0};
    162 const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
    163 const static uint32_t ZWJ_STR[] = {0x200D, 0};
    164 const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
    165 
    166 const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
    167     switch (hyph) {
    168         case INSERT_HYPHEN_AT_END:
    169         case REPLACE_WITH_HYPHEN_AT_END:
    170         case INSERT_HYPHEN_AT_START:
    171             return HYPHEN_STR;
    172         case INSERT_ARMENIAN_HYPHEN_AT_END:
    173             return ARMENIAN_HYPHEN_STR;
    174         case INSERT_MAQAF_AT_END:
    175             return MAQAF_STR;
    176         case INSERT_UCAS_HYPHEN_AT_END:
    177             return UCAS_HYPHEN_STR;
    178         case INSERT_ZWJ_AND_HYPHEN_AT_END:
    179             return ZWJ_AND_HYPHEN_STR;
    180         case INSERT_ZWJ_AT_START:
    181             return ZWJ_STR;
    182         default:
    183             return nullptr;
    184     }
    185 }
    186 
    187 uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
    188     switch (type) {
    189         case HyphenationType::DONT_BREAK:
    190             return NO_EDIT;
    191         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
    192             return INSERT_HYPHEN_AT_END;
    193         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
    194             return INSERT_ARMENIAN_HYPHEN_AT_END;
    195         case HyphenationType::BREAK_AND_INSERT_MAQAF:
    196             return INSERT_MAQAF_AT_END;
    197         case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
    198             return INSERT_UCAS_HYPHEN_AT_END;
    199         case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
    200             return REPLACE_WITH_HYPHEN_AT_END;
    201         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
    202             return INSERT_ZWJ_AND_HYPHEN_AT_END;
    203         default:
    204             return BREAK_AT_END;
    205     }
    206 }
    207 
    208 uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
    209     switch (type) {
    210         case HyphenationType::DONT_BREAK:
    211             return NO_EDIT;
    212         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
    213             return INSERT_HYPHEN_AT_START;
    214         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
    215             return INSERT_ZWJ_AT_START;
    216         default:
    217             return BREAK_AT_START;
    218     }
    219 }
    220 
    221 static UScriptCode getScript(uint32_t codePoint) {
    222     UErrorCode errorCode = U_ZERO_ERROR;
    223     const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
    224     if (U_SUCCESS(errorCode)) {
    225         return script;
    226     } else {
    227         return USCRIPT_INVALID_CODE;
    228     }
    229 }
    230 
    231 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
    232     // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
    233     // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
    234     // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
    235     // to maqaf for Hebrew, we can simply add a condition here.
    236     const UScriptCode script = getScript(codePoint);
    237     if (script == USCRIPT_KANNADA
    238             || script == USCRIPT_MALAYALAM
    239             || script == USCRIPT_TAMIL
    240             || script == USCRIPT_TELUGU) {
    241         // Grantha is not included, since we don't support non-BMP hyphenation yet.
    242         return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
    243     } else if (script == USCRIPT_ARMENIAN) {
    244         return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
    245     } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
    246         return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
    247     } else {
    248         return HyphenationType::BREAK_AND_INSERT_HYPHEN;
    249     }
    250 }
    251 
    252 static inline int32_t getJoiningType(UChar32 codepoint) {
    253     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
    254 }
    255 
    256 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
    257 // This function decides if the letters before and after the hyphen should appear as joining.
    258 static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len,
    259         size_t location) {
    260     ssize_t i = location;
    261     int32_t type = U_JT_NON_JOINING;
    262     while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
    263         i++;
    264     }
    265     if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
    266         // The next character is of the type that may join the last character. See if the last
    267         // character is also of the right type.
    268         i = location - 2; // Skip the soft hyphen
    269         type = U_JT_NON_JOINING;
    270         while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
    271             i--;
    272         }
    273         if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
    274             return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
    275         }
    276     }
    277     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
    278 }
    279 
    280 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
    281 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
    282 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
    283 void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
    284         const icu::Locale& locale) {
    285     result[0] = HyphenationType::DONT_BREAK;
    286     for (size_t i = 1; i < len; i++) {
    287         const uint16_t prevChar = word[i - 1];
    288         if (i > 1 && isLineBreakingHyphen(prevChar)) {
    289             // Break after hyphens, but only if they don't start the word.
    290 
    291             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN)
    292                     && strcmp(locale.getLanguage(), "pl") == 0
    293                     && getScript(word[i]) == USCRIPT_LATIN ) {
    294                 // In Polish, hyphens get repeated at the next line. To be safe,
    295                 // we will do this only if the next character is Latin.
    296                 result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
    297             } else {
    298                 result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
    299             }
    300         } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
    301             // Break after soft hyphens, but only if they don't start the word (a soft hyphen
    302             // starting the word doesn't give any useful break opportunities). The type of the break
    303             // is based on the script of the character we break on.
    304             if (getScript(word[i]) == USCRIPT_ARABIC) {
    305                 // For Arabic, we need to look and see if the characters around the soft hyphen
    306                 // actually join. If they don't, we'll just insert a normal hyphen.
    307                 result[i] = getHyphTypeForArabic(word, len, i);
    308             } else {
    309                 result[i] = hyphenationTypeBasedOnScript(word[i]);
    310             }
    311         } else if (prevChar == CHAR_MIDDLE_DOT
    312                 && minPrefix < i && i <= len - minSuffix
    313                 && ((word[i - 2] == 'l' && word[i] == 'l')
    314                         || (word[i - 2] == 'L' && word[i] == 'L'))
    315                 && strcmp(locale.getLanguage(), "ca") == 0) {
    316             // In Catalan, "ll" should break as "l-" on the first line
    317             // and "l" on the next line.
    318             result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
    319         } else {
    320             result[i] = HyphenationType::DONT_BREAK;
    321         }
    322      }
    323 }
    324 
    325 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word,
    326         size_t len) {
    327     const Header* header = getHeader();
    328     HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
    329     // TODO: check header magic
    330     uint32_t alphabetVersion = header->alphabetVersion();
    331     if (alphabetVersion == 0) {
    332         const AlphabetTable0* alphabet = header->alphabetTable0();
    333         uint32_t min_codepoint = alphabet->min_codepoint;
    334         uint32_t max_codepoint = alphabet->max_codepoint;
    335         alpha_codes[0] = 0;  // word start
    336         for (size_t i = 0; i < len; i++) {
    337             uint16_t c = word[i];
    338             if (c < min_codepoint || c >= max_codepoint) {
    339                 return HyphenationType::DONT_BREAK;
    340             }
    341             uint8_t code = alphabet->data[c - min_codepoint];
    342             if (code == 0) {
    343                 return HyphenationType::DONT_BREAK;
    344             }
    345             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
    346                 result = hyphenationTypeBasedOnScript(c);
    347             }
    348             alpha_codes[i + 1] = code;
    349         }
    350         alpha_codes[len + 1] = 0;  // word termination
    351         return result;
    352     } else if (alphabetVersion == 1) {
    353         const AlphabetTable1* alphabet = header->alphabetTable1();
    354         size_t n_entries = alphabet->n_entries;
    355         const uint32_t* begin = alphabet->data;
    356         const uint32_t* end = begin + n_entries;
    357         alpha_codes[0] = 0;
    358         for (size_t i = 0; i < len; i++) {
    359             uint16_t c = word[i];
    360             auto p = std::lower_bound(begin, end, c << 11);
    361             if (p == end) {
    362                 return HyphenationType::DONT_BREAK;
    363             }
    364             uint32_t entry = *p;
    365             if (AlphabetTable1::codepoint(entry) != c) {
    366                 return HyphenationType::DONT_BREAK;
    367             }
    368             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
    369                 result = hyphenationTypeBasedOnScript(c);
    370             }
    371             alpha_codes[i + 1] = AlphabetTable1::value(entry);
    372         }
    373         alpha_codes[len + 1] = 0;
    374         return result;
    375     }
    376     return HyphenationType::DONT_BREAK;
    377 }
    378 
    379 /**
    380  * Internal implementation, after conversion to codes. All case folding and normalization
    381  * has been done by now, and all characters have been found in the alphabet.
    382  * Note: len here is the padded length including 0 codes at start and end.
    383  **/
    384 void Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
    385         HyphenationType hyphenValue) {
    386     static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
    387     // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
    388     uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
    389 
    390     const Header* header = getHeader();
    391     const Trie* trie = header->trieTable();
    392     const Pattern* pattern = header->patternTable();
    393     uint32_t char_mask = trie->char_mask;
    394     uint32_t link_shift = trie->link_shift;
    395     uint32_t link_mask = trie->link_mask;
    396     uint32_t pattern_shift = trie->pattern_shift;
    397     size_t maxOffset = len - minSuffix - 1;
    398     for (size_t i = 0; i < len - 1; i++) {
    399         uint32_t node = 0;  // index into Trie table
    400         for (size_t j = i; j < len; j++) {
    401             uint16_t c = codes[j];
    402             uint32_t entry = trie->data[node + c];
    403             if ((entry & char_mask) == c) {
    404                 node = (entry & link_mask) >> link_shift;
    405             } else {
    406                 break;
    407             }
    408             uint32_t pat_ix = trie->data[node] >> pattern_shift;
    409             // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
    410             // into the buf pool. This is the pattern for the substring (i..j) we just matched,
    411             // which we combine (via point-wise max) into the buffer vector.
    412             if (pat_ix != 0) {
    413                 uint32_t pat_entry = pattern->data[pat_ix];
    414                 int pat_len = Pattern::len(pat_entry);
    415                 int pat_shift = Pattern::shift(pat_entry);
    416                 const uint8_t* pat_buf = pattern->buf(pat_entry);
    417                 int offset = j + 1 - (pat_len + pat_shift);
    418                 // offset is the index within buffer that lines up with the start of pat_buf
    419                 int start = std::max((int)minPrefix - offset, 0);
    420                 int end = std::min(pat_len, (int)maxOffset - offset);
    421                 for (int k = start; k < end; k++) {
    422                     buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
    423                 }
    424             }
    425         }
    426     }
    427     // Since the above calculation does not modify values outside
    428     // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
    429     for (size_t i = minPrefix; i < maxOffset; i++) {
    430         // Hyphenation opportunities happen when the hyphenation numbers are odd.
    431         result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
    432     }
    433 }
    434 
    435 }  // namespace minikin
    436