Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "minikin/Hyphenator.h"
     18 
     19 #include <algorithm>
     20 #include <memory>
     21 #include <string>
     22 #include <vector>
     23 
     24 #include <unicode/uchar.h>
     25 #include <unicode/uscript.h>
     26 
     27 #include "minikin/Characters.h"
     28 
     29 namespace minikin {
     30 
     31 // The following are structs that correspond to tables inside the hyb file format
     32 
     33 struct AlphabetTable0 {
     34     uint32_t version;
     35     uint32_t min_codepoint;
     36     uint32_t max_codepoint;
     37     uint8_t data[1];  // actually flexible array, size is known at runtime
     38 };
     39 
     40 struct AlphabetTable1 {
     41     uint32_t version;
     42     uint32_t n_entries;
     43     uint32_t data[1];  // actually flexible array, size is known at runtime
     44 
     45     static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
     46     static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
     47 };
     48 
     49 struct Trie {
     50     uint32_t version;
     51     uint32_t char_mask;
     52     uint32_t link_shift;
     53     uint32_t link_mask;
     54     uint32_t pattern_shift;
     55     uint32_t n_entries;
     56     uint32_t data[1];  // actually flexible array, size is known at runtime
     57 };
     58 
     59 struct Pattern {
     60     uint32_t version;
     61     uint32_t n_entries;
     62     uint32_t pattern_offset;
     63     uint32_t pattern_size;
     64     uint32_t data[1];  // actually flexible array, size is known at runtime
     65 
     66     // accessors
     67     static uint32_t len(uint32_t entry) { return entry >> 26; }
     68     static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
     69     const uint8_t* buf(uint32_t entry) const {
     70         return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
     71     }
     72 };
     73 
     74 struct Header {
     75     uint32_t magic;
     76     uint32_t version;
     77     uint32_t alphabet_offset;
     78     uint32_t trie_offset;
     79     uint32_t pattern_offset;
     80     uint32_t file_size;
     81 
     82     // accessors
     83     const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
     84     uint32_t alphabetVersion() const {
     85         return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
     86     }
     87     const AlphabetTable0* alphabetTable0() const {
     88         return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
     89     }
     90     const AlphabetTable1* alphabetTable1() const {
     91         return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
     92     }
     93     const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); }
     94     const Pattern* patternTable() const {
     95         return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
     96     }
     97 };
     98 
     99 // static
    100 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
    101                                    const std::string& locale) {
    102     HyphenationLocale hyphenLocale = HyphenationLocale::OTHER;
    103     if (locale == "pl") {
    104         hyphenLocale = HyphenationLocale::POLISH;
    105     } else if (locale == "ca") {
    106         hyphenLocale = HyphenationLocale::CATALAN;
    107     } else if (locale == "sl") {
    108         hyphenLocale = HyphenationLocale::SLOVENIAN;
    109     }
    110     return new Hyphenator(patternData, minPrefix, minSuffix, hyphenLocale);
    111 }
    112 
    113 Hyphenator::Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
    114                        HyphenationLocale hyphenLocale)
    115         : mPatternData(patternData),
    116           mMinPrefix(minPrefix),
    117           mMinSuffix(minSuffix),
    118           mHyphenationLocale(hyphenLocale) {}
    119 
    120 void Hyphenator::hyphenate(const U16StringPiece& word, HyphenationType* out) const {
    121     const size_t len = word.size();
    122     const size_t paddedLen = len + 2;  // start and stop code each count for 1
    123     if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix &&
    124         paddedLen <= MAX_HYPHENATED_SIZE) {
    125         uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
    126         const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word);
    127         if (hyphenValue != HyphenationType::DONT_BREAK) {
    128             hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out);
    129             return;
    130         }
    131         // TODO: try NFC normalization
    132         // TODO: handle non-BMP Unicode (requires remapping of offsets)
    133     }
    134     // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
    135     // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
    136     // would return DONT_BREAK.
    137     hyphenateWithNoPatterns(word, out);
    138 }
    139 
    140 // This function determines whether a character is like U+2010 HYPHEN in
    141 // line breaking and usage: a character immediately after which line breaks
    142 // are allowed, but words containing it should not be automatically
    143 // hyphenated using patterns. This is a curated set, created by manually
    144 // inspecting all the characters that have the Unicode line breaking
    145 // property of BA or HY and seeing which ones are hyphens.
    146 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
    147     return (c == 0x002D ||  // HYPHEN-MINUS
    148             c == 0x058A ||  // ARMENIAN HYPHEN
    149             c == 0x05BE ||  // HEBREW PUNCTUATION MAQAF
    150             c == 0x1400 ||  // CANADIAN SYLLABICS HYPHEN
    151             c == 0x2010 ||  // HYPHEN
    152             c == 0x2013 ||  // EN DASH
    153             c == 0x2027 ||  // HYPHENATION POINT
    154             c == 0x2E17 ||  // DOUBLE OBLIQUE HYPHEN
    155             c == 0x2E40);   // DOUBLE HYPHEN
    156 }
    157 
    158 EndHyphenEdit editForThisLine(HyphenationType type) {
    159     switch (type) {
    160         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
    161             return EndHyphenEdit::INSERT_HYPHEN;
    162         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
    163             return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
    164         case HyphenationType::BREAK_AND_INSERT_MAQAF:
    165             return EndHyphenEdit::INSERT_MAQAF;
    166         case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
    167             return EndHyphenEdit::INSERT_UCAS_HYPHEN;
    168         case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
    169             return EndHyphenEdit::REPLACE_WITH_HYPHEN;
    170         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
    171             return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN;
    172         case HyphenationType::DONT_BREAK:  // Hyphen edit for non breaking case doesn't make sense.
    173         default:
    174             return EndHyphenEdit::NO_EDIT;
    175     }
    176 }
    177 
    178 StartHyphenEdit editForNextLine(HyphenationType type) {
    179     switch (type) {
    180         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
    181             return StartHyphenEdit::INSERT_HYPHEN;
    182         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
    183             return StartHyphenEdit::INSERT_ZWJ;
    184         case HyphenationType::DONT_BREAK:  // Hyphen edit for non breaking case doesn't make sense.
    185         default:
    186             return StartHyphenEdit::NO_EDIT;
    187     }
    188 }
    189 
    190 static UScriptCode getScript(uint32_t codePoint) {
    191     UErrorCode errorCode = U_ZERO_ERROR;
    192     const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
    193     if (U_SUCCESS(errorCode)) {
    194         return script;
    195     } else {
    196         return USCRIPT_INVALID_CODE;
    197     }
    198 }
    199 
    200 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
    201     // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
    202     // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
    203     // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
    204     // to maqaf for Hebrew, we can simply add a condition here.
    205     const UScriptCode script = getScript(codePoint);
    206     if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
    207         script == USCRIPT_TELUGU) {
    208         // Grantha is not included, since we don't support non-BMP hyphenation yet.
    209         return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
    210     } else if (script == USCRIPT_ARMENIAN) {
    211         return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
    212     } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
    213         return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
    214     } else {
    215         return HyphenationType::BREAK_AND_INSERT_HYPHEN;
    216     }
    217 }
    218 
    219 static inline int32_t getJoiningType(UChar32 codepoint) {
    220     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
    221 }
    222 
    223 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
    224 // This function decides if the letters before and after the hyphen should appear as joining.
    225 static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) {
    226     ssize_t i = location;
    227     int32_t type = U_JT_NON_JOINING;
    228     while (static_cast<size_t>(i) < word.size() &&
    229            (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
    230         i++;
    231     }
    232     if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
    233         // The next character is of the type that may join the last character. See if the last
    234         // character is also of the right type.
    235         i = location - 2;  // Skip the soft hyphen
    236         type = U_JT_NON_JOINING;
    237         while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
    238             i--;
    239         }
    240         if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
    241             return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
    242         }
    243     }
    244     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
    245 }
    246 
    247 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
    248 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
    249 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
    250 void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const {
    251     out[0] = HyphenationType::DONT_BREAK;
    252     for (size_t i = 1; i < word.size(); i++) {
    253         const uint16_t prevChar = word[i - 1];
    254         if (i > 1 && isLineBreakingHyphen(prevChar)) {
    255             // Break after hyphens, but only if they don't start the word.
    256 
    257             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
    258                 (mHyphenationLocale == HyphenationLocale::POLISH ||
    259                  mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
    260                 getScript(word[i]) == USCRIPT_LATIN) {
    261                 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
    262                 // we will do this only if the next character is Latin.
    263                 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
    264             } else {
    265                 out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
    266             }
    267         } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
    268             // Break after soft hyphens, but only if they don't start the word (a soft hyphen
    269             // starting the word doesn't give any useful break opportunities). The type of the break
    270             // is based on the script of the character we break on.
    271             if (getScript(word[i]) == USCRIPT_ARABIC) {
    272                 // For Arabic, we need to look and see if the characters around the soft hyphen
    273                 // actually join. If they don't, we'll just insert a normal hyphen.
    274                 out[i] = getHyphTypeForArabic(word, i);
    275             } else {
    276                 out[i] = hyphenationTypeBasedOnScript(word[i]);
    277             }
    278         } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
    279                    ((word[i - 2] == 'l' && word[i] == 'l') ||
    280                     (word[i - 2] == 'L' && word[i] == 'L')) &&
    281                    mHyphenationLocale == HyphenationLocale::CATALAN) {
    282             // In Catalan, "ll" should break as "l-" on the first line
    283             // and "l" on the next line.
    284             out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
    285         } else {
    286             out[i] = HyphenationType::DONT_BREAK;
    287         }
    288     }
    289 }
    290 
    291 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
    292                                            const U16StringPiece& word) const {
    293     const Header* header = getHeader();
    294     HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
    295     // TODO: check header magic
    296     uint32_t alphabetVersion = header->alphabetVersion();
    297     if (alphabetVersion == 0) {
    298         const AlphabetTable0* alphabet = header->alphabetTable0();
    299         uint32_t min_codepoint = alphabet->min_codepoint;
    300         uint32_t max_codepoint = alphabet->max_codepoint;
    301         alpha_codes[0] = 0;  // word start
    302         for (size_t i = 0; i < word.size(); i++) {
    303             uint16_t c = word[i];
    304             if (c < min_codepoint || c >= max_codepoint) {
    305                 return HyphenationType::DONT_BREAK;
    306             }
    307             uint8_t code = alphabet->data[c - min_codepoint];
    308             if (code == 0) {
    309                 return HyphenationType::DONT_BREAK;
    310             }
    311             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
    312                 result = hyphenationTypeBasedOnScript(c);
    313             }
    314             alpha_codes[i + 1] = code;
    315         }
    316         alpha_codes[word.size() + 1] = 0;  // word termination
    317         return result;
    318     } else if (alphabetVersion == 1) {
    319         const AlphabetTable1* alphabet = header->alphabetTable1();
    320         size_t n_entries = alphabet->n_entries;
    321         const uint32_t* begin = alphabet->data;
    322         const uint32_t* end = begin + n_entries;
    323         alpha_codes[0] = 0;
    324         for (size_t i = 0; i < word.size(); i++) {
    325             uint16_t c = word[i];
    326             auto p = std::lower_bound(begin, end, c << 11);
    327             if (p == end) {
    328                 return HyphenationType::DONT_BREAK;
    329             }
    330             uint32_t entry = *p;
    331             if (AlphabetTable1::codepoint(entry) != c) {
    332                 return HyphenationType::DONT_BREAK;
    333             }
    334             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
    335                 result = hyphenationTypeBasedOnScript(c);
    336             }
    337             alpha_codes[i + 1] = AlphabetTable1::value(entry);
    338         }
    339         alpha_codes[word.size() + 1] = 0;
    340         return result;
    341     }
    342     return HyphenationType::DONT_BREAK;
    343 }
    344 
    345 /**
    346  * Internal implementation, after conversion to codes. All case folding and normalization
    347  * has been done by now, and all characters have been found in the alphabet.
    348  * Note: len here is the padded length including 0 codes at start and end.
    349  **/
    350 void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
    351                                     HyphenationType* out) const {
    352     static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
    353     // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
    354     uint8_t* buffer = reinterpret_cast<uint8_t*>(out);
    355 
    356     const Header* header = getHeader();
    357     const Trie* trie = header->trieTable();
    358     const Pattern* pattern = header->patternTable();
    359     uint32_t char_mask = trie->char_mask;
    360     uint32_t link_shift = trie->link_shift;
    361     uint32_t link_mask = trie->link_mask;
    362     uint32_t pattern_shift = trie->pattern_shift;
    363     size_t maxOffset = len - mMinSuffix - 1;
    364     for (size_t i = 0; i < len - 1; i++) {
    365         uint32_t node = 0;  // index into Trie table
    366         for (size_t j = i; j < len; j++) {
    367             uint16_t c = codes[j];
    368             uint32_t entry = trie->data[node + c];
    369             if ((entry & char_mask) == c) {
    370                 node = (entry & link_mask) >> link_shift;
    371             } else {
    372                 break;
    373             }
    374             uint32_t pat_ix = trie->data[node] >> pattern_shift;
    375             // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
    376             // into the buf pool. This is the pattern for the substring (i..j) we just matched,
    377             // which we combine (via point-wise max) into the buffer vector.
    378             if (pat_ix != 0) {
    379                 uint32_t pat_entry = pattern->data[pat_ix];
    380                 int pat_len = Pattern::len(pat_entry);
    381                 int pat_shift = Pattern::shift(pat_entry);
    382                 const uint8_t* pat_buf = pattern->buf(pat_entry);
    383                 int offset = j + 1 - (pat_len + pat_shift);
    384                 // offset is the index within buffer that lines up with the start of pat_buf
    385                 int start = std::max((int)mMinPrefix - offset, 0);
    386                 int end = std::min(pat_len, (int)maxOffset - offset);
    387                 for (int k = start; k < end; k++) {
    388                     buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
    389                 }
    390             }
    391         }
    392     }
    393     // Since the above calculation does not modify values outside
    394     // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
    395     for (size_t i = mMinPrefix; i < maxOffset; i++) {
    396         // Hyphenation opportunities happen when the hyphenation numbers are odd.
    397         out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
    398     }
    399 }
    400 
    401 }  // namespace minikin
    402