Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /**
     18  * An implementation of Liang's hyphenation algorithm.
     19  */
     20 
     21 #ifndef MINIKIN_HYPHENATOR_H
     22 #define MINIKIN_HYPHENATOR_H
     23 
     24 #include <string>
     25 #include <vector>
     26 
     27 #include "minikin/Characters.h"
     28 #include "minikin/U16StringPiece.h"
     29 
     30 namespace minikin {
     31 
     32 class Hyphenator;
     33 
     34 // Registers the hyphenator.
     35 // This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
     36 // In Android, the Hyphenator is allocated in Zygote and never gets released.
     37 void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
     38 void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);
     39 
     40 enum class HyphenationType : uint8_t {
     41     // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
     42 
     43     // Do not break.
     44     DONT_BREAK = 0,
     45     // Break the line and insert a normal hyphen.
     46     BREAK_AND_INSERT_HYPHEN = 1,
     47     // Break the line and insert an Armenian hyphen (U+058A).
     48     BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
     49     // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
     50     BREAK_AND_INSERT_MAQAF = 3,
     51     // Break the line and insert a Canadian Syllabics hyphen (U+1400).
     52     BREAK_AND_INSERT_UCAS_HYPHEN = 4,
     53     // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
     54     // present or the script does not use a hyphen (e.g. in Malayalam).
     55     BREAK_AND_DONT_INSERT_HYPHEN = 5,
     56     // Break and replace the last code unit with hyphen. Used for Catalan "ll" which hyphenates
     57     // as "l-/l".
     58     BREAK_AND_REPLACE_WITH_HYPHEN = 6,
     59     // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
     60     // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
     61     // "czerwono-/-niebieska") and Slovenian.
     62     BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
     63     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
     64     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
     65     // behavior when a soft hyphen is used in Arabic script.
     66     BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
     67 };
     68 
     69 // The hyphen edit represents an edit to the string when a word is hyphenated.
     70 // The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
     71 // allows for more choices.
     72 // One at the beginning of the string/line and one at the end.
     73 enum class EndHyphenEdit : uint8_t {
     74     // Note that everything inserting characters must have a value greater than or equal to
     75     // INSERT_HYPHEN.
     76     NO_EDIT = 0b000,
     77     REPLACE_WITH_HYPHEN = 0b001,
     78 
     79     INSERT_HYPHEN = 0b010,
     80     INSERT_ARMENIAN_HYPHEN = 0b011,
     81     INSERT_MAQAF = 0b100,
     82     INSERT_UCAS_HYPHEN = 0b101,
     83     INSERT_ZWJ_AND_HYPHEN = 0b110,
     84 };
     85 
     86 enum class StartHyphenEdit : uint8_t {
     87     NO_EDIT = 0b00,
     88 
     89     INSERT_HYPHEN = 0b01,
     90     INSERT_ZWJ = 0b10,
     91 };
     92 
     93 typedef uint8_t HyphenEdit;
     94 constexpr uint8_t START_BITS_SHIFT = 3;
     95 // The following two masks must keep in sync with the definitions in the Java code at:
     96 // frameworks/base/graphics/java/android/graphics/Paint.java
     97 constexpr uint8_t MASK_END_OF_LINE = 0b00111;
     98 constexpr uint8_t MASK_START_OF_LINE = 0b11000;
     99 
    100 inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
    101     return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
    102 }
    103 
    104 inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
    105     return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
    106 }
    107 
    108 inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
    109     return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
    110 }
    111 
    112 inline bool isReplacement(EndHyphenEdit hyph) {
    113     return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
    114 }
    115 
    116 inline bool isInsertion(StartHyphenEdit hyph) {
    117     return hyph != StartHyphenEdit::NO_EDIT;
    118 }
    119 
    120 inline bool isInsertion(EndHyphenEdit hyph) {
    121     return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
    122 }
    123 
    124 template <typename T, size_t size>
    125 constexpr size_t ARRAYSIZE(T const (&)[size]) {
    126     return size;
    127 }
    128 constexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
    129 constexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
    130 constexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
    131 constexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
    132 constexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
    133 constexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
    134 constexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
    135 #define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars))
    136 
    137 inline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
    138     if (hyph == StartHyphenEdit::INSERT_ZWJ) {
    139         return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
    140     } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
    141         return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
    142     } else {
    143         return EMPTY_HYPHEN_STR;
    144     }
    145 }
    146 
    147 inline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
    148     switch (hyph) {
    149         case EndHyphenEdit::REPLACE_WITH_HYPHEN:  // fall through
    150         case EndHyphenEdit::INSERT_HYPHEN:
    151             return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
    152         case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
    153             return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
    154         case EndHyphenEdit::INSERT_MAQAF:
    155             return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
    156         case EndHyphenEdit::INSERT_UCAS_HYPHEN:
    157             return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
    158         case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
    159             return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
    160         case EndHyphenEdit::NO_EDIT:
    161         default:
    162             return EMPTY_HYPHEN_STR;
    163     }
    164 }
    165 #undef MAKE_HYPHEN_STR
    166 
    167 EndHyphenEdit editForThisLine(HyphenationType type);
    168 StartHyphenEdit editForNextLine(HyphenationType type);
    169 
    170 // hyb file header; implementation details are in the .cpp file
    171 struct Header;
    172 
    173 class Hyphenator {
    174 public:
    175     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
    176     // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
    177     // corresponding code unit offset in the word.
    178     //
    179     // out must have at least the length of the word capacity.
    180     //
    181     // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
    182     // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
    183     void hyphenate(const U16StringPiece& word, HyphenationType* out) const;
    184 
    185     // Compute the hyphenation of a word.
    186     //
    187     // out will be resized to word length.
    188     void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
    189         out->resize(word.size());
    190         return hyphenate(word, out->data());
    191     }
    192 
    193     // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
    194     // immediately after which line breaks are allowed, but words containing it should not be
    195     // automatically hyphenated.
    196     static bool isLineBreakingHyphen(uint32_t cp);
    197 
    198     // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
    199     // the caller is responsible for ensuring that the lifetime of the pattern data is
    200     // at least as long as the Hyphenator object.
    201 
    202     // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
    203     // until this instance is deleted.
    204     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
    205     static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
    206                                   const std::string& locale);
    207 
    208 private:
    209     enum class HyphenationLocale : uint8_t {
    210         OTHER = 0,
    211         CATALAN = 1,
    212         POLISH = 2,
    213         SLOVENIAN = 3,
    214     };
    215 
    216     // Use Hyphenator::loadBinary instead.
    217     Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
    218                HyphenationLocale hyphenLocale);
    219 
    220     // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
    221     void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;
    222 
    223     // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
    224     // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
    225     // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
    226     // Note that this method writes len+2 entries into alpha_codes (including start and stop)
    227     HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;
    228 
    229     // calculate hyphenation from patterns, assuming alphabet lookup has already been done
    230     void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
    231                             HyphenationType* out) const;
    232 
    233     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
    234     // that temporary buffers can be stack-allocated without waste, which is a slightly
    235     // different use case. It measures UTF-16 code units.
    236     static const size_t MAX_HYPHENATED_SIZE = 64;
    237 
    238     const uint8_t* mPatternData;
    239     const size_t mMinPrefix, mMinSuffix;
    240     const HyphenationLocale mHyphenationLocale;
    241 
    242     // accessors for binary data
    243     const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
    244 };
    245 
    246 }  // namespace minikin
    247 
    248 #endif  // MINIKIN_HYPHENATOR_H
    249