1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /** 18 * An implementation of Liang's hyphenation algorithm. 19 */ 20 21 #ifndef MINIKIN_HYPHENATOR_H 22 #define MINIKIN_HYPHENATOR_H 23 24 #include <string> 25 #include <vector> 26 27 #include "minikin/Characters.h" 28 #include "minikin/U16StringPiece.h" 29 30 namespace minikin { 31 32 class Hyphenator; 33 34 // Registers the hyphenator. 35 // This doesn't take ownership of the hyphenator but we don't need to care about the ownership. 36 // In Android, the Hyphenator is allocated in Zygote and never gets released. 37 void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator); 38 void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr); 39 40 enum class HyphenationType : uint8_t { 41 // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0. 42 43 // Do not break. 44 DONT_BREAK = 0, 45 // Break the line and insert a normal hyphen. 46 BREAK_AND_INSERT_HYPHEN = 1, 47 // Break the line and insert an Armenian hyphen (U+058A). 48 BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2, 49 // Break the line and insert a maqaf (Hebrew hyphen, U+05BE). 50 BREAK_AND_INSERT_MAQAF = 3, 51 // Break the line and insert a Canadian Syllabics hyphen (U+1400). 52 BREAK_AND_INSERT_UCAS_HYPHEN = 4, 53 // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen 54 // present or the script does not use a hyphen (e.g. in Malayalam). 55 BREAK_AND_DONT_INSERT_HYPHEN = 5, 56 // Break and replace the last code unit with hyphen. Used for Catalan "ll" which hyphenates 57 // as "l-/l". 58 BREAK_AND_REPLACE_WITH_HYPHEN = 6, 59 // Break the line, and repeat the hyphen (which is the last character) at the beginning of the 60 // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as 61 // "czerwono-/-niebieska") and Slovenian. 62 BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7, 63 // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. 64 // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default 65 // behavior when a soft hyphen is used in Arabic script. 66 BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8 67 }; 68 69 // The hyphen edit represents an edit to the string when a word is hyphenated. 70 // The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation 71 // allows for more choices. 72 // One at the beginning of the string/line and one at the end. 73 enum class EndHyphenEdit : uint8_t { 74 // Note that everything inserting characters must have a value greater than or equal to 75 // INSERT_HYPHEN. 76 NO_EDIT = 0b000, 77 REPLACE_WITH_HYPHEN = 0b001, 78 79 INSERT_HYPHEN = 0b010, 80 INSERT_ARMENIAN_HYPHEN = 0b011, 81 INSERT_MAQAF = 0b100, 82 INSERT_UCAS_HYPHEN = 0b101, 83 INSERT_ZWJ_AND_HYPHEN = 0b110, 84 }; 85 86 enum class StartHyphenEdit : uint8_t { 87 NO_EDIT = 0b00, 88 89 INSERT_HYPHEN = 0b01, 90 INSERT_ZWJ = 0b10, 91 }; 92 93 typedef uint8_t HyphenEdit; 94 constexpr uint8_t START_BITS_SHIFT = 3; 95 // The following two masks must keep in sync with the definitions in the Java code at: 96 // frameworks/base/graphics/java/android/graphics/Paint.java 97 constexpr uint8_t MASK_END_OF_LINE = 0b00111; 98 constexpr uint8_t MASK_START_OF_LINE = 0b11000; 99 100 inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) { 101 return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end); 102 } 103 104 inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) { 105 return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE); 106 } 107 108 inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) { 109 return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT); 110 } 111 112 inline bool isReplacement(EndHyphenEdit hyph) { 113 return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN; 114 } 115 116 inline bool isInsertion(StartHyphenEdit hyph) { 117 return hyph != StartHyphenEdit::NO_EDIT; 118 } 119 120 inline bool isInsertion(EndHyphenEdit hyph) { 121 return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN); 122 } 123 124 template <typename T, size_t size> 125 constexpr size_t ARRAYSIZE(T const (&)[size]) { 126 return size; 127 } 128 constexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ}; 129 constexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN}; 130 constexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN}; 131 constexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF}; 132 constexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN}; 133 constexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN}; 134 constexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0); 135 #define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars)) 136 137 inline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) { 138 if (hyph == StartHyphenEdit::INSERT_ZWJ) { 139 return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ); 140 } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) { 141 return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN); 142 } else { 143 return EMPTY_HYPHEN_STR; 144 } 145 } 146 147 inline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) { 148 switch (hyph) { 149 case EndHyphenEdit::REPLACE_WITH_HYPHEN: // fall through 150 case EndHyphenEdit::INSERT_HYPHEN: 151 return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN); 152 case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN: 153 return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN); 154 case EndHyphenEdit::INSERT_MAQAF: 155 return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF); 156 case EndHyphenEdit::INSERT_UCAS_HYPHEN: 157 return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN); 158 case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN: 159 return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN); 160 case EndHyphenEdit::NO_EDIT: 161 default: 162 return EMPTY_HYPHEN_STR; 163 } 164 } 165 #undef MAKE_HYPHEN_STR 166 167 EndHyphenEdit editForThisLine(HyphenationType type); 168 StartHyphenEdit editForNextLine(HyphenationType type); 169 170 // hyb file header; implementation details are in the .cpp file 171 struct Header; 172 173 class Hyphenator { 174 public: 175 // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in 176 // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the 177 // corresponding code unit offset in the word. 178 // 179 // out must have at least the length of the word capacity. 180 // 181 // Example: word is "hyphen", result is the following, corresponding to "hy-phen": 182 // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK] 183 void hyphenate(const U16StringPiece& word, HyphenationType* out) const; 184 185 // Compute the hyphenation of a word. 186 // 187 // out will be resized to word length. 188 void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const { 189 out->resize(word.size()); 190 return hyphenate(word, out->data()); 191 } 192 193 // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character 194 // immediately after which line breaks are allowed, but words containing it should not be 195 // automatically hyphenated. 196 static bool isLineBreakingHyphen(uint32_t cp); 197 198 // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: 199 // the caller is responsible for ensuring that the lifetime of the pattern data is 200 // at least as long as the Hyphenator object. 201 202 // This class doesn't copy or take ownership of patternData. Caller must keep the data valid 203 // until this instance is deleted. 204 // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens. 205 static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 206 const std::string& locale); 207 208 private: 209 enum class HyphenationLocale : uint8_t { 210 OTHER = 0, 211 CATALAN = 1, 212 POLISH = 2, 213 SLOVENIAN = 3, 214 }; 215 216 // Use Hyphenator::loadBinary instead. 217 Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 218 HyphenationLocale hyphenLocale); 219 220 // apply various hyphenation rules including hard and soft hyphens, ignoring patterns 221 void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const; 222 223 // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map. 224 // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or 225 // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen. 226 // Note that this method writes len+2 entries into alpha_codes (including start and stop) 227 HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const; 228 229 // calculate hyphenation from patterns, assuming alphabet lookup has already been done 230 void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue, 231 HyphenationType* out) const; 232 233 // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so 234 // that temporary buffers can be stack-allocated without waste, which is a slightly 235 // different use case. It measures UTF-16 code units. 236 static const size_t MAX_HYPHENATED_SIZE = 64; 237 238 const uint8_t* mPatternData; 239 const size_t mMinPrefix, mMinSuffix; 240 const HyphenationLocale mHyphenationLocale; 241 242 // accessors for binary data 243 const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); } 244 }; 245 246 } // namespace minikin 247 248 #endif // MINIKIN_HYPHENATOR_H 249