1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "minikin/Hyphenator.h" 18 19 #include <algorithm> 20 #include <memory> 21 #include <string> 22 #include <vector> 23 24 #include <unicode/uchar.h> 25 #include <unicode/uscript.h> 26 27 #include "minikin/Characters.h" 28 29 namespace minikin { 30 31 // The following are structs that correspond to tables inside the hyb file format 32 33 struct AlphabetTable0 { 34 uint32_t version; 35 uint32_t min_codepoint; 36 uint32_t max_codepoint; 37 uint8_t data[1]; // actually flexible array, size is known at runtime 38 }; 39 40 struct AlphabetTable1 { 41 uint32_t version; 42 uint32_t n_entries; 43 uint32_t data[1]; // actually flexible array, size is known at runtime 44 45 static uint32_t codepoint(uint32_t entry) { return entry >> 11; } 46 static uint32_t value(uint32_t entry) { return entry & 0x7ff; } 47 }; 48 49 struct Trie { 50 uint32_t version; 51 uint32_t char_mask; 52 uint32_t link_shift; 53 uint32_t link_mask; 54 uint32_t pattern_shift; 55 uint32_t n_entries; 56 uint32_t data[1]; // actually flexible array, size is known at runtime 57 }; 58 59 struct Pattern { 60 uint32_t version; 61 uint32_t n_entries; 62 uint32_t pattern_offset; 63 uint32_t pattern_size; 64 uint32_t data[1]; // actually flexible array, size is known at runtime 65 66 // accessors 67 static uint32_t len(uint32_t entry) { return entry >> 26; } 68 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } 69 const uint8_t* buf(uint32_t entry) const { 70 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff); 71 } 72 }; 73 74 struct Header { 75 uint32_t magic; 76 uint32_t version; 77 uint32_t alphabet_offset; 78 uint32_t trie_offset; 79 uint32_t pattern_offset; 80 uint32_t file_size; 81 82 // accessors 83 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); } 84 uint32_t alphabetVersion() const { 85 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); 86 } 87 const AlphabetTable0* alphabetTable0() const { 88 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); 89 } 90 const AlphabetTable1* alphabetTable1() const { 91 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); 92 } 93 const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); } 94 const Pattern* patternTable() const { 95 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); 96 } 97 }; 98 99 // static 100 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 101 const std::string& locale) { 102 HyphenationLocale hyphenLocale = HyphenationLocale::OTHER; 103 if (locale == "pl") { 104 hyphenLocale = HyphenationLocale::POLISH; 105 } else if (locale == "ca") { 106 hyphenLocale = HyphenationLocale::CATALAN; 107 } else if (locale == "sl") { 108 hyphenLocale = HyphenationLocale::SLOVENIAN; 109 } 110 return new Hyphenator(patternData, minPrefix, minSuffix, hyphenLocale); 111 } 112 113 Hyphenator::Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 114 HyphenationLocale hyphenLocale) 115 : mPatternData(patternData), 116 mMinPrefix(minPrefix), 117 mMinSuffix(minSuffix), 118 mHyphenationLocale(hyphenLocale) {} 119 120 void Hyphenator::hyphenate(const U16StringPiece& word, HyphenationType* out) const { 121 const size_t len = word.size(); 122 const size_t paddedLen = len + 2; // start and stop code each count for 1 123 if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix && 124 paddedLen <= MAX_HYPHENATED_SIZE) { 125 uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; 126 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word); 127 if (hyphenValue != HyphenationType::DONT_BREAK) { 128 hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out); 129 return; 130 } 131 // TODO: try NFC normalization 132 // TODO: handle non-BMP Unicode (requires remapping of offsets) 133 } 134 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the 135 // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup 136 // would return DONT_BREAK. 137 hyphenateWithNoPatterns(word, out); 138 } 139 140 // This function determines whether a character is like U+2010 HYPHEN in 141 // line breaking and usage: a character immediately after which line breaks 142 // are allowed, but words containing it should not be automatically 143 // hyphenated using patterns. This is a curated set, created by manually 144 // inspecting all the characters that have the Unicode line breaking 145 // property of BA or HY and seeing which ones are hyphens. 146 bool Hyphenator::isLineBreakingHyphen(uint32_t c) { 147 return (c == 0x002D || // HYPHEN-MINUS 148 c == 0x058A || // ARMENIAN HYPHEN 149 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 150 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 151 c == 0x2010 || // HYPHEN 152 c == 0x2013 || // EN DASH 153 c == 0x2027 || // HYPHENATION POINT 154 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 155 c == 0x2E40); // DOUBLE HYPHEN 156 } 157 158 EndHyphenEdit editForThisLine(HyphenationType type) { 159 switch (type) { 160 case HyphenationType::BREAK_AND_INSERT_HYPHEN: 161 return EndHyphenEdit::INSERT_HYPHEN; 162 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: 163 return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN; 164 case HyphenationType::BREAK_AND_INSERT_MAQAF: 165 return EndHyphenEdit::INSERT_MAQAF; 166 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: 167 return EndHyphenEdit::INSERT_UCAS_HYPHEN; 168 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: 169 return EndHyphenEdit::REPLACE_WITH_HYPHEN; 170 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 171 return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN; 172 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense. 173 default: 174 return EndHyphenEdit::NO_EDIT; 175 } 176 } 177 178 StartHyphenEdit editForNextLine(HyphenationType type) { 179 switch (type) { 180 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: 181 return StartHyphenEdit::INSERT_HYPHEN; 182 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 183 return StartHyphenEdit::INSERT_ZWJ; 184 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense. 185 default: 186 return StartHyphenEdit::NO_EDIT; 187 } 188 } 189 190 static UScriptCode getScript(uint32_t codePoint) { 191 UErrorCode errorCode = U_ZERO_ERROR; 192 const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); 193 if (U_SUCCESS(errorCode)) { 194 return script; 195 } else { 196 return USCRIPT_INVALID_CODE; 197 } 198 } 199 200 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { 201 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen 202 // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens 203 // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch 204 // to maqaf for Hebrew, we can simply add a condition here. 205 const UScriptCode script = getScript(codePoint); 206 if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL || 207 script == USCRIPT_TELUGU) { 208 // Grantha is not included, since we don't support non-BMP hyphenation yet. 209 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 210 } else if (script == USCRIPT_ARMENIAN) { 211 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; 212 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { 213 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; 214 } else { 215 return HyphenationType::BREAK_AND_INSERT_HYPHEN; 216 } 217 } 218 219 static inline int32_t getJoiningType(UChar32 codepoint) { 220 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); 221 } 222 223 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN. 224 // This function decides if the letters before and after the hyphen should appear as joining. 225 static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) { 226 ssize_t i = location; 227 int32_t type = U_JT_NON_JOINING; 228 while (static_cast<size_t>(i) < word.size() && 229 (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 230 i++; 231 } 232 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) { 233 // The next character is of the type that may join the last character. See if the last 234 // character is also of the right type. 235 i = location - 2; // Skip the soft hyphen 236 type = U_JT_NON_JOINING; 237 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 238 i--; 239 } 240 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) { 241 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; 242 } 243 } 244 return HyphenationType::BREAK_AND_INSERT_HYPHEN; 245 } 246 247 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words 248 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections 249 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen). 250 void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const { 251 out[0] = HyphenationType::DONT_BREAK; 252 for (size_t i = 1; i < word.size(); i++) { 253 const uint16_t prevChar = word[i - 1]; 254 if (i > 1 && isLineBreakingHyphen(prevChar)) { 255 // Break after hyphens, but only if they don't start the word. 256 257 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) && 258 (mHyphenationLocale == HyphenationLocale::POLISH || 259 mHyphenationLocale == HyphenationLocale::SLOVENIAN) && 260 getScript(word[i]) == USCRIPT_LATIN) { 261 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, 262 // we will do this only if the next character is Latin. 263 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; 264 } else { 265 out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 266 } 267 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { 268 // Break after soft hyphens, but only if they don't start the word (a soft hyphen 269 // starting the word doesn't give any useful break opportunities). The type of the break 270 // is based on the script of the character we break on. 271 if (getScript(word[i]) == USCRIPT_ARABIC) { 272 // For Arabic, we need to look and see if the characters around the soft hyphen 273 // actually join. If they don't, we'll just insert a normal hyphen. 274 out[i] = getHyphTypeForArabic(word, i); 275 } else { 276 out[i] = hyphenationTypeBasedOnScript(word[i]); 277 } 278 } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix && 279 ((word[i - 2] == 'l' && word[i] == 'l') || 280 (word[i - 2] == 'L' && word[i] == 'L')) && 281 mHyphenationLocale == HyphenationLocale::CATALAN) { 282 // In Catalan, "ll" should break as "l-" on the first line 283 // and "l" on the next line. 284 out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; 285 } else { 286 out[i] = HyphenationType::DONT_BREAK; 287 } 288 } 289 } 290 291 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, 292 const U16StringPiece& word) const { 293 const Header* header = getHeader(); 294 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; 295 // TODO: check header magic 296 uint32_t alphabetVersion = header->alphabetVersion(); 297 if (alphabetVersion == 0) { 298 const AlphabetTable0* alphabet = header->alphabetTable0(); 299 uint32_t min_codepoint = alphabet->min_codepoint; 300 uint32_t max_codepoint = alphabet->max_codepoint; 301 alpha_codes[0] = 0; // word start 302 for (size_t i = 0; i < word.size(); i++) { 303 uint16_t c = word[i]; 304 if (c < min_codepoint || c >= max_codepoint) { 305 return HyphenationType::DONT_BREAK; 306 } 307 uint8_t code = alphabet->data[c - min_codepoint]; 308 if (code == 0) { 309 return HyphenationType::DONT_BREAK; 310 } 311 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 312 result = hyphenationTypeBasedOnScript(c); 313 } 314 alpha_codes[i + 1] = code; 315 } 316 alpha_codes[word.size() + 1] = 0; // word termination 317 return result; 318 } else if (alphabetVersion == 1) { 319 const AlphabetTable1* alphabet = header->alphabetTable1(); 320 size_t n_entries = alphabet->n_entries; 321 const uint32_t* begin = alphabet->data; 322 const uint32_t* end = begin + n_entries; 323 alpha_codes[0] = 0; 324 for (size_t i = 0; i < word.size(); i++) { 325 uint16_t c = word[i]; 326 auto p = std::lower_bound(begin, end, c << 11); 327 if (p == end) { 328 return HyphenationType::DONT_BREAK; 329 } 330 uint32_t entry = *p; 331 if (AlphabetTable1::codepoint(entry) != c) { 332 return HyphenationType::DONT_BREAK; 333 } 334 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 335 result = hyphenationTypeBasedOnScript(c); 336 } 337 alpha_codes[i + 1] = AlphabetTable1::value(entry); 338 } 339 alpha_codes[word.size() + 1] = 0; 340 return result; 341 } 342 return HyphenationType::DONT_BREAK; 343 } 344 345 /** 346 * Internal implementation, after conversion to codes. All case folding and normalization 347 * has been done by now, and all characters have been found in the alphabet. 348 * Note: len here is the padded length including 0 codes at start and end. 349 **/ 350 void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue, 351 HyphenationType* out) const { 352 static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t."); 353 // Reuse the result array as a buffer for calculating intermediate hyphenation numbers. 354 uint8_t* buffer = reinterpret_cast<uint8_t*>(out); 355 356 const Header* header = getHeader(); 357 const Trie* trie = header->trieTable(); 358 const Pattern* pattern = header->patternTable(); 359 uint32_t char_mask = trie->char_mask; 360 uint32_t link_shift = trie->link_shift; 361 uint32_t link_mask = trie->link_mask; 362 uint32_t pattern_shift = trie->pattern_shift; 363 size_t maxOffset = len - mMinSuffix - 1; 364 for (size_t i = 0; i < len - 1; i++) { 365 uint32_t node = 0; // index into Trie table 366 for (size_t j = i; j < len; j++) { 367 uint16_t c = codes[j]; 368 uint32_t entry = trie->data[node + c]; 369 if ((entry & char_mask) == c) { 370 node = (entry & link_mask) >> link_shift; 371 } else { 372 break; 373 } 374 uint32_t pat_ix = trie->data[node] >> pattern_shift; 375 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset 376 // into the buf pool. This is the pattern for the substring (i..j) we just matched, 377 // which we combine (via point-wise max) into the buffer vector. 378 if (pat_ix != 0) { 379 uint32_t pat_entry = pattern->data[pat_ix]; 380 int pat_len = Pattern::len(pat_entry); 381 int pat_shift = Pattern::shift(pat_entry); 382 const uint8_t* pat_buf = pattern->buf(pat_entry); 383 int offset = j + 1 - (pat_len + pat_shift); 384 // offset is the index within buffer that lines up with the start of pat_buf 385 int start = std::max((int)mMinPrefix - offset, 0); 386 int end = std::min(pat_len, (int)maxOffset - offset); 387 for (int k = start; k < end; k++) { 388 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); 389 } 390 } 391 } 392 } 393 // Since the above calculation does not modify values outside 394 // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK. 395 for (size_t i = mMinPrefix; i < maxOffset; i++) { 396 // Hyphenation opportunities happen when the hyphenation numbers are odd. 397 out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; 398 } 399 } 400 401 } // namespace minikin 402