1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <vector> 18 #include <memory> 19 #include <algorithm> 20 #include <string> 21 #include <unicode/uchar.h> 22 #include <unicode/uscript.h> 23 24 // HACK: for reading pattern file 25 #include <fcntl.h> 26 27 #define LOG_TAG "Minikin" 28 #include "utils/Log.h" 29 30 #include "minikin/Hyphenator.h" 31 32 using std::vector; 33 34 namespace minikin { 35 36 static const uint16_t CHAR_HYPHEN_MINUS = 0x002D; 37 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 38 static const uint16_t CHAR_MIDDLE_DOT = 0x00B7; 39 static const uint16_t CHAR_HYPHEN = 0x2010; 40 41 // The following are structs that correspond to tables inside the hyb file format 42 43 struct AlphabetTable0 { 44 uint32_t version; 45 uint32_t min_codepoint; 46 uint32_t max_codepoint; 47 uint8_t data[1]; // actually flexible array, size is known at runtime 48 }; 49 50 struct AlphabetTable1 { 51 uint32_t version; 52 uint32_t n_entries; 53 uint32_t data[1]; // actually flexible array, size is known at runtime 54 55 static uint32_t codepoint(uint32_t entry) { return entry >> 11; } 56 static uint32_t value(uint32_t entry) { return entry & 0x7ff; } 57 }; 58 59 struct Trie { 60 uint32_t version; 61 uint32_t char_mask; 62 uint32_t link_shift; 63 uint32_t link_mask; 64 uint32_t pattern_shift; 65 uint32_t n_entries; 66 uint32_t data[1]; // actually flexible array, size is known at runtime 67 }; 68 69 struct Pattern { 70 uint32_t version; 71 uint32_t n_entries; 72 uint32_t pattern_offset; 73 uint32_t pattern_size; 74 uint32_t data[1]; // actually flexible array, size is known at runtime 75 76 // accessors 77 static uint32_t len(uint32_t entry) { return entry >> 26; } 78 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } 79 const uint8_t* buf(uint32_t entry) const { 80 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff); 81 } 82 }; 83 84 struct Header { 85 uint32_t magic; 86 uint32_t version; 87 uint32_t alphabet_offset; 88 uint32_t trie_offset; 89 uint32_t pattern_offset; 90 uint32_t file_size; 91 92 // accessors 93 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); } 94 uint32_t alphabetVersion() const { 95 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); 96 } 97 const AlphabetTable0* alphabetTable0() const { 98 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); 99 } 100 const AlphabetTable1* alphabetTable1() const { 101 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); 102 } 103 const Trie* trieTable() const { 104 return reinterpret_cast<const Trie*>(bytes() + trie_offset); 105 } 106 const Pattern* patternTable() const { 107 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); 108 } 109 }; 110 111 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) { 112 Hyphenator* result = new Hyphenator; 113 result->patternData = patternData; 114 result->minPrefix = minPrefix; 115 result->minSuffix = minSuffix; 116 return result; 117 } 118 119 void Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len, 120 const icu::Locale& locale) { 121 result->clear(); 122 result->resize(len); 123 const size_t paddedLen = len + 2; // start and stop code each count for 1 124 if (patternData != nullptr && 125 len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) { 126 uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; 127 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len); 128 if (hyphenValue != HyphenationType::DONT_BREAK) { 129 hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue); 130 return; 131 } 132 // TODO: try NFC normalization 133 // TODO: handle non-BMP Unicode (requires remapping of offsets) 134 } 135 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the 136 // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup 137 // would return DONT_BREAK. 138 hyphenateWithNoPatterns(result->data(), word, len, locale); 139 } 140 141 // This function determines whether a character is like U+2010 HYPHEN in 142 // line breaking and usage: a character immediately after which line breaks 143 // are allowed, but words containing it should not be automatically 144 // hyphenated using patterns. This is a curated set, created by manually 145 // inspecting all the characters that have the Unicode line breaking 146 // property of BA or HY and seeing which ones are hyphens. 147 bool Hyphenator::isLineBreakingHyphen(uint32_t c) { 148 return (c == 0x002D || // HYPHEN-MINUS 149 c == 0x058A || // ARMENIAN HYPHEN 150 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 151 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 152 c == 0x2010 || // HYPHEN 153 c == 0x2013 || // EN DASH 154 c == 0x2027 || // HYPHENATION POINT 155 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 156 c == 0x2E40); // DOUBLE HYPHEN 157 } 158 159 const static uint32_t HYPHEN_STR[] = {0x2010, 0}; 160 const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0}; 161 const static uint32_t MAQAF_STR[] = {0x05BE, 0}; 162 const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0}; 163 const static uint32_t ZWJ_STR[] = {0x200D, 0}; 164 const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0}; 165 166 const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) { 167 switch (hyph) { 168 case INSERT_HYPHEN_AT_END: 169 case REPLACE_WITH_HYPHEN_AT_END: 170 case INSERT_HYPHEN_AT_START: 171 return HYPHEN_STR; 172 case INSERT_ARMENIAN_HYPHEN_AT_END: 173 return ARMENIAN_HYPHEN_STR; 174 case INSERT_MAQAF_AT_END: 175 return MAQAF_STR; 176 case INSERT_UCAS_HYPHEN_AT_END: 177 return UCAS_HYPHEN_STR; 178 case INSERT_ZWJ_AND_HYPHEN_AT_END: 179 return ZWJ_AND_HYPHEN_STR; 180 case INSERT_ZWJ_AT_START: 181 return ZWJ_STR; 182 default: 183 return nullptr; 184 } 185 } 186 187 uint32_t HyphenEdit::editForThisLine(HyphenationType type) { 188 switch (type) { 189 case HyphenationType::DONT_BREAK: 190 return NO_EDIT; 191 case HyphenationType::BREAK_AND_INSERT_HYPHEN: 192 return INSERT_HYPHEN_AT_END; 193 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: 194 return INSERT_ARMENIAN_HYPHEN_AT_END; 195 case HyphenationType::BREAK_AND_INSERT_MAQAF: 196 return INSERT_MAQAF_AT_END; 197 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: 198 return INSERT_UCAS_HYPHEN_AT_END; 199 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: 200 return REPLACE_WITH_HYPHEN_AT_END; 201 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 202 return INSERT_ZWJ_AND_HYPHEN_AT_END; 203 default: 204 return BREAK_AT_END; 205 } 206 } 207 208 uint32_t HyphenEdit::editForNextLine(HyphenationType type) { 209 switch (type) { 210 case HyphenationType::DONT_BREAK: 211 return NO_EDIT; 212 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: 213 return INSERT_HYPHEN_AT_START; 214 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 215 return INSERT_ZWJ_AT_START; 216 default: 217 return BREAK_AT_START; 218 } 219 } 220 221 static UScriptCode getScript(uint32_t codePoint) { 222 UErrorCode errorCode = U_ZERO_ERROR; 223 const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); 224 if (U_SUCCESS(errorCode)) { 225 return script; 226 } else { 227 return USCRIPT_INVALID_CODE; 228 } 229 } 230 231 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { 232 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen 233 // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens 234 // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch 235 // to maqaf for Hebrew, we can simply add a condition here. 236 const UScriptCode script = getScript(codePoint); 237 if (script == USCRIPT_KANNADA 238 || script == USCRIPT_MALAYALAM 239 || script == USCRIPT_TAMIL 240 || script == USCRIPT_TELUGU) { 241 // Grantha is not included, since we don't support non-BMP hyphenation yet. 242 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 243 } else if (script == USCRIPT_ARMENIAN) { 244 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; 245 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { 246 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; 247 } else { 248 return HyphenationType::BREAK_AND_INSERT_HYPHEN; 249 } 250 } 251 252 static inline int32_t getJoiningType(UChar32 codepoint) { 253 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); 254 } 255 256 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN. 257 // This function decides if the letters before and after the hyphen should appear as joining. 258 static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len, 259 size_t location) { 260 ssize_t i = location; 261 int32_t type = U_JT_NON_JOINING; 262 while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 263 i++; 264 } 265 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) { 266 // The next character is of the type that may join the last character. See if the last 267 // character is also of the right type. 268 i = location - 2; // Skip the soft hyphen 269 type = U_JT_NON_JOINING; 270 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 271 i--; 272 } 273 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) { 274 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; 275 } 276 } 277 return HyphenationType::BREAK_AND_INSERT_HYPHEN; 278 } 279 280 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words 281 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections 282 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen). 283 void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len, 284 const icu::Locale& locale) { 285 result[0] = HyphenationType::DONT_BREAK; 286 for (size_t i = 1; i < len; i++) { 287 const uint16_t prevChar = word[i - 1]; 288 if (i > 1 && isLineBreakingHyphen(prevChar)) { 289 // Break after hyphens, but only if they don't start the word. 290 291 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) 292 && strcmp(locale.getLanguage(), "pl") == 0 293 && getScript(word[i]) == USCRIPT_LATIN ) { 294 // In Polish, hyphens get repeated at the next line. To be safe, 295 // we will do this only if the next character is Latin. 296 result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; 297 } else { 298 result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 299 } 300 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { 301 // Break after soft hyphens, but only if they don't start the word (a soft hyphen 302 // starting the word doesn't give any useful break opportunities). The type of the break 303 // is based on the script of the character we break on. 304 if (getScript(word[i]) == USCRIPT_ARABIC) { 305 // For Arabic, we need to look and see if the characters around the soft hyphen 306 // actually join. If they don't, we'll just insert a normal hyphen. 307 result[i] = getHyphTypeForArabic(word, len, i); 308 } else { 309 result[i] = hyphenationTypeBasedOnScript(word[i]); 310 } 311 } else if (prevChar == CHAR_MIDDLE_DOT 312 && minPrefix < i && i <= len - minSuffix 313 && ((word[i - 2] == 'l' && word[i] == 'l') 314 || (word[i - 2] == 'L' && word[i] == 'L')) 315 && strcmp(locale.getLanguage(), "ca") == 0) { 316 // In Catalan, "ll" should break as "l-" on the first line 317 // and "l" on the next line. 318 result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; 319 } else { 320 result[i] = HyphenationType::DONT_BREAK; 321 } 322 } 323 } 324 325 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, 326 size_t len) { 327 const Header* header = getHeader(); 328 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; 329 // TODO: check header magic 330 uint32_t alphabetVersion = header->alphabetVersion(); 331 if (alphabetVersion == 0) { 332 const AlphabetTable0* alphabet = header->alphabetTable0(); 333 uint32_t min_codepoint = alphabet->min_codepoint; 334 uint32_t max_codepoint = alphabet->max_codepoint; 335 alpha_codes[0] = 0; // word start 336 for (size_t i = 0; i < len; i++) { 337 uint16_t c = word[i]; 338 if (c < min_codepoint || c >= max_codepoint) { 339 return HyphenationType::DONT_BREAK; 340 } 341 uint8_t code = alphabet->data[c - min_codepoint]; 342 if (code == 0) { 343 return HyphenationType::DONT_BREAK; 344 } 345 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 346 result = hyphenationTypeBasedOnScript(c); 347 } 348 alpha_codes[i + 1] = code; 349 } 350 alpha_codes[len + 1] = 0; // word termination 351 return result; 352 } else if (alphabetVersion == 1) { 353 const AlphabetTable1* alphabet = header->alphabetTable1(); 354 size_t n_entries = alphabet->n_entries; 355 const uint32_t* begin = alphabet->data; 356 const uint32_t* end = begin + n_entries; 357 alpha_codes[0] = 0; 358 for (size_t i = 0; i < len; i++) { 359 uint16_t c = word[i]; 360 auto p = std::lower_bound(begin, end, c << 11); 361 if (p == end) { 362 return HyphenationType::DONT_BREAK; 363 } 364 uint32_t entry = *p; 365 if (AlphabetTable1::codepoint(entry) != c) { 366 return HyphenationType::DONT_BREAK; 367 } 368 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 369 result = hyphenationTypeBasedOnScript(c); 370 } 371 alpha_codes[i + 1] = AlphabetTable1::value(entry); 372 } 373 alpha_codes[len + 1] = 0; 374 return result; 375 } 376 return HyphenationType::DONT_BREAK; 377 } 378 379 /** 380 * Internal implementation, after conversion to codes. All case folding and normalization 381 * has been done by now, and all characters have been found in the alphabet. 382 * Note: len here is the padded length including 0 codes at start and end. 383 **/ 384 void Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len, 385 HyphenationType hyphenValue) { 386 static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t."); 387 // Reuse the result array as a buffer for calculating intermediate hyphenation numbers. 388 uint8_t* buffer = reinterpret_cast<uint8_t*>(result); 389 390 const Header* header = getHeader(); 391 const Trie* trie = header->trieTable(); 392 const Pattern* pattern = header->patternTable(); 393 uint32_t char_mask = trie->char_mask; 394 uint32_t link_shift = trie->link_shift; 395 uint32_t link_mask = trie->link_mask; 396 uint32_t pattern_shift = trie->pattern_shift; 397 size_t maxOffset = len - minSuffix - 1; 398 for (size_t i = 0; i < len - 1; i++) { 399 uint32_t node = 0; // index into Trie table 400 for (size_t j = i; j < len; j++) { 401 uint16_t c = codes[j]; 402 uint32_t entry = trie->data[node + c]; 403 if ((entry & char_mask) == c) { 404 node = (entry & link_mask) >> link_shift; 405 } else { 406 break; 407 } 408 uint32_t pat_ix = trie->data[node] >> pattern_shift; 409 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset 410 // into the buf pool. This is the pattern for the substring (i..j) we just matched, 411 // which we combine (via point-wise max) into the buffer vector. 412 if (pat_ix != 0) { 413 uint32_t pat_entry = pattern->data[pat_ix]; 414 int pat_len = Pattern::len(pat_entry); 415 int pat_shift = Pattern::shift(pat_entry); 416 const uint8_t* pat_buf = pattern->buf(pat_entry); 417 int offset = j + 1 - (pat_len + pat_shift); 418 // offset is the index within buffer that lines up with the start of pat_buf 419 int start = std::max((int)minPrefix - offset, 0); 420 int end = std::min(pat_len, (int)maxOffset - offset); 421 for (int k = start; k < end; k++) { 422 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); 423 } 424 } 425 } 426 } 427 // Since the above calculation does not modify values outside 428 // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK. 429 for (size_t i = minPrefix; i < maxOffset; i++) { 430 // Hyphenation opportunities happen when the hyphenation numbers are odd. 431 result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; 432 } 433 } 434 435 } // namespace minikin 436