1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2014, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uniset.h" 15 #include "unicode/chariter.h" 16 #include "unicode/ubrk.h" 17 #include "uvector.h" 18 #include "uassert.h" 19 #include "unicode/normlzr.h" 20 #include "cmemory.h" 21 #include "dictionarydata.h" 22 23 U_NAMESPACE_BEGIN 24 25 /* 26 ****************************************************************** 27 */ 28 29 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 30 fTypes = breakTypes; 31 } 32 33 DictionaryBreakEngine::~DictionaryBreakEngine() { 34 } 35 36 UBool 37 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 38 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 39 && fSet.contains(c)); 40 } 41 42 int32_t 43 DictionaryBreakEngine::findBreaks( UText *text, 44 int32_t startPos, 45 int32_t endPos, 46 UBool reverse, 47 int32_t breakType, 48 UStack &foundBreaks ) const { 49 int32_t result = 0; 50 51 // Find the span of characters included in the set. 52 // The span to break begins at the current position in the text, and 53 // extends towards the start or end of the text, depending on 'reverse'. 54 55 int32_t start = (int32_t)utext_getNativeIndex(text); 56 int32_t current; 57 int32_t rangeStart; 58 int32_t rangeEnd; 59 UChar32 c = utext_current32(text); 60 if (reverse) { 61 UBool isDict = fSet.contains(c); 62 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 63 c = utext_previous32(text); 64 isDict = fSet.contains(c); 65 } 66 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 67 rangeEnd = start + 1; 68 } 69 else { 70 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 71 utext_next32(text); // TODO: recast loop for postincrement 72 c = utext_current32(text); 73 } 74 rangeStart = start; 75 rangeEnd = current; 76 } 77 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 78 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 79 utext_setNativeIndex(text, current); 80 } 81 82 return result; 83 } 84 85 void 86 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 87 fSet = set; 88 // Compact for caching 89 fSet.compact(); 90 } 91 92 /* 93 ****************************************************************** 94 * PossibleWord 95 */ 96 97 // Helper class for improving readability of the Thai/Lao/Khmer word break 98 // algorithm. The implementation is completely inline. 99 100 // List size, limited by the maximum number of words in the dictionary 101 // that form a nested sequence. 102 #define POSSIBLE_WORD_LIST_MAX 20 103 104 class PossibleWord { 105 private: 106 // list of word candidate lengths, in increasing length order 107 int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 108 int32_t count; // Count of candidates 109 int32_t prefix; // The longest match with a dictionary word 110 int32_t offset; // Offset in the text of these candidates 111 int mark; // The preferred candidate's offset 112 int current; // The candidate we're currently looking at 113 114 public: 115 PossibleWord(); 116 ~PossibleWord(); 117 118 // Fill the list of candidates if needed, select the longest, and return the number found 119 int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 120 121 // Select the currently marked candidate, point after it in the text, and invalidate self 122 int32_t acceptMarked( UText *text ); 123 124 // Back up from the current candidate to the next shorter one; return TRUE if that exists 125 // and point the text after it 126 UBool backUp( UText *text ); 127 128 // Return the longest prefix this candidate location shares with a dictionary word 129 int32_t longestPrefix(); 130 131 // Mark the current candidate as the one we like 132 void markCurrent(); 133 }; 134 135 inline 136 PossibleWord::PossibleWord() { 137 offset = -1; 138 } 139 140 inline 141 PossibleWord::~PossibleWord() { 142 } 143 144 inline int 145 PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 146 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 147 int32_t start = (int32_t)utext_getNativeIndex(text); 148 if (start != offset) { 149 offset = start; 150 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 151 // Dictionary leaves text after longest prefix, not longest word. Back up. 152 if (count <= 0) { 153 utext_setNativeIndex(text, start); 154 } 155 } 156 if (count > 0) { 157 utext_setNativeIndex(text, start+lengths[count-1]); 158 } 159 current = count-1; 160 mark = current; 161 return count; 162 } 163 164 inline int32_t 165 PossibleWord::acceptMarked( UText *text ) { 166 utext_setNativeIndex(text, offset + lengths[mark]); 167 return lengths[mark]; 168 } 169 170 inline UBool 171 PossibleWord::backUp( UText *text ) { 172 if (current > 0) { 173 utext_setNativeIndex(text, offset + lengths[--current]); 174 return TRUE; 175 } 176 return FALSE; 177 } 178 179 inline int32_t 180 PossibleWord::longestPrefix() { 181 return prefix; 182 } 183 184 inline void 185 PossibleWord::markCurrent() { 186 mark = current; 187 } 188 189 /* 190 ****************************************************************** 191 * ThaiBreakEngine 192 */ 193 194 // How many words in a row are "good enough"? 195 #define THAI_LOOKAHEAD 3 196 197 // Will not combine a non-word with a preceding dictionary word longer than this 198 #define THAI_ROOT_COMBINE_THRESHOLD 3 199 200 // Will not combine a non-word that shares at least this much prefix with a 201 // dictionary word, with a preceding word 202 #define THAI_PREFIX_COMBINE_THRESHOLD 3 203 204 // Ellision character 205 #define THAI_PAIYANNOI 0x0E2F 206 207 // Repeat character 208 #define THAI_MAIYAMOK 0x0E46 209 210 // Minimum word size 211 #define THAI_MIN_WORD 2 212 213 // Minimum number of characters for two words 214 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 215 216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 218 fDictionary(adoptDictionary) 219 { 220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 221 if (U_SUCCESS(status)) { 222 setCharacters(fThaiWordSet); 223 } 224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 225 fMarkSet.add(0x0020); 226 fEndWordSet = fThaiWordSet; 227 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 228 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 230 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 231 fSuffixSet.add(THAI_PAIYANNOI); 232 fSuffixSet.add(THAI_MAIYAMOK); 233 234 // Compact for caching. 235 fMarkSet.compact(); 236 fEndWordSet.compact(); 237 fBeginWordSet.compact(); 238 fSuffixSet.compact(); 239 } 240 241 ThaiBreakEngine::~ThaiBreakEngine() { 242 delete fDictionary; 243 } 244 245 int32_t 246 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 247 int32_t rangeStart, 248 int32_t rangeEnd, 249 UStack &foundBreaks ) const { 250 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 251 return 0; // Not enough characters for two words 252 } 253 254 uint32_t wordsFound = 0; 255 int32_t wordLength; 256 int32_t current; 257 UErrorCode status = U_ZERO_ERROR; 258 PossibleWord words[THAI_LOOKAHEAD]; 259 UChar32 uc; 260 261 utext_setNativeIndex(text, rangeStart); 262 263 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 264 wordLength = 0; 265 266 // Look for candidate words at the current position 267 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 268 269 // If we found exactly one, use that 270 if (candidates == 1) { 271 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 272 wordsFound += 1; 273 } 274 // If there was more than one, see which one can take us forward the most words 275 else if (candidates > 1) { 276 // If we're already at the end of the range, we're done 277 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 278 goto foundBest; 279 } 280 do { 281 int wordsMatched = 1; 282 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 283 if (wordsMatched < 2) { 284 // Followed by another dictionary word; mark first word as a good candidate 285 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 286 wordsMatched = 2; 287 } 288 289 // If we're already at the end of the range, we're done 290 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 291 goto foundBest; 292 } 293 294 // See if any of the possible second words is followed by a third word 295 do { 296 // If we find a third word, stop right away 297 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 298 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 299 goto foundBest; 300 } 301 } 302 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 303 } 304 } 305 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 306 foundBest: 307 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 308 wordsFound += 1; 309 } 310 311 // We come here after having either found a word or not. We look ahead to the 312 // next word. If it's not a dictionary word, we will combine it withe the word we 313 // just found (if there is one), but only if the preceding word does not exceed 314 // the threshold. 315 // The text iterator should now be positioned at the end of the word we found. 316 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 317 // if it is a dictionary word, do nothing. If it isn't, then if there is 318 // no preceding word, or the non-word shares less than the minimum threshold 319 // of characters with a dictionary word, then scan to resynchronize 320 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 321 && (wordLength == 0 322 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 323 // Look for a plausible word boundary 324 //TODO: This section will need a rework for UText. 325 int32_t remaining = rangeEnd - (current+wordLength); 326 UChar32 pc = utext_current32(text); 327 int32_t chars = 0; 328 for (;;) { 329 utext_next32(text); 330 uc = utext_current32(text); 331 // TODO: Here we're counting on the fact that the SA languages are all 332 // in the BMP. This should get fixed with the UText rework. 333 chars += 1; 334 if (--remaining <= 0) { 335 break; 336 } 337 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 338 // Maybe. See if it's in the dictionary. 339 // NOTE: In the original Apple code, checked that the next 340 // two characters after uc were not 0x0E4C THANTHAKHAT before 341 // checking the dictionary. That is just a performance filter, 342 // but it's not clear it's faster than checking the trie. 343 int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 344 utext_setNativeIndex(text, current + wordLength + chars); 345 if (candidates > 0) { 346 break; 347 } 348 } 349 pc = uc; 350 } 351 352 // Bump the word count if there wasn't already one 353 if (wordLength <= 0) { 354 wordsFound += 1; 355 } 356 357 // Update the length with the passed-over characters 358 wordLength += chars; 359 } 360 else { 361 // Back up to where we were for next iteration 362 utext_setNativeIndex(text, current+wordLength); 363 } 364 } 365 366 // Never stop before a combining mark. 367 int32_t currPos; 368 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 369 utext_next32(text); 370 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 371 } 372 373 // Look ahead for possible suffixes if a dictionary word does not follow. 374 // We do this in code rather than using a rule so that the heuristic 375 // resynch continues to function. For example, one of the suffix characters 376 // could be a typo in the middle of a word. 377 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 378 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 379 && fSuffixSet.contains(uc = utext_current32(text))) { 380 if (uc == THAI_PAIYANNOI) { 381 if (!fSuffixSet.contains(utext_previous32(text))) { 382 // Skip over previous end and PAIYANNOI 383 utext_next32(text); 384 utext_next32(text); 385 wordLength += 1; // Add PAIYANNOI to word 386 uc = utext_current32(text); // Fetch next character 387 } 388 else { 389 // Restore prior position 390 utext_next32(text); 391 } 392 } 393 if (uc == THAI_MAIYAMOK) { 394 if (utext_previous32(text) != THAI_MAIYAMOK) { 395 // Skip over previous end and MAIYAMOK 396 utext_next32(text); 397 utext_next32(text); 398 wordLength += 1; // Add MAIYAMOK to word 399 } 400 else { 401 // Restore prior position 402 utext_next32(text); 403 } 404 } 405 } 406 else { 407 utext_setNativeIndex(text, current+wordLength); 408 } 409 } 410 411 // Did we find a word on this iteration? If so, push it on the break stack 412 if (wordLength > 0) { 413 foundBreaks.push((current+wordLength), status); 414 } 415 } 416 417 // Don't return a break for the end of the dictionary range if there is one there. 418 if (foundBreaks.peeki() >= rangeEnd) { 419 (void) foundBreaks.popi(); 420 wordsFound -= 1; 421 } 422 423 return wordsFound; 424 } 425 426 /* 427 ****************************************************************** 428 * LaoBreakEngine 429 */ 430 431 // How many words in a row are "good enough"? 432 #define LAO_LOOKAHEAD 3 433 434 // Will not combine a non-word with a preceding dictionary word longer than this 435 #define LAO_ROOT_COMBINE_THRESHOLD 3 436 437 // Will not combine a non-word that shares at least this much prefix with a 438 // dictionary word, with a preceding word 439 #define LAO_PREFIX_COMBINE_THRESHOLD 3 440 441 // Minimum word size 442 #define LAO_MIN_WORD 2 443 444 // Minimum number of characters for two words 445 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) 446 447 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 448 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 449 fDictionary(adoptDictionary) 450 { 451 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 452 if (U_SUCCESS(status)) { 453 setCharacters(fLaoWordSet); 454 } 455 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 456 fMarkSet.add(0x0020); 457 fEndWordSet = fLaoWordSet; 458 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 459 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 460 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 461 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 462 463 // Compact for caching. 464 fMarkSet.compact(); 465 fEndWordSet.compact(); 466 fBeginWordSet.compact(); 467 } 468 469 LaoBreakEngine::~LaoBreakEngine() { 470 delete fDictionary; 471 } 472 473 int32_t 474 LaoBreakEngine::divideUpDictionaryRange( UText *text, 475 int32_t rangeStart, 476 int32_t rangeEnd, 477 UStack &foundBreaks ) const { 478 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 479 return 0; // Not enough characters for two words 480 } 481 482 uint32_t wordsFound = 0; 483 int32_t wordLength; 484 int32_t current; 485 UErrorCode status = U_ZERO_ERROR; 486 PossibleWord words[LAO_LOOKAHEAD]; 487 UChar32 uc; 488 489 utext_setNativeIndex(text, rangeStart); 490 491 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 492 wordLength = 0; 493 494 // Look for candidate words at the current position 495 int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 496 497 // If we found exactly one, use that 498 if (candidates == 1) { 499 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 500 wordsFound += 1; 501 } 502 // If there was more than one, see which one can take us forward the most words 503 else if (candidates > 1) { 504 // If we're already at the end of the range, we're done 505 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 506 goto foundBest; 507 } 508 do { 509 int wordsMatched = 1; 510 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 511 if (wordsMatched < 2) { 512 // Followed by another dictionary word; mark first word as a good candidate 513 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 514 wordsMatched = 2; 515 } 516 517 // If we're already at the end of the range, we're done 518 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 519 goto foundBest; 520 } 521 522 // See if any of the possible second words is followed by a third word 523 do { 524 // If we find a third word, stop right away 525 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 526 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 527 goto foundBest; 528 } 529 } 530 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 531 } 532 } 533 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 534 foundBest: 535 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 536 wordsFound += 1; 537 } 538 539 // We come here after having either found a word or not. We look ahead to the 540 // next word. If it's not a dictionary word, we will combine it withe the word we 541 // just found (if there is one), but only if the preceding word does not exceed 542 // the threshold. 543 // The text iterator should now be positioned at the end of the word we found. 544 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) { 545 // if it is a dictionary word, do nothing. If it isn't, then if there is 546 // no preceding word, or the non-word shares less than the minimum threshold 547 // of characters with a dictionary word, then scan to resynchronize 548 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 549 && (wordLength == 0 550 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 551 // Look for a plausible word boundary 552 //TODO: This section will need a rework for UText. 553 int32_t remaining = rangeEnd - (current+wordLength); 554 UChar32 pc = utext_current32(text); 555 int32_t chars = 0; 556 for (;;) { 557 utext_next32(text); 558 uc = utext_current32(text); 559 // TODO: Here we're counting on the fact that the SA languages are all 560 // in the BMP. This should get fixed with the UText rework. 561 chars += 1; 562 if (--remaining <= 0) { 563 break; 564 } 565 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 566 // Maybe. See if it's in the dictionary. 567 int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 568 utext_setNativeIndex(text, current + wordLength + chars); 569 if (candidates > 0) { 570 break; 571 } 572 } 573 pc = uc; 574 } 575 576 // Bump the word count if there wasn't already one 577 if (wordLength <= 0) { 578 wordsFound += 1; 579 } 580 581 // Update the length with the passed-over characters 582 wordLength += chars; 583 } 584 else { 585 // Back up to where we were for next iteration 586 utext_setNativeIndex(text, current+wordLength); 587 } 588 } 589 590 // Never stop before a combining mark. 591 int32_t currPos; 592 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 593 utext_next32(text); 594 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 595 } 596 597 // Look ahead for possible suffixes if a dictionary word does not follow. 598 // We do this in code rather than using a rule so that the heuristic 599 // resynch continues to function. For example, one of the suffix characters 600 // could be a typo in the middle of a word. 601 // NOT CURRENTLY APPLICABLE TO LAO 602 603 // Did we find a word on this iteration? If so, push it on the break stack 604 if (wordLength > 0) { 605 foundBreaks.push((current+wordLength), status); 606 } 607 } 608 609 // Don't return a break for the end of the dictionary range if there is one there. 610 if (foundBreaks.peeki() >= rangeEnd) { 611 (void) foundBreaks.popi(); 612 wordsFound -= 1; 613 } 614 615 return wordsFound; 616 } 617 618 /* 619 ****************************************************************** 620 * KhmerBreakEngine 621 */ 622 623 // How many words in a row are "good enough"? 624 #define KHMER_LOOKAHEAD 3 625 626 // Will not combine a non-word with a preceding dictionary word longer than this 627 #define KHMER_ROOT_COMBINE_THRESHOLD 3 628 629 // Will not combine a non-word that shares at least this much prefix with a 630 // dictionary word, with a preceding word 631 #define KHMER_PREFIX_COMBINE_THRESHOLD 3 632 633 // Minimum word size 634 #define KHMER_MIN_WORD 2 635 636 // Minimum number of characters for two words 637 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 638 639 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 640 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 641 fDictionary(adoptDictionary) 642 { 643 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 644 if (U_SUCCESS(status)) { 645 setCharacters(fKhmerWordSet); 646 } 647 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 648 fMarkSet.add(0x0020); 649 fEndWordSet = fKhmerWordSet; 650 fBeginWordSet.add(0x1780, 0x17B3); 651 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 652 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 653 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 654 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 655 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 656 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 657 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 658 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 659 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 660 // fSuffixSet.add(THAI_PAIYANNOI); 661 // fSuffixSet.add(THAI_MAIYAMOK); 662 663 // Compact for caching. 664 fMarkSet.compact(); 665 fEndWordSet.compact(); 666 fBeginWordSet.compact(); 667 // fSuffixSet.compact(); 668 } 669 670 KhmerBreakEngine::~KhmerBreakEngine() { 671 delete fDictionary; 672 } 673 674 int32_t 675 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 676 int32_t rangeStart, 677 int32_t rangeEnd, 678 UStack &foundBreaks ) const { 679 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 680 return 0; // Not enough characters for two words 681 } 682 683 uint32_t wordsFound = 0; 684 int32_t wordLength; 685 int32_t current; 686 UErrorCode status = U_ZERO_ERROR; 687 PossibleWord words[KHMER_LOOKAHEAD]; 688 UChar32 uc; 689 690 utext_setNativeIndex(text, rangeStart); 691 692 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 693 wordLength = 0; 694 695 // Look for candidate words at the current position 696 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 697 698 // If we found exactly one, use that 699 if (candidates == 1) { 700 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 701 wordsFound += 1; 702 } 703 704 // If there was more than one, see which one can take us forward the most words 705 else if (candidates > 1) { 706 // If we're already at the end of the range, we're done 707 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 708 goto foundBest; 709 } 710 do { 711 int wordsMatched = 1; 712 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 713 if (wordsMatched < 2) { 714 // Followed by another dictionary word; mark first word as a good candidate 715 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 716 wordsMatched = 2; 717 } 718 719 // If we're already at the end of the range, we're done 720 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 721 goto foundBest; 722 } 723 724 // See if any of the possible second words is followed by a third word 725 do { 726 // If we find a third word, stop right away 727 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 728 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 729 goto foundBest; 730 } 731 } 732 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 733 } 734 } 735 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 736 foundBest: 737 wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 738 wordsFound += 1; 739 } 740 741 // We come here after having either found a word or not. We look ahead to the 742 // next word. If it's not a dictionary word, we will combine it with the word we 743 // just found (if there is one), but only if the preceding word does not exceed 744 // the threshold. 745 // The text iterator should now be positioned at the end of the word we found. 746 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 747 // if it is a dictionary word, do nothing. If it isn't, then if there is 748 // no preceding word, or the non-word shares less than the minimum threshold 749 // of characters with a dictionary word, then scan to resynchronize 750 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 751 && (wordLength == 0 752 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 753 // Look for a plausible word boundary 754 //TODO: This section will need a rework for UText. 755 int32_t remaining = rangeEnd - (current+wordLength); 756 UChar32 pc = utext_current32(text); 757 int32_t chars = 0; 758 for (;;) { 759 utext_next32(text); 760 uc = utext_current32(text); 761 // TODO: Here we're counting on the fact that the SA languages are all 762 // in the BMP. This should get fixed with the UText rework. 763 chars += 1; 764 if (--remaining <= 0) { 765 break; 766 } 767 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 768 // Maybe. See if it's in the dictionary. 769 int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 770 utext_setNativeIndex(text, current+wordLength+chars); 771 if (candidates > 0) { 772 break; 773 } 774 } 775 pc = uc; 776 } 777 778 // Bump the word count if there wasn't already one 779 if (wordLength <= 0) { 780 wordsFound += 1; 781 } 782 783 // Update the length with the passed-over characters 784 wordLength += chars; 785 } 786 else { 787 // Back up to where we were for next iteration 788 utext_setNativeIndex(text, current+wordLength); 789 } 790 } 791 792 // Never stop before a combining mark. 793 int32_t currPos; 794 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 795 utext_next32(text); 796 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 797 } 798 799 // Look ahead for possible suffixes if a dictionary word does not follow. 800 // We do this in code rather than using a rule so that the heuristic 801 // resynch continues to function. For example, one of the suffix characters 802 // could be a typo in the middle of a word. 803 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 804 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 805 // && fSuffixSet.contains(uc = utext_current32(text))) { 806 // if (uc == KHMER_PAIYANNOI) { 807 // if (!fSuffixSet.contains(utext_previous32(text))) { 808 // // Skip over previous end and PAIYANNOI 809 // utext_next32(text); 810 // utext_next32(text); 811 // wordLength += 1; // Add PAIYANNOI to word 812 // uc = utext_current32(text); // Fetch next character 813 // } 814 // else { 815 // // Restore prior position 816 // utext_next32(text); 817 // } 818 // } 819 // if (uc == KHMER_MAIYAMOK) { 820 // if (utext_previous32(text) != KHMER_MAIYAMOK) { 821 // // Skip over previous end and MAIYAMOK 822 // utext_next32(text); 823 // utext_next32(text); 824 // wordLength += 1; // Add MAIYAMOK to word 825 // } 826 // else { 827 // // Restore prior position 828 // utext_next32(text); 829 // } 830 // } 831 // } 832 // else { 833 // utext_setNativeIndex(text, current+wordLength); 834 // } 835 // } 836 837 // Did we find a word on this iteration? If so, push it on the break stack 838 if (wordLength > 0) { 839 foundBreaks.push((current+wordLength), status); 840 } 841 } 842 843 // Don't return a break for the end of the dictionary range if there is one there. 844 if (foundBreaks.peeki() >= rangeEnd) { 845 (void) foundBreaks.popi(); 846 wordsFound -= 1; 847 } 848 849 return wordsFound; 850 } 851 852 #if !UCONFIG_NO_NORMALIZATION 853 /* 854 ****************************************************************** 855 * CjkBreakEngine 856 */ 857 static const uint32_t kuint32max = 0xFFFFFFFF; 858 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 859 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 860 // Korean dictionary only includes Hangul syllables 861 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 862 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 863 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 864 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 865 866 if (U_SUCCESS(status)) { 867 // handle Korean and Japanese/Chinese using different dictionaries 868 if (type == kKorean) { 869 setCharacters(fHangulWordSet); 870 } else { //Chinese and Japanese 871 UnicodeSet cjSet; 872 cjSet.addAll(fHanWordSet); 873 cjSet.addAll(fKatakanaWordSet); 874 cjSet.addAll(fHiraganaWordSet); 875 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 876 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 877 setCharacters(cjSet); 878 } 879 } 880 } 881 882 CjkBreakEngine::~CjkBreakEngine(){ 883 delete fDictionary; 884 } 885 886 // The katakanaCost values below are based on the length frequencies of all 887 // katakana phrases in the dictionary 888 static const int kMaxKatakanaLength = 8; 889 static const int kMaxKatakanaGroupLength = 20; 890 static const uint32_t maxSnlp = 255; 891 892 static inline uint32_t getKatakanaCost(int wordLength){ 893 //TODO: fill array with actual values from dictionary! 894 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 895 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 896 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 897 } 898 899 static inline bool isKatakana(uint16_t value) { 900 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 901 (value >= 0xFF66u && value <= 0xFF9fu); 902 } 903 904 // A very simple helper class to streamline the buffer handling in 905 // divideUpDictionaryRange. 906 template<class T, size_t N> 907 class AutoBuffer { 908 public: 909 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 910 if (size > N) { 911 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 912 capacity = size; 913 } 914 } 915 ~AutoBuffer() { 916 if (buffer != stackBuffer) 917 uprv_free(buffer); 918 } 919 920 T* elems() { 921 return buffer; 922 } 923 924 const T& operator[] (size_t i) const { 925 return buffer[i]; 926 } 927 928 T& operator[] (size_t i) { 929 return buffer[i]; 930 } 931 932 // resize without copy 933 void resize(size_t size) { 934 if (size <= capacity) 935 return; 936 if (buffer != stackBuffer) 937 uprv_free(buffer); 938 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 939 capacity = size; 940 } 941 942 private: 943 T stackBuffer[N]; 944 T* buffer; 945 AutoBuffer(); 946 size_t capacity; 947 }; 948 949 950 /* 951 * @param text A UText representing the text 952 * @param rangeStart The start of the range of dictionary characters 953 * @param rangeEnd The end of the range of dictionary characters 954 * @param foundBreaks Output of C array of int32_t break positions, or 0 955 * @return The number of breaks found 956 */ 957 int32_t 958 CjkBreakEngine::divideUpDictionaryRange( UText *text, 959 int32_t rangeStart, 960 int32_t rangeEnd, 961 UStack &foundBreaks ) const { 962 if (rangeStart >= rangeEnd) { 963 return 0; 964 } 965 966 const size_t defaultInputLength = 80; 967 size_t inputLength = rangeEnd - rangeStart; 968 // TODO: Replace by UnicodeString. 969 AutoBuffer<UChar, defaultInputLength> charString(inputLength); 970 971 // Normalize the input string and put it in normalizedText. 972 // The map from the indices of the normalized input to the raw 973 // input is kept in charPositions. 974 UErrorCode status = U_ZERO_ERROR; 975 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 976 if (U_FAILURE(status)) { 977 return 0; 978 } 979 980 UnicodeString inputString(charString.elems(), inputLength); 981 // TODO: Use Normalizer2. 982 UNormalizationMode norm_mode = UNORM_NFKC; 983 UBool isNormalized = 984 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 985 Normalizer::isNormalized(inputString, norm_mode, status); 986 987 // TODO: Replace by UVector32. 988 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 989 int numChars = 0; 990 UText normalizedText = UTEXT_INITIALIZER; 991 // Needs to be declared here because normalizedText holds onto its buffer. 992 UnicodeString normalizedString; 993 if (isNormalized) { 994 int32_t index = 0; 995 charPositions[0] = 0; 996 while(index < inputString.length()) { 997 index = inputString.moveIndex32(index, 1); 998 charPositions[++numChars] = index; 999 } 1000 utext_openUnicodeString(&normalizedText, &inputString, &status); 1001 } 1002 else { 1003 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 1004 if (U_FAILURE(status)) { 1005 return 0; 1006 } 1007 charPositions.resize(normalizedString.length() + 1); 1008 Normalizer normalizer(charString.elems(), inputLength, norm_mode); 1009 int32_t index = 0; 1010 charPositions[0] = 0; 1011 while(index < normalizer.endIndex()){ 1012 /* UChar32 uc = */ normalizer.next(); 1013 charPositions[++numChars] = index = normalizer.getIndex(); 1014 } 1015 utext_openUnicodeString(&normalizedText, &normalizedString, &status); 1016 } 1017 1018 if (U_FAILURE(status)) { 1019 return 0; 1020 } 1021 1022 // From this point on, all the indices refer to the indices of 1023 // the normalized input string. 1024 1025 // bestSnlp[i] is the snlp of the best segmentation of the first i 1026 // characters in the range to be matched. 1027 // TODO: Replace by UVector32. 1028 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 1029 bestSnlp[0] = 0; 1030 for(int i = 1; i <= numChars; i++) { 1031 bestSnlp[i] = kuint32max; 1032 } 1033 1034 // prev[i] is the index of the last CJK character in the previous word in 1035 // the best segmentation of the first i characters. 1036 // TODO: Replace by UVector32. 1037 AutoBuffer<int, defaultInputLength> prev(numChars + 1); 1038 for(int i = 0; i <= numChars; i++){ 1039 prev[i] = -1; 1040 } 1041 1042 const size_t maxWordSize = 20; 1043 // TODO: Replace both with UVector32. 1044 AutoBuffer<int32_t, maxWordSize> values(numChars); 1045 AutoBuffer<int32_t, maxWordSize> lengths(numChars); 1046 1047 // Dynamic programming to find the best segmentation. 1048 bool is_prev_katakana = false; 1049 for (int32_t i = 0; i < numChars; ++i) { 1050 //utext_setNativeIndex(text, rangeStart + i); 1051 utext_setNativeIndex(&normalizedText, i); 1052 if (bestSnlp[i] == kuint32max) 1053 continue; 1054 1055 int32_t count; 1056 // limit maximum word length matched to size of current substring 1057 int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); 1058 1059 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 1060 1061 // if there are no single character matches found in the dictionary 1062 // starting with this charcter, treat character as a 1-character word 1063 // with the highest value possible, i.e. the least likely to occur. 1064 // Exclude Korean characters from this treatment, as they should be left 1065 // together by default. 1066 if((count == 0 || lengths[0] != 1) && 1067 !fHangulWordSet.contains(utext_current32(&normalizedText))) { 1068 values[count] = maxSnlp; 1069 lengths[count++] = 1; 1070 } 1071 1072 for (int j = 0; j < count; j++) { 1073 uint32_t newSnlp = bestSnlp[i] + values[j]; 1074 if (newSnlp < bestSnlp[lengths[j] + i]) { 1075 bestSnlp[lengths[j] + i] = newSnlp; 1076 prev[lengths[j] + i] = i; 1077 } 1078 } 1079 1080 // In Japanese, 1081 // Katakana word in single character is pretty rare. So we apply 1082 // the following heuristic to Katakana: any continuous run of Katakana 1083 // characters is considered a candidate word with a default cost 1084 // specified in the katakanaCost table according to its length. 1085 //utext_setNativeIndex(text, rangeStart + i); 1086 utext_setNativeIndex(&normalizedText, i); 1087 bool is_katakana = isKatakana(utext_current32(&normalizedText)); 1088 if (!is_prev_katakana && is_katakana) { 1089 int j = i + 1; 1090 utext_next32(&normalizedText); 1091 // Find the end of the continuous run of Katakana characters 1092 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 1093 isKatakana(utext_current32(&normalizedText))) { 1094 utext_next32(&normalizedText); 1095 ++j; 1096 } 1097 if ((j - i) < kMaxKatakanaGroupLength) { 1098 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 1099 if (newSnlp < bestSnlp[j]) { 1100 bestSnlp[j] = newSnlp; 1101 prev[j] = i; 1102 } 1103 } 1104 } 1105 is_prev_katakana = is_katakana; 1106 } 1107 1108 // Start pushing the optimal offset index into t_boundary (t for tentative). 1109 // prev[numChars] is guaranteed to be meaningful. 1110 // We'll first push in the reverse order, i.e., 1111 // t_boundary[0] = numChars, and afterwards do a swap. 1112 // TODO: Replace by UVector32. 1113 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 1114 1115 int numBreaks = 0; 1116 // No segmentation found, set boundary to end of range 1117 if (bestSnlp[numChars] == kuint32max) { 1118 t_boundary[numBreaks++] = numChars; 1119 } else { 1120 for (int i = numChars; i > 0; i = prev[i]) { 1121 t_boundary[numBreaks++] = i; 1122 } 1123 U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); 1124 } 1125 1126 // Reverse offset index in t_boundary. 1127 // Don't add a break for the start of the dictionary range if there is one 1128 // there already. 1129 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1130 t_boundary[numBreaks++] = 0; 1131 } 1132 1133 // Now that we're done, convert positions in t_bdry[] (indices in 1134 // the normalized input string) back to indices in the raw input string 1135 // while reversing t_bdry and pushing values to foundBreaks. 1136 for (int i = numBreaks-1; i >= 0; i--) { 1137 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 1138 } 1139 1140 utext_close(&normalizedText); 1141 return numBreaks; 1142 } 1143 #endif 1144 1145 U_NAMESPACE_END 1146 1147 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1148 1149