1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2016 International Business Machines Corporation 6 * and others. All rights reserved. 7 *************************************************************************** 8 */ 9 // 10 // file: rbbi.c Contains the implementation of the rule based break iterator 11 // runtime engine and the API implementation for 12 // class RuleBasedBreakIterator 13 // 14 15 #include "utypeinfo.h" // for 'typeid' to work 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/rbbi.h" 22 #include "unicode/schriter.h" 23 #include "unicode/uchriter.h" 24 #include "unicode/udata.h" 25 #include "unicode/uclean.h" 26 #include "rbbidata.h" 27 #include "rbbirb.h" 28 #include "cmemory.h" 29 #include "cstring.h" 30 #include "umutex.h" 31 #include "ucln_cmn.h" 32 #include "brkeng.h" 33 34 #include "uassert.h" 35 #include "uvector.h" 36 37 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 38 #if U_LOCAL_SERVICE_HOOK 39 #include "localsvc.h" 40 #endif 41 42 #ifdef RBBI_DEBUG 43 static UBool fTrace = FALSE; 44 #endif 45 46 U_NAMESPACE_BEGIN 47 48 // The state number of the starting state 49 #define START_STATE 1 50 51 // The state-transition value indicating "stop" 52 #define STOP_STATE 0 53 54 55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 56 57 58 //======================================================================= 59 // constructors 60 //======================================================================= 61 62 /** 63 * Constructs a RuleBasedBreakIterator that uses the already-created 64 * tables object that is passed in as a parameter. 65 */ 66 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 67 { 68 init(); 69 fData = new RBBIDataWrapper(data, status); // status checked in constructor 70 if (U_FAILURE(status)) {return;} 71 if(fData == 0) { 72 status = U_MEMORY_ALLOCATION_ERROR; 73 return; 74 } 75 } 76 77 // 78 // Construct from precompiled binary rules (tables). This constructor is public API, 79 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 80 // 81 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 82 uint32_t ruleLength, 83 UErrorCode &status) { 84 init(); 85 if (U_FAILURE(status)) { 86 return; 87 } 88 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 89 status = U_ILLEGAL_ARGUMENT_ERROR; 90 return; 91 } 92 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 93 if (data->fLength > ruleLength) { 94 status = U_ILLEGAL_ARGUMENT_ERROR; 95 return; 96 } 97 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 98 if (U_FAILURE(status)) {return;} 99 if(fData == 0) { 100 status = U_MEMORY_ALLOCATION_ERROR; 101 return; 102 } 103 } 104 105 106 //------------------------------------------------------------------------------- 107 // 108 // Constructor from a UDataMemory handle to precompiled break rules 109 // stored in an ICU data file. 110 // 111 //------------------------------------------------------------------------------- 112 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 113 { 114 init(); 115 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 116 if (U_FAILURE(status)) {return;} 117 if(fData == 0) { 118 status = U_MEMORY_ALLOCATION_ERROR; 119 return; 120 } 121 } 122 123 124 125 //------------------------------------------------------------------------------- 126 // 127 // Constructor from a set of rules supplied as a string. 128 // 129 //------------------------------------------------------------------------------- 130 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 131 UParseError &parseError, 132 UErrorCode &status) 133 { 134 init(); 135 if (U_FAILURE(status)) {return;} 136 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 137 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 138 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 139 // creates and returns a complete RBBI. From here, in a constructor, we 140 // can't just return the object created by the builder factory, hence 141 // the assignment of the factory created object to "this". 142 if (U_SUCCESS(status)) { 143 *this = *bi; 144 delete bi; 145 } 146 } 147 148 149 //------------------------------------------------------------------------------- 150 // 151 // Default Constructor. Create an empty shell that can be set up later. 152 // Used when creating a RuleBasedBreakIterator from a set 153 // of rules. 154 //------------------------------------------------------------------------------- 155 RuleBasedBreakIterator::RuleBasedBreakIterator() { 156 init(); 157 } 158 159 160 //------------------------------------------------------------------------------- 161 // 162 // Copy constructor. Will produce a break iterator with the same behavior, 163 // and which iterates over the same text, as the one passed in. 164 // 165 //------------------------------------------------------------------------------- 166 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 167 : BreakIterator(other) 168 { 169 this->init(); 170 *this = other; 171 } 172 173 174 /** 175 * Destructor 176 */ 177 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 178 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 179 // fCharIter was adopted from the outside. 180 delete fCharIter; 181 } 182 fCharIter = NULL; 183 delete fSCharIter; 184 fCharIter = NULL; 185 delete fDCharIter; 186 fDCharIter = NULL; 187 188 utext_close(fText); 189 190 if (fData != NULL) { 191 fData->removeReference(); 192 fData = NULL; 193 } 194 if (fCachedBreakPositions) { 195 uprv_free(fCachedBreakPositions); 196 fCachedBreakPositions = NULL; 197 } 198 if (fLanguageBreakEngines) { 199 delete fLanguageBreakEngines; 200 fLanguageBreakEngines = NULL; 201 } 202 if (fUnhandledBreakEngine) { 203 delete fUnhandledBreakEngine; 204 fUnhandledBreakEngine = NULL; 205 } 206 } 207 208 /** 209 * Assignment operator. Sets this iterator to have the same behavior, 210 * and iterate over the same text, as the one passed in. 211 */ 212 RuleBasedBreakIterator& 213 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 214 if (this == &that) { 215 return *this; 216 } 217 reset(); // Delete break cache information 218 fBreakType = that.fBreakType; 219 if (fLanguageBreakEngines != NULL) { 220 delete fLanguageBreakEngines; 221 fLanguageBreakEngines = NULL; // Just rebuild for now 222 } 223 // TODO: clone fLanguageBreakEngines from "that" 224 UErrorCode status = U_ZERO_ERROR; 225 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 226 227 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 228 delete fCharIter; 229 } 230 fCharIter = NULL; 231 232 if (that.fCharIter != NULL ) { 233 // This is a little bit tricky - it will intially appear that 234 // this->fCharIter is adopted, even if that->fCharIter was 235 // not adopted. That's ok. 236 fCharIter = that.fCharIter->clone(); 237 } 238 239 if (fData != NULL) { 240 fData->removeReference(); 241 fData = NULL; 242 } 243 if (that.fData != NULL) { 244 fData = that.fData->addReference(); 245 } 246 247 return *this; 248 } 249 250 251 252 //----------------------------------------------------------------------------- 253 // 254 // init() Shared initialization routine. Used by all the constructors. 255 // Initializes all fields, leaving the object in a consistent state. 256 // 257 //----------------------------------------------------------------------------- 258 void RuleBasedBreakIterator::init() { 259 UErrorCode status = U_ZERO_ERROR; 260 fText = utext_openUChars(NULL, NULL, 0, &status); 261 fCharIter = NULL; 262 fSCharIter = NULL; 263 fDCharIter = NULL; 264 fData = NULL; 265 fLastRuleStatusIndex = 0; 266 fLastStatusIndexValid = TRUE; 267 fDictionaryCharCount = 0; 268 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 269 // dictionary behavior for Break Iterators that are 270 // built from rules. Even better would be the ability to 271 // declare the type in the rules. 272 273 fCachedBreakPositions = NULL; 274 fLanguageBreakEngines = NULL; 275 fUnhandledBreakEngine = NULL; 276 fNumCachedBreakPositions = 0; 277 fPositionInCache = 0; 278 279 #ifdef RBBI_DEBUG 280 static UBool debugInitDone = FALSE; 281 if (debugInitDone == FALSE) { 282 char *debugEnv = getenv("U_RBBIDEBUG"); 283 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 284 fTrace = TRUE; 285 } 286 debugInitDone = TRUE; 287 } 288 #endif 289 } 290 291 292 293 //----------------------------------------------------------------------------- 294 // 295 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 296 // behavior, and iterating over the same text, as this one. 297 // Virtual function: does the right thing with subclasses. 298 // 299 //----------------------------------------------------------------------------- 300 BreakIterator* 301 RuleBasedBreakIterator::clone(void) const { 302 return new RuleBasedBreakIterator(*this); 303 } 304 305 /** 306 * Equality operator. Returns TRUE if both BreakIterators are of the 307 * same class, have the same behavior, and iterate over the same text. 308 */ 309 UBool 310 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 311 if (typeid(*this) != typeid(that)) { 312 return FALSE; 313 } 314 315 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 316 317 if (!utext_equals(fText, that2.fText)) { 318 // The two break iterators are operating on different text, 319 // or have a different interation position. 320 return FALSE; 321 }; 322 323 // TODO: need a check for when in a dictionary region at different offsets. 324 325 if (that2.fData == fData || 326 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 327 // The two break iterators are using the same rules. 328 return TRUE; 329 } 330 return FALSE; 331 } 332 333 /** 334 * Compute a hash code for this BreakIterator 335 * @return A hash code 336 */ 337 int32_t 338 RuleBasedBreakIterator::hashCode(void) const { 339 int32_t hash = 0; 340 if (fData != NULL) { 341 hash = fData->hashCode(); 342 } 343 return hash; 344 } 345 346 347 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 348 if (U_FAILURE(status)) { 349 return; 350 } 351 reset(); 352 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 353 354 // Set up a dummy CharacterIterator to be returned if anyone 355 // calls getText(). With input from UText, there is no reasonable 356 // way to return a characterIterator over the actual input text. 357 // Return one over an empty string instead - this is the closest 358 // we can come to signaling a failure. 359 // (GetText() is obsolete, this failure is sort of OK) 360 if (fDCharIter == NULL) { 361 static const UChar c = 0; 362 fDCharIter = new UCharCharacterIterator(&c, 0); 363 if (fDCharIter == NULL) { 364 status = U_MEMORY_ALLOCATION_ERROR; 365 return; 366 } 367 } 368 369 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 370 // existing fCharIter was adopted from the outside. Delete it now. 371 delete fCharIter; 372 } 373 fCharIter = fDCharIter; 374 375 this->first(); 376 } 377 378 379 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 380 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 381 return result; 382 } 383 384 385 386 /** 387 * Returns the description used to create this iterator 388 */ 389 const UnicodeString& 390 RuleBasedBreakIterator::getRules() const { 391 if (fData != NULL) { 392 return fData->getRuleSourceString(); 393 } else { 394 static const UnicodeString *s; 395 if (s == NULL) { 396 // TODO: something more elegant here. 397 // perhaps API should return the string by value. 398 // Note: thread unsafe init & leak are semi-ok, better than 399 // what was before. Sould be cleaned up, though. 400 s = new UnicodeString; 401 } 402 return *s; 403 } 404 } 405 406 //======================================================================= 407 // BreakIterator overrides 408 //======================================================================= 409 410 /** 411 * Return a CharacterIterator over the text being analyzed. 412 */ 413 CharacterIterator& 414 RuleBasedBreakIterator::getText() const { 415 return *fCharIter; 416 } 417 418 /** 419 * Set the iterator to analyze a new piece of text. This function resets 420 * the current iteration position to the beginning of the text. 421 * @param newText An iterator over the text to analyze. 422 */ 423 void 424 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 425 // If we are holding a CharacterIterator adopted from a 426 // previous call to this function, delete it now. 427 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 428 delete fCharIter; 429 } 430 431 fCharIter = newText; 432 UErrorCode status = U_ZERO_ERROR; 433 reset(); 434 if (newText==NULL || newText->startIndex() != 0) { 435 // startIndex !=0 wants to be an error, but there's no way to report it. 436 // Make the iterator text be an empty string. 437 fText = utext_openUChars(fText, NULL, 0, &status); 438 } else { 439 fText = utext_openCharacterIterator(fText, newText, &status); 440 } 441 this->first(); 442 } 443 444 /** 445 * Set the iterator to analyze a new piece of text. This function resets 446 * the current iteration position to the beginning of the text. 447 * @param newText An iterator over the text to analyze. 448 */ 449 void 450 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 451 UErrorCode status = U_ZERO_ERROR; 452 reset(); 453 fText = utext_openConstUnicodeString(fText, &newText, &status); 454 455 // Set up a character iterator on the string. 456 // Needed in case someone calls getText(). 457 // Can not, unfortunately, do this lazily on the (probably never) 458 // call to getText(), because getText is const. 459 if (fSCharIter == NULL) { 460 fSCharIter = new StringCharacterIterator(newText); 461 } else { 462 fSCharIter->setText(newText); 463 } 464 465 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 466 // old fCharIter was adopted from the outside. Delete it. 467 delete fCharIter; 468 } 469 fCharIter = fSCharIter; 470 471 this->first(); 472 } 473 474 475 /** 476 * Provide a new UText for the input text. Must reference text with contents identical 477 * to the original. 478 * Intended for use with text data originating in Java (garbage collected) environments 479 * where the data may be moved in memory at arbitrary times. 480 */ 481 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 482 if (U_FAILURE(status)) { 483 return *this; 484 } 485 if (input == NULL) { 486 status = U_ILLEGAL_ARGUMENT_ERROR; 487 return *this; 488 } 489 int64_t pos = utext_getNativeIndex(fText); 490 // Shallow read-only clone of the new UText into the existing input UText 491 fText = utext_clone(fText, input, FALSE, TRUE, &status); 492 if (U_FAILURE(status)) { 493 return *this; 494 } 495 utext_setNativeIndex(fText, pos); 496 if (utext_getNativeIndex(fText) != pos) { 497 // Sanity check. The new input utext is supposed to have the exact same 498 // contents as the old. If we can't set to the same position, it doesn't. 499 // The contents underlying the old utext might be invalid at this point, 500 // so it's not safe to check directly. 501 status = U_ILLEGAL_ARGUMENT_ERROR; 502 } 503 return *this; 504 } 505 506 507 /** 508 * Sets the current iteration position to the beginning of the text, position zero. 509 * @return The new iterator position, which is zero. 510 */ 511 int32_t RuleBasedBreakIterator::first(void) { 512 reset(); 513 fLastRuleStatusIndex = 0; 514 fLastStatusIndexValid = TRUE; 515 //if (fText == NULL) 516 // return BreakIterator::DONE; 517 518 utext_setNativeIndex(fText, 0); 519 return 0; 520 } 521 522 /** 523 * Sets the current iteration position to the end of the text. 524 * @return The text's past-the-end offset. 525 */ 526 int32_t RuleBasedBreakIterator::last(void) { 527 reset(); 528 if (fText == NULL) { 529 fLastRuleStatusIndex = 0; 530 fLastStatusIndexValid = TRUE; 531 return BreakIterator::DONE; 532 } 533 534 fLastStatusIndexValid = FALSE; 535 int32_t pos = (int32_t)utext_nativeLength(fText); 536 utext_setNativeIndex(fText, pos); 537 return pos; 538 } 539 540 /** 541 * Advances the iterator either forward or backward the specified number of steps. 542 * Negative values move backward, and positive values move forward. This is 543 * equivalent to repeatedly calling next() or previous(). 544 * @param n The number of steps to move. The sign indicates the direction 545 * (negative is backwards, and positive is forwards). 546 * @return The character offset of the boundary position n boundaries away from 547 * the current one. 548 */ 549 int32_t RuleBasedBreakIterator::next(int32_t n) { 550 int32_t result = current(); 551 while (n > 0) { 552 result = next(); 553 --n; 554 } 555 while (n < 0) { 556 result = previous(); 557 ++n; 558 } 559 return result; 560 } 561 562 /** 563 * Advances the iterator to the next boundary position. 564 * @return The position of the first boundary after this one. 565 */ 566 int32_t RuleBasedBreakIterator::next(void) { 567 // if we have cached break positions and we're still in the range 568 // covered by them, just move one step forward in the cache 569 if (fCachedBreakPositions != NULL) { 570 if (fPositionInCache < fNumCachedBreakPositions - 1) { 571 ++fPositionInCache; 572 int32_t pos = fCachedBreakPositions[fPositionInCache]; 573 utext_setNativeIndex(fText, pos); 574 return pos; 575 } 576 else { 577 reset(); 578 } 579 } 580 581 int32_t startPos = current(); 582 fDictionaryCharCount = 0; 583 int32_t result = handleNext(fData->fForwardTable); 584 if (fDictionaryCharCount > 0) { 585 result = checkDictionary(startPos, result, FALSE); 586 } 587 return result; 588 } 589 590 /** 591 * Advances the iterator backwards, to the last boundary preceding this one. 592 * @return The position of the last boundary position preceding this one. 593 */ 594 int32_t RuleBasedBreakIterator::previous(void) { 595 int32_t result; 596 int32_t startPos; 597 598 // if we have cached break positions and we're still in the range 599 // covered by them, just move one step backward in the cache 600 if (fCachedBreakPositions != NULL) { 601 if (fPositionInCache > 0) { 602 --fPositionInCache; 603 // If we're at the beginning of the cache, need to reevaluate the 604 // rule status 605 if (fPositionInCache <= 0) { 606 fLastStatusIndexValid = FALSE; 607 } 608 int32_t pos = fCachedBreakPositions[fPositionInCache]; 609 utext_setNativeIndex(fText, pos); 610 return pos; 611 } 612 else { 613 reset(); 614 } 615 } 616 617 // if we're already sitting at the beginning of the text, return DONE 618 if (fText == NULL || (startPos = current()) == 0) { 619 fLastRuleStatusIndex = 0; 620 fLastStatusIndexValid = TRUE; 621 return BreakIterator::DONE; 622 } 623 624 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 625 result = handlePrevious(fData->fReverseTable); 626 if (fDictionaryCharCount > 0) { 627 result = checkDictionary(result, startPos, TRUE); 628 } 629 return result; 630 } 631 632 // old rule syntax 633 // set things up. handlePrevious() will back us up to some valid 634 // break position before the current position (we back our internal 635 // iterator up one step to prevent handlePrevious() from returning 636 // the current position), but not necessarily the last one before 637 // where we started 638 639 int32_t start = current(); 640 641 (void)UTEXT_PREVIOUS32(fText); 642 int32_t lastResult = handlePrevious(fData->fReverseTable); 643 if (lastResult == UBRK_DONE) { 644 lastResult = 0; 645 utext_setNativeIndex(fText, 0); 646 } 647 result = lastResult; 648 int32_t lastTag = 0; 649 UBool breakTagValid = FALSE; 650 651 // iterate forward from the known break position until we pass our 652 // starting point. The last break position before the starting 653 // point is our return value 654 655 for (;;) { 656 result = next(); 657 if (result == BreakIterator::DONE || result >= start) { 658 break; 659 } 660 lastResult = result; 661 lastTag = fLastRuleStatusIndex; 662 breakTagValid = TRUE; 663 } 664 665 // fLastBreakTag wants to have the value for section of text preceding 666 // the result position that we are to return (in lastResult.) If 667 // the backwards rules overshot and the above loop had to do two or more 668 // next()s to move up to the desired return position, we will have a valid 669 // tag value. But, if handlePrevious() took us to exactly the correct result position, 670 // we wont have a tag value for that position, which is only set by handleNext(). 671 672 // Set the current iteration position to be the last break position 673 // before where we started, and then return that value. 674 utext_setNativeIndex(fText, lastResult); 675 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 676 fLastStatusIndexValid = breakTagValid; 677 678 // No need to check the dictionary; it will have been handled by 679 // next() 680 681 return lastResult; 682 } 683 684 /** 685 * Sets the iterator to refer to the first boundary position following 686 * the specified position. 687 * @offset The position from which to begin searching for a break position. 688 * @return The position of the first break after the current position. 689 */ 690 int32_t RuleBasedBreakIterator::following(int32_t offset) { 691 // if the offset passed in is already past the end of the text, 692 // just return DONE; if it's before the beginning, return the 693 // text's starting offset 694 if (fText == NULL || offset >= utext_nativeLength(fText)) { 695 last(); 696 return next(); 697 } 698 else if (offset < 0) { 699 return first(); 700 } 701 702 // Move requested offset to a code point start. It might be on a trail surrogate, 703 // or on a trail byte if the input is UTF-8. 704 utext_setNativeIndex(fText, offset); 705 offset = (int32_t)utext_getNativeIndex(fText); 706 707 // if we have cached break positions and offset is in the range 708 // covered by them, use them 709 // TODO: could use binary search 710 // TODO: what if offset is outside range, but break is not? 711 if (fCachedBreakPositions != NULL) { 712 if (offset >= fCachedBreakPositions[0] 713 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 714 fPositionInCache = 0; 715 // We are guaranteed not to leave the array due to range test above 716 while (offset >= fCachedBreakPositions[fPositionInCache]) { 717 ++fPositionInCache; 718 } 719 int32_t pos = fCachedBreakPositions[fPositionInCache]; 720 utext_setNativeIndex(fText, pos); 721 return pos; 722 } 723 else { 724 reset(); 725 } 726 } 727 728 // Set our internal iteration position (temporarily) 729 // to the position passed in. If this is the _beginning_ position, 730 // then we can just use next() to get our return value 731 732 int32_t result = 0; 733 734 if (fData->fSafeRevTable != NULL) { 735 // new rule syntax 736 utext_setNativeIndex(fText, offset); 737 // move forward one codepoint to prepare for moving back to a 738 // safe point. 739 // this handles offset being between a supplementary character 740 // TODO: is this still needed, with move to code point boundary handled above? 741 (void)UTEXT_NEXT32(fText); 742 // handlePrevious will move most of the time to < 1 boundary away 743 handlePrevious(fData->fSafeRevTable); 744 int32_t result = next(); 745 while (result <= offset) { 746 result = next(); 747 } 748 return result; 749 } 750 if (fData->fSafeFwdTable != NULL) { 751 // backup plan if forward safe table is not available 752 utext_setNativeIndex(fText, offset); 753 (void)UTEXT_PREVIOUS32(fText); 754 // handle next will give result >= offset 755 handleNext(fData->fSafeFwdTable); 756 // previous will give result 0 or 1 boundary away from offset, 757 // most of the time 758 // we have to 759 int32_t oldresult = previous(); 760 while (oldresult > offset) { 761 int32_t result = previous(); 762 if (result <= offset) { 763 return oldresult; 764 } 765 oldresult = result; 766 } 767 int32_t result = next(); 768 if (result <= offset) { 769 return next(); 770 } 771 return result; 772 } 773 // otherwise, we have to sync up first. Use handlePrevious() to back 774 // up to a known break position before the specified position (if 775 // we can determine that the specified position is a break position, 776 // we don't back up at all). This may or may not be the last break 777 // position at or before our starting position. Advance forward 778 // from here until we've passed the starting position. The position 779 // we stop on will be the first break position after the specified one. 780 // old rule syntax 781 782 utext_setNativeIndex(fText, offset); 783 if (offset==0 || 784 (offset==1 && utext_getNativeIndex(fText)==0)) { 785 return next(); 786 } 787 result = previous(); 788 789 while (result != BreakIterator::DONE && result <= offset) { 790 result = next(); 791 } 792 793 return result; 794 } 795 796 /** 797 * Sets the iterator to refer to the last boundary position before the 798 * specified position. 799 * @offset The position to begin searching for a break from. 800 * @return The position of the last boundary before the starting position. 801 */ 802 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 803 // if the offset passed in is already past the end of the text, 804 // just return DONE; if it's before the beginning, return the 805 // text's starting offset 806 if (fText == NULL || offset > utext_nativeLength(fText)) { 807 return last(); 808 } 809 else if (offset < 0) { 810 return first(); 811 } 812 813 // Move requested offset to a code point start. It might be on a trail surrogate, 814 // or on a trail byte if the input is UTF-8. 815 utext_setNativeIndex(fText, offset); 816 offset = (int32_t)utext_getNativeIndex(fText); 817 818 // if we have cached break positions and offset is in the range 819 // covered by them, use them 820 if (fCachedBreakPositions != NULL) { 821 // TODO: binary search? 822 // TODO: What if offset is outside range, but break is not? 823 if (offset > fCachedBreakPositions[0] 824 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 825 fPositionInCache = 0; 826 while (fPositionInCache < fNumCachedBreakPositions 827 && offset > fCachedBreakPositions[fPositionInCache]) 828 ++fPositionInCache; 829 --fPositionInCache; 830 // If we're at the beginning of the cache, need to reevaluate the 831 // rule status 832 if (fPositionInCache <= 0) { 833 fLastStatusIndexValid = FALSE; 834 } 835 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 836 return fCachedBreakPositions[fPositionInCache]; 837 } 838 else { 839 reset(); 840 } 841 } 842 843 // if we start by updating the current iteration position to the 844 // position specified by the caller, we can just use previous() 845 // to carry out this operation 846 847 if (fData->fSafeFwdTable != NULL) { 848 // new rule syntax 849 utext_setNativeIndex(fText, offset); 850 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 851 if (newOffset != offset) { 852 // Will come here if specified offset was not a code point boundary AND 853 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 854 // indices to the containing code point. 855 // For breakitereator::preceding only, these non-code-point indices need to be moved 856 // up to refer to the following codepoint. 857 (void)UTEXT_NEXT32(fText); 858 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 859 } 860 861 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 862 // rather than adjusting the position unconditionally? 863 // (Change would interact with safe rules.) 864 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 865 // affects only preceding(), seems cleaner, but is slightly different. 866 (void)UTEXT_PREVIOUS32(fText); 867 handleNext(fData->fSafeFwdTable); 868 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 869 while (result >= offset) { 870 result = previous(); 871 } 872 return result; 873 } 874 if (fData->fSafeRevTable != NULL) { 875 // backup plan if forward safe table is not available 876 // TODO: check whether this path can be discarded 877 // It's probably OK to say that rules must supply both safe tables 878 // if they use safe tables at all. We have certainly never described 879 // to anyone how to work with just one safe table. 880 utext_setNativeIndex(fText, offset); 881 (void)UTEXT_NEXT32(fText); 882 883 // handle previous will give result <= offset 884 handlePrevious(fData->fSafeRevTable); 885 886 // next will give result 0 or 1 boundary away from offset, 887 // most of the time 888 // we have to 889 int32_t oldresult = next(); 890 while (oldresult < offset) { 891 int32_t result = next(); 892 if (result >= offset) { 893 return oldresult; 894 } 895 oldresult = result; 896 } 897 int32_t result = previous(); 898 if (result >= offset) { 899 return previous(); 900 } 901 return result; 902 } 903 904 // old rule syntax 905 utext_setNativeIndex(fText, offset); 906 return previous(); 907 } 908 909 /** 910 * Returns true if the specfied position is a boundary position. As a side 911 * effect, leaves the iterator pointing to the first boundary position at 912 * or after "offset". 913 * @param offset the offset to check. 914 * @return True if "offset" is a boundary position. 915 */ 916 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 917 // the beginning index of the iterator is always a boundary position by definition 918 if (offset == 0) { 919 first(); // For side effects on current position, tag values. 920 return TRUE; 921 } 922 923 if (offset == (int32_t)utext_nativeLength(fText)) { 924 last(); // For side effects on current position, tag values. 925 return TRUE; 926 } 927 928 // out-of-range indexes are never boundary positions 929 if (offset < 0) { 930 first(); // For side effects on current position, tag values. 931 return FALSE; 932 } 933 934 if (offset > utext_nativeLength(fText)) { 935 last(); // For side effects on current position, tag values. 936 return FALSE; 937 } 938 939 // otherwise, we can use following() on the position before the specified 940 // one and return true if the position we get back is the one the user 941 // specified 942 utext_previous32From(fText, offset); 943 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 944 UBool result = following(backOne) == offset; 945 return result; 946 } 947 948 /** 949 * Returns the current iteration position. 950 * @return The current iteration position. 951 */ 952 int32_t RuleBasedBreakIterator::current(void) const { 953 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 954 return pos; 955 } 956 957 //======================================================================= 958 // implementation 959 //======================================================================= 960 961 // 962 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 963 // of user text. A variable with this enum type keeps track of where we 964 // are. The state machine only fetches user input while in the RUN mode. 965 // 966 enum RBBIRunMode { 967 RBBI_START, // state machine processing is before first char of input 968 RBBI_RUN, // state machine processing is in the user text 969 RBBI_END // state machine processing is after end of user text. 970 }; 971 972 973 // Map from look-ahead break states (corresponds to rules) to boundary positions. 974 // Allows multiple lookahead break rules to be in flight at the same time. 975 // 976 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers 977 // in the state table be sequential, then we can just index an array. And the 978 // table could also tell us in advance how big that array needs to be. 979 // 980 // Before ICU 57 there was just a single simple variable for a look-ahead match that 981 // was in progress. Two rules at once did not work. 982 983 static const int32_t kMaxLookaheads = 8; 984 struct LookAheadResults { 985 int32_t fUsedSlotLimit; 986 int32_t fPositions[8]; 987 int16_t fKeys[8]; 988 989 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; 990 991 int32_t getPosition(int16_t key) { 992 for (int32_t i=0; i<fUsedSlotLimit; ++i) { 993 if (fKeys[i] == key) { 994 return fPositions[i]; 995 } 996 } 997 U_ASSERT(FALSE); 998 return -1; 999 } 1000 1001 void setPosition(int16_t key, int32_t position) { 1002 int32_t i; 1003 for (i=0; i<fUsedSlotLimit; ++i) { 1004 if (fKeys[i] == key) { 1005 fPositions[i] = position; 1006 return; 1007 } 1008 } 1009 if (i >= kMaxLookaheads) { 1010 U_ASSERT(FALSE); 1011 i = kMaxLookaheads - 1; 1012 } 1013 fKeys[i] = key; 1014 fPositions[i] = position; 1015 U_ASSERT(fUsedSlotLimit == i); 1016 fUsedSlotLimit = i + 1; 1017 } 1018 }; 1019 1020 1021 //----------------------------------------------------------------------------------- 1022 // 1023 // handleNext(stateTable) 1024 // This method is the actual implementation of the rbbi next() method. 1025 // This method initializes the state machine to state 1 1026 // and advances through the text character by character until we reach the end 1027 // of the text or the state machine transitions to state 0. We update our return 1028 // value every time the state machine passes through an accepting state. 1029 // 1030 //----------------------------------------------------------------------------------- 1031 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 1032 int32_t state; 1033 uint16_t category = 0; 1034 RBBIRunMode mode; 1035 1036 RBBIStateTableRow *row; 1037 UChar32 c; 1038 LookAheadResults lookAheadMatches; 1039 int32_t result = 0; 1040 int32_t initialPosition = 0; 1041 const char *tableData = statetable->fTableData; 1042 uint32_t tableRowLen = statetable->fRowLen; 1043 1044 #ifdef RBBI_DEBUG 1045 if (fTrace) { 1046 RBBIDebugPuts("Handle Next pos char state category"); 1047 } 1048 #endif 1049 1050 // No matter what, handleNext alway correctly sets the break tag value. 1051 fLastStatusIndexValid = TRUE; 1052 fLastRuleStatusIndex = 0; 1053 1054 // if we're already at the end of the text, return DONE. 1055 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1056 result = initialPosition; 1057 c = UTEXT_NEXT32(fText); 1058 if (fData == NULL || c==U_SENTINEL) { 1059 return BreakIterator::DONE; 1060 } 1061 1062 // Set the initial state for the state machine 1063 state = START_STATE; 1064 row = (RBBIStateTableRow *) 1065 //(statetable->fTableData + (statetable->fRowLen * state)); 1066 (tableData + tableRowLen * state); 1067 1068 1069 mode = RBBI_RUN; 1070 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1071 category = 2; 1072 mode = RBBI_START; 1073 } 1074 1075 1076 // loop until we reach the end of the text or transition to state 0 1077 // 1078 for (;;) { 1079 if (c == U_SENTINEL) { 1080 // Reached end of input string. 1081 if (mode == RBBI_END) { 1082 // We have already run the loop one last time with the 1083 // character set to the psueudo {eof} value. Now it is time 1084 // to unconditionally bail out. 1085 break; 1086 } 1087 // Run the loop one last time with the fake end-of-input character category. 1088 mode = RBBI_END; 1089 category = 1; 1090 } 1091 1092 // 1093 // Get the char category. An incoming category of 1 or 2 means that 1094 // we are preset for doing the beginning or end of input, and 1095 // that we shouldn't get a category from an actual text input character. 1096 // 1097 if (mode == RBBI_RUN) { 1098 // look up the current character's character category, which tells us 1099 // which column in the state table to look at. 1100 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1101 // not the size of the character going in, which is a UChar32. 1102 // 1103 UTRIE_GET16(&fData->fTrie, c, category); 1104 1105 // Check the dictionary bit in the character's category. 1106 // Counter is only used by dictionary based iterators (subclasses). 1107 // Chars that need to be handled by a dictionary have a flag bit set 1108 // in their category values. 1109 // 1110 if ((category & 0x4000) != 0) { 1111 fDictionaryCharCount++; 1112 // And off the dictionary flag bit. 1113 category &= ~0x4000; 1114 } 1115 } 1116 1117 #ifdef RBBI_DEBUG 1118 if (fTrace) { 1119 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1120 if (0x20<=c && c<0x7f) { 1121 RBBIDebugPrintf("\"%c\" ", c); 1122 } else { 1123 RBBIDebugPrintf("%5x ", c); 1124 } 1125 RBBIDebugPrintf("%3d %3d\n", state, category); 1126 } 1127 #endif 1128 1129 // State Transition - move machine to its next state 1130 // 1131 1132 // Note: fNextState is defined as uint16_t[2], but we are casting 1133 // a generated RBBI table to RBBIStateTableRow and some tables 1134 // actually have more than 2 categories. 1135 U_ASSERT(category<fData->fHeader->fCatCount); 1136 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1137 row = (RBBIStateTableRow *) 1138 // (statetable->fTableData + (statetable->fRowLen * state)); 1139 (tableData + tableRowLen * state); 1140 1141 1142 if (row->fAccepting == -1) { 1143 // Match found, common case. 1144 if (mode != RBBI_START) { 1145 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1146 } 1147 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1148 } 1149 1150 int16_t completedRule = row->fAccepting; 1151 if (completedRule > 0) { 1152 // Lookahead match is completed. 1153 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); 1154 if (lookaheadResult >= 0) { 1155 fLastRuleStatusIndex = row->fTagIdx; 1156 UTEXT_SETNATIVEINDEX(fText, lookaheadResult); 1157 return lookaheadResult; 1158 } 1159 } 1160 int16_t rule = row->fLookAhead; 1161 if (rule != 0) { 1162 // At the position of a '/' in a look-ahead match. Record it. 1163 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1164 lookAheadMatches.setPosition(rule, pos); 1165 } 1166 1167 if (state == STOP_STATE) { 1168 // This is the normal exit from the lookup state machine. 1169 // We have advanced through the string until it is certain that no 1170 // longer match is possible, no matter what characters follow. 1171 break; 1172 } 1173 1174 // Advance to the next character. 1175 // If this is a beginning-of-input loop iteration, don't advance 1176 // the input position. The next iteration will be processing the 1177 // first real input character. 1178 if (mode == RBBI_RUN) { 1179 c = UTEXT_NEXT32(fText); 1180 } else { 1181 if (mode == RBBI_START) { 1182 mode = RBBI_RUN; 1183 } 1184 } 1185 1186 1187 } 1188 1189 // The state machine is done. Check whether it found a match... 1190 1191 // If the iterator failed to advance in the match engine, force it ahead by one. 1192 // (This really indicates a defect in the break rules. They should always match 1193 // at least one character.) 1194 if (result == initialPosition) { 1195 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1196 UTEXT_NEXT32(fText); 1197 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1198 } 1199 1200 // Leave the iterator at our result position. 1201 UTEXT_SETNATIVEINDEX(fText, result); 1202 #ifdef RBBI_DEBUG 1203 if (fTrace) { 1204 RBBIDebugPrintf("result = %d\n\n", result); 1205 } 1206 #endif 1207 return result; 1208 } 1209 1210 1211 1212 //----------------------------------------------------------------------------------- 1213 // 1214 // handlePrevious() 1215 // 1216 // Iterate backwards, according to the logic of the reverse rules. 1217 // This version handles the exact style backwards rules. 1218 // 1219 // The logic of this function is very similar to handleNext(), above. 1220 // 1221 //----------------------------------------------------------------------------------- 1222 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1223 int32_t state; 1224 uint16_t category = 0; 1225 RBBIRunMode mode; 1226 RBBIStateTableRow *row; 1227 UChar32 c; 1228 LookAheadResults lookAheadMatches; 1229 int32_t result = 0; 1230 int32_t initialPosition = 0; 1231 1232 #ifdef RBBI_DEBUG 1233 if (fTrace) { 1234 RBBIDebugPuts("Handle Previous pos char state category"); 1235 } 1236 #endif 1237 1238 // handlePrevious() never gets the rule status. 1239 // Flag the status as invalid; if the user ever asks for status, we will need 1240 // to back up, then re-find the break position using handleNext(), which does 1241 // get the status value. 1242 fLastStatusIndexValid = FALSE; 1243 fLastRuleStatusIndex = 0; 1244 1245 // if we're already at the start of the text, return DONE. 1246 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1247 return BreakIterator::DONE; 1248 } 1249 1250 // Set up the starting char. 1251 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1252 result = initialPosition; 1253 c = UTEXT_PREVIOUS32(fText); 1254 1255 // Set the initial state for the state machine 1256 state = START_STATE; 1257 row = (RBBIStateTableRow *) 1258 (statetable->fTableData + (statetable->fRowLen * state)); 1259 category = 3; 1260 mode = RBBI_RUN; 1261 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1262 category = 2; 1263 mode = RBBI_START; 1264 } 1265 1266 1267 // loop until we reach the start of the text or transition to state 0 1268 // 1269 for (;;) { 1270 if (c == U_SENTINEL) { 1271 // Reached end of input string. 1272 if (mode == RBBI_END) { 1273 // We have already run the loop one last time with the 1274 // character set to the psueudo {eof} value. Now it is time 1275 // to unconditionally bail out. 1276 if (result == initialPosition) { 1277 // Ran off start, no match found. 1278 // move one index one (towards the start, since we are doing a previous()) 1279 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1280 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1281 } 1282 break; 1283 } 1284 // Run the loop one last time with the fake end-of-input character category. 1285 mode = RBBI_END; 1286 category = 1; 1287 } 1288 1289 // 1290 // Get the char category. An incoming category of 1 or 2 means that 1291 // we are preset for doing the beginning or end of input, and 1292 // that we shouldn't get a category from an actual text input character. 1293 // 1294 if (mode == RBBI_RUN) { 1295 // look up the current character's character category, which tells us 1296 // which column in the state table to look at. 1297 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1298 // not the size of the character going in, which is a UChar32. 1299 // 1300 UTRIE_GET16(&fData->fTrie, c, category); 1301 1302 // Check the dictionary bit in the character's category. 1303 // Counter is only used by dictionary based iterators (subclasses). 1304 // Chars that need to be handled by a dictionary have a flag bit set 1305 // in their category values. 1306 // 1307 if ((category & 0x4000) != 0) { 1308 fDictionaryCharCount++; 1309 // And off the dictionary flag bit. 1310 category &= ~0x4000; 1311 } 1312 } 1313 1314 #ifdef RBBI_DEBUG 1315 if (fTrace) { 1316 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1317 if (0x20<=c && c<0x7f) { 1318 RBBIDebugPrintf("\"%c\" ", c); 1319 } else { 1320 RBBIDebugPrintf("%5x ", c); 1321 } 1322 RBBIDebugPrintf("%3d %3d\n", state, category); 1323 } 1324 #endif 1325 1326 // State Transition - move machine to its next state 1327 // 1328 1329 // Note: fNextState is defined as uint16_t[2], but we are casting 1330 // a generated RBBI table to RBBIStateTableRow and some tables 1331 // actually have more than 2 categories. 1332 U_ASSERT(category<fData->fHeader->fCatCount); 1333 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1334 row = (RBBIStateTableRow *) 1335 (statetable->fTableData + (statetable->fRowLen * state)); 1336 1337 if (row->fAccepting == -1) { 1338 // Match found, common case. 1339 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1340 } 1341 1342 int16_t completedRule = row->fAccepting; 1343 if (completedRule > 0) { 1344 // Lookahead match is completed. 1345 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); 1346 if (lookaheadResult >= 0) { 1347 UTEXT_SETNATIVEINDEX(fText, lookaheadResult); 1348 return lookaheadResult; 1349 } 1350 } 1351 int16_t rule = row->fLookAhead; 1352 if (rule != 0) { 1353 // At the position of a '/' in a look-ahead match. Record it. 1354 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1355 lookAheadMatches.setPosition(rule, pos); 1356 } 1357 1358 if (state == STOP_STATE) { 1359 // This is the normal exit from the lookup state machine. 1360 // We have advanced through the string until it is certain that no 1361 // longer match is possible, no matter what characters follow. 1362 break; 1363 } 1364 1365 // Move (backwards) to the next character to process. 1366 // If this is a beginning-of-input loop iteration, don't advance 1367 // the input position. The next iteration will be processing the 1368 // first real input character. 1369 if (mode == RBBI_RUN) { 1370 c = UTEXT_PREVIOUS32(fText); 1371 } else { 1372 if (mode == RBBI_START) { 1373 mode = RBBI_RUN; 1374 } 1375 } 1376 } 1377 1378 // The state machine is done. Check whether it found a match... 1379 1380 // If the iterator failed to advance in the match engine, force it ahead by one. 1381 // (This really indicates a defect in the break rules. They should always match 1382 // at least one character.) 1383 if (result == initialPosition) { 1384 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1385 UTEXT_PREVIOUS32(fText); 1386 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1387 } 1388 1389 // Leave the iterator at our result position. 1390 UTEXT_SETNATIVEINDEX(fText, result); 1391 #ifdef RBBI_DEBUG 1392 if (fTrace) { 1393 RBBIDebugPrintf("result = %d\n\n", result); 1394 } 1395 #endif 1396 return result; 1397 } 1398 1399 1400 void 1401 RuleBasedBreakIterator::reset() 1402 { 1403 if (fCachedBreakPositions) { 1404 uprv_free(fCachedBreakPositions); 1405 } 1406 fCachedBreakPositions = NULL; 1407 fNumCachedBreakPositions = 0; 1408 fDictionaryCharCount = 0; 1409 fPositionInCache = 0; 1410 } 1411 1412 1413 1414 //------------------------------------------------------------------------------- 1415 // 1416 // getRuleStatus() Return the break rule tag associated with the current 1417 // iterator position. If the iterator arrived at its current 1418 // position by iterating forwards, the value will have been 1419 // cached by the handleNext() function. 1420 // 1421 // If no cached status value is available, the status is 1422 // found by doing a previous() followed by a next(), which 1423 // leaves the iterator where it started, and computes the 1424 // status while doing the next(). 1425 // 1426 //------------------------------------------------------------------------------- 1427 void RuleBasedBreakIterator::makeRuleStatusValid() { 1428 if (fLastStatusIndexValid == FALSE) { 1429 // No cached status is available. 1430 if (fText == NULL || current() == 0) { 1431 // At start of text, or there is no text. Status is always zero. 1432 fLastRuleStatusIndex = 0; 1433 fLastStatusIndexValid = TRUE; 1434 } else { 1435 // Not at start of text. Find status the tedious way. 1436 int32_t pa = current(); 1437 previous(); 1438 if (fNumCachedBreakPositions > 0) { 1439 reset(); // Blow off the dictionary cache 1440 } 1441 int32_t pb = next(); 1442 if (pa != pb) { 1443 // note: the if (pa != pb) test is here only to eliminate warnings for 1444 // unused local variables on gcc. Logically, it isn't needed. 1445 U_ASSERT(pa == pb); 1446 } 1447 } 1448 } 1449 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1450 } 1451 1452 1453 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1454 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1455 nonConstThis->makeRuleStatusValid(); 1456 1457 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1458 // (the number of status values.) 1459 // This function returns the last (largest) of the array of status values. 1460 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1461 int32_t tagVal = fData->fRuleStatusTable[idx]; 1462 1463 return tagVal; 1464 } 1465 1466 1467 1468 1469 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1470 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1471 { 1472 if (U_FAILURE(status)) { 1473 return 0; 1474 } 1475 1476 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1477 nonConstThis->makeRuleStatusValid(); 1478 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1479 int32_t numValsToCopy = numVals; 1480 if (numVals > capacity) { 1481 status = U_BUFFER_OVERFLOW_ERROR; 1482 numValsToCopy = capacity; 1483 } 1484 int i; 1485 for (i=0; i<numValsToCopy; i++) { 1486 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1487 } 1488 return numVals; 1489 } 1490 1491 1492 1493 //------------------------------------------------------------------------------- 1494 // 1495 // getBinaryRules Access to the compiled form of the rules, 1496 // for use by build system tools that save the data 1497 // for standard iterator types. 1498 // 1499 //------------------------------------------------------------------------------- 1500 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1501 const uint8_t *retPtr = NULL; 1502 length = 0; 1503 1504 if (fData != NULL) { 1505 retPtr = (const uint8_t *)fData->fHeader; 1506 length = fData->fHeader->fLength; 1507 } 1508 return retPtr; 1509 } 1510 1511 1512 BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, 1513 int32_t &bufferSize, 1514 UErrorCode &status) 1515 { 1516 if (U_FAILURE(status)){ 1517 return NULL; 1518 } 1519 1520 if (bufferSize == 0) { 1521 bufferSize = 1; // preflighting for deprecated functionality 1522 return NULL; 1523 } 1524 1525 BreakIterator *clonedBI = clone(); 1526 if (clonedBI == NULL) { 1527 status = U_MEMORY_ALLOCATION_ERROR; 1528 } else { 1529 status = U_SAFECLONE_ALLOCATED_WARNING; 1530 } 1531 return (RuleBasedBreakIterator *)clonedBI; 1532 } 1533 1534 1535 //------------------------------------------------------------------------------- 1536 // 1537 // isDictionaryChar Return true if the category lookup for this char 1538 // indicates that it is in the set of dictionary lookup 1539 // chars. 1540 // 1541 // This function is intended for use by dictionary based 1542 // break iterators. 1543 // 1544 //------------------------------------------------------------------------------- 1545 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1546 if (fData == NULL) { 1547 return FALSE; 1548 } 1549 uint16_t category; 1550 UTRIE_GET16(&fData->fTrie, c, category); 1551 return (category & 0x4000) != 0; 1552 }*/ 1553 1554 1555 //------------------------------------------------------------------------------- 1556 // 1557 // checkDictionary This function handles all processing of characters in 1558 // the "dictionary" set. It will determine the appropriate 1559 // course of action, and possibly set up a cache in the 1560 // process. 1561 // 1562 //------------------------------------------------------------------------------- 1563 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1564 int32_t endPos, 1565 UBool reverse) { 1566 // Reset the old break cache first. 1567 reset(); 1568 1569 // note: code segment below assumes that dictionary chars are in the 1570 // startPos-endPos range 1571 // value returned should be next character in sequence 1572 if ((endPos - startPos) <= 1) { 1573 return (reverse ? startPos : endPos); 1574 } 1575 1576 // Starting from the starting point, scan towards the proposed result, 1577 // looking for the first dictionary character (which may be the one 1578 // we're on, if we're starting in the middle of a range). 1579 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1580 if (reverse) { 1581 UTEXT_PREVIOUS32(fText); 1582 } 1583 1584 int32_t rangeStart = startPos; 1585 int32_t rangeEnd = endPos; 1586 1587 uint16_t category; 1588 int32_t current; 1589 UErrorCode status = U_ZERO_ERROR; 1590 UStack breaks(status); 1591 int32_t foundBreakCount = 0; 1592 UChar32 c = utext_current32(fText); 1593 1594 UTRIE_GET16(&fData->fTrie, c, category); 1595 1596 // Is the character we're starting on a dictionary character? If so, we 1597 // need to back up to include the entire run; otherwise the results of 1598 // the break algorithm will differ depending on where we start. Since 1599 // the result is cached and there is typically a non-dictionary break 1600 // within a small number of words, there should be little performance impact. 1601 if (category & 0x4000) { 1602 if (reverse) { 1603 do { 1604 utext_next32(fText); // TODO: recast to work directly with postincrement. 1605 c = utext_current32(fText); 1606 UTRIE_GET16(&fData->fTrie, c, category); 1607 } while (c != U_SENTINEL && (category & 0x4000)); 1608 // Back up to the last dictionary character 1609 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1610 if (c == U_SENTINEL) { 1611 // c = fText->last32(); 1612 // TODO: why was this if needed? 1613 c = UTEXT_PREVIOUS32(fText); 1614 } 1615 else { 1616 c = UTEXT_PREVIOUS32(fText); 1617 } 1618 } 1619 else { 1620 do { 1621 c = UTEXT_PREVIOUS32(fText); 1622 UTRIE_GET16(&fData->fTrie, c, category); 1623 } 1624 while (c != U_SENTINEL && (category & 0x4000)); 1625 // Back up to the last dictionary character 1626 if (c == U_SENTINEL) { 1627 // c = fText->first32(); 1628 c = utext_current32(fText); 1629 } 1630 else { 1631 utext_next32(fText); 1632 c = utext_current32(fText); 1633 } 1634 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1635 } 1636 UTRIE_GET16(&fData->fTrie, c, category); 1637 } 1638 1639 // Loop through the text, looking for ranges of dictionary characters. 1640 // For each span, find the appropriate break engine, and ask it to find 1641 // any breaks within the span. 1642 // Note: we always do this in the forward direction, so that the break 1643 // cache is built in the right order. 1644 if (reverse) { 1645 utext_setNativeIndex(fText, rangeStart); 1646 c = utext_current32(fText); 1647 UTRIE_GET16(&fData->fTrie, c, category); 1648 } 1649 while(U_SUCCESS(status)) { 1650 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1651 utext_next32(fText); // TODO: tweak for post-increment operation 1652 c = utext_current32(fText); 1653 UTRIE_GET16(&fData->fTrie, c, category); 1654 } 1655 if (current >= rangeEnd) { 1656 break; 1657 } 1658 1659 // We now have a dictionary character. Get the appropriate language object 1660 // to deal with it. 1661 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1662 1663 // Ask the language object if there are any breaks. It will leave the text 1664 // pointer on the other side of its range, ready to search for the next one. 1665 if (lbe != NULL) { 1666 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1667 } 1668 1669 // Reload the loop variables for the next go-round 1670 c = utext_current32(fText); 1671 UTRIE_GET16(&fData->fTrie, c, category); 1672 } 1673 1674 // If we found breaks, build a new break cache. The first and last entries must 1675 // be the original starting and ending position. 1676 if (foundBreakCount > 0) { 1677 U_ASSERT(foundBreakCount == breaks.size()); 1678 int32_t totalBreaks = foundBreakCount; 1679 if (startPos < breaks.elementAti(0)) { 1680 totalBreaks += 1; 1681 } 1682 if (endPos > breaks.peeki()) { 1683 totalBreaks += 1; 1684 } 1685 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1686 if (fCachedBreakPositions != NULL) { 1687 int32_t out = 0; 1688 fNumCachedBreakPositions = totalBreaks; 1689 if (startPos < breaks.elementAti(0)) { 1690 fCachedBreakPositions[out++] = startPos; 1691 } 1692 for (int32_t i = 0; i < foundBreakCount; ++i) { 1693 fCachedBreakPositions[out++] = breaks.elementAti(i); 1694 } 1695 if (endPos > fCachedBreakPositions[out-1]) { 1696 fCachedBreakPositions[out] = endPos; 1697 } 1698 // If there are breaks, then by definition, we are replacing the original 1699 // proposed break by one of the breaks we found. Use following() and 1700 // preceding() to do the work. They should never recurse in this case. 1701 if (reverse) { 1702 return preceding(endPos); 1703 } 1704 else { 1705 return following(startPos); 1706 } 1707 } 1708 // If the allocation failed, just fall through to the "no breaks found" case. 1709 } 1710 1711 // If we get here, there were no language-based breaks. Set the text pointer 1712 // to the original proposed break. 1713 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1714 return (reverse ? startPos : endPos); 1715 } 1716 1717 U_NAMESPACE_END 1718 1719 1720 static icu::UStack *gLanguageBreakFactories = NULL; 1721 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1722 1723 /** 1724 * Release all static memory held by breakiterator. 1725 */ 1726 U_CDECL_BEGIN 1727 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1728 if (gLanguageBreakFactories) { 1729 delete gLanguageBreakFactories; 1730 gLanguageBreakFactories = NULL; 1731 } 1732 gLanguageBreakFactoriesInitOnce.reset(); 1733 return TRUE; 1734 } 1735 U_CDECL_END 1736 1737 U_CDECL_BEGIN 1738 static void U_CALLCONV _deleteFactory(void *obj) { 1739 delete (icu::LanguageBreakFactory *) obj; 1740 } 1741 U_CDECL_END 1742 U_NAMESPACE_BEGIN 1743 1744 static void U_CALLCONV initLanguageFactories() { 1745 UErrorCode status = U_ZERO_ERROR; 1746 U_ASSERT(gLanguageBreakFactories == NULL); 1747 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); 1748 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { 1749 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1750 gLanguageBreakFactories->push(builtIn, status); 1751 #ifdef U_LOCAL_SERVICE_HOOK 1752 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1753 if (extra != NULL) { 1754 gLanguageBreakFactories->push(extra, status); 1755 } 1756 #endif 1757 } 1758 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1759 } 1760 1761 1762 static const LanguageBreakEngine* 1763 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1764 { 1765 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); 1766 if (gLanguageBreakFactories == NULL) { 1767 return NULL; 1768 } 1769 1770 int32_t i = gLanguageBreakFactories->size(); 1771 const LanguageBreakEngine *lbe = NULL; 1772 while (--i >= 0) { 1773 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1774 lbe = factory->getEngineFor(c, breakType); 1775 if (lbe != NULL) { 1776 break; 1777 } 1778 } 1779 return lbe; 1780 } 1781 1782 1783 //------------------------------------------------------------------------------- 1784 // 1785 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1786 // the character c. 1787 // 1788 //------------------------------------------------------------------------------- 1789 const LanguageBreakEngine * 1790 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1791 const LanguageBreakEngine *lbe = NULL; 1792 UErrorCode status = U_ZERO_ERROR; 1793 1794 if (fLanguageBreakEngines == NULL) { 1795 fLanguageBreakEngines = new UStack(status); 1796 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1797 delete fLanguageBreakEngines; 1798 fLanguageBreakEngines = 0; 1799 return NULL; 1800 } 1801 } 1802 1803 int32_t i = fLanguageBreakEngines->size(); 1804 while (--i >= 0) { 1805 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1806 if (lbe->handles(c, fBreakType)) { 1807 return lbe; 1808 } 1809 } 1810 1811 // No existing dictionary took the character. See if a factory wants to 1812 // give us a new LanguageBreakEngine for this character. 1813 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1814 1815 // If we got one, use it and push it on our stack. 1816 if (lbe != NULL) { 1817 fLanguageBreakEngines->push((void *)lbe, status); 1818 // Even if we can't remember it, we can keep looking it up, so 1819 // return it even if the push fails. 1820 return lbe; 1821 } 1822 1823 // No engine is forthcoming for this character. Add it to the 1824 // reject set. Create the reject break engine if needed. 1825 if (fUnhandledBreakEngine == NULL) { 1826 fUnhandledBreakEngine = new UnhandledEngine(status); 1827 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1828 status = U_MEMORY_ALLOCATION_ERROR; 1829 } 1830 // Put it last so that scripts for which we have an engine get tried 1831 // first. 1832 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1833 // If we can't insert it, or creation failed, get rid of it 1834 if (U_FAILURE(status)) { 1835 delete fUnhandledBreakEngine; 1836 fUnhandledBreakEngine = 0; 1837 return NULL; 1838 } 1839 } 1840 1841 // Tell the reject engine about the character; at its discretion, it may 1842 // add more than just the one character. 1843 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1844 1845 return fUnhandledBreakEngine; 1846 } 1847 1848 1849 1850 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1851 return fBreakType; 1852 }*/ 1853 1854 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1855 fBreakType = type; 1856 reset(); 1857 } 1858 1859 U_NAMESPACE_END 1860 1861 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1862