1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2014 International Business Machines Corporation 4 * and others. All rights reserved. 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include "utypeinfo.h" // for 'typeid' to work 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/rbbi.h" 20 #include "unicode/schriter.h" 21 #include "unicode/uchriter.h" 22 #include "unicode/udata.h" 23 #include "unicode/uclean.h" 24 #include "rbbidata.h" 25 #include "rbbirb.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "umutex.h" 29 #include "ucln_cmn.h" 30 #include "brkeng.h" 31 32 #include "uassert.h" 33 #include "uvector.h" 34 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 36 #if U_LOCAL_SERVICE_HOOK 37 #include "localsvc.h" 38 #endif 39 40 #ifdef RBBI_DEBUG 41 static UBool fTrace = FALSE; 42 #endif 43 44 U_NAMESPACE_BEGIN 45 46 // The state number of the starting state 47 #define START_STATE 1 48 49 // The state-transition value indicating "stop" 50 #define STOP_STATE 0 51 52 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 54 55 56 //======================================================================= 57 // constructors 58 //======================================================================= 59 60 /** 61 * Constructs a RuleBasedBreakIterator that uses the already-created 62 * tables object that is passed in as a parameter. 63 */ 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 65 { 66 init(); 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor 68 if (U_FAILURE(status)) {return;} 69 if(fData == 0) { 70 status = U_MEMORY_ALLOCATION_ERROR; 71 return; 72 } 73 } 74 75 /** 76 * Same as above but does not adopt memory 77 */ 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 79 { 80 init(); 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 82 if (U_FAILURE(status)) {return;} 83 if(fData == 0) { 84 status = U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 } 88 89 90 // 91 // Construct from precompiled binary rules (tables). This constructor is public API, 92 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 93 // 94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 95 uint32_t ruleLength, 96 UErrorCode &status) { 97 init(); 98 if (U_FAILURE(status)) { 99 return; 100 } 101 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 102 status = U_ILLEGAL_ARGUMENT_ERROR; 103 return; 104 } 105 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 106 if (data->fLength > ruleLength) { 107 status = U_ILLEGAL_ARGUMENT_ERROR; 108 return; 109 } 110 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 111 if (U_FAILURE(status)) {return;} 112 if(fData == 0) { 113 status = U_MEMORY_ALLOCATION_ERROR; 114 return; 115 } 116 } 117 118 119 //------------------------------------------------------------------------------- 120 // 121 // Constructor from a UDataMemory handle to precompiled break rules 122 // stored in an ICU data file. 123 // 124 //------------------------------------------------------------------------------- 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 126 { 127 init(); 128 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 129 if (U_FAILURE(status)) {return;} 130 if(fData == 0) { 131 status = U_MEMORY_ALLOCATION_ERROR; 132 return; 133 } 134 } 135 136 137 138 //------------------------------------------------------------------------------- 139 // 140 // Constructor from a set of rules supplied as a string. 141 // 142 //------------------------------------------------------------------------------- 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 144 UParseError &parseError, 145 UErrorCode &status) 146 { 147 init(); 148 if (U_FAILURE(status)) {return;} 149 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 150 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 151 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 152 // creates and returns a complete RBBI. From here, in a constructor, we 153 // can't just return the object created by the builder factory, hence 154 // the assignment of the factory created object to "this". 155 if (U_SUCCESS(status)) { 156 *this = *bi; 157 delete bi; 158 } 159 } 160 161 162 //------------------------------------------------------------------------------- 163 // 164 // Default Constructor. Create an empty shell that can be set up later. 165 // Used when creating a RuleBasedBreakIterator from a set 166 // of rules. 167 //------------------------------------------------------------------------------- 168 RuleBasedBreakIterator::RuleBasedBreakIterator() { 169 init(); 170 } 171 172 173 //------------------------------------------------------------------------------- 174 // 175 // Copy constructor. Will produce a break iterator with the same behavior, 176 // and which iterates over the same text, as the one passed in. 177 // 178 //------------------------------------------------------------------------------- 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 180 : BreakIterator(other) 181 { 182 this->init(); 183 *this = other; 184 } 185 186 187 /** 188 * Destructor 189 */ 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 191 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 192 // fCharIter was adopted from the outside. 193 delete fCharIter; 194 } 195 fCharIter = NULL; 196 delete fSCharIter; 197 fCharIter = NULL; 198 delete fDCharIter; 199 fDCharIter = NULL; 200 201 utext_close(fText); 202 203 if (fData != NULL) { 204 fData->removeReference(); 205 fData = NULL; 206 } 207 if (fCachedBreakPositions) { 208 uprv_free(fCachedBreakPositions); 209 fCachedBreakPositions = NULL; 210 } 211 if (fLanguageBreakEngines) { 212 delete fLanguageBreakEngines; 213 fLanguageBreakEngines = NULL; 214 } 215 if (fUnhandledBreakEngine) { 216 delete fUnhandledBreakEngine; 217 fUnhandledBreakEngine = NULL; 218 } 219 } 220 221 /** 222 * Assignment operator. Sets this iterator to have the same behavior, 223 * and iterate over the same text, as the one passed in. 224 */ 225 RuleBasedBreakIterator& 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 227 if (this == &that) { 228 return *this; 229 } 230 reset(); // Delete break cache information 231 fBreakType = that.fBreakType; 232 if (fLanguageBreakEngines != NULL) { 233 delete fLanguageBreakEngines; 234 fLanguageBreakEngines = NULL; // Just rebuild for now 235 } 236 // TODO: clone fLanguageBreakEngines from "that" 237 UErrorCode status = U_ZERO_ERROR; 238 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 239 240 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 241 delete fCharIter; 242 } 243 fCharIter = NULL; 244 245 if (that.fCharIter != NULL ) { 246 // This is a little bit tricky - it will intially appear that 247 // this->fCharIter is adopted, even if that->fCharIter was 248 // not adopted. That's ok. 249 fCharIter = that.fCharIter->clone(); 250 } 251 252 if (fData != NULL) { 253 fData->removeReference(); 254 fData = NULL; 255 } 256 if (that.fData != NULL) { 257 fData = that.fData->addReference(); 258 } 259 260 return *this; 261 } 262 263 264 265 //----------------------------------------------------------------------------- 266 // 267 // init() Shared initialization routine. Used by all the constructors. 268 // Initializes all fields, leaving the object in a consistent state. 269 // 270 //----------------------------------------------------------------------------- 271 void RuleBasedBreakIterator::init() { 272 UErrorCode status = U_ZERO_ERROR; 273 fText = utext_openUChars(NULL, NULL, 0, &status); 274 fCharIter = NULL; 275 fSCharIter = NULL; 276 fDCharIter = NULL; 277 fData = NULL; 278 fLastRuleStatusIndex = 0; 279 fLastStatusIndexValid = TRUE; 280 fDictionaryCharCount = 0; 281 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 282 // dictionary behavior for Break Iterators that are 283 // built from rules. Even better would be the ability to 284 // declare the type in the rules. 285 286 fCachedBreakPositions = NULL; 287 fLanguageBreakEngines = NULL; 288 fUnhandledBreakEngine = NULL; 289 fNumCachedBreakPositions = 0; 290 fPositionInCache = 0; 291 292 #ifdef RBBI_DEBUG 293 static UBool debugInitDone = FALSE; 294 if (debugInitDone == FALSE) { 295 char *debugEnv = getenv("U_RBBIDEBUG"); 296 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 297 fTrace = TRUE; 298 } 299 debugInitDone = TRUE; 300 } 301 #endif 302 } 303 304 305 306 //----------------------------------------------------------------------------- 307 // 308 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 309 // behavior, and iterating over the same text, as this one. 310 // Virtual function: does the right thing with subclasses. 311 // 312 //----------------------------------------------------------------------------- 313 BreakIterator* 314 RuleBasedBreakIterator::clone(void) const { 315 return new RuleBasedBreakIterator(*this); 316 } 317 318 /** 319 * Equality operator. Returns TRUE if both BreakIterators are of the 320 * same class, have the same behavior, and iterate over the same text. 321 */ 322 UBool 323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 324 if (typeid(*this) != typeid(that)) { 325 return FALSE; 326 } 327 328 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 329 330 if (!utext_equals(fText, that2.fText)) { 331 // The two break iterators are operating on different text, 332 // or have a different interation position. 333 return FALSE; 334 }; 335 336 // TODO: need a check for when in a dictionary region at different offsets. 337 338 if (that2.fData == fData || 339 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 340 // The two break iterators are using the same rules. 341 return TRUE; 342 } 343 return FALSE; 344 } 345 346 /** 347 * Compute a hash code for this BreakIterator 348 * @return A hash code 349 */ 350 int32_t 351 RuleBasedBreakIterator::hashCode(void) const { 352 int32_t hash = 0; 353 if (fData != NULL) { 354 hash = fData->hashCode(); 355 } 356 return hash; 357 } 358 359 360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 361 if (U_FAILURE(status)) { 362 return; 363 } 364 reset(); 365 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 366 367 // Set up a dummy CharacterIterator to be returned if anyone 368 // calls getText(). With input from UText, there is no reasonable 369 // way to return a characterIterator over the actual input text. 370 // Return one over an empty string instead - this is the closest 371 // we can come to signaling a failure. 372 // (GetText() is obsolete, this failure is sort of OK) 373 if (fDCharIter == NULL) { 374 static const UChar c = 0; 375 fDCharIter = new UCharCharacterIterator(&c, 0); 376 if (fDCharIter == NULL) { 377 status = U_MEMORY_ALLOCATION_ERROR; 378 return; 379 } 380 } 381 382 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 383 // existing fCharIter was adopted from the outside. Delete it now. 384 delete fCharIter; 385 } 386 fCharIter = fDCharIter; 387 388 this->first(); 389 } 390 391 392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 393 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 394 return result; 395 } 396 397 398 399 /** 400 * Returns the description used to create this iterator 401 */ 402 const UnicodeString& 403 RuleBasedBreakIterator::getRules() const { 404 if (fData != NULL) { 405 return fData->getRuleSourceString(); 406 } else { 407 static const UnicodeString *s; 408 if (s == NULL) { 409 // TODO: something more elegant here. 410 // perhaps API should return the string by value. 411 // Note: thread unsafe init & leak are semi-ok, better than 412 // what was before. Sould be cleaned up, though. 413 s = new UnicodeString; 414 } 415 return *s; 416 } 417 } 418 419 //======================================================================= 420 // BreakIterator overrides 421 //======================================================================= 422 423 /** 424 * Return a CharacterIterator over the text being analyzed. 425 */ 426 CharacterIterator& 427 RuleBasedBreakIterator::getText() const { 428 return *fCharIter; 429 } 430 431 /** 432 * Set the iterator to analyze a new piece of text. This function resets 433 * the current iteration position to the beginning of the text. 434 * @param newText An iterator over the text to analyze. 435 */ 436 void 437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 438 // If we are holding a CharacterIterator adopted from a 439 // previous call to this function, delete it now. 440 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 441 delete fCharIter; 442 } 443 444 fCharIter = newText; 445 UErrorCode status = U_ZERO_ERROR; 446 reset(); 447 if (newText==NULL || newText->startIndex() != 0) { 448 // startIndex !=0 wants to be an error, but there's no way to report it. 449 // Make the iterator text be an empty string. 450 fText = utext_openUChars(fText, NULL, 0, &status); 451 } else { 452 fText = utext_openCharacterIterator(fText, newText, &status); 453 } 454 this->first(); 455 } 456 457 /** 458 * Set the iterator to analyze a new piece of text. This function resets 459 * the current iteration position to the beginning of the text. 460 * @param newText An iterator over the text to analyze. 461 */ 462 void 463 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 464 UErrorCode status = U_ZERO_ERROR; 465 reset(); 466 fText = utext_openConstUnicodeString(fText, &newText, &status); 467 468 // Set up a character iterator on the string. 469 // Needed in case someone calls getText(). 470 // Can not, unfortunately, do this lazily on the (probably never) 471 // call to getText(), because getText is const. 472 if (fSCharIter == NULL) { 473 fSCharIter = new StringCharacterIterator(newText); 474 } else { 475 fSCharIter->setText(newText); 476 } 477 478 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 479 // old fCharIter was adopted from the outside. Delete it. 480 delete fCharIter; 481 } 482 fCharIter = fSCharIter; 483 484 this->first(); 485 } 486 487 488 /** 489 * Provide a new UText for the input text. Must reference text with contents identical 490 * to the original. 491 * Intended for use with text data originating in Java (garbage collected) environments 492 * where the data may be moved in memory at arbitrary times. 493 */ 494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 495 if (U_FAILURE(status)) { 496 return *this; 497 } 498 if (input == NULL) { 499 status = U_ILLEGAL_ARGUMENT_ERROR; 500 return *this; 501 } 502 int64_t pos = utext_getNativeIndex(fText); 503 // Shallow read-only clone of the new UText into the existing input UText 504 fText = utext_clone(fText, input, FALSE, TRUE, &status); 505 if (U_FAILURE(status)) { 506 return *this; 507 } 508 utext_setNativeIndex(fText, pos); 509 if (utext_getNativeIndex(fText) != pos) { 510 // Sanity check. The new input utext is supposed to have the exact same 511 // contents as the old. If we can't set to the same position, it doesn't. 512 // The contents underlying the old utext might be invalid at this point, 513 // so it's not safe to check directly. 514 status = U_ILLEGAL_ARGUMENT_ERROR; 515 } 516 return *this; 517 } 518 519 520 /** 521 * Sets the current iteration position to the beginning of the text, position zero. 522 * @return The new iterator position, which is zero. 523 */ 524 int32_t RuleBasedBreakIterator::first(void) { 525 reset(); 526 fLastRuleStatusIndex = 0; 527 fLastStatusIndexValid = TRUE; 528 //if (fText == NULL) 529 // return BreakIterator::DONE; 530 531 utext_setNativeIndex(fText, 0); 532 return 0; 533 } 534 535 /** 536 * Sets the current iteration position to the end of the text. 537 * @return The text's past-the-end offset. 538 */ 539 int32_t RuleBasedBreakIterator::last(void) { 540 reset(); 541 if (fText == NULL) { 542 fLastRuleStatusIndex = 0; 543 fLastStatusIndexValid = TRUE; 544 return BreakIterator::DONE; 545 } 546 547 fLastStatusIndexValid = FALSE; 548 int32_t pos = (int32_t)utext_nativeLength(fText); 549 utext_setNativeIndex(fText, pos); 550 return pos; 551 } 552 553 /** 554 * Advances the iterator either forward or backward the specified number of steps. 555 * Negative values move backward, and positive values move forward. This is 556 * equivalent to repeatedly calling next() or previous(). 557 * @param n The number of steps to move. The sign indicates the direction 558 * (negative is backwards, and positive is forwards). 559 * @return The character offset of the boundary position n boundaries away from 560 * the current one. 561 */ 562 int32_t RuleBasedBreakIterator::next(int32_t n) { 563 int32_t result = current(); 564 while (n > 0) { 565 result = next(); 566 --n; 567 } 568 while (n < 0) { 569 result = previous(); 570 ++n; 571 } 572 return result; 573 } 574 575 /** 576 * Advances the iterator to the next boundary position. 577 * @return The position of the first boundary after this one. 578 */ 579 int32_t RuleBasedBreakIterator::next(void) { 580 // if we have cached break positions and we're still in the range 581 // covered by them, just move one step forward in the cache 582 if (fCachedBreakPositions != NULL) { 583 if (fPositionInCache < fNumCachedBreakPositions - 1) { 584 ++fPositionInCache; 585 int32_t pos = fCachedBreakPositions[fPositionInCache]; 586 utext_setNativeIndex(fText, pos); 587 return pos; 588 } 589 else { 590 reset(); 591 } 592 } 593 594 int32_t startPos = current(); 595 fDictionaryCharCount = 0; 596 int32_t result = handleNext(fData->fForwardTable); 597 if (fDictionaryCharCount > 0) { 598 result = checkDictionary(startPos, result, FALSE); 599 } 600 return result; 601 } 602 603 /** 604 * Advances the iterator backwards, to the last boundary preceding this one. 605 * @return The position of the last boundary position preceding this one. 606 */ 607 int32_t RuleBasedBreakIterator::previous(void) { 608 int32_t result; 609 int32_t startPos; 610 611 // if we have cached break positions and we're still in the range 612 // covered by them, just move one step backward in the cache 613 if (fCachedBreakPositions != NULL) { 614 if (fPositionInCache > 0) { 615 --fPositionInCache; 616 // If we're at the beginning of the cache, need to reevaluate the 617 // rule status 618 if (fPositionInCache <= 0) { 619 fLastStatusIndexValid = FALSE; 620 } 621 int32_t pos = fCachedBreakPositions[fPositionInCache]; 622 utext_setNativeIndex(fText, pos); 623 return pos; 624 } 625 else { 626 reset(); 627 } 628 } 629 630 // if we're already sitting at the beginning of the text, return DONE 631 if (fText == NULL || (startPos = current()) == 0) { 632 fLastRuleStatusIndex = 0; 633 fLastStatusIndexValid = TRUE; 634 return BreakIterator::DONE; 635 } 636 637 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 638 result = handlePrevious(fData->fReverseTable); 639 if (fDictionaryCharCount > 0) { 640 result = checkDictionary(result, startPos, TRUE); 641 } 642 return result; 643 } 644 645 // old rule syntax 646 // set things up. handlePrevious() will back us up to some valid 647 // break position before the current position (we back our internal 648 // iterator up one step to prevent handlePrevious() from returning 649 // the current position), but not necessarily the last one before 650 // where we started 651 652 int32_t start = current(); 653 654 (void)UTEXT_PREVIOUS32(fText); 655 int32_t lastResult = handlePrevious(fData->fReverseTable); 656 if (lastResult == UBRK_DONE) { 657 lastResult = 0; 658 utext_setNativeIndex(fText, 0); 659 } 660 result = lastResult; 661 int32_t lastTag = 0; 662 UBool breakTagValid = FALSE; 663 664 // iterate forward from the known break position until we pass our 665 // starting point. The last break position before the starting 666 // point is our return value 667 668 for (;;) { 669 result = next(); 670 if (result == BreakIterator::DONE || result >= start) { 671 break; 672 } 673 lastResult = result; 674 lastTag = fLastRuleStatusIndex; 675 breakTagValid = TRUE; 676 } 677 678 // fLastBreakTag wants to have the value for section of text preceding 679 // the result position that we are to return (in lastResult.) If 680 // the backwards rules overshot and the above loop had to do two or more 681 // next()s to move up to the desired return position, we will have a valid 682 // tag value. But, if handlePrevious() took us to exactly the correct result position, 683 // we wont have a tag value for that position, which is only set by handleNext(). 684 685 // Set the current iteration position to be the last break position 686 // before where we started, and then return that value. 687 utext_setNativeIndex(fText, lastResult); 688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 689 fLastStatusIndexValid = breakTagValid; 690 691 // No need to check the dictionary; it will have been handled by 692 // next() 693 694 return lastResult; 695 } 696 697 /** 698 * Sets the iterator to refer to the first boundary position following 699 * the specified position. 700 * @offset The position from which to begin searching for a break position. 701 * @return The position of the first break after the current position. 702 */ 703 int32_t RuleBasedBreakIterator::following(int32_t offset) { 704 // if the offset passed in is already past the end of the text, 705 // just return DONE; if it's before the beginning, return the 706 // text's starting offset 707 if (fText == NULL || offset >= utext_nativeLength(fText)) { 708 last(); 709 return next(); 710 } 711 else if (offset < 0) { 712 return first(); 713 } 714 715 // Move requested offset to a code point start. It might be on a trail surrogate, 716 // or on a trail byte if the input is UTF-8. 717 utext_setNativeIndex(fText, offset); 718 offset = utext_getNativeIndex(fText); 719 720 // if we have cached break positions and offset is in the range 721 // covered by them, use them 722 // TODO: could use binary search 723 // TODO: what if offset is outside range, but break is not? 724 if (fCachedBreakPositions != NULL) { 725 if (offset >= fCachedBreakPositions[0] 726 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 727 fPositionInCache = 0; 728 // We are guaranteed not to leave the array due to range test above 729 while (offset >= fCachedBreakPositions[fPositionInCache]) { 730 ++fPositionInCache; 731 } 732 int32_t pos = fCachedBreakPositions[fPositionInCache]; 733 utext_setNativeIndex(fText, pos); 734 return pos; 735 } 736 else { 737 reset(); 738 } 739 } 740 741 // Set our internal iteration position (temporarily) 742 // to the position passed in. If this is the _beginning_ position, 743 // then we can just use next() to get our return value 744 745 int32_t result = 0; 746 747 if (fData->fSafeRevTable != NULL) { 748 // new rule syntax 749 utext_setNativeIndex(fText, offset); 750 // move forward one codepoint to prepare for moving back to a 751 // safe point. 752 // this handles offset being between a supplementary character 753 // TODO: is this still needed, with move to code point boundary handled above? 754 (void)UTEXT_NEXT32(fText); 755 // handlePrevious will move most of the time to < 1 boundary away 756 handlePrevious(fData->fSafeRevTable); 757 int32_t result = next(); 758 while (result <= offset) { 759 result = next(); 760 } 761 return result; 762 } 763 if (fData->fSafeFwdTable != NULL) { 764 // backup plan if forward safe table is not available 765 utext_setNativeIndex(fText, offset); 766 (void)UTEXT_PREVIOUS32(fText); 767 // handle next will give result >= offset 768 handleNext(fData->fSafeFwdTable); 769 // previous will give result 0 or 1 boundary away from offset, 770 // most of the time 771 // we have to 772 int32_t oldresult = previous(); 773 while (oldresult > offset) { 774 int32_t result = previous(); 775 if (result <= offset) { 776 return oldresult; 777 } 778 oldresult = result; 779 } 780 int32_t result = next(); 781 if (result <= offset) { 782 return next(); 783 } 784 return result; 785 } 786 // otherwise, we have to sync up first. Use handlePrevious() to back 787 // up to a known break position before the specified position (if 788 // we can determine that the specified position is a break position, 789 // we don't back up at all). This may or may not be the last break 790 // position at or before our starting position. Advance forward 791 // from here until we've passed the starting position. The position 792 // we stop on will be the first break position after the specified one. 793 // old rule syntax 794 795 utext_setNativeIndex(fText, offset); 796 if (offset==0 || 797 (offset==1 && utext_getNativeIndex(fText)==0)) { 798 return next(); 799 } 800 result = previous(); 801 802 while (result != BreakIterator::DONE && result <= offset) { 803 result = next(); 804 } 805 806 return result; 807 } 808 809 /** 810 * Sets the iterator to refer to the last boundary position before the 811 * specified position. 812 * @offset The position to begin searching for a break from. 813 * @return The position of the last boundary before the starting position. 814 */ 815 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 816 // if the offset passed in is already past the end of the text, 817 // just return DONE; if it's before the beginning, return the 818 // text's starting offset 819 if (fText == NULL || offset > utext_nativeLength(fText)) { 820 return last(); 821 } 822 else if (offset < 0) { 823 return first(); 824 } 825 826 // Move requested offset to a code point start. It might be on a trail surrogate, 827 // or on a trail byte if the input is UTF-8. 828 utext_setNativeIndex(fText, offset); 829 offset = utext_getNativeIndex(fText); 830 831 // if we have cached break positions and offset is in the range 832 // covered by them, use them 833 if (fCachedBreakPositions != NULL) { 834 // TODO: binary search? 835 // TODO: What if offset is outside range, but break is not? 836 if (offset > fCachedBreakPositions[0] 837 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 838 fPositionInCache = 0; 839 while (fPositionInCache < fNumCachedBreakPositions 840 && offset > fCachedBreakPositions[fPositionInCache]) 841 ++fPositionInCache; 842 --fPositionInCache; 843 // If we're at the beginning of the cache, need to reevaluate the 844 // rule status 845 if (fPositionInCache <= 0) { 846 fLastStatusIndexValid = FALSE; 847 } 848 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 849 return fCachedBreakPositions[fPositionInCache]; 850 } 851 else { 852 reset(); 853 } 854 } 855 856 // if we start by updating the current iteration position to the 857 // position specified by the caller, we can just use previous() 858 // to carry out this operation 859 860 if (fData->fSafeFwdTable != NULL) { 861 // new rule syntax 862 utext_setNativeIndex(fText, offset); 863 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 864 if (newOffset != offset) { 865 // Will come here if specified offset was not a code point boundary AND 866 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 867 // indices to the containing code point. 868 // For breakitereator::preceding only, these non-code-point indices need to be moved 869 // up to refer to the following codepoint. 870 (void)UTEXT_NEXT32(fText); 871 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 872 } 873 874 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 875 // rather than adjusting the position unconditionally? 876 // (Change would interact with safe rules.) 877 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 878 // affects only preceding(), seems cleaner, but is slightly different. 879 (void)UTEXT_PREVIOUS32(fText); 880 handleNext(fData->fSafeFwdTable); 881 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 882 while (result >= offset) { 883 result = previous(); 884 } 885 return result; 886 } 887 if (fData->fSafeRevTable != NULL) { 888 // backup plan if forward safe table is not available 889 // TODO: check whether this path can be discarded 890 // It's probably OK to say that rules must supply both safe tables 891 // if they use safe tables at all. We have certainly never described 892 // to anyone how to work with just one safe table. 893 utext_setNativeIndex(fText, offset); 894 (void)UTEXT_NEXT32(fText); 895 896 // handle previous will give result <= offset 897 handlePrevious(fData->fSafeRevTable); 898 899 // next will give result 0 or 1 boundary away from offset, 900 // most of the time 901 // we have to 902 int32_t oldresult = next(); 903 while (oldresult < offset) { 904 int32_t result = next(); 905 if (result >= offset) { 906 return oldresult; 907 } 908 oldresult = result; 909 } 910 int32_t result = previous(); 911 if (result >= offset) { 912 return previous(); 913 } 914 return result; 915 } 916 917 // old rule syntax 918 utext_setNativeIndex(fText, offset); 919 return previous(); 920 } 921 922 /** 923 * Returns true if the specfied position is a boundary position. As a side 924 * effect, leaves the iterator pointing to the first boundary position at 925 * or after "offset". 926 * @param offset the offset to check. 927 * @return True if "offset" is a boundary position. 928 */ 929 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 930 // the beginning index of the iterator is always a boundary position by definition 931 if (offset == 0) { 932 first(); // For side effects on current position, tag values. 933 return TRUE; 934 } 935 936 if (offset == (int32_t)utext_nativeLength(fText)) { 937 last(); // For side effects on current position, tag values. 938 return TRUE; 939 } 940 941 // out-of-range indexes are never boundary positions 942 if (offset < 0) { 943 first(); // For side effects on current position, tag values. 944 return FALSE; 945 } 946 947 if (offset > utext_nativeLength(fText)) { 948 last(); // For side effects on current position, tag values. 949 return FALSE; 950 } 951 952 // otherwise, we can use following() on the position before the specified 953 // one and return true if the position we get back is the one the user 954 // specified 955 utext_previous32From(fText, offset); 956 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 957 UBool result = following(backOne) == offset; 958 return result; 959 } 960 961 /** 962 * Returns the current iteration position. 963 * @return The current iteration position. 964 */ 965 int32_t RuleBasedBreakIterator::current(void) const { 966 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 967 return pos; 968 } 969 970 //======================================================================= 971 // implementation 972 //======================================================================= 973 974 // 975 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 976 // of user text. A variable with this enum type keeps track of where we 977 // are. The state machine only fetches user input while in the RUN mode. 978 // 979 enum RBBIRunMode { 980 RBBI_START, // state machine processing is before first char of input 981 RBBI_RUN, // state machine processing is in the user text 982 RBBI_END // state machine processing is after end of user text. 983 }; 984 985 986 //----------------------------------------------------------------------------------- 987 // 988 // handleNext(stateTable) 989 // This method is the actual implementation of the rbbi next() method. 990 // This method initializes the state machine to state 1 991 // and advances through the text character by character until we reach the end 992 // of the text or the state machine transitions to state 0. We update our return 993 // value every time the state machine passes through an accepting state. 994 // 995 //----------------------------------------------------------------------------------- 996 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 997 int32_t state; 998 uint16_t category = 0; 999 RBBIRunMode mode; 1000 1001 RBBIStateTableRow *row; 1002 UChar32 c; 1003 int32_t lookaheadStatus = 0; 1004 int32_t lookaheadTagIdx = 0; 1005 int32_t result = 0; 1006 int32_t initialPosition = 0; 1007 int32_t lookaheadResult = 0; 1008 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1009 const char *tableData = statetable->fTableData; 1010 uint32_t tableRowLen = statetable->fRowLen; 1011 1012 #ifdef RBBI_DEBUG 1013 if (fTrace) { 1014 RBBIDebugPuts("Handle Next pos char state category"); 1015 } 1016 #endif 1017 1018 // No matter what, handleNext alway correctly sets the break tag value. 1019 fLastStatusIndexValid = TRUE; 1020 fLastRuleStatusIndex = 0; 1021 1022 // if we're already at the end of the text, return DONE. 1023 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1024 result = initialPosition; 1025 c = UTEXT_NEXT32(fText); 1026 if (fData == NULL || c==U_SENTINEL) { 1027 return BreakIterator::DONE; 1028 } 1029 1030 // Set the initial state for the state machine 1031 state = START_STATE; 1032 row = (RBBIStateTableRow *) 1033 //(statetable->fTableData + (statetable->fRowLen * state)); 1034 (tableData + tableRowLen * state); 1035 1036 1037 mode = RBBI_RUN; 1038 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1039 category = 2; 1040 mode = RBBI_START; 1041 } 1042 1043 1044 // loop until we reach the end of the text or transition to state 0 1045 // 1046 for (;;) { 1047 if (c == U_SENTINEL) { 1048 // Reached end of input string. 1049 if (mode == RBBI_END) { 1050 // We have already run the loop one last time with the 1051 // character set to the psueudo {eof} value. Now it is time 1052 // to unconditionally bail out. 1053 if (lookaheadResult > result) { 1054 // We ran off the end of the string with a pending look-ahead match. 1055 // Treat this as if the look-ahead condition had been met, and return 1056 // the match at the / position from the look-ahead rule. 1057 result = lookaheadResult; 1058 fLastRuleStatusIndex = lookaheadTagIdx; 1059 lookaheadStatus = 0; 1060 } 1061 break; 1062 } 1063 // Run the loop one last time with the fake end-of-input character category. 1064 mode = RBBI_END; 1065 category = 1; 1066 } 1067 1068 // 1069 // Get the char category. An incoming category of 1 or 2 means that 1070 // we are preset for doing the beginning or end of input, and 1071 // that we shouldn't get a category from an actual text input character. 1072 // 1073 if (mode == RBBI_RUN) { 1074 // look up the current character's character category, which tells us 1075 // which column in the state table to look at. 1076 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1077 // not the size of the character going in, which is a UChar32. 1078 // 1079 UTRIE_GET16(&fData->fTrie, c, category); 1080 1081 // Check the dictionary bit in the character's category. 1082 // Counter is only used by dictionary based iterators (subclasses). 1083 // Chars that need to be handled by a dictionary have a flag bit set 1084 // in their category values. 1085 // 1086 if ((category & 0x4000) != 0) { 1087 fDictionaryCharCount++; 1088 // And off the dictionary flag bit. 1089 category &= ~0x4000; 1090 } 1091 } 1092 1093 #ifdef RBBI_DEBUG 1094 if (fTrace) { 1095 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1096 if (0x20<=c && c<0x7f) { 1097 RBBIDebugPrintf("\"%c\" ", c); 1098 } else { 1099 RBBIDebugPrintf("%5x ", c); 1100 } 1101 RBBIDebugPrintf("%3d %3d\n", state, category); 1102 } 1103 #endif 1104 1105 // State Transition - move machine to its next state 1106 // 1107 1108 // Note: fNextState is defined as uint16_t[2], but we are casting 1109 // a generated RBBI table to RBBIStateTableRow and some tables 1110 // actually have more than 2 categories. 1111 U_ASSERT(category<fData->fHeader->fCatCount); 1112 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1113 row = (RBBIStateTableRow *) 1114 // (statetable->fTableData + (statetable->fRowLen * state)); 1115 (tableData + tableRowLen * state); 1116 1117 1118 if (row->fAccepting == -1) { 1119 // Match found, common case. 1120 if (mode != RBBI_START) { 1121 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1122 } 1123 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1124 } 1125 1126 if (row->fLookAhead != 0) { 1127 if (lookaheadStatus != 0 1128 && row->fAccepting == lookaheadStatus) { 1129 // Lookahead match is completed. 1130 result = lookaheadResult; 1131 fLastRuleStatusIndex = lookaheadTagIdx; 1132 lookaheadStatus = 0; 1133 // TODO: make a standalone hard break in a rule work. 1134 if (lookAheadHardBreak) { 1135 UTEXT_SETNATIVEINDEX(fText, result); 1136 return result; 1137 } 1138 // Look-ahead completed, but other rules may match further. Continue on 1139 // TODO: junk this feature? I don't think it's used anywhwere. 1140 goto continueOn; 1141 } 1142 1143 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1144 lookaheadResult = r; 1145 lookaheadStatus = row->fLookAhead; 1146 lookaheadTagIdx = row->fTagIdx; 1147 goto continueOn; 1148 } 1149 1150 1151 if (row->fAccepting != 0) { 1152 // Because this is an accepting state, any in-progress look-ahead match 1153 // is no longer relavant. Clear out the pending lookahead status. 1154 lookaheadStatus = 0; // clear out any pending look-ahead match. 1155 } 1156 1157 continueOn: 1158 if (state == STOP_STATE) { 1159 // This is the normal exit from the lookup state machine. 1160 // We have advanced through the string until it is certain that no 1161 // longer match is possible, no matter what characters follow. 1162 break; 1163 } 1164 1165 // Advance to the next character. 1166 // If this is a beginning-of-input loop iteration, don't advance 1167 // the input position. The next iteration will be processing the 1168 // first real input character. 1169 if (mode == RBBI_RUN) { 1170 c = UTEXT_NEXT32(fText); 1171 } else { 1172 if (mode == RBBI_START) { 1173 mode = RBBI_RUN; 1174 } 1175 } 1176 1177 1178 } 1179 1180 // The state machine is done. Check whether it found a match... 1181 1182 // If the iterator failed to advance in the match engine, force it ahead by one. 1183 // (This really indicates a defect in the break rules. They should always match 1184 // at least one character.) 1185 if (result == initialPosition) { 1186 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1187 UTEXT_NEXT32(fText); 1188 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1189 } 1190 1191 // Leave the iterator at our result position. 1192 UTEXT_SETNATIVEINDEX(fText, result); 1193 #ifdef RBBI_DEBUG 1194 if (fTrace) { 1195 RBBIDebugPrintf("result = %d\n\n", result); 1196 } 1197 #endif 1198 return result; 1199 } 1200 1201 1202 1203 //----------------------------------------------------------------------------------- 1204 // 1205 // handlePrevious() 1206 // 1207 // Iterate backwards, according to the logic of the reverse rules. 1208 // This version handles the exact style backwards rules. 1209 // 1210 // The logic of this function is very similar to handleNext(), above. 1211 // 1212 //----------------------------------------------------------------------------------- 1213 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1214 int32_t state; 1215 uint16_t category = 0; 1216 RBBIRunMode mode; 1217 RBBIStateTableRow *row; 1218 UChar32 c; 1219 int32_t lookaheadStatus = 0; 1220 int32_t result = 0; 1221 int32_t initialPosition = 0; 1222 int32_t lookaheadResult = 0; 1223 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1224 1225 #ifdef RBBI_DEBUG 1226 if (fTrace) { 1227 RBBIDebugPuts("Handle Previous pos char state category"); 1228 } 1229 #endif 1230 1231 // handlePrevious() never gets the rule status. 1232 // Flag the status as invalid; if the user ever asks for status, we will need 1233 // to back up, then re-find the break position using handleNext(), which does 1234 // get the status value. 1235 fLastStatusIndexValid = FALSE; 1236 fLastRuleStatusIndex = 0; 1237 1238 // if we're already at the start of the text, return DONE. 1239 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1240 return BreakIterator::DONE; 1241 } 1242 1243 // Set up the starting char. 1244 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1245 result = initialPosition; 1246 c = UTEXT_PREVIOUS32(fText); 1247 1248 // Set the initial state for the state machine 1249 state = START_STATE; 1250 row = (RBBIStateTableRow *) 1251 (statetable->fTableData + (statetable->fRowLen * state)); 1252 category = 3; 1253 mode = RBBI_RUN; 1254 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1255 category = 2; 1256 mode = RBBI_START; 1257 } 1258 1259 1260 // loop until we reach the start of the text or transition to state 0 1261 // 1262 for (;;) { 1263 if (c == U_SENTINEL) { 1264 // Reached end of input string. 1265 if (mode == RBBI_END) { 1266 // We have already run the loop one last time with the 1267 // character set to the psueudo {eof} value. Now it is time 1268 // to unconditionally bail out. 1269 if (lookaheadResult < result) { 1270 // We ran off the end of the string with a pending look-ahead match. 1271 // Treat this as if the look-ahead condition had been met, and return 1272 // the match at the / position from the look-ahead rule. 1273 result = lookaheadResult; 1274 lookaheadStatus = 0; 1275 } else if (result == initialPosition) { 1276 // Ran off start, no match found. 1277 // move one index one (towards the start, since we are doing a previous()) 1278 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1279 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1280 } 1281 break; 1282 } 1283 // Run the loop one last time with the fake end-of-input character category. 1284 mode = RBBI_END; 1285 category = 1; 1286 } 1287 1288 // 1289 // Get the char category. An incoming category of 1 or 2 means that 1290 // we are preset for doing the beginning or end of input, and 1291 // that we shouldn't get a category from an actual text input character. 1292 // 1293 if (mode == RBBI_RUN) { 1294 // look up the current character's character category, which tells us 1295 // which column in the state table to look at. 1296 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1297 // not the size of the character going in, which is a UChar32. 1298 // 1299 UTRIE_GET16(&fData->fTrie, c, category); 1300 1301 // Check the dictionary bit in the character's category. 1302 // Counter is only used by dictionary based iterators (subclasses). 1303 // Chars that need to be handled by a dictionary have a flag bit set 1304 // in their category values. 1305 // 1306 if ((category & 0x4000) != 0) { 1307 fDictionaryCharCount++; 1308 // And off the dictionary flag bit. 1309 category &= ~0x4000; 1310 } 1311 } 1312 1313 #ifdef RBBI_DEBUG 1314 if (fTrace) { 1315 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1316 if (0x20<=c && c<0x7f) { 1317 RBBIDebugPrintf("\"%c\" ", c); 1318 } else { 1319 RBBIDebugPrintf("%5x ", c); 1320 } 1321 RBBIDebugPrintf("%3d %3d\n", state, category); 1322 } 1323 #endif 1324 1325 // State Transition - move machine to its next state 1326 // 1327 1328 // Note: fNextState is defined as uint16_t[2], but we are casting 1329 // a generated RBBI table to RBBIStateTableRow and some tables 1330 // actually have more than 2 categories. 1331 U_ASSERT(category<fData->fHeader->fCatCount); 1332 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1333 row = (RBBIStateTableRow *) 1334 (statetable->fTableData + (statetable->fRowLen * state)); 1335 1336 if (row->fAccepting == -1) { 1337 // Match found, common case. 1338 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1339 } 1340 1341 if (row->fLookAhead != 0) { 1342 if (lookaheadStatus != 0 1343 && row->fAccepting == lookaheadStatus) { 1344 // Lookahead match is completed. 1345 result = lookaheadResult; 1346 lookaheadStatus = 0; 1347 // TODO: make a standalone hard break in a rule work. 1348 if (lookAheadHardBreak) { 1349 UTEXT_SETNATIVEINDEX(fText, result); 1350 return result; 1351 } 1352 // Look-ahead completed, but other rules may match further. Continue on 1353 // TODO: junk this feature? I don't think it's used anywhwere. 1354 goto continueOn; 1355 } 1356 1357 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1358 lookaheadResult = r; 1359 lookaheadStatus = row->fLookAhead; 1360 goto continueOn; 1361 } 1362 1363 1364 if (row->fAccepting != 0) { 1365 // Because this is an accepting state, any in-progress look-ahead match 1366 // is no longer relavant. Clear out the pending lookahead status. 1367 lookaheadStatus = 0; 1368 } 1369 1370 continueOn: 1371 if (state == STOP_STATE) { 1372 // This is the normal exit from the lookup state machine. 1373 // We have advanced through the string until it is certain that no 1374 // longer match is possible, no matter what characters follow. 1375 break; 1376 } 1377 1378 // Move (backwards) to the next character to process. 1379 // If this is a beginning-of-input loop iteration, don't advance 1380 // the input position. The next iteration will be processing the 1381 // first real input character. 1382 if (mode == RBBI_RUN) { 1383 c = UTEXT_PREVIOUS32(fText); 1384 } else { 1385 if (mode == RBBI_START) { 1386 mode = RBBI_RUN; 1387 } 1388 } 1389 } 1390 1391 // The state machine is done. Check whether it found a match... 1392 1393 // If the iterator failed to advance in the match engine, force it ahead by one. 1394 // (This really indicates a defect in the break rules. They should always match 1395 // at least one character.) 1396 if (result == initialPosition) { 1397 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1398 UTEXT_PREVIOUS32(fText); 1399 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1400 } 1401 1402 // Leave the iterator at our result position. 1403 UTEXT_SETNATIVEINDEX(fText, result); 1404 #ifdef RBBI_DEBUG 1405 if (fTrace) { 1406 RBBIDebugPrintf("result = %d\n\n", result); 1407 } 1408 #endif 1409 return result; 1410 } 1411 1412 1413 void 1414 RuleBasedBreakIterator::reset() 1415 { 1416 if (fCachedBreakPositions) { 1417 uprv_free(fCachedBreakPositions); 1418 } 1419 fCachedBreakPositions = NULL; 1420 fNumCachedBreakPositions = 0; 1421 fDictionaryCharCount = 0; 1422 fPositionInCache = 0; 1423 } 1424 1425 1426 1427 //------------------------------------------------------------------------------- 1428 // 1429 // getRuleStatus() Return the break rule tag associated with the current 1430 // iterator position. If the iterator arrived at its current 1431 // position by iterating forwards, the value will have been 1432 // cached by the handleNext() function. 1433 // 1434 // If no cached status value is available, the status is 1435 // found by doing a previous() followed by a next(), which 1436 // leaves the iterator where it started, and computes the 1437 // status while doing the next(). 1438 // 1439 //------------------------------------------------------------------------------- 1440 void RuleBasedBreakIterator::makeRuleStatusValid() { 1441 if (fLastStatusIndexValid == FALSE) { 1442 // No cached status is available. 1443 if (fText == NULL || current() == 0) { 1444 // At start of text, or there is no text. Status is always zero. 1445 fLastRuleStatusIndex = 0; 1446 fLastStatusIndexValid = TRUE; 1447 } else { 1448 // Not at start of text. Find status the tedious way. 1449 int32_t pa = current(); 1450 previous(); 1451 if (fNumCachedBreakPositions > 0) { 1452 reset(); // Blow off the dictionary cache 1453 } 1454 int32_t pb = next(); 1455 if (pa != pb) { 1456 // note: the if (pa != pb) test is here only to eliminate warnings for 1457 // unused local variables on gcc. Logically, it isn't needed. 1458 U_ASSERT(pa == pb); 1459 } 1460 } 1461 } 1462 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1463 } 1464 1465 1466 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1467 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1468 nonConstThis->makeRuleStatusValid(); 1469 1470 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1471 // (the number of status values.) 1472 // This function returns the last (largest) of the array of status values. 1473 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1474 int32_t tagVal = fData->fRuleStatusTable[idx]; 1475 1476 return tagVal; 1477 } 1478 1479 1480 1481 1482 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1483 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1484 { 1485 if (U_FAILURE(status)) { 1486 return 0; 1487 } 1488 1489 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1490 nonConstThis->makeRuleStatusValid(); 1491 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1492 int32_t numValsToCopy = numVals; 1493 if (numVals > capacity) { 1494 status = U_BUFFER_OVERFLOW_ERROR; 1495 numValsToCopy = capacity; 1496 } 1497 int i; 1498 for (i=0; i<numValsToCopy; i++) { 1499 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1500 } 1501 return numVals; 1502 } 1503 1504 1505 1506 //------------------------------------------------------------------------------- 1507 // 1508 // getBinaryRules Access to the compiled form of the rules, 1509 // for use by build system tools that save the data 1510 // for standard iterator types. 1511 // 1512 //------------------------------------------------------------------------------- 1513 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1514 const uint8_t *retPtr = NULL; 1515 length = 0; 1516 1517 if (fData != NULL) { 1518 retPtr = (const uint8_t *)fData->fHeader; 1519 length = fData->fHeader->fLength; 1520 } 1521 return retPtr; 1522 } 1523 1524 1525 BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, 1526 int32_t &bufferSize, 1527 UErrorCode &status) 1528 { 1529 if (U_FAILURE(status)){ 1530 return NULL; 1531 } 1532 1533 if (bufferSize == 0) { 1534 bufferSize = 1; // preflighting for deprecated functionality 1535 return NULL; 1536 } 1537 1538 BreakIterator *clonedBI = clone(); 1539 if (clonedBI == NULL) { 1540 status = U_MEMORY_ALLOCATION_ERROR; 1541 } else { 1542 status = U_SAFECLONE_ALLOCATED_WARNING; 1543 } 1544 return (RuleBasedBreakIterator *)clonedBI; 1545 } 1546 1547 1548 //------------------------------------------------------------------------------- 1549 // 1550 // isDictionaryChar Return true if the category lookup for this char 1551 // indicates that it is in the set of dictionary lookup 1552 // chars. 1553 // 1554 // This function is intended for use by dictionary based 1555 // break iterators. 1556 // 1557 //------------------------------------------------------------------------------- 1558 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1559 if (fData == NULL) { 1560 return FALSE; 1561 } 1562 uint16_t category; 1563 UTRIE_GET16(&fData->fTrie, c, category); 1564 return (category & 0x4000) != 0; 1565 }*/ 1566 1567 1568 //------------------------------------------------------------------------------- 1569 // 1570 // checkDictionary This function handles all processing of characters in 1571 // the "dictionary" set. It will determine the appropriate 1572 // course of action, and possibly set up a cache in the 1573 // process. 1574 // 1575 //------------------------------------------------------------------------------- 1576 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1577 int32_t endPos, 1578 UBool reverse) { 1579 // Reset the old break cache first. 1580 reset(); 1581 1582 // note: code segment below assumes that dictionary chars are in the 1583 // startPos-endPos range 1584 // value returned should be next character in sequence 1585 if ((endPos - startPos) <= 1) { 1586 return (reverse ? startPos : endPos); 1587 } 1588 1589 // Starting from the starting point, scan towards the proposed result, 1590 // looking for the first dictionary character (which may be the one 1591 // we're on, if we're starting in the middle of a range). 1592 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1593 if (reverse) { 1594 UTEXT_PREVIOUS32(fText); 1595 } 1596 1597 int32_t rangeStart = startPos; 1598 int32_t rangeEnd = endPos; 1599 1600 uint16_t category; 1601 int32_t current; 1602 UErrorCode status = U_ZERO_ERROR; 1603 UStack breaks(status); 1604 int32_t foundBreakCount = 0; 1605 UChar32 c = utext_current32(fText); 1606 1607 UTRIE_GET16(&fData->fTrie, c, category); 1608 1609 // Is the character we're starting on a dictionary character? If so, we 1610 // need to back up to include the entire run; otherwise the results of 1611 // the break algorithm will differ depending on where we start. Since 1612 // the result is cached and there is typically a non-dictionary break 1613 // within a small number of words, there should be little performance impact. 1614 if (category & 0x4000) { 1615 if (reverse) { 1616 do { 1617 utext_next32(fText); // TODO: recast to work directly with postincrement. 1618 c = utext_current32(fText); 1619 UTRIE_GET16(&fData->fTrie, c, category); 1620 } while (c != U_SENTINEL && (category & 0x4000)); 1621 // Back up to the last dictionary character 1622 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1623 if (c == U_SENTINEL) { 1624 // c = fText->last32(); 1625 // TODO: why was this if needed? 1626 c = UTEXT_PREVIOUS32(fText); 1627 } 1628 else { 1629 c = UTEXT_PREVIOUS32(fText); 1630 } 1631 } 1632 else { 1633 do { 1634 c = UTEXT_PREVIOUS32(fText); 1635 UTRIE_GET16(&fData->fTrie, c, category); 1636 } 1637 while (c != U_SENTINEL && (category & 0x4000)); 1638 // Back up to the last dictionary character 1639 if (c == U_SENTINEL) { 1640 // c = fText->first32(); 1641 c = utext_current32(fText); 1642 } 1643 else { 1644 utext_next32(fText); 1645 c = utext_current32(fText); 1646 } 1647 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1648 } 1649 UTRIE_GET16(&fData->fTrie, c, category); 1650 } 1651 1652 // Loop through the text, looking for ranges of dictionary characters. 1653 // For each span, find the appropriate break engine, and ask it to find 1654 // any breaks within the span. 1655 // Note: we always do this in the forward direction, so that the break 1656 // cache is built in the right order. 1657 if (reverse) { 1658 utext_setNativeIndex(fText, rangeStart); 1659 c = utext_current32(fText); 1660 UTRIE_GET16(&fData->fTrie, c, category); 1661 } 1662 while(U_SUCCESS(status)) { 1663 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1664 utext_next32(fText); // TODO: tweak for post-increment operation 1665 c = utext_current32(fText); 1666 UTRIE_GET16(&fData->fTrie, c, category); 1667 } 1668 if (current >= rangeEnd) { 1669 break; 1670 } 1671 1672 // We now have a dictionary character. Get the appropriate language object 1673 // to deal with it. 1674 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1675 1676 // Ask the language object if there are any breaks. It will leave the text 1677 // pointer on the other side of its range, ready to search for the next one. 1678 if (lbe != NULL) { 1679 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1680 } 1681 1682 // Reload the loop variables for the next go-round 1683 c = utext_current32(fText); 1684 UTRIE_GET16(&fData->fTrie, c, category); 1685 } 1686 1687 // If we found breaks, build a new break cache. The first and last entries must 1688 // be the original starting and ending position. 1689 if (foundBreakCount > 0) { 1690 U_ASSERT(foundBreakCount == breaks.size()); 1691 int32_t totalBreaks = foundBreakCount; 1692 if (startPos < breaks.elementAti(0)) { 1693 totalBreaks += 1; 1694 } 1695 if (endPos > breaks.peeki()) { 1696 totalBreaks += 1; 1697 } 1698 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1699 if (fCachedBreakPositions != NULL) { 1700 int32_t out = 0; 1701 fNumCachedBreakPositions = totalBreaks; 1702 if (startPos < breaks.elementAti(0)) { 1703 fCachedBreakPositions[out++] = startPos; 1704 } 1705 for (int32_t i = 0; i < foundBreakCount; ++i) { 1706 fCachedBreakPositions[out++] = breaks.elementAti(i); 1707 } 1708 if (endPos > fCachedBreakPositions[out-1]) { 1709 fCachedBreakPositions[out] = endPos; 1710 } 1711 // If there are breaks, then by definition, we are replacing the original 1712 // proposed break by one of the breaks we found. Use following() and 1713 // preceding() to do the work. They should never recurse in this case. 1714 if (reverse) { 1715 return preceding(endPos); 1716 } 1717 else { 1718 return following(startPos); 1719 } 1720 } 1721 // If the allocation failed, just fall through to the "no breaks found" case. 1722 } 1723 1724 // If we get here, there were no language-based breaks. Set the text pointer 1725 // to the original proposed break. 1726 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1727 return (reverse ? startPos : endPos); 1728 } 1729 1730 U_NAMESPACE_END 1731 1732 1733 static icu::UStack *gLanguageBreakFactories = NULL; 1734 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1735 1736 /** 1737 * Release all static memory held by breakiterator. 1738 */ 1739 U_CDECL_BEGIN 1740 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1741 if (gLanguageBreakFactories) { 1742 delete gLanguageBreakFactories; 1743 gLanguageBreakFactories = NULL; 1744 } 1745 gLanguageBreakFactoriesInitOnce.reset(); 1746 return TRUE; 1747 } 1748 U_CDECL_END 1749 1750 U_CDECL_BEGIN 1751 static void U_CALLCONV _deleteFactory(void *obj) { 1752 delete (icu::LanguageBreakFactory *) obj; 1753 } 1754 U_CDECL_END 1755 U_NAMESPACE_BEGIN 1756 1757 static void U_CALLCONV initLanguageFactories() { 1758 UErrorCode status = U_ZERO_ERROR; 1759 U_ASSERT(gLanguageBreakFactories == NULL); 1760 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); 1761 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { 1762 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1763 gLanguageBreakFactories->push(builtIn, status); 1764 #ifdef U_LOCAL_SERVICE_HOOK 1765 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1766 if (extra != NULL) { 1767 gLanguageBreakFactories->push(extra, status); 1768 } 1769 #endif 1770 } 1771 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1772 } 1773 1774 1775 static const LanguageBreakEngine* 1776 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1777 { 1778 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); 1779 if (gLanguageBreakFactories == NULL) { 1780 return NULL; 1781 } 1782 1783 int32_t i = gLanguageBreakFactories->size(); 1784 const LanguageBreakEngine *lbe = NULL; 1785 while (--i >= 0) { 1786 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1787 lbe = factory->getEngineFor(c, breakType); 1788 if (lbe != NULL) { 1789 break; 1790 } 1791 } 1792 return lbe; 1793 } 1794 1795 1796 //------------------------------------------------------------------------------- 1797 // 1798 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1799 // the character c. 1800 // 1801 //------------------------------------------------------------------------------- 1802 const LanguageBreakEngine * 1803 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1804 const LanguageBreakEngine *lbe = NULL; 1805 UErrorCode status = U_ZERO_ERROR; 1806 1807 if (fLanguageBreakEngines == NULL) { 1808 fLanguageBreakEngines = new UStack(status); 1809 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1810 delete fLanguageBreakEngines; 1811 fLanguageBreakEngines = 0; 1812 return NULL; 1813 } 1814 } 1815 1816 int32_t i = fLanguageBreakEngines->size(); 1817 while (--i >= 0) { 1818 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1819 if (lbe->handles(c, fBreakType)) { 1820 return lbe; 1821 } 1822 } 1823 1824 // No existing dictionary took the character. See if a factory wants to 1825 // give us a new LanguageBreakEngine for this character. 1826 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1827 1828 // If we got one, use it and push it on our stack. 1829 if (lbe != NULL) { 1830 fLanguageBreakEngines->push((void *)lbe, status); 1831 // Even if we can't remember it, we can keep looking it up, so 1832 // return it even if the push fails. 1833 return lbe; 1834 } 1835 1836 // No engine is forthcoming for this character. Add it to the 1837 // reject set. Create the reject break engine if needed. 1838 if (fUnhandledBreakEngine == NULL) { 1839 fUnhandledBreakEngine = new UnhandledEngine(status); 1840 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1841 status = U_MEMORY_ALLOCATION_ERROR; 1842 } 1843 // Put it last so that scripts for which we have an engine get tried 1844 // first. 1845 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1846 // If we can't insert it, or creation failed, get rid of it 1847 if (U_FAILURE(status)) { 1848 delete fUnhandledBreakEngine; 1849 fUnhandledBreakEngine = 0; 1850 return NULL; 1851 } 1852 } 1853 1854 // Tell the reject engine about the character; at its discretion, it may 1855 // add more than just the one character. 1856 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1857 1858 return fUnhandledBreakEngine; 1859 } 1860 1861 1862 1863 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1864 return fBreakType; 1865 }*/ 1866 1867 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1868 fBreakType = type; 1869 reset(); 1870 } 1871 1872 U_NAMESPACE_END 1873 1874 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1875