1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2014 International Business Machines Corporation 4 * and others. All rights reserved. 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include "utypeinfo.h" // for 'typeid' to work 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/rbbi.h" 20 #include "unicode/schriter.h" 21 #include "unicode/uchriter.h" 22 #include "unicode/udata.h" 23 #include "unicode/uclean.h" 24 #include "rbbidata.h" 25 #include "rbbirb.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "umutex.h" 29 #include "ucln_cmn.h" 30 #include "brkeng.h" 31 32 #include "uassert.h" 33 #include "uvector.h" 34 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 36 #if U_LOCAL_SERVICE_HOOK 37 #include "localsvc.h" 38 #endif 39 40 #ifdef RBBI_DEBUG 41 static UBool fTrace = FALSE; 42 #endif 43 44 U_NAMESPACE_BEGIN 45 46 // The state number of the starting state 47 #define START_STATE 1 48 49 // The state-transition value indicating "stop" 50 #define STOP_STATE 0 51 52 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 54 55 56 //======================================================================= 57 // constructors 58 //======================================================================= 59 60 /** 61 * Constructs a RuleBasedBreakIterator that uses the already-created 62 * tables object that is passed in as a parameter. 63 */ 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 65 { 66 init(); 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor 68 if (U_FAILURE(status)) {return;} 69 if(fData == 0) { 70 status = U_MEMORY_ALLOCATION_ERROR; 71 return; 72 } 73 } 74 75 /** 76 * Same as above but does not adopt memory 77 */ 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 79 { 80 init(); 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 82 if (U_FAILURE(status)) {return;} 83 if(fData == 0) { 84 status = U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 } 88 89 90 // 91 // Construct from precompiled binary rules (tables). This constructor is public API, 92 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 93 // 94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 95 uint32_t ruleLength, 96 UErrorCode &status) { 97 init(); 98 if (U_FAILURE(status)) { 99 return; 100 } 101 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 102 status = U_ILLEGAL_ARGUMENT_ERROR; 103 return; 104 } 105 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 106 if (data->fLength > ruleLength) { 107 status = U_ILLEGAL_ARGUMENT_ERROR; 108 return; 109 } 110 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 111 if (U_FAILURE(status)) {return;} 112 if(fData == 0) { 113 status = U_MEMORY_ALLOCATION_ERROR; 114 return; 115 } 116 } 117 118 119 //------------------------------------------------------------------------------- 120 // 121 // Constructor from a UDataMemory handle to precompiled break rules 122 // stored in an ICU data file. 123 // 124 //------------------------------------------------------------------------------- 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 126 { 127 init(); 128 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 129 if (U_FAILURE(status)) {return;} 130 if(fData == 0) { 131 status = U_MEMORY_ALLOCATION_ERROR; 132 return; 133 } 134 } 135 136 137 138 //------------------------------------------------------------------------------- 139 // 140 // Constructor from a set of rules supplied as a string. 141 // 142 //------------------------------------------------------------------------------- 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 144 UParseError &parseError, 145 UErrorCode &status) 146 { 147 init(); 148 if (U_FAILURE(status)) {return;} 149 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 150 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 151 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 152 // creates and returns a complete RBBI. From here, in a constructor, we 153 // can't just return the object created by the builder factory, hence 154 // the assignment of the factory created object to "this". 155 if (U_SUCCESS(status)) { 156 *this = *bi; 157 delete bi; 158 } 159 } 160 161 162 //------------------------------------------------------------------------------- 163 // 164 // Default Constructor. Create an empty shell that can be set up later. 165 // Used when creating a RuleBasedBreakIterator from a set 166 // of rules. 167 //------------------------------------------------------------------------------- 168 RuleBasedBreakIterator::RuleBasedBreakIterator() { 169 init(); 170 } 171 172 173 //------------------------------------------------------------------------------- 174 // 175 // Copy constructor. Will produce a break iterator with the same behavior, 176 // and which iterates over the same text, as the one passed in. 177 // 178 //------------------------------------------------------------------------------- 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 180 : BreakIterator(other) 181 { 182 this->init(); 183 *this = other; 184 } 185 186 187 /** 188 * Destructor 189 */ 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 191 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 192 // fCharIter was adopted from the outside. 193 delete fCharIter; 194 } 195 fCharIter = NULL; 196 delete fSCharIter; 197 fCharIter = NULL; 198 delete fDCharIter; 199 fDCharIter = NULL; 200 201 utext_close(fText); 202 203 if (fData != NULL) { 204 fData->removeReference(); 205 fData = NULL; 206 } 207 if (fCachedBreakPositions) { 208 uprv_free(fCachedBreakPositions); 209 fCachedBreakPositions = NULL; 210 } 211 if (fLanguageBreakEngines) { 212 delete fLanguageBreakEngines; 213 fLanguageBreakEngines = NULL; 214 } 215 if (fUnhandledBreakEngine) { 216 delete fUnhandledBreakEngine; 217 fUnhandledBreakEngine = NULL; 218 } 219 } 220 221 /** 222 * Assignment operator. Sets this iterator to have the same behavior, 223 * and iterate over the same text, as the one passed in. 224 */ 225 RuleBasedBreakIterator& 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 227 if (this == &that) { 228 return *this; 229 } 230 reset(); // Delete break cache information 231 fBreakType = that.fBreakType; 232 if (fLanguageBreakEngines != NULL) { 233 delete fLanguageBreakEngines; 234 fLanguageBreakEngines = NULL; // Just rebuild for now 235 } 236 // TODO: clone fLanguageBreakEngines from "that" 237 UErrorCode status = U_ZERO_ERROR; 238 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 239 240 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 241 delete fCharIter; 242 } 243 fCharIter = NULL; 244 245 if (that.fCharIter != NULL ) { 246 // This is a little bit tricky - it will intially appear that 247 // this->fCharIter is adopted, even if that->fCharIter was 248 // not adopted. That's ok. 249 fCharIter = that.fCharIter->clone(); 250 } 251 252 if (fData != NULL) { 253 fData->removeReference(); 254 fData = NULL; 255 } 256 if (that.fData != NULL) { 257 fData = that.fData->addReference(); 258 } 259 260 return *this; 261 } 262 263 264 265 //----------------------------------------------------------------------------- 266 // 267 // init() Shared initialization routine. Used by all the constructors. 268 // Initializes all fields, leaving the object in a consistent state. 269 // 270 //----------------------------------------------------------------------------- 271 void RuleBasedBreakIterator::init() { 272 UErrorCode status = U_ZERO_ERROR; 273 fText = utext_openUChars(NULL, NULL, 0, &status); 274 fCharIter = NULL; 275 fSCharIter = NULL; 276 fDCharIter = NULL; 277 fData = NULL; 278 fLastRuleStatusIndex = 0; 279 fLastStatusIndexValid = TRUE; 280 fDictionaryCharCount = 0; 281 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 282 // dictionary behavior for Break Iterators that are 283 // built from rules. Even better would be the ability to 284 // declare the type in the rules. 285 286 fCachedBreakPositions = NULL; 287 fLanguageBreakEngines = NULL; 288 fUnhandledBreakEngine = NULL; 289 fNumCachedBreakPositions = 0; 290 fPositionInCache = 0; 291 292 #ifdef RBBI_DEBUG 293 static UBool debugInitDone = FALSE; 294 if (debugInitDone == FALSE) { 295 char *debugEnv = getenv("U_RBBIDEBUG"); 296 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 297 fTrace = TRUE; 298 } 299 debugInitDone = TRUE; 300 } 301 #endif 302 } 303 304 305 306 //----------------------------------------------------------------------------- 307 // 308 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 309 // behavior, and iterating over the same text, as this one. 310 // Virtual function: does the right thing with subclasses. 311 // 312 //----------------------------------------------------------------------------- 313 BreakIterator* 314 RuleBasedBreakIterator::clone(void) const { 315 return new RuleBasedBreakIterator(*this); 316 } 317 318 /** 319 * Equality operator. Returns TRUE if both BreakIterators are of the 320 * same class, have the same behavior, and iterate over the same text. 321 */ 322 UBool 323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 324 if (typeid(*this) != typeid(that)) { 325 return FALSE; 326 } 327 328 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 329 330 if (!utext_equals(fText, that2.fText)) { 331 // The two break iterators are operating on different text, 332 // or have a different interation position. 333 return FALSE; 334 }; 335 336 // TODO: need a check for when in a dictionary region at different offsets. 337 338 if (that2.fData == fData || 339 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 340 // The two break iterators are using the same rules. 341 return TRUE; 342 } 343 return FALSE; 344 } 345 346 /** 347 * Compute a hash code for this BreakIterator 348 * @return A hash code 349 */ 350 int32_t 351 RuleBasedBreakIterator::hashCode(void) const { 352 int32_t hash = 0; 353 if (fData != NULL) { 354 hash = fData->hashCode(); 355 } 356 return hash; 357 } 358 359 360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 361 if (U_FAILURE(status)) { 362 return; 363 } 364 reset(); 365 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 366 367 // Set up a dummy CharacterIterator to be returned if anyone 368 // calls getText(). With input from UText, there is no reasonable 369 // way to return a characterIterator over the actual input text. 370 // Return one over an empty string instead - this is the closest 371 // we can come to signaling a failure. 372 // (GetText() is obsolete, this failure is sort of OK) 373 if (fDCharIter == NULL) { 374 static const UChar c = 0; 375 fDCharIter = new UCharCharacterIterator(&c, 0); 376 if (fDCharIter == NULL) { 377 status = U_MEMORY_ALLOCATION_ERROR; 378 return; 379 } 380 } 381 382 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 383 // existing fCharIter was adopted from the outside. Delete it now. 384 delete fCharIter; 385 } 386 fCharIter = fDCharIter; 387 388 this->first(); 389 } 390 391 392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 393 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 394 return result; 395 } 396 397 398 399 /** 400 * Returns the description used to create this iterator 401 */ 402 const UnicodeString& 403 RuleBasedBreakIterator::getRules() const { 404 if (fData != NULL) { 405 return fData->getRuleSourceString(); 406 } else { 407 static const UnicodeString *s; 408 if (s == NULL) { 409 // TODO: something more elegant here. 410 // perhaps API should return the string by value. 411 // Note: thread unsafe init & leak are semi-ok, better than 412 // what was before. Sould be cleaned up, though. 413 s = new UnicodeString; 414 } 415 return *s; 416 } 417 } 418 419 //======================================================================= 420 // BreakIterator overrides 421 //======================================================================= 422 423 /** 424 * Return a CharacterIterator over the text being analyzed. 425 */ 426 CharacterIterator& 427 RuleBasedBreakIterator::getText() const { 428 return *fCharIter; 429 } 430 431 /** 432 * Set the iterator to analyze a new piece of text. This function resets 433 * the current iteration position to the beginning of the text. 434 * @param newText An iterator over the text to analyze. 435 */ 436 void 437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 438 // If we are holding a CharacterIterator adopted from a 439 // previous call to this function, delete it now. 440 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 441 delete fCharIter; 442 } 443 444 fCharIter = newText; 445 UErrorCode status = U_ZERO_ERROR; 446 reset(); 447 if (newText==NULL || newText->startIndex() != 0) { 448 // startIndex !=0 wants to be an error, but there's no way to report it. 449 // Make the iterator text be an empty string. 450 fText = utext_openUChars(fText, NULL, 0, &status); 451 } else { 452 fText = utext_openCharacterIterator(fText, newText, &status); 453 } 454 this->first(); 455 } 456 457 /** 458 * Set the iterator to analyze a new piece of text. This function resets 459 * the current iteration position to the beginning of the text. 460 * @param newText An iterator over the text to analyze. 461 */ 462 void 463 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 464 UErrorCode status = U_ZERO_ERROR; 465 reset(); 466 fText = utext_openConstUnicodeString(fText, &newText, &status); 467 468 // Set up a character iterator on the string. 469 // Needed in case someone calls getText(). 470 // Can not, unfortunately, do this lazily on the (probably never) 471 // call to getText(), because getText is const. 472 if (fSCharIter == NULL) { 473 fSCharIter = new StringCharacterIterator(newText); 474 } else { 475 fSCharIter->setText(newText); 476 } 477 478 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 479 // old fCharIter was adopted from the outside. Delete it. 480 delete fCharIter; 481 } 482 fCharIter = fSCharIter; 483 484 this->first(); 485 } 486 487 488 /** 489 * Provide a new UText for the input text. Must reference text with contents identical 490 * to the original. 491 * Intended for use with text data originating in Java (garbage collected) environments 492 * where the data may be moved in memory at arbitrary times. 493 */ 494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 495 if (U_FAILURE(status)) { 496 return *this; 497 } 498 if (input == NULL) { 499 status = U_ILLEGAL_ARGUMENT_ERROR; 500 return *this; 501 } 502 int64_t pos = utext_getNativeIndex(fText); 503 // Shallow read-only clone of the new UText into the existing input UText 504 fText = utext_clone(fText, input, FALSE, TRUE, &status); 505 if (U_FAILURE(status)) { 506 return *this; 507 } 508 utext_setNativeIndex(fText, pos); 509 if (utext_getNativeIndex(fText) != pos) { 510 // Sanity check. The new input utext is supposed to have the exact same 511 // contents as the old. If we can't set to the same position, it doesn't. 512 // The contents underlying the old utext might be invalid at this point, 513 // so it's not safe to check directly. 514 status = U_ILLEGAL_ARGUMENT_ERROR; 515 } 516 return *this; 517 } 518 519 520 /** 521 * Sets the current iteration position to the beginning of the text. 522 * @return The offset of the beginning of the text. 523 */ 524 int32_t RuleBasedBreakIterator::first(void) { 525 reset(); 526 fLastRuleStatusIndex = 0; 527 fLastStatusIndexValid = TRUE; 528 //if (fText == NULL) 529 // return BreakIterator::DONE; 530 531 utext_setNativeIndex(fText, 0); 532 return 0; 533 } 534 535 /** 536 * Sets the current iteration position to the end of the text. 537 * @return The text's past-the-end offset. 538 */ 539 int32_t RuleBasedBreakIterator::last(void) { 540 reset(); 541 if (fText == NULL) { 542 fLastRuleStatusIndex = 0; 543 fLastStatusIndexValid = TRUE; 544 return BreakIterator::DONE; 545 } 546 547 fLastStatusIndexValid = FALSE; 548 int32_t pos = (int32_t)utext_nativeLength(fText); 549 utext_setNativeIndex(fText, pos); 550 return pos; 551 } 552 553 /** 554 * Advances the iterator either forward or backward the specified number of steps. 555 * Negative values move backward, and positive values move forward. This is 556 * equivalent to repeatedly calling next() or previous(). 557 * @param n The number of steps to move. The sign indicates the direction 558 * (negative is backwards, and positive is forwards). 559 * @return The character offset of the boundary position n boundaries away from 560 * the current one. 561 */ 562 int32_t RuleBasedBreakIterator::next(int32_t n) { 563 int32_t result = current(); 564 while (n > 0) { 565 result = next(); 566 --n; 567 } 568 while (n < 0) { 569 result = previous(); 570 ++n; 571 } 572 return result; 573 } 574 575 /** 576 * Advances the iterator to the next boundary position. 577 * @return The position of the first boundary after this one. 578 */ 579 int32_t RuleBasedBreakIterator::next(void) { 580 // if we have cached break positions and we're still in the range 581 // covered by them, just move one step forward in the cache 582 if (fCachedBreakPositions != NULL) { 583 if (fPositionInCache < fNumCachedBreakPositions - 1) { 584 ++fPositionInCache; 585 int32_t pos = fCachedBreakPositions[fPositionInCache]; 586 utext_setNativeIndex(fText, pos); 587 return pos; 588 } 589 else { 590 reset(); 591 } 592 } 593 594 int32_t startPos = current(); 595 fDictionaryCharCount = 0; 596 int32_t result = handleNext(fData->fForwardTable); 597 if (fDictionaryCharCount > 0) { 598 result = checkDictionary(startPos, result, FALSE); 599 } 600 return result; 601 } 602 603 /** 604 * Advances the iterator backwards, to the last boundary preceding this one. 605 * @return The position of the last boundary position preceding this one. 606 */ 607 int32_t RuleBasedBreakIterator::previous(void) { 608 int32_t result; 609 int32_t startPos; 610 611 // if we have cached break positions and we're still in the range 612 // covered by them, just move one step backward in the cache 613 if (fCachedBreakPositions != NULL) { 614 if (fPositionInCache > 0) { 615 --fPositionInCache; 616 // If we're at the beginning of the cache, need to reevaluate the 617 // rule status 618 if (fPositionInCache <= 0) { 619 fLastStatusIndexValid = FALSE; 620 } 621 int32_t pos = fCachedBreakPositions[fPositionInCache]; 622 utext_setNativeIndex(fText, pos); 623 return pos; 624 } 625 else { 626 reset(); 627 } 628 } 629 630 // if we're already sitting at the beginning of the text, return DONE 631 if (fText == NULL || (startPos = current()) == 0) { 632 fLastRuleStatusIndex = 0; 633 fLastStatusIndexValid = TRUE; 634 return BreakIterator::DONE; 635 } 636 637 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 638 result = handlePrevious(fData->fReverseTable); 639 if (fDictionaryCharCount > 0) { 640 result = checkDictionary(result, startPos, TRUE); 641 } 642 return result; 643 } 644 645 // old rule syntax 646 // set things up. handlePrevious() will back us up to some valid 647 // break position before the current position (we back our internal 648 // iterator up one step to prevent handlePrevious() from returning 649 // the current position), but not necessarily the last one before 650 // where we started 651 652 int32_t start = current(); 653 654 (void)UTEXT_PREVIOUS32(fText); 655 int32_t lastResult = handlePrevious(fData->fReverseTable); 656 if (lastResult == UBRK_DONE) { 657 lastResult = 0; 658 utext_setNativeIndex(fText, 0); 659 } 660 result = lastResult; 661 int32_t lastTag = 0; 662 UBool breakTagValid = FALSE; 663 664 // iterate forward from the known break position until we pass our 665 // starting point. The last break position before the starting 666 // point is our return value 667 668 for (;;) { 669 result = next(); 670 if (result == BreakIterator::DONE || result >= start) { 671 break; 672 } 673 lastResult = result; 674 lastTag = fLastRuleStatusIndex; 675 breakTagValid = TRUE; 676 } 677 678 // fLastBreakTag wants to have the value for section of text preceding 679 // the result position that we are to return (in lastResult.) If 680 // the backwards rules overshot and the above loop had to do two or more 681 // next()s to move up to the desired return position, we will have a valid 682 // tag value. But, if handlePrevious() took us to exactly the correct result position, 683 // we wont have a tag value for that position, which is only set by handleNext(). 684 685 // Set the current iteration position to be the last break position 686 // before where we started, and then return that value. 687 utext_setNativeIndex(fText, lastResult); 688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 689 fLastStatusIndexValid = breakTagValid; 690 691 // No need to check the dictionary; it will have been handled by 692 // next() 693 694 return lastResult; 695 } 696 697 /** 698 * Sets the iterator to refer to the first boundary position following 699 * the specified position. 700 * @offset The position from which to begin searching for a break position. 701 * @return The position of the first break after the current position. 702 */ 703 int32_t RuleBasedBreakIterator::following(int32_t offset) { 704 // if we have cached break positions and offset is in the range 705 // covered by them, use them 706 // TODO: could use binary search 707 // TODO: what if offset is outside range, but break is not? 708 if (fCachedBreakPositions != NULL) { 709 if (offset >= fCachedBreakPositions[0] 710 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 711 fPositionInCache = 0; 712 // We are guaranteed not to leave the array due to range test above 713 while (offset >= fCachedBreakPositions[fPositionInCache]) { 714 ++fPositionInCache; 715 } 716 int32_t pos = fCachedBreakPositions[fPositionInCache]; 717 utext_setNativeIndex(fText, pos); 718 return pos; 719 } 720 else { 721 reset(); 722 } 723 } 724 725 // if the offset passed in is already past the end of the text, 726 // just return DONE; if it's before the beginning, return the 727 // text's starting offset 728 fLastRuleStatusIndex = 0; 729 fLastStatusIndexValid = TRUE; 730 if (fText == NULL || offset >= utext_nativeLength(fText)) { 731 last(); 732 return next(); 733 } 734 else if (offset < 0) { 735 return first(); 736 } 737 738 // otherwise, set our internal iteration position (temporarily) 739 // to the position passed in. If this is the _beginning_ position, 740 // then we can just use next() to get our return value 741 742 int32_t result = 0; 743 744 if (fData->fSafeRevTable != NULL) { 745 // new rule syntax 746 utext_setNativeIndex(fText, offset); 747 // move forward one codepoint to prepare for moving back to a 748 // safe point. 749 // this handles offset being between a supplementary character 750 (void)UTEXT_NEXT32(fText); 751 // handlePrevious will move most of the time to < 1 boundary away 752 handlePrevious(fData->fSafeRevTable); 753 int32_t result = next(); 754 while (result <= offset) { 755 result = next(); 756 } 757 return result; 758 } 759 if (fData->fSafeFwdTable != NULL) { 760 // backup plan if forward safe table is not available 761 utext_setNativeIndex(fText, offset); 762 (void)UTEXT_PREVIOUS32(fText); 763 // handle next will give result >= offset 764 handleNext(fData->fSafeFwdTable); 765 // previous will give result 0 or 1 boundary away from offset, 766 // most of the time 767 // we have to 768 int32_t oldresult = previous(); 769 while (oldresult > offset) { 770 int32_t result = previous(); 771 if (result <= offset) { 772 return oldresult; 773 } 774 oldresult = result; 775 } 776 int32_t result = next(); 777 if (result <= offset) { 778 return next(); 779 } 780 return result; 781 } 782 // otherwise, we have to sync up first. Use handlePrevious() to back 783 // up to a known break position before the specified position (if 784 // we can determine that the specified position is a break position, 785 // we don't back up at all). This may or may not be the last break 786 // position at or before our starting position. Advance forward 787 // from here until we've passed the starting position. The position 788 // we stop on will be the first break position after the specified one. 789 // old rule syntax 790 791 utext_setNativeIndex(fText, offset); 792 if (offset==0 || 793 (offset==1 && utext_getNativeIndex(fText)==0)) { 794 return next(); 795 } 796 result = previous(); 797 798 while (result != BreakIterator::DONE && result <= offset) { 799 result = next(); 800 } 801 802 return result; 803 } 804 805 /** 806 * Sets the iterator to refer to the last boundary position before the 807 * specified position. 808 * @offset The position to begin searching for a break from. 809 * @return The position of the last boundary before the starting position. 810 */ 811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 812 // if we have cached break positions and offset is in the range 813 // covered by them, use them 814 if (fCachedBreakPositions != NULL) { 815 // TODO: binary search? 816 // TODO: What if offset is outside range, but break is not? 817 if (offset > fCachedBreakPositions[0] 818 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 819 fPositionInCache = 0; 820 while (fPositionInCache < fNumCachedBreakPositions 821 && offset > fCachedBreakPositions[fPositionInCache]) 822 ++fPositionInCache; 823 --fPositionInCache; 824 // If we're at the beginning of the cache, need to reevaluate the 825 // rule status 826 if (fPositionInCache <= 0) { 827 fLastStatusIndexValid = FALSE; 828 } 829 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 830 return fCachedBreakPositions[fPositionInCache]; 831 } 832 else { 833 reset(); 834 } 835 } 836 837 // if the offset passed in is already past the end of the text, 838 // just return DONE; if it's before the beginning, return the 839 // text's starting offset 840 if (fText == NULL || offset > utext_nativeLength(fText)) { 841 // return BreakIterator::DONE; 842 return last(); 843 } 844 else if (offset < 0) { 845 return first(); 846 } 847 848 // if we start by updating the current iteration position to the 849 // position specified by the caller, we can just use previous() 850 // to carry out this operation 851 852 if (fData->fSafeFwdTable != NULL) { 853 // new rule syntax 854 utext_setNativeIndex(fText, offset); 855 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 856 if (newOffset != offset) { 857 // Will come here if specified offset was not a code point boundary AND 858 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 859 // indices to the containing code point. 860 // For breakitereator::preceding only, these non-code-point indices need to be moved 861 // up to refer to the following codepoint. 862 (void)UTEXT_NEXT32(fText); 863 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 864 } 865 866 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 867 // rather than adjusting the position unconditionally? 868 // (Change would interact with safe rules.) 869 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 870 // affects only preceding(), seems cleaner, but is slightly different. 871 (void)UTEXT_PREVIOUS32(fText); 872 handleNext(fData->fSafeFwdTable); 873 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 874 while (result >= offset) { 875 result = previous(); 876 } 877 return result; 878 } 879 if (fData->fSafeRevTable != NULL) { 880 // backup plan if forward safe table is not available 881 // TODO: check whether this path can be discarded 882 // It's probably OK to say that rules must supply both safe tables 883 // if they use safe tables at all. We have certainly never described 884 // to anyone how to work with just one safe table. 885 utext_setNativeIndex(fText, offset); 886 (void)UTEXT_NEXT32(fText); 887 888 // handle previous will give result <= offset 889 handlePrevious(fData->fSafeRevTable); 890 891 // next will give result 0 or 1 boundary away from offset, 892 // most of the time 893 // we have to 894 int32_t oldresult = next(); 895 while (oldresult < offset) { 896 int32_t result = next(); 897 if (result >= offset) { 898 return oldresult; 899 } 900 oldresult = result; 901 } 902 int32_t result = previous(); 903 if (result >= offset) { 904 return previous(); 905 } 906 return result; 907 } 908 909 // old rule syntax 910 utext_setNativeIndex(fText, offset); 911 return previous(); 912 } 913 914 /** 915 * Returns true if the specfied position is a boundary position. As a side 916 * effect, leaves the iterator pointing to the first boundary position at 917 * or after "offset". 918 * @param offset the offset to check. 919 * @return True if "offset" is a boundary position. 920 */ 921 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 922 // the beginning index of the iterator is always a boundary position by definition 923 if (offset == 0) { 924 first(); // For side effects on current position, tag values. 925 return TRUE; 926 } 927 928 if (offset == (int32_t)utext_nativeLength(fText)) { 929 last(); // For side effects on current position, tag values. 930 return TRUE; 931 } 932 933 // out-of-range indexes are never boundary positions 934 if (offset < 0) { 935 first(); // For side effects on current position, tag values. 936 return FALSE; 937 } 938 939 if (offset > utext_nativeLength(fText)) { 940 last(); // For side effects on current position, tag values. 941 return FALSE; 942 } 943 944 // otherwise, we can use following() on the position before the specified 945 // one and return true if the position we get back is the one the user 946 // specified 947 utext_previous32From(fText, offset); 948 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 949 UBool result = following(backOne) == offset; 950 return result; 951 } 952 953 /** 954 * Returns the current iteration position. 955 * @return The current iteration position. 956 */ 957 int32_t RuleBasedBreakIterator::current(void) const { 958 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 959 return pos; 960 } 961 962 //======================================================================= 963 // implementation 964 //======================================================================= 965 966 // 967 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 968 // of user text. A variable with this enum type keeps track of where we 969 // are. The state machine only fetches user input while in the RUN mode. 970 // 971 enum RBBIRunMode { 972 RBBI_START, // state machine processing is before first char of input 973 RBBI_RUN, // state machine processing is in the user text 974 RBBI_END // state machine processing is after end of user text. 975 }; 976 977 978 //----------------------------------------------------------------------------------- 979 // 980 // handleNext(stateTable) 981 // This method is the actual implementation of the rbbi next() method. 982 // This method initializes the state machine to state 1 983 // and advances through the text character by character until we reach the end 984 // of the text or the state machine transitions to state 0. We update our return 985 // value every time the state machine passes through an accepting state. 986 // 987 //----------------------------------------------------------------------------------- 988 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 989 int32_t state; 990 uint16_t category = 0; 991 RBBIRunMode mode; 992 993 RBBIStateTableRow *row; 994 UChar32 c; 995 int32_t lookaheadStatus = 0; 996 int32_t lookaheadTagIdx = 0; 997 int32_t result = 0; 998 int32_t initialPosition = 0; 999 int32_t lookaheadResult = 0; 1000 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1001 const char *tableData = statetable->fTableData; 1002 uint32_t tableRowLen = statetable->fRowLen; 1003 1004 #ifdef RBBI_DEBUG 1005 if (fTrace) { 1006 RBBIDebugPuts("Handle Next pos char state category"); 1007 } 1008 #endif 1009 1010 // No matter what, handleNext alway correctly sets the break tag value. 1011 fLastStatusIndexValid = TRUE; 1012 fLastRuleStatusIndex = 0; 1013 1014 // if we're already at the end of the text, return DONE. 1015 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1016 result = initialPosition; 1017 c = UTEXT_NEXT32(fText); 1018 if (fData == NULL || c==U_SENTINEL) { 1019 return BreakIterator::DONE; 1020 } 1021 1022 // Set the initial state for the state machine 1023 state = START_STATE; 1024 row = (RBBIStateTableRow *) 1025 //(statetable->fTableData + (statetable->fRowLen * state)); 1026 (tableData + tableRowLen * state); 1027 1028 1029 mode = RBBI_RUN; 1030 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1031 category = 2; 1032 mode = RBBI_START; 1033 } 1034 1035 1036 // loop until we reach the end of the text or transition to state 0 1037 // 1038 for (;;) { 1039 if (c == U_SENTINEL) { 1040 // Reached end of input string. 1041 if (mode == RBBI_END) { 1042 // We have already run the loop one last time with the 1043 // character set to the psueudo {eof} value. Now it is time 1044 // to unconditionally bail out. 1045 if (lookaheadResult > result) { 1046 // We ran off the end of the string with a pending look-ahead match. 1047 // Treat this as if the look-ahead condition had been met, and return 1048 // the match at the / position from the look-ahead rule. 1049 result = lookaheadResult; 1050 fLastRuleStatusIndex = lookaheadTagIdx; 1051 lookaheadStatus = 0; 1052 } 1053 break; 1054 } 1055 // Run the loop one last time with the fake end-of-input character category. 1056 mode = RBBI_END; 1057 category = 1; 1058 } 1059 1060 // 1061 // Get the char category. An incoming category of 1 or 2 means that 1062 // we are preset for doing the beginning or end of input, and 1063 // that we shouldn't get a category from an actual text input character. 1064 // 1065 if (mode == RBBI_RUN) { 1066 // look up the current character's character category, which tells us 1067 // which column in the state table to look at. 1068 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1069 // not the size of the character going in, which is a UChar32. 1070 // 1071 UTRIE_GET16(&fData->fTrie, c, category); 1072 1073 // Check the dictionary bit in the character's category. 1074 // Counter is only used by dictionary based iterators (subclasses). 1075 // Chars that need to be handled by a dictionary have a flag bit set 1076 // in their category values. 1077 // 1078 if ((category & 0x4000) != 0) { 1079 fDictionaryCharCount++; 1080 // And off the dictionary flag bit. 1081 category &= ~0x4000; 1082 } 1083 } 1084 1085 #ifdef RBBI_DEBUG 1086 if (fTrace) { 1087 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1088 if (0x20<=c && c<0x7f) { 1089 RBBIDebugPrintf("\"%c\" ", c); 1090 } else { 1091 RBBIDebugPrintf("%5x ", c); 1092 } 1093 RBBIDebugPrintf("%3d %3d\n", state, category); 1094 } 1095 #endif 1096 1097 // State Transition - move machine to its next state 1098 // 1099 1100 // Note: fNextState is defined as uint16_t[2], but we are casting 1101 // a generated RBBI table to RBBIStateTableRow and some tables 1102 // actually have more than 2 categories. 1103 U_ASSERT(category<fData->fHeader->fCatCount); 1104 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1105 row = (RBBIStateTableRow *) 1106 // (statetable->fTableData + (statetable->fRowLen * state)); 1107 (tableData + tableRowLen * state); 1108 1109 1110 if (row->fAccepting == -1) { 1111 // Match found, common case. 1112 if (mode != RBBI_START) { 1113 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1114 } 1115 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1116 } 1117 1118 if (row->fLookAhead != 0) { 1119 if (lookaheadStatus != 0 1120 && row->fAccepting == lookaheadStatus) { 1121 // Lookahead match is completed. 1122 result = lookaheadResult; 1123 fLastRuleStatusIndex = lookaheadTagIdx; 1124 lookaheadStatus = 0; 1125 // TODO: make a standalone hard break in a rule work. 1126 if (lookAheadHardBreak) { 1127 UTEXT_SETNATIVEINDEX(fText, result); 1128 return result; 1129 } 1130 // Look-ahead completed, but other rules may match further. Continue on 1131 // TODO: junk this feature? I don't think it's used anywhwere. 1132 goto continueOn; 1133 } 1134 1135 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1136 lookaheadResult = r; 1137 lookaheadStatus = row->fLookAhead; 1138 lookaheadTagIdx = row->fTagIdx; 1139 goto continueOn; 1140 } 1141 1142 1143 if (row->fAccepting != 0) { 1144 // Because this is an accepting state, any in-progress look-ahead match 1145 // is no longer relavant. Clear out the pending lookahead status. 1146 lookaheadStatus = 0; // clear out any pending look-ahead match. 1147 } 1148 1149 continueOn: 1150 if (state == STOP_STATE) { 1151 // This is the normal exit from the lookup state machine. 1152 // We have advanced through the string until it is certain that no 1153 // longer match is possible, no matter what characters follow. 1154 break; 1155 } 1156 1157 // Advance to the next character. 1158 // If this is a beginning-of-input loop iteration, don't advance 1159 // the input position. The next iteration will be processing the 1160 // first real input character. 1161 if (mode == RBBI_RUN) { 1162 c = UTEXT_NEXT32(fText); 1163 } else { 1164 if (mode == RBBI_START) { 1165 mode = RBBI_RUN; 1166 } 1167 } 1168 1169 1170 } 1171 1172 // The state machine is done. Check whether it found a match... 1173 1174 // If the iterator failed to advance in the match engine, force it ahead by one. 1175 // (This really indicates a defect in the break rules. They should always match 1176 // at least one character.) 1177 if (result == initialPosition) { 1178 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1179 UTEXT_NEXT32(fText); 1180 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1181 } 1182 1183 // Leave the iterator at our result position. 1184 UTEXT_SETNATIVEINDEX(fText, result); 1185 #ifdef RBBI_DEBUG 1186 if (fTrace) { 1187 RBBIDebugPrintf("result = %d\n\n", result); 1188 } 1189 #endif 1190 return result; 1191 } 1192 1193 1194 1195 //----------------------------------------------------------------------------------- 1196 // 1197 // handlePrevious() 1198 // 1199 // Iterate backwards, according to the logic of the reverse rules. 1200 // This version handles the exact style backwards rules. 1201 // 1202 // The logic of this function is very similar to handleNext(), above. 1203 // 1204 //----------------------------------------------------------------------------------- 1205 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1206 int32_t state; 1207 uint16_t category = 0; 1208 RBBIRunMode mode; 1209 RBBIStateTableRow *row; 1210 UChar32 c; 1211 int32_t lookaheadStatus = 0; 1212 int32_t result = 0; 1213 int32_t initialPosition = 0; 1214 int32_t lookaheadResult = 0; 1215 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1216 1217 #ifdef RBBI_DEBUG 1218 if (fTrace) { 1219 RBBIDebugPuts("Handle Previous pos char state category"); 1220 } 1221 #endif 1222 1223 // handlePrevious() never gets the rule status. 1224 // Flag the status as invalid; if the user ever asks for status, we will need 1225 // to back up, then re-find the break position using handleNext(), which does 1226 // get the status value. 1227 fLastStatusIndexValid = FALSE; 1228 fLastRuleStatusIndex = 0; 1229 1230 // if we're already at the start of the text, return DONE. 1231 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1232 return BreakIterator::DONE; 1233 } 1234 1235 // Set up the starting char. 1236 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1237 result = initialPosition; 1238 c = UTEXT_PREVIOUS32(fText); 1239 1240 // Set the initial state for the state machine 1241 state = START_STATE; 1242 row = (RBBIStateTableRow *) 1243 (statetable->fTableData + (statetable->fRowLen * state)); 1244 category = 3; 1245 mode = RBBI_RUN; 1246 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1247 category = 2; 1248 mode = RBBI_START; 1249 } 1250 1251 1252 // loop until we reach the start of the text or transition to state 0 1253 // 1254 for (;;) { 1255 if (c == U_SENTINEL) { 1256 // Reached end of input string. 1257 if (mode == RBBI_END) { 1258 // We have already run the loop one last time with the 1259 // character set to the psueudo {eof} value. Now it is time 1260 // to unconditionally bail out. 1261 if (lookaheadResult < result) { 1262 // We ran off the end of the string with a pending look-ahead match. 1263 // Treat this as if the look-ahead condition had been met, and return 1264 // the match at the / position from the look-ahead rule. 1265 result = lookaheadResult; 1266 lookaheadStatus = 0; 1267 } else if (result == initialPosition) { 1268 // Ran off start, no match found. 1269 // move one index one (towards the start, since we are doing a previous()) 1270 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1271 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1272 } 1273 break; 1274 } 1275 // Run the loop one last time with the fake end-of-input character category. 1276 mode = RBBI_END; 1277 category = 1; 1278 } 1279 1280 // 1281 // Get the char category. An incoming category of 1 or 2 means that 1282 // we are preset for doing the beginning or end of input, and 1283 // that we shouldn't get a category from an actual text input character. 1284 // 1285 if (mode == RBBI_RUN) { 1286 // look up the current character's character category, which tells us 1287 // which column in the state table to look at. 1288 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1289 // not the size of the character going in, which is a UChar32. 1290 // 1291 UTRIE_GET16(&fData->fTrie, c, category); 1292 1293 // Check the dictionary bit in the character's category. 1294 // Counter is only used by dictionary based iterators (subclasses). 1295 // Chars that need to be handled by a dictionary have a flag bit set 1296 // in their category values. 1297 // 1298 if ((category & 0x4000) != 0) { 1299 fDictionaryCharCount++; 1300 // And off the dictionary flag bit. 1301 category &= ~0x4000; 1302 } 1303 } 1304 1305 #ifdef RBBI_DEBUG 1306 if (fTrace) { 1307 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1308 if (0x20<=c && c<0x7f) { 1309 RBBIDebugPrintf("\"%c\" ", c); 1310 } else { 1311 RBBIDebugPrintf("%5x ", c); 1312 } 1313 RBBIDebugPrintf("%3d %3d\n", state, category); 1314 } 1315 #endif 1316 1317 // State Transition - move machine to its next state 1318 // 1319 1320 // Note: fNextState is defined as uint16_t[2], but we are casting 1321 // a generated RBBI table to RBBIStateTableRow and some tables 1322 // actually have more than 2 categories. 1323 U_ASSERT(category<fData->fHeader->fCatCount); 1324 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1325 row = (RBBIStateTableRow *) 1326 (statetable->fTableData + (statetable->fRowLen * state)); 1327 1328 if (row->fAccepting == -1) { 1329 // Match found, common case. 1330 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1331 } 1332 1333 if (row->fLookAhead != 0) { 1334 if (lookaheadStatus != 0 1335 && row->fAccepting == lookaheadStatus) { 1336 // Lookahead match is completed. 1337 result = lookaheadResult; 1338 lookaheadStatus = 0; 1339 // TODO: make a standalone hard break in a rule work. 1340 if (lookAheadHardBreak) { 1341 UTEXT_SETNATIVEINDEX(fText, result); 1342 return result; 1343 } 1344 // Look-ahead completed, but other rules may match further. Continue on 1345 // TODO: junk this feature? I don't think it's used anywhwere. 1346 goto continueOn; 1347 } 1348 1349 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1350 lookaheadResult = r; 1351 lookaheadStatus = row->fLookAhead; 1352 goto continueOn; 1353 } 1354 1355 1356 if (row->fAccepting != 0) { 1357 // Because this is an accepting state, any in-progress look-ahead match 1358 // is no longer relavant. Clear out the pending lookahead status. 1359 lookaheadStatus = 0; 1360 } 1361 1362 continueOn: 1363 if (state == STOP_STATE) { 1364 // This is the normal exit from the lookup state machine. 1365 // We have advanced through the string until it is certain that no 1366 // longer match is possible, no matter what characters follow. 1367 break; 1368 } 1369 1370 // Move (backwards) to the next character to process. 1371 // If this is a beginning-of-input loop iteration, don't advance 1372 // the input position. The next iteration will be processing the 1373 // first real input character. 1374 if (mode == RBBI_RUN) { 1375 c = UTEXT_PREVIOUS32(fText); 1376 } else { 1377 if (mode == RBBI_START) { 1378 mode = RBBI_RUN; 1379 } 1380 } 1381 } 1382 1383 // The state machine is done. Check whether it found a match... 1384 1385 // If the iterator failed to advance in the match engine, force it ahead by one. 1386 // (This really indicates a defect in the break rules. They should always match 1387 // at least one character.) 1388 if (result == initialPosition) { 1389 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1390 UTEXT_PREVIOUS32(fText); 1391 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1392 } 1393 1394 // Leave the iterator at our result position. 1395 UTEXT_SETNATIVEINDEX(fText, result); 1396 #ifdef RBBI_DEBUG 1397 if (fTrace) { 1398 RBBIDebugPrintf("result = %d\n\n", result); 1399 } 1400 #endif 1401 return result; 1402 } 1403 1404 1405 void 1406 RuleBasedBreakIterator::reset() 1407 { 1408 if (fCachedBreakPositions) { 1409 uprv_free(fCachedBreakPositions); 1410 } 1411 fCachedBreakPositions = NULL; 1412 fNumCachedBreakPositions = 0; 1413 fDictionaryCharCount = 0; 1414 fPositionInCache = 0; 1415 } 1416 1417 1418 1419 //------------------------------------------------------------------------------- 1420 // 1421 // getRuleStatus() Return the break rule tag associated with the current 1422 // iterator position. If the iterator arrived at its current 1423 // position by iterating forwards, the value will have been 1424 // cached by the handleNext() function. 1425 // 1426 // If no cached status value is available, the status is 1427 // found by doing a previous() followed by a next(), which 1428 // leaves the iterator where it started, and computes the 1429 // status while doing the next(). 1430 // 1431 //------------------------------------------------------------------------------- 1432 void RuleBasedBreakIterator::makeRuleStatusValid() { 1433 if (fLastStatusIndexValid == FALSE) { 1434 // No cached status is available. 1435 if (fText == NULL || current() == 0) { 1436 // At start of text, or there is no text. Status is always zero. 1437 fLastRuleStatusIndex = 0; 1438 fLastStatusIndexValid = TRUE; 1439 } else { 1440 // Not at start of text. Find status the tedious way. 1441 int32_t pa = current(); 1442 previous(); 1443 if (fNumCachedBreakPositions > 0) { 1444 reset(); // Blow off the dictionary cache 1445 } 1446 int32_t pb = next(); 1447 if (pa != pb) { 1448 // note: the if (pa != pb) test is here only to eliminate warnings for 1449 // unused local variables on gcc. Logically, it isn't needed. 1450 U_ASSERT(pa == pb); 1451 } 1452 } 1453 } 1454 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1455 } 1456 1457 1458 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1459 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1460 nonConstThis->makeRuleStatusValid(); 1461 1462 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1463 // (the number of status values.) 1464 // This function returns the last (largest) of the array of status values. 1465 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1466 int32_t tagVal = fData->fRuleStatusTable[idx]; 1467 1468 return tagVal; 1469 } 1470 1471 1472 1473 1474 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1475 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1476 { 1477 if (U_FAILURE(status)) { 1478 return 0; 1479 } 1480 1481 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1482 nonConstThis->makeRuleStatusValid(); 1483 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1484 int32_t numValsToCopy = numVals; 1485 if (numVals > capacity) { 1486 status = U_BUFFER_OVERFLOW_ERROR; 1487 numValsToCopy = capacity; 1488 } 1489 int i; 1490 for (i=0; i<numValsToCopy; i++) { 1491 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1492 } 1493 return numVals; 1494 } 1495 1496 1497 1498 //------------------------------------------------------------------------------- 1499 // 1500 // getBinaryRules Access to the compiled form of the rules, 1501 // for use by build system tools that save the data 1502 // for standard iterator types. 1503 // 1504 //------------------------------------------------------------------------------- 1505 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1506 const uint8_t *retPtr = NULL; 1507 length = 0; 1508 1509 if (fData != NULL) { 1510 retPtr = (const uint8_t *)fData->fHeader; 1511 length = fData->fHeader->fLength; 1512 } 1513 return retPtr; 1514 } 1515 1516 1517 BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, 1518 int32_t &bufferSize, 1519 UErrorCode &status) 1520 { 1521 if (U_FAILURE(status)){ 1522 return NULL; 1523 } 1524 1525 if (bufferSize == 0) { 1526 bufferSize = 1; // preflighting for deprecated functionality 1527 return NULL; 1528 } 1529 1530 BreakIterator *clonedBI = clone(); 1531 if (clonedBI == NULL) { 1532 status = U_MEMORY_ALLOCATION_ERROR; 1533 } else { 1534 status = U_SAFECLONE_ALLOCATED_WARNING; 1535 } 1536 return (RuleBasedBreakIterator *)clonedBI; 1537 } 1538 1539 1540 //------------------------------------------------------------------------------- 1541 // 1542 // isDictionaryChar Return true if the category lookup for this char 1543 // indicates that it is in the set of dictionary lookup 1544 // chars. 1545 // 1546 // This function is intended for use by dictionary based 1547 // break iterators. 1548 // 1549 //------------------------------------------------------------------------------- 1550 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1551 if (fData == NULL) { 1552 return FALSE; 1553 } 1554 uint16_t category; 1555 UTRIE_GET16(&fData->fTrie, c, category); 1556 return (category & 0x4000) != 0; 1557 }*/ 1558 1559 1560 //------------------------------------------------------------------------------- 1561 // 1562 // checkDictionary This function handles all processing of characters in 1563 // the "dictionary" set. It will determine the appropriate 1564 // course of action, and possibly set up a cache in the 1565 // process. 1566 // 1567 //------------------------------------------------------------------------------- 1568 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1569 int32_t endPos, 1570 UBool reverse) { 1571 // Reset the old break cache first. 1572 reset(); 1573 1574 // note: code segment below assumes that dictionary chars are in the 1575 // startPos-endPos range 1576 // value returned should be next character in sequence 1577 if ((endPos - startPos) <= 1) { 1578 return (reverse ? startPos : endPos); 1579 } 1580 1581 // Bug 5532. The dictionary code will crash if the input text is UTF-8 1582 // because native indexes are different from UTF-16 indexes. 1583 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. 1584 // It wont give the right breaks, but it's better than a crash. 1585 // 1586 // Check the type of the UText by checking its pFuncs field, which 1587 // is UText's function dispatch table. It will be the same for all 1588 // UTF-8 UTexts and different for any other UText type. 1589 // 1590 // We have no other type of UText available with non-UTF-16 native indexing. 1591 // This whole check will go away once the dictionary code is fixed. 1592 static const void *utext_utf8Funcs; 1593 if (utext_utf8Funcs == NULL) { 1594 // Cache the UTF-8 UText function pointer value. 1595 UErrorCode status = U_ZERO_ERROR; 1596 UText tempUText = UTEXT_INITIALIZER; 1597 utext_openUTF8(&tempUText, NULL, 0, &status); 1598 utext_utf8Funcs = tempUText.pFuncs; 1599 utext_close(&tempUText); 1600 } 1601 if (fText->pFuncs == utext_utf8Funcs) { 1602 return (reverse ? startPos : endPos); 1603 } 1604 1605 // Starting from the starting point, scan towards the proposed result, 1606 // looking for the first dictionary character (which may be the one 1607 // we're on, if we're starting in the middle of a range). 1608 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1609 if (reverse) { 1610 UTEXT_PREVIOUS32(fText); 1611 } 1612 1613 int32_t rangeStart = startPos; 1614 int32_t rangeEnd = endPos; 1615 1616 uint16_t category; 1617 int32_t current; 1618 UErrorCode status = U_ZERO_ERROR; 1619 UStack breaks(status); 1620 int32_t foundBreakCount = 0; 1621 UChar32 c = utext_current32(fText); 1622 1623 UTRIE_GET16(&fData->fTrie, c, category); 1624 1625 // Is the character we're starting on a dictionary character? If so, we 1626 // need to back up to include the entire run; otherwise the results of 1627 // the break algorithm will differ depending on where we start. Since 1628 // the result is cached and there is typically a non-dictionary break 1629 // within a small number of words, there should be little performance impact. 1630 if (category & 0x4000) { 1631 if (reverse) { 1632 do { 1633 utext_next32(fText); // TODO: recast to work directly with postincrement. 1634 c = utext_current32(fText); 1635 UTRIE_GET16(&fData->fTrie, c, category); 1636 } while (c != U_SENTINEL && (category & 0x4000)); 1637 // Back up to the last dictionary character 1638 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1639 if (c == U_SENTINEL) { 1640 // c = fText->last32(); 1641 // TODO: why was this if needed? 1642 c = UTEXT_PREVIOUS32(fText); 1643 } 1644 else { 1645 c = UTEXT_PREVIOUS32(fText); 1646 } 1647 } 1648 else { 1649 do { 1650 c = UTEXT_PREVIOUS32(fText); 1651 UTRIE_GET16(&fData->fTrie, c, category); 1652 } 1653 while (c != U_SENTINEL && (category & 0x4000)); 1654 // Back up to the last dictionary character 1655 if (c == U_SENTINEL) { 1656 // c = fText->first32(); 1657 c = utext_current32(fText); 1658 } 1659 else { 1660 utext_next32(fText); 1661 c = utext_current32(fText); 1662 } 1663 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1664 } 1665 UTRIE_GET16(&fData->fTrie, c, category); 1666 } 1667 1668 // Loop through the text, looking for ranges of dictionary characters. 1669 // For each span, find the appropriate break engine, and ask it to find 1670 // any breaks within the span. 1671 // Note: we always do this in the forward direction, so that the break 1672 // cache is built in the right order. 1673 if (reverse) { 1674 utext_setNativeIndex(fText, rangeStart); 1675 c = utext_current32(fText); 1676 UTRIE_GET16(&fData->fTrie, c, category); 1677 } 1678 while(U_SUCCESS(status)) { 1679 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1680 utext_next32(fText); // TODO: tweak for post-increment operation 1681 c = utext_current32(fText); 1682 UTRIE_GET16(&fData->fTrie, c, category); 1683 } 1684 if (current >= rangeEnd) { 1685 break; 1686 } 1687 1688 // We now have a dictionary character. Get the appropriate language object 1689 // to deal with it. 1690 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1691 1692 // Ask the language object if there are any breaks. It will leave the text 1693 // pointer on the other side of its range, ready to search for the next one. 1694 if (lbe != NULL) { 1695 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1696 } 1697 1698 // Reload the loop variables for the next go-round 1699 c = utext_current32(fText); 1700 UTRIE_GET16(&fData->fTrie, c, category); 1701 } 1702 1703 // If we found breaks, build a new break cache. The first and last entries must 1704 // be the original starting and ending position. 1705 if (foundBreakCount > 0) { 1706 U_ASSERT(foundBreakCount == breaks.size()); 1707 int32_t totalBreaks = foundBreakCount; 1708 if (startPos < breaks.elementAti(0)) { 1709 totalBreaks += 1; 1710 } 1711 if (endPos > breaks.peeki()) { 1712 totalBreaks += 1; 1713 } 1714 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1715 if (fCachedBreakPositions != NULL) { 1716 int32_t out = 0; 1717 fNumCachedBreakPositions = totalBreaks; 1718 if (startPos < breaks.elementAti(0)) { 1719 fCachedBreakPositions[out++] = startPos; 1720 } 1721 for (int32_t i = 0; i < foundBreakCount; ++i) { 1722 fCachedBreakPositions[out++] = breaks.elementAti(i); 1723 } 1724 if (endPos > fCachedBreakPositions[out-1]) { 1725 fCachedBreakPositions[out] = endPos; 1726 } 1727 // If there are breaks, then by definition, we are replacing the original 1728 // proposed break by one of the breaks we found. Use following() and 1729 // preceding() to do the work. They should never recurse in this case. 1730 if (reverse) { 1731 return preceding(endPos); 1732 } 1733 else { 1734 return following(startPos); 1735 } 1736 } 1737 // If the allocation failed, just fall through to the "no breaks found" case. 1738 } 1739 1740 // If we get here, there were no language-based breaks. Set the text pointer 1741 // to the original proposed break. 1742 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1743 return (reverse ? startPos : endPos); 1744 } 1745 1746 // defined in ucln_cmn.h 1747 1748 U_NAMESPACE_END 1749 1750 1751 static icu::UStack *gLanguageBreakFactories = NULL; 1752 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1753 1754 /** 1755 * Release all static memory held by breakiterator. 1756 */ 1757 U_CDECL_BEGIN 1758 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1759 if (gLanguageBreakFactories) { 1760 delete gLanguageBreakFactories; 1761 gLanguageBreakFactories = NULL; 1762 } 1763 gLanguageBreakFactoriesInitOnce.reset(); 1764 return TRUE; 1765 } 1766 U_CDECL_END 1767 1768 U_CDECL_BEGIN 1769 static void U_CALLCONV _deleteFactory(void *obj) { 1770 delete (icu::LanguageBreakFactory *) obj; 1771 } 1772 U_CDECL_END 1773 U_NAMESPACE_BEGIN 1774 1775 static void U_CALLCONV initLanguageFactories() { 1776 UErrorCode status = U_ZERO_ERROR; 1777 U_ASSERT(gLanguageBreakFactories == NULL); 1778 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); 1779 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { 1780 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1781 gLanguageBreakFactories->push(builtIn, status); 1782 #ifdef U_LOCAL_SERVICE_HOOK 1783 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1784 if (extra != NULL) { 1785 gLanguageBreakFactories->push(extra, status); 1786 } 1787 #endif 1788 } 1789 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1790 } 1791 1792 1793 static const LanguageBreakEngine* 1794 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1795 { 1796 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); 1797 if (gLanguageBreakFactories == NULL) { 1798 return NULL; 1799 } 1800 1801 int32_t i = gLanguageBreakFactories->size(); 1802 const LanguageBreakEngine *lbe = NULL; 1803 while (--i >= 0) { 1804 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1805 lbe = factory->getEngineFor(c, breakType); 1806 if (lbe != NULL) { 1807 break; 1808 } 1809 } 1810 return lbe; 1811 } 1812 1813 1814 //------------------------------------------------------------------------------- 1815 // 1816 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1817 // the character c. 1818 // 1819 //------------------------------------------------------------------------------- 1820 const LanguageBreakEngine * 1821 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1822 const LanguageBreakEngine *lbe = NULL; 1823 UErrorCode status = U_ZERO_ERROR; 1824 1825 if (fLanguageBreakEngines == NULL) { 1826 fLanguageBreakEngines = new UStack(status); 1827 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1828 delete fLanguageBreakEngines; 1829 fLanguageBreakEngines = 0; 1830 return NULL; 1831 } 1832 } 1833 1834 int32_t i = fLanguageBreakEngines->size(); 1835 while (--i >= 0) { 1836 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1837 if (lbe->handles(c, fBreakType)) { 1838 return lbe; 1839 } 1840 } 1841 1842 // No existing dictionary took the character. See if a factory wants to 1843 // give us a new LanguageBreakEngine for this character. 1844 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1845 1846 // If we got one, use it and push it on our stack. 1847 if (lbe != NULL) { 1848 fLanguageBreakEngines->push((void *)lbe, status); 1849 // Even if we can't remember it, we can keep looking it up, so 1850 // return it even if the push fails. 1851 return lbe; 1852 } 1853 1854 // No engine is forthcoming for this character. Add it to the 1855 // reject set. Create the reject break engine if needed. 1856 if (fUnhandledBreakEngine == NULL) { 1857 fUnhandledBreakEngine = new UnhandledEngine(status); 1858 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1859 status = U_MEMORY_ALLOCATION_ERROR; 1860 } 1861 // Put it last so that scripts for which we have an engine get tried 1862 // first. 1863 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1864 // If we can't insert it, or creation failed, get rid of it 1865 if (U_FAILURE(status)) { 1866 delete fUnhandledBreakEngine; 1867 fUnhandledBreakEngine = 0; 1868 return NULL; 1869 } 1870 } 1871 1872 // Tell the reject engine about the character; at its discretion, it may 1873 // add more than just the one character. 1874 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1875 1876 return fUnhandledBreakEngine; 1877 } 1878 1879 1880 1881 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1882 return fBreakType; 1883 }*/ 1884 1885 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1886 fBreakType = type; 1887 reset(); 1888 } 1889 1890 U_NAMESPACE_END 1891 1892 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1893