1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2012 International Business Machines Corporation 4 * and others. All rights reserved. 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include "utypeinfo.h" // for 'typeid' to work 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/rbbi.h" 20 #include "unicode/schriter.h" 21 #include "unicode/uchriter.h" 22 #include "unicode/udata.h" 23 #include "unicode/uclean.h" 24 #include "rbbidata.h" 25 #include "rbbirb.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "umutex.h" 29 #include "ucln_cmn.h" 30 #include "brkeng.h" 31 32 #include "uassert.h" 33 #include "uvector.h" 34 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 36 #if U_LOCAL_SERVICE_HOOK 37 #include "localsvc.h" 38 #endif 39 40 #ifdef RBBI_DEBUG 41 static UBool fTrace = FALSE; 42 #endif 43 44 U_NAMESPACE_BEGIN 45 46 // The state number of the starting state 47 #define START_STATE 1 48 49 // The state-transition value indicating "stop" 50 #define STOP_STATE 0 51 52 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 54 55 56 //======================================================================= 57 // constructors 58 //======================================================================= 59 60 /** 61 * Constructs a RuleBasedBreakIterator that uses the already-created 62 * tables object that is passed in as a parameter. 63 */ 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 65 { 66 init(); 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor 68 if (U_FAILURE(status)) {return;} 69 if(fData == 0) { 70 status = U_MEMORY_ALLOCATION_ERROR; 71 return; 72 } 73 } 74 75 /** 76 * Same as above but does not adopt memory 77 */ 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 79 { 80 init(); 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 82 if (U_FAILURE(status)) {return;} 83 if(fData == 0) { 84 status = U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 } 88 89 90 // 91 // Construct from precompiled binary rules (tables). This constructor is public API, 92 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 93 // 94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 95 uint32_t ruleLength, 96 UErrorCode &status) { 97 init(); 98 if (U_FAILURE(status)) { 99 return; 100 } 101 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 102 status = U_ILLEGAL_ARGUMENT_ERROR; 103 return; 104 } 105 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 106 if (data->fLength > ruleLength) { 107 status = U_ILLEGAL_ARGUMENT_ERROR; 108 return; 109 } 110 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 111 if (U_FAILURE(status)) {return;} 112 if(fData == 0) { 113 status = U_MEMORY_ALLOCATION_ERROR; 114 return; 115 } 116 } 117 118 119 //------------------------------------------------------------------------------- 120 // 121 // Constructor from a UDataMemory handle to precompiled break rules 122 // stored in an ICU data file. 123 // 124 //------------------------------------------------------------------------------- 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 126 { 127 init(); 128 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 129 if (U_FAILURE(status)) {return;} 130 if(fData == 0) { 131 status = U_MEMORY_ALLOCATION_ERROR; 132 return; 133 } 134 } 135 136 137 138 //------------------------------------------------------------------------------- 139 // 140 // Constructor from a set of rules supplied as a string. 141 // 142 //------------------------------------------------------------------------------- 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 144 UParseError &parseError, 145 UErrorCode &status) 146 { 147 init(); 148 if (U_FAILURE(status)) {return;} 149 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 150 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 151 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 152 // creates and returns a complete RBBI. From here, in a constructor, we 153 // can't just return the object created by the builder factory, hence 154 // the assignment of the factory created object to "this". 155 if (U_SUCCESS(status)) { 156 *this = *bi; 157 delete bi; 158 } 159 } 160 161 162 //------------------------------------------------------------------------------- 163 // 164 // Default Constructor. Create an empty shell that can be set up later. 165 // Used when creating a RuleBasedBreakIterator from a set 166 // of rules. 167 //------------------------------------------------------------------------------- 168 RuleBasedBreakIterator::RuleBasedBreakIterator() { 169 init(); 170 } 171 172 173 //------------------------------------------------------------------------------- 174 // 175 // Copy constructor. Will produce a break iterator with the same behavior, 176 // and which iterates over the same text, as the one passed in. 177 // 178 //------------------------------------------------------------------------------- 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 180 : BreakIterator(other) 181 { 182 this->init(); 183 *this = other; 184 } 185 186 187 /** 188 * Destructor 189 */ 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 191 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 192 // fCharIter was adopted from the outside. 193 delete fCharIter; 194 } 195 fCharIter = NULL; 196 delete fSCharIter; 197 fCharIter = NULL; 198 delete fDCharIter; 199 fDCharIter = NULL; 200 201 utext_close(fText); 202 203 if (fData != NULL) { 204 fData->removeReference(); 205 fData = NULL; 206 } 207 if (fCachedBreakPositions) { 208 uprv_free(fCachedBreakPositions); 209 fCachedBreakPositions = NULL; 210 } 211 if (fLanguageBreakEngines) { 212 delete fLanguageBreakEngines; 213 fLanguageBreakEngines = NULL; 214 } 215 if (fUnhandledBreakEngine) { 216 delete fUnhandledBreakEngine; 217 fUnhandledBreakEngine = NULL; 218 } 219 } 220 221 /** 222 * Assignment operator. Sets this iterator to have the same behavior, 223 * and iterate over the same text, as the one passed in. 224 */ 225 RuleBasedBreakIterator& 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 227 if (this == &that) { 228 return *this; 229 } 230 reset(); // Delete break cache information 231 fBreakType = that.fBreakType; 232 if (fLanguageBreakEngines != NULL) { 233 delete fLanguageBreakEngines; 234 fLanguageBreakEngines = NULL; // Just rebuild for now 235 } 236 // TODO: clone fLanguageBreakEngines from "that" 237 UErrorCode status = U_ZERO_ERROR; 238 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 239 240 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 241 delete fCharIter; 242 } 243 fCharIter = NULL; 244 245 if (that.fCharIter != NULL ) { 246 // This is a little bit tricky - it will intially appear that 247 // this->fCharIter is adopted, even if that->fCharIter was 248 // not adopted. That's ok. 249 fCharIter = that.fCharIter->clone(); 250 } 251 252 if (fData != NULL) { 253 fData->removeReference(); 254 fData = NULL; 255 } 256 if (that.fData != NULL) { 257 fData = that.fData->addReference(); 258 } 259 260 return *this; 261 } 262 263 264 265 //----------------------------------------------------------------------------- 266 // 267 // init() Shared initialization routine. Used by all the constructors. 268 // Initializes all fields, leaving the object in a consistent state. 269 // 270 //----------------------------------------------------------------------------- 271 void RuleBasedBreakIterator::init() { 272 UErrorCode status = U_ZERO_ERROR; 273 fBufferClone = FALSE; 274 fText = utext_openUChars(NULL, NULL, 0, &status); 275 fCharIter = NULL; 276 fSCharIter = NULL; 277 fDCharIter = NULL; 278 fData = NULL; 279 fLastRuleStatusIndex = 0; 280 fLastStatusIndexValid = TRUE; 281 fDictionaryCharCount = 0; 282 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 283 // dictionary behavior for Break Iterators that are 284 // built from rules. Even better would be the ability to 285 // declare the type in the rules. 286 287 fCachedBreakPositions = NULL; 288 fLanguageBreakEngines = NULL; 289 fUnhandledBreakEngine = NULL; 290 fNumCachedBreakPositions = 0; 291 fPositionInCache = 0; 292 293 #ifdef RBBI_DEBUG 294 static UBool debugInitDone = FALSE; 295 if (debugInitDone == FALSE) { 296 char *debugEnv = getenv("U_RBBIDEBUG"); 297 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 298 fTrace = TRUE; 299 } 300 debugInitDone = TRUE; 301 } 302 #endif 303 } 304 305 306 307 //----------------------------------------------------------------------------- 308 // 309 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 310 // behavior, and iterating over the same text, as this one. 311 // Virtual function: does the right thing with subclasses. 312 // 313 //----------------------------------------------------------------------------- 314 BreakIterator* 315 RuleBasedBreakIterator::clone(void) const { 316 return new RuleBasedBreakIterator(*this); 317 } 318 319 /** 320 * Equality operator. Returns TRUE if both BreakIterators are of the 321 * same class, have the same behavior, and iterate over the same text. 322 */ 323 UBool 324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 325 if (typeid(*this) != typeid(that)) { 326 return FALSE; 327 } 328 329 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 330 331 if (!utext_equals(fText, that2.fText)) { 332 // The two break iterators are operating on different text, 333 // or have a different interation position. 334 return FALSE; 335 }; 336 337 // TODO: need a check for when in a dictionary region at different offsets. 338 339 if (that2.fData == fData || 340 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 341 // The two break iterators are using the same rules. 342 return TRUE; 343 } 344 return FALSE; 345 } 346 347 /** 348 * Compute a hash code for this BreakIterator 349 * @return A hash code 350 */ 351 int32_t 352 RuleBasedBreakIterator::hashCode(void) const { 353 int32_t hash = 0; 354 if (fData != NULL) { 355 hash = fData->hashCode(); 356 } 357 return hash; 358 } 359 360 361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 362 if (U_FAILURE(status)) { 363 return; 364 } 365 reset(); 366 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 367 368 // Set up a dummy CharacterIterator to be returned if anyone 369 // calls getText(). With input from UText, there is no reasonable 370 // way to return a characterIterator over the actual input text. 371 // Return one over an empty string instead - this is the closest 372 // we can come to signaling a failure. 373 // (GetText() is obsolete, this failure is sort of OK) 374 if (fDCharIter == NULL) { 375 static const UChar c = 0; 376 fDCharIter = new UCharCharacterIterator(&c, 0); 377 if (fDCharIter == NULL) { 378 status = U_MEMORY_ALLOCATION_ERROR; 379 return; 380 } 381 } 382 383 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 384 // existing fCharIter was adopted from the outside. Delete it now. 385 delete fCharIter; 386 } 387 fCharIter = fDCharIter; 388 389 this->first(); 390 } 391 392 393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 394 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 395 return result; 396 } 397 398 399 400 /** 401 * Returns the description used to create this iterator 402 */ 403 const UnicodeString& 404 RuleBasedBreakIterator::getRules() const { 405 if (fData != NULL) { 406 return fData->getRuleSourceString(); 407 } else { 408 static const UnicodeString *s; 409 if (s == NULL) { 410 // TODO: something more elegant here. 411 // perhaps API should return the string by value. 412 // Note: thread unsafe init & leak are semi-ok, better than 413 // what was before. Sould be cleaned up, though. 414 s = new UnicodeString; 415 } 416 return *s; 417 } 418 } 419 420 //======================================================================= 421 // BreakIterator overrides 422 //======================================================================= 423 424 /** 425 * Return a CharacterIterator over the text being analyzed. 426 */ 427 CharacterIterator& 428 RuleBasedBreakIterator::getText() const { 429 return *fCharIter; 430 } 431 432 /** 433 * Set the iterator to analyze a new piece of text. This function resets 434 * the current iteration position to the beginning of the text. 435 * @param newText An iterator over the text to analyze. 436 */ 437 void 438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 439 // If we are holding a CharacterIterator adopted from a 440 // previous call to this function, delete it now. 441 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 442 delete fCharIter; 443 } 444 445 fCharIter = newText; 446 UErrorCode status = U_ZERO_ERROR; 447 reset(); 448 if (newText==NULL || newText->startIndex() != 0) { 449 // startIndex !=0 wants to be an error, but there's no way to report it. 450 // Make the iterator text be an empty string. 451 fText = utext_openUChars(fText, NULL, 0, &status); 452 } else { 453 fText = utext_openCharacterIterator(fText, newText, &status); 454 } 455 this->first(); 456 } 457 458 /** 459 * Set the iterator to analyze a new piece of text. This function resets 460 * the current iteration position to the beginning of the text. 461 * @param newText An iterator over the text to analyze. 462 */ 463 void 464 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 465 UErrorCode status = U_ZERO_ERROR; 466 reset(); 467 fText = utext_openConstUnicodeString(fText, &newText, &status); 468 469 // Set up a character iterator on the string. 470 // Needed in case someone calls getText(). 471 // Can not, unfortunately, do this lazily on the (probably never) 472 // call to getText(), because getText is const. 473 if (fSCharIter == NULL) { 474 fSCharIter = new StringCharacterIterator(newText); 475 } else { 476 fSCharIter->setText(newText); 477 } 478 479 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 480 // old fCharIter was adopted from the outside. Delete it. 481 delete fCharIter; 482 } 483 fCharIter = fSCharIter; 484 485 this->first(); 486 } 487 488 489 /** 490 * Provide a new UText for the input text. Must reference text with contents identical 491 * to the original. 492 * Intended for use with text data originating in Java (garbage collected) environments 493 * where the data may be moved in memory at arbitrary times. 494 */ 495 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 496 if (U_FAILURE(status)) { 497 return *this; 498 } 499 if (input == NULL) { 500 status = U_ILLEGAL_ARGUMENT_ERROR; 501 return *this; 502 } 503 int64_t pos = utext_getNativeIndex(fText); 504 // Shallow read-only clone of the new UText into the existing input UText 505 fText = utext_clone(fText, input, FALSE, TRUE, &status); 506 if (U_FAILURE(status)) { 507 return *this; 508 } 509 utext_setNativeIndex(fText, pos); 510 if (utext_getNativeIndex(fText) != pos) { 511 // Sanity check. The new input utext is supposed to have the exact same 512 // contents as the old. If we can't set to the same position, it doesn't. 513 // The contents underlying the old utext might be invalid at this point, 514 // so it's not safe to check directly. 515 status = U_ILLEGAL_ARGUMENT_ERROR; 516 } 517 return *this; 518 } 519 520 521 /** 522 * Sets the current iteration position to the beginning of the text. 523 * @return The offset of the beginning of the text. 524 */ 525 int32_t RuleBasedBreakIterator::first(void) { 526 reset(); 527 fLastRuleStatusIndex = 0; 528 fLastStatusIndexValid = TRUE; 529 //if (fText == NULL) 530 // return BreakIterator::DONE; 531 532 utext_setNativeIndex(fText, 0); 533 return 0; 534 } 535 536 /** 537 * Sets the current iteration position to the end of the text. 538 * @return The text's past-the-end offset. 539 */ 540 int32_t RuleBasedBreakIterator::last(void) { 541 reset(); 542 if (fText == NULL) { 543 fLastRuleStatusIndex = 0; 544 fLastStatusIndexValid = TRUE; 545 return BreakIterator::DONE; 546 } 547 548 fLastStatusIndexValid = FALSE; 549 int32_t pos = (int32_t)utext_nativeLength(fText); 550 utext_setNativeIndex(fText, pos); 551 return pos; 552 } 553 554 /** 555 * Advances the iterator either forward or backward the specified number of steps. 556 * Negative values move backward, and positive values move forward. This is 557 * equivalent to repeatedly calling next() or previous(). 558 * @param n The number of steps to move. The sign indicates the direction 559 * (negative is backwards, and positive is forwards). 560 * @return The character offset of the boundary position n boundaries away from 561 * the current one. 562 */ 563 int32_t RuleBasedBreakIterator::next(int32_t n) { 564 int32_t result = current(); 565 while (n > 0) { 566 result = next(); 567 --n; 568 } 569 while (n < 0) { 570 result = previous(); 571 ++n; 572 } 573 return result; 574 } 575 576 /** 577 * Advances the iterator to the next boundary position. 578 * @return The position of the first boundary after this one. 579 */ 580 int32_t RuleBasedBreakIterator::next(void) { 581 // if we have cached break positions and we're still in the range 582 // covered by them, just move one step forward in the cache 583 if (fCachedBreakPositions != NULL) { 584 if (fPositionInCache < fNumCachedBreakPositions - 1) { 585 ++fPositionInCache; 586 int32_t pos = fCachedBreakPositions[fPositionInCache]; 587 utext_setNativeIndex(fText, pos); 588 return pos; 589 } 590 else { 591 reset(); 592 } 593 } 594 595 int32_t startPos = current(); 596 int32_t result = handleNext(fData->fForwardTable); 597 if (fDictionaryCharCount > 0) { 598 result = checkDictionary(startPos, result, FALSE); 599 } 600 return result; 601 } 602 603 /** 604 * Advances the iterator backwards, to the last boundary preceding this one. 605 * @return The position of the last boundary position preceding this one. 606 */ 607 int32_t RuleBasedBreakIterator::previous(void) { 608 int32_t result; 609 int32_t startPos; 610 611 // if we have cached break positions and we're still in the range 612 // covered by them, just move one step backward in the cache 613 if (fCachedBreakPositions != NULL) { 614 if (fPositionInCache > 0) { 615 --fPositionInCache; 616 // If we're at the beginning of the cache, need to reevaluate the 617 // rule status 618 if (fPositionInCache <= 0) { 619 fLastStatusIndexValid = FALSE; 620 } 621 int32_t pos = fCachedBreakPositions[fPositionInCache]; 622 utext_setNativeIndex(fText, pos); 623 return pos; 624 } 625 else { 626 reset(); 627 } 628 } 629 630 // if we're already sitting at the beginning of the text, return DONE 631 if (fText == NULL || (startPos = current()) == 0) { 632 fLastRuleStatusIndex = 0; 633 fLastStatusIndexValid = TRUE; 634 return BreakIterator::DONE; 635 } 636 637 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 638 result = handlePrevious(fData->fReverseTable); 639 if (fDictionaryCharCount > 0) { 640 result = checkDictionary(result, startPos, TRUE); 641 } 642 return result; 643 } 644 645 // old rule syntax 646 // set things up. handlePrevious() will back us up to some valid 647 // break position before the current position (we back our internal 648 // iterator up one step to prevent handlePrevious() from returning 649 // the current position), but not necessarily the last one before 650 651 // where we started 652 653 int32_t start = current(); 654 655 (void)UTEXT_PREVIOUS32(fText); 656 int32_t lastResult = handlePrevious(fData->fReverseTable); 657 if (lastResult == UBRK_DONE) { 658 lastResult = 0; 659 utext_setNativeIndex(fText, 0); 660 } 661 result = lastResult; 662 int32_t lastTag = 0; 663 UBool breakTagValid = FALSE; 664 665 // iterate forward from the known break position until we pass our 666 // starting point. The last break position before the starting 667 // point is our return value 668 669 for (;;) { 670 result = next(); 671 if (result == BreakIterator::DONE || result >= start) { 672 break; 673 } 674 lastResult = result; 675 lastTag = fLastRuleStatusIndex; 676 breakTagValid = TRUE; 677 } 678 679 // fLastBreakTag wants to have the value for section of text preceding 680 // the result position that we are to return (in lastResult.) If 681 // the backwards rules overshot and the above loop had to do two or more 682 // next()s to move up to the desired return position, we will have a valid 683 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 684 // we wont have a tag value for that position, which is only set by handleNext(). 685 686 // set the current iteration position to be the last break position 687 // before where we started, and then return that value 688 utext_setNativeIndex(fText, lastResult); 689 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 690 fLastStatusIndexValid = breakTagValid; 691 692 // No need to check the dictionary; it will have been handled by 693 // next() 694 695 return lastResult; 696 } 697 698 /** 699 * Sets the iterator to refer to the first boundary position following 700 * the specified position. 701 * @offset The position from which to begin searching for a break position. 702 * @return The position of the first break after the current position. 703 */ 704 int32_t RuleBasedBreakIterator::following(int32_t offset) { 705 // if we have cached break positions and offset is in the range 706 // covered by them, use them 707 // TODO: could use binary search 708 // TODO: what if offset is outside range, but break is not? 709 if (fCachedBreakPositions != NULL) { 710 if (offset >= fCachedBreakPositions[0] 711 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 712 fPositionInCache = 0; 713 // We are guaranteed not to leave the array due to range test above 714 while (offset >= fCachedBreakPositions[fPositionInCache]) { 715 ++fPositionInCache; 716 } 717 int32_t pos = fCachedBreakPositions[fPositionInCache]; 718 utext_setNativeIndex(fText, pos); 719 return pos; 720 } 721 else { 722 reset(); 723 } 724 } 725 726 // if the offset passed in is already past the end of the text, 727 // just return DONE; if it's before the beginning, return the 728 // text's starting offset 729 fLastRuleStatusIndex = 0; 730 fLastStatusIndexValid = TRUE; 731 if (fText == NULL || offset >= utext_nativeLength(fText)) { 732 last(); 733 return next(); 734 } 735 else if (offset < 0) { 736 return first(); 737 } 738 739 // otherwise, set our internal iteration position (temporarily) 740 // to the position passed in. If this is the _beginning_ position, 741 // then we can just use next() to get our return value 742 743 int32_t result = 0; 744 745 if (fData->fSafeRevTable != NULL) { 746 // new rule syntax 747 utext_setNativeIndex(fText, offset); 748 // move forward one codepoint to prepare for moving back to a 749 // safe point. 750 // this handles offset being between a supplementary character 751 (void)UTEXT_NEXT32(fText); 752 // handlePrevious will move most of the time to < 1 boundary away 753 handlePrevious(fData->fSafeRevTable); 754 int32_t result = next(); 755 while (result <= offset) { 756 result = next(); 757 } 758 return result; 759 } 760 if (fData->fSafeFwdTable != NULL) { 761 // backup plan if forward safe table is not available 762 utext_setNativeIndex(fText, offset); 763 (void)UTEXT_PREVIOUS32(fText); 764 // handle next will give result >= offset 765 handleNext(fData->fSafeFwdTable); 766 // previous will give result 0 or 1 boundary away from offset, 767 // most of the time 768 // we have to 769 int32_t oldresult = previous(); 770 while (oldresult > offset) { 771 int32_t result = previous(); 772 if (result <= offset) { 773 return oldresult; 774 } 775 oldresult = result; 776 } 777 int32_t result = next(); 778 if (result <= offset) { 779 return next(); 780 } 781 return result; 782 } 783 // otherwise, we have to sync up first. Use handlePrevious() to back 784 // up to a known break position before the specified position (if 785 // we can determine that the specified position is a break position, 786 // we don't back up at all). This may or may not be the last break 787 // position at or before our starting position. Advance forward 788 // from here until we've passed the starting position. The position 789 // we stop on will be the first break position after the specified one. 790 // old rule syntax 791 792 utext_setNativeIndex(fText, offset); 793 if (offset==0 || 794 (offset==1 && utext_getNativeIndex(fText)==0)) { 795 return next(); 796 } 797 result = previous(); 798 799 while (result != BreakIterator::DONE && result <= offset) { 800 result = next(); 801 } 802 803 return result; 804 } 805 806 /** 807 * Sets the iterator to refer to the last boundary position before the 808 * specified position. 809 * @offset The position to begin searching for a break from. 810 * @return The position of the last boundary before the starting position. 811 */ 812 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 813 // if we have cached break positions and offset is in the range 814 // covered by them, use them 815 if (fCachedBreakPositions != NULL) { 816 // TODO: binary search? 817 // TODO: What if offset is outside range, but break is not? 818 if (offset > fCachedBreakPositions[0] 819 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 820 fPositionInCache = 0; 821 while (fPositionInCache < fNumCachedBreakPositions 822 && offset > fCachedBreakPositions[fPositionInCache]) 823 ++fPositionInCache; 824 --fPositionInCache; 825 // If we're at the beginning of the cache, need to reevaluate the 826 // rule status 827 if (fPositionInCache <= 0) { 828 fLastStatusIndexValid = FALSE; 829 } 830 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 831 return fCachedBreakPositions[fPositionInCache]; 832 } 833 else { 834 reset(); 835 } 836 } 837 838 // if the offset passed in is already past the end of the text, 839 // just return DONE; if it's before the beginning, return the 840 // text's starting offset 841 if (fText == NULL || offset > utext_nativeLength(fText)) { 842 // return BreakIterator::DONE; 843 return last(); 844 } 845 else if (offset < 0) { 846 return first(); 847 } 848 849 // if we start by updating the current iteration position to the 850 // position specified by the caller, we can just use previous() 851 // to carry out this operation 852 853 if (fData->fSafeFwdTable != NULL) { 854 // new rule syntax 855 utext_setNativeIndex(fText, offset); 856 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 857 if (newOffset != offset) { 858 // Will come here if specified offset was not a code point boundary AND 859 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 860 // indices to the containing code point. 861 // For breakitereator::preceding only, these non-code-point indices need to be moved 862 // up to refer to the following codepoint. 863 (void)UTEXT_NEXT32(fText); 864 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 865 } 866 867 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 868 // rather than adjusting the position unconditionally? 869 // (Change would interact with safe rules.) 870 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 871 // affects only preceding(), seems cleaner, but is slightly different. 872 (void)UTEXT_PREVIOUS32(fText); 873 handleNext(fData->fSafeFwdTable); 874 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 875 while (result >= offset) { 876 result = previous(); 877 } 878 return result; 879 } 880 if (fData->fSafeRevTable != NULL) { 881 // backup plan if forward safe table is not available 882 // TODO: check whether this path can be discarded 883 // It's probably OK to say that rules must supply both safe tables 884 // if they use safe tables at all. We have certainly never described 885 // to anyone how to work with just one safe table. 886 utext_setNativeIndex(fText, offset); 887 (void)UTEXT_NEXT32(fText); 888 889 // handle previous will give result <= offset 890 handlePrevious(fData->fSafeRevTable); 891 892 // next will give result 0 or 1 boundary away from offset, 893 // most of the time 894 // we have to 895 int32_t oldresult = next(); 896 while (oldresult < offset) { 897 int32_t result = next(); 898 if (result >= offset) { 899 return oldresult; 900 } 901 oldresult = result; 902 } 903 int32_t result = previous(); 904 if (result >= offset) { 905 return previous(); 906 } 907 return result; 908 } 909 910 // old rule syntax 911 utext_setNativeIndex(fText, offset); 912 return previous(); 913 } 914 915 /** 916 * Returns true if the specfied position is a boundary position. As a side 917 * effect, leaves the iterator pointing to the first boundary position at 918 * or after "offset". 919 * @param offset the offset to check. 920 * @return True if "offset" is a boundary position. 921 */ 922 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 923 // the beginning index of the iterator is always a boundary position by definition 924 if (offset == 0) { 925 first(); // For side effects on current position, tag values. 926 return TRUE; 927 } 928 929 if (offset == (int32_t)utext_nativeLength(fText)) { 930 last(); // For side effects on current position, tag values. 931 return TRUE; 932 } 933 934 // out-of-range indexes are never boundary positions 935 if (offset < 0) { 936 first(); // For side effects on current position, tag values. 937 return FALSE; 938 } 939 940 if (offset > utext_nativeLength(fText)) { 941 last(); // For side effects on current position, tag values. 942 return FALSE; 943 } 944 945 // otherwise, we can use following() on the position before the specified 946 // one and return true if the position we get back is the one the user 947 // specified 948 utext_previous32From(fText, offset); 949 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 950 UBool result = following(backOne) == offset; 951 return result; 952 } 953 954 /** 955 * Returns the current iteration position. 956 * @return The current iteration position. 957 */ 958 int32_t RuleBasedBreakIterator::current(void) const { 959 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 960 return pos; 961 } 962 963 //======================================================================= 964 // implementation 965 //======================================================================= 966 967 // 968 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 969 // of user text. A variable with this enum type keeps track of where we 970 // are. The state machine only fetches user input while in the RUN mode. 971 // 972 enum RBBIRunMode { 973 RBBI_START, // state machine processing is before first char of input 974 RBBI_RUN, // state machine processing is in the user text 975 RBBI_END // state machine processing is after end of user text. 976 }; 977 978 979 //----------------------------------------------------------------------------------- 980 // 981 // handleNext(stateTable) 982 // This method is the actual implementation of the rbbi next() method. 983 // This method initializes the state machine to state 1 984 // and advances through the text character by character until we reach the end 985 // of the text or the state machine transitions to state 0. We update our return 986 // value every time the state machine passes through an accepting state. 987 // 988 //----------------------------------------------------------------------------------- 989 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 990 int32_t state; 991 uint16_t category = 0; 992 RBBIRunMode mode; 993 994 RBBIStateTableRow *row; 995 UChar32 c; 996 int32_t lookaheadStatus = 0; 997 int32_t lookaheadTagIdx = 0; 998 int32_t result = 0; 999 int32_t initialPosition = 0; 1000 int32_t lookaheadResult = 0; 1001 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1002 const char *tableData = statetable->fTableData; 1003 uint32_t tableRowLen = statetable->fRowLen; 1004 1005 #ifdef RBBI_DEBUG 1006 if (fTrace) { 1007 RBBIDebugPuts("Handle Next pos char state category"); 1008 } 1009 #endif 1010 1011 // No matter what, handleNext alway correctly sets the break tag value. 1012 fLastStatusIndexValid = TRUE; 1013 fLastRuleStatusIndex = 0; 1014 1015 // if we're already at the end of the text, return DONE. 1016 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1017 result = initialPosition; 1018 c = UTEXT_NEXT32(fText); 1019 if (fData == NULL || c==U_SENTINEL) { 1020 return BreakIterator::DONE; 1021 } 1022 1023 // Set the initial state for the state machine 1024 state = START_STATE; 1025 row = (RBBIStateTableRow *) 1026 //(statetable->fTableData + (statetable->fRowLen * state)); 1027 (tableData + tableRowLen * state); 1028 1029 1030 mode = RBBI_RUN; 1031 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1032 category = 2; 1033 mode = RBBI_START; 1034 } 1035 1036 1037 // loop until we reach the end of the text or transition to state 0 1038 // 1039 for (;;) { 1040 if (c == U_SENTINEL) { 1041 // Reached end of input string. 1042 if (mode == RBBI_END) { 1043 // We have already run the loop one last time with the 1044 // character set to the psueudo {eof} value. Now it is time 1045 // to unconditionally bail out. 1046 if (lookaheadResult > result) { 1047 // We ran off the end of the string with a pending look-ahead match. 1048 // Treat this as if the look-ahead condition had been met, and return 1049 // the match at the / position from the look-ahead rule. 1050 result = lookaheadResult; 1051 fLastRuleStatusIndex = lookaheadTagIdx; 1052 lookaheadStatus = 0; 1053 } 1054 break; 1055 } 1056 // Run the loop one last time with the fake end-of-input character category. 1057 mode = RBBI_END; 1058 category = 1; 1059 } 1060 1061 // 1062 // Get the char category. An incoming category of 1 or 2 means that 1063 // we are preset for doing the beginning or end of input, and 1064 // that we shouldn't get a category from an actual text input character. 1065 // 1066 if (mode == RBBI_RUN) { 1067 // look up the current character's character category, which tells us 1068 // which column in the state table to look at. 1069 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1070 // not the size of the character going in, which is a UChar32. 1071 // 1072 UTRIE_GET16(&fData->fTrie, c, category); 1073 1074 // Check the dictionary bit in the character's category. 1075 // Counter is only used by dictionary based iterators (subclasses). 1076 // Chars that need to be handled by a dictionary have a flag bit set 1077 // in their category values. 1078 // 1079 if ((category & 0x4000) != 0) { 1080 fDictionaryCharCount++; 1081 // And off the dictionary flag bit. 1082 category &= ~0x4000; 1083 } 1084 } 1085 1086 #ifdef RBBI_DEBUG 1087 if (fTrace) { 1088 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1089 if (0x20<=c && c<0x7f) { 1090 RBBIDebugPrintf("\"%c\" ", c); 1091 } else { 1092 RBBIDebugPrintf("%5x ", c); 1093 } 1094 RBBIDebugPrintf("%3d %3d\n", state, category); 1095 } 1096 #endif 1097 1098 // State Transition - move machine to its next state 1099 // 1100 1101 // Note: fNextState is defined as uint16_t[2], but we are casting 1102 // a generated RBBI table to RBBIStateTableRow and some tables 1103 // actually have more than 2 categories. 1104 U_ASSERT(category<fData->fHeader->fCatCount); 1105 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1106 row = (RBBIStateTableRow *) 1107 // (statetable->fTableData + (statetable->fRowLen * state)); 1108 (tableData + tableRowLen * state); 1109 1110 1111 if (row->fAccepting == -1) { 1112 // Match found, common case. 1113 if (mode != RBBI_START) { 1114 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1115 } 1116 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1117 } 1118 1119 if (row->fLookAhead != 0) { 1120 if (lookaheadStatus != 0 1121 && row->fAccepting == lookaheadStatus) { 1122 // Lookahead match is completed. 1123 result = lookaheadResult; 1124 fLastRuleStatusIndex = lookaheadTagIdx; 1125 lookaheadStatus = 0; 1126 // TODO: make a standalone hard break in a rule work. 1127 if (lookAheadHardBreak) { 1128 UTEXT_SETNATIVEINDEX(fText, result); 1129 return result; 1130 } 1131 // Look-ahead completed, but other rules may match further. Continue on 1132 // TODO: junk this feature? I don't think it's used anywhwere. 1133 goto continueOn; 1134 } 1135 1136 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1137 lookaheadResult = r; 1138 lookaheadStatus = row->fLookAhead; 1139 lookaheadTagIdx = row->fTagIdx; 1140 goto continueOn; 1141 } 1142 1143 1144 if (row->fAccepting != 0) { 1145 // Because this is an accepting state, any in-progress look-ahead match 1146 // is no longer relavant. Clear out the pending lookahead status. 1147 lookaheadStatus = 0; // clear out any pending look-ahead match. 1148 } 1149 1150 continueOn: 1151 if (state == STOP_STATE) { 1152 // This is the normal exit from the lookup state machine. 1153 // We have advanced through the string until it is certain that no 1154 // longer match is possible, no matter what characters follow. 1155 break; 1156 } 1157 1158 // Advance to the next character. 1159 // If this is a beginning-of-input loop iteration, don't advance 1160 // the input position. The next iteration will be processing the 1161 // first real input character. 1162 if (mode == RBBI_RUN) { 1163 c = UTEXT_NEXT32(fText); 1164 } else { 1165 if (mode == RBBI_START) { 1166 mode = RBBI_RUN; 1167 } 1168 } 1169 1170 1171 } 1172 1173 // The state machine is done. Check whether it found a match... 1174 1175 // If the iterator failed to advance in the match engine, force it ahead by one. 1176 // (This really indicates a defect in the break rules. They should always match 1177 // at least one character.) 1178 if (result == initialPosition) { 1179 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1180 UTEXT_NEXT32(fText); 1181 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1182 } 1183 1184 // Leave the iterator at our result position. 1185 UTEXT_SETNATIVEINDEX(fText, result); 1186 #ifdef RBBI_DEBUG 1187 if (fTrace) { 1188 RBBIDebugPrintf("result = %d\n\n", result); 1189 } 1190 #endif 1191 return result; 1192 } 1193 1194 1195 1196 //----------------------------------------------------------------------------------- 1197 // 1198 // handlePrevious() 1199 // 1200 // Iterate backwards, according to the logic of the reverse rules. 1201 // This version handles the exact style backwards rules. 1202 // 1203 // The logic of this function is very similar to handleNext(), above. 1204 // 1205 //----------------------------------------------------------------------------------- 1206 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1207 int32_t state; 1208 uint16_t category = 0; 1209 RBBIRunMode mode; 1210 RBBIStateTableRow *row; 1211 UChar32 c; 1212 int32_t lookaheadStatus = 0; 1213 int32_t result = 0; 1214 int32_t initialPosition = 0; 1215 int32_t lookaheadResult = 0; 1216 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1217 1218 #ifdef RBBI_DEBUG 1219 if (fTrace) { 1220 RBBIDebugPuts("Handle Previous pos char state category"); 1221 } 1222 #endif 1223 1224 // handlePrevious() never gets the rule status. 1225 // Flag the status as invalid; if the user ever asks for status, we will need 1226 // to back up, then re-find the break position using handleNext(), which does 1227 // get the status value. 1228 fLastStatusIndexValid = FALSE; 1229 fLastRuleStatusIndex = 0; 1230 1231 // if we're already at the start of the text, return DONE. 1232 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1233 return BreakIterator::DONE; 1234 } 1235 1236 // Set up the starting char. 1237 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1238 result = initialPosition; 1239 c = UTEXT_PREVIOUS32(fText); 1240 1241 // Set the initial state for the state machine 1242 state = START_STATE; 1243 row = (RBBIStateTableRow *) 1244 (statetable->fTableData + (statetable->fRowLen * state)); 1245 category = 3; 1246 mode = RBBI_RUN; 1247 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1248 category = 2; 1249 mode = RBBI_START; 1250 } 1251 1252 1253 // loop until we reach the start of the text or transition to state 0 1254 // 1255 for (;;) { 1256 if (c == U_SENTINEL) { 1257 // Reached end of input string. 1258 if (mode == RBBI_END) { 1259 // We have already run the loop one last time with the 1260 // character set to the psueudo {eof} value. Now it is time 1261 // to unconditionally bail out. 1262 if (lookaheadResult < result) { 1263 // We ran off the end of the string with a pending look-ahead match. 1264 // Treat this as if the look-ahead condition had been met, and return 1265 // the match at the / position from the look-ahead rule. 1266 result = lookaheadResult; 1267 lookaheadStatus = 0; 1268 } else if (result == initialPosition) { 1269 // Ran off start, no match found. 1270 // move one index one (towards the start, since we are doing a previous()) 1271 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1272 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1273 } 1274 break; 1275 } 1276 // Run the loop one last time with the fake end-of-input character category. 1277 mode = RBBI_END; 1278 category = 1; 1279 } 1280 1281 // 1282 // Get the char category. An incoming category of 1 or 2 means that 1283 // we are preset for doing the beginning or end of input, and 1284 // that we shouldn't get a category from an actual text input character. 1285 // 1286 if (mode == RBBI_RUN) { 1287 // look up the current character's character category, which tells us 1288 // which column in the state table to look at. 1289 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1290 // not the size of the character going in, which is a UChar32. 1291 // 1292 UTRIE_GET16(&fData->fTrie, c, category); 1293 1294 // Check the dictionary bit in the character's category. 1295 // Counter is only used by dictionary based iterators (subclasses). 1296 // Chars that need to be handled by a dictionary have a flag bit set 1297 // in their category values. 1298 // 1299 if ((category & 0x4000) != 0) { 1300 fDictionaryCharCount++; 1301 // And off the dictionary flag bit. 1302 category &= ~0x4000; 1303 } 1304 } 1305 1306 #ifdef RBBI_DEBUG 1307 if (fTrace) { 1308 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1309 if (0x20<=c && c<0x7f) { 1310 RBBIDebugPrintf("\"%c\" ", c); 1311 } else { 1312 RBBIDebugPrintf("%5x ", c); 1313 } 1314 RBBIDebugPrintf("%3d %3d\n", state, category); 1315 } 1316 #endif 1317 1318 // State Transition - move machine to its next state 1319 // 1320 1321 // Note: fNextState is defined as uint16_t[2], but we are casting 1322 // a generated RBBI table to RBBIStateTableRow and some tables 1323 // actually have more than 2 categories. 1324 U_ASSERT(category<fData->fHeader->fCatCount); 1325 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1326 row = (RBBIStateTableRow *) 1327 (statetable->fTableData + (statetable->fRowLen * state)); 1328 1329 if (row->fAccepting == -1) { 1330 // Match found, common case. 1331 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1332 } 1333 1334 if (row->fLookAhead != 0) { 1335 if (lookaheadStatus != 0 1336 && row->fAccepting == lookaheadStatus) { 1337 // Lookahead match is completed. 1338 result = lookaheadResult; 1339 lookaheadStatus = 0; 1340 // TODO: make a standalone hard break in a rule work. 1341 if (lookAheadHardBreak) { 1342 UTEXT_SETNATIVEINDEX(fText, result); 1343 return result; 1344 } 1345 // Look-ahead completed, but other rules may match further. Continue on 1346 // TODO: junk this feature? I don't think it's used anywhwere. 1347 goto continueOn; 1348 } 1349 1350 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1351 lookaheadResult = r; 1352 lookaheadStatus = row->fLookAhead; 1353 goto continueOn; 1354 } 1355 1356 1357 if (row->fAccepting != 0) { 1358 // Because this is an accepting state, any in-progress look-ahead match 1359 // is no longer relavant. Clear out the pending lookahead status. 1360 lookaheadStatus = 0; 1361 } 1362 1363 continueOn: 1364 if (state == STOP_STATE) { 1365 // This is the normal exit from the lookup state machine. 1366 // We have advanced through the string until it is certain that no 1367 // longer match is possible, no matter what characters follow. 1368 break; 1369 } 1370 1371 // Move (backwards) to the next character to process. 1372 // If this is a beginning-of-input loop iteration, don't advance 1373 // the input position. The next iteration will be processing the 1374 // first real input character. 1375 if (mode == RBBI_RUN) { 1376 c = UTEXT_PREVIOUS32(fText); 1377 } else { 1378 if (mode == RBBI_START) { 1379 mode = RBBI_RUN; 1380 } 1381 } 1382 } 1383 1384 // The state machine is done. Check whether it found a match... 1385 1386 // If the iterator failed to advance in the match engine, force it ahead by one. 1387 // (This really indicates a defect in the break rules. They should always match 1388 // at least one character.) 1389 if (result == initialPosition) { 1390 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1391 UTEXT_PREVIOUS32(fText); 1392 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1393 } 1394 1395 // Leave the iterator at our result position. 1396 UTEXT_SETNATIVEINDEX(fText, result); 1397 #ifdef RBBI_DEBUG 1398 if (fTrace) { 1399 RBBIDebugPrintf("result = %d\n\n", result); 1400 } 1401 #endif 1402 return result; 1403 } 1404 1405 1406 void 1407 RuleBasedBreakIterator::reset() 1408 { 1409 if (fCachedBreakPositions) { 1410 uprv_free(fCachedBreakPositions); 1411 } 1412 fCachedBreakPositions = NULL; 1413 fNumCachedBreakPositions = 0; 1414 fDictionaryCharCount = 0; 1415 fPositionInCache = 0; 1416 } 1417 1418 1419 1420 //------------------------------------------------------------------------------- 1421 // 1422 // getRuleStatus() Return the break rule tag associated with the current 1423 // iterator position. If the iterator arrived at its current 1424 // position by iterating forwards, the value will have been 1425 // cached by the handleNext() function. 1426 // 1427 // If no cached status value is available, the status is 1428 // found by doing a previous() followed by a next(), which 1429 // leaves the iterator where it started, and computes the 1430 // status while doing the next(). 1431 // 1432 //------------------------------------------------------------------------------- 1433 void RuleBasedBreakIterator::makeRuleStatusValid() { 1434 if (fLastStatusIndexValid == FALSE) { 1435 // No cached status is available. 1436 if (fText == NULL || current() == 0) { 1437 // At start of text, or there is no text. Status is always zero. 1438 fLastRuleStatusIndex = 0; 1439 fLastStatusIndexValid = TRUE; 1440 } else { 1441 // Not at start of text. Find status the tedious way. 1442 int32_t pa = current(); 1443 previous(); 1444 if (fNumCachedBreakPositions > 0) { 1445 reset(); // Blow off the dictionary cache 1446 } 1447 int32_t pb = next(); 1448 if (pa != pb) { 1449 // note: the if (pa != pb) test is here only to eliminate warnings for 1450 // unused local variables on gcc. Logically, it isn't needed. 1451 U_ASSERT(pa == pb); 1452 } 1453 } 1454 } 1455 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1456 } 1457 1458 1459 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1460 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1461 nonConstThis->makeRuleStatusValid(); 1462 1463 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1464 // (the number of status values.) 1465 // This function returns the last (largest) of the array of status values. 1466 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1467 int32_t tagVal = fData->fRuleStatusTable[idx]; 1468 1469 return tagVal; 1470 } 1471 1472 1473 1474 1475 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1476 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1477 { 1478 if (U_FAILURE(status)) { 1479 return 0; 1480 } 1481 1482 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1483 nonConstThis->makeRuleStatusValid(); 1484 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1485 int32_t numValsToCopy = numVals; 1486 if (numVals > capacity) { 1487 status = U_BUFFER_OVERFLOW_ERROR; 1488 numValsToCopy = capacity; 1489 } 1490 int i; 1491 for (i=0; i<numValsToCopy; i++) { 1492 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1493 } 1494 return numVals; 1495 } 1496 1497 1498 1499 //------------------------------------------------------------------------------- 1500 // 1501 // getBinaryRules Access to the compiled form of the rules, 1502 // for use by build system tools that save the data 1503 // for standard iterator types. 1504 // 1505 //------------------------------------------------------------------------------- 1506 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1507 const uint8_t *retPtr = NULL; 1508 length = 0; 1509 1510 if (fData != NULL) { 1511 retPtr = (const uint8_t *)fData->fHeader; 1512 length = fData->fHeader->fLength; 1513 } 1514 return retPtr; 1515 } 1516 1517 1518 1519 1520 //------------------------------------------------------------------------------- 1521 // 1522 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1523 // Saving one heap allocation isn't worth the trouble. 1524 // Cloning shouldn't be done in tight loops, and 1525 // making the clone copy involves other heap operations anyway. 1526 // And the application code for correctly dealing with buffer 1527 // size problems and the eventual object destruction is ugly. 1528 // 1529 //------------------------------------------------------------------------------- 1530 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1531 int32_t &bufferSize, 1532 UErrorCode &status) 1533 { 1534 if (U_FAILURE(status)){ 1535 return NULL; 1536 } 1537 1538 // 1539 // If user buffer size is zero this is a preflight operation to 1540 // obtain the needed buffer size, allowing for worst case misalignment. 1541 // 1542 if (bufferSize == 0) { 1543 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1544 return NULL; 1545 } 1546 1547 1548 // 1549 // Check the alignment and size of the user supplied buffer. 1550 // Allocate heap memory if the user supplied memory is insufficient. 1551 // 1552 char *buf = (char *)stackBuffer; 1553 uint32_t s = bufferSize; 1554 1555 if (stackBuffer == NULL) { 1556 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1557 } 1558 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1559 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1560 s -= offsetUp; 1561 buf += offsetUp; 1562 } 1563 if (s < sizeof(RuleBasedBreakIterator)) { 1564 // Not enough room in the caller-supplied buffer. 1565 // Do a plain-vanilla heap based clone and return that, along with 1566 // a warning that the clone was allocated. 1567 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1568 if (clonedBI == 0) { 1569 status = U_MEMORY_ALLOCATION_ERROR; 1570 } else { 1571 status = U_SAFECLONE_ALLOCATED_WARNING; 1572 } 1573 return clonedBI; 1574 } 1575 1576 // 1577 // Clone the source BI into the caller-supplied buffer. 1578 // 1579 RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this); 1580 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1581 1582 return clone; 1583 } 1584 1585 1586 //------------------------------------------------------------------------------- 1587 // 1588 // isDictionaryChar Return true if the category lookup for this char 1589 // indicates that it is in the set of dictionary lookup 1590 // chars. 1591 // 1592 // This function is intended for use by dictionary based 1593 // break iterators. 1594 // 1595 //------------------------------------------------------------------------------- 1596 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1597 if (fData == NULL) { 1598 return FALSE; 1599 } 1600 uint16_t category; 1601 UTRIE_GET16(&fData->fTrie, c, category); 1602 return (category & 0x4000) != 0; 1603 }*/ 1604 1605 1606 //------------------------------------------------------------------------------- 1607 // 1608 // checkDictionary This function handles all processing of characters in 1609 // the "dictionary" set. It will determine the appropriate 1610 // course of action, and possibly set up a cache in the 1611 // process. 1612 // 1613 //------------------------------------------------------------------------------- 1614 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1615 int32_t endPos, 1616 UBool reverse) { 1617 // Reset the old break cache first. 1618 reset(); 1619 1620 // note: code segment below assumes that dictionary chars are in the 1621 // startPos-endPos range 1622 // value returned should be next character in sequence 1623 if ((endPos - startPos) <= 1) { 1624 return (reverse ? startPos : endPos); 1625 } 1626 1627 // Bug 5532. The dictionary code will crash if the input text is UTF-8 1628 // because native indexes are different from UTF-16 indexes. 1629 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. 1630 // It wont give the right breaks, but it's better than a crash. 1631 // 1632 // Check the type of the UText by checking its pFuncs field, which 1633 // is UText's function dispatch table. It will be the same for all 1634 // UTF-8 UTexts and different for any other UText type. 1635 // 1636 // We have no other type of UText available with non-UTF-16 native indexing. 1637 // This whole check will go away once the dictionary code is fixed. 1638 static const void *utext_utf8Funcs; 1639 if (utext_utf8Funcs == NULL) { 1640 // Cache the UTF-8 UText function pointer value. 1641 UErrorCode status = U_ZERO_ERROR; 1642 UText tempUText = UTEXT_INITIALIZER; 1643 utext_openUTF8(&tempUText, NULL, 0, &status); 1644 utext_utf8Funcs = tempUText.pFuncs; 1645 utext_close(&tempUText); 1646 } 1647 if (fText->pFuncs == utext_utf8Funcs) { 1648 return (reverse ? startPos : endPos); 1649 } 1650 1651 // Starting from the starting point, scan towards the proposed result, 1652 // looking for the first dictionary character (which may be the one 1653 // we're on, if we're starting in the middle of a range). 1654 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1655 if (reverse) { 1656 UTEXT_PREVIOUS32(fText); 1657 } 1658 1659 int32_t rangeStart = startPos; 1660 int32_t rangeEnd = endPos; 1661 1662 uint16_t category; 1663 int32_t current; 1664 UErrorCode status = U_ZERO_ERROR; 1665 UStack breaks(status); 1666 int32_t foundBreakCount = 0; 1667 UChar32 c = utext_current32(fText); 1668 1669 UTRIE_GET16(&fData->fTrie, c, category); 1670 1671 // Is the character we're starting on a dictionary character? If so, we 1672 // need to back up to include the entire run; otherwise the results of 1673 // the break algorithm will differ depending on where we start. Since 1674 // the result is cached and there is typically a non-dictionary break 1675 // within a small number of words, there should be little performance impact. 1676 if (category & 0x4000) { 1677 if (reverse) { 1678 do { 1679 utext_next32(fText); // TODO: recast to work directly with postincrement. 1680 c = utext_current32(fText); 1681 UTRIE_GET16(&fData->fTrie, c, category); 1682 } while (c != U_SENTINEL && (category & 0x4000)); 1683 // Back up to the last dictionary character 1684 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1685 if (c == U_SENTINEL) { 1686 // c = fText->last32(); 1687 // TODO: why was this if needed? 1688 c = UTEXT_PREVIOUS32(fText); 1689 } 1690 else { 1691 c = UTEXT_PREVIOUS32(fText); 1692 } 1693 } 1694 else { 1695 do { 1696 c = UTEXT_PREVIOUS32(fText); 1697 UTRIE_GET16(&fData->fTrie, c, category); 1698 } 1699 while (c != U_SENTINEL && (category & 0x4000)); 1700 // Back up to the last dictionary character 1701 if (c == U_SENTINEL) { 1702 // c = fText->first32(); 1703 c = utext_current32(fText); 1704 } 1705 else { 1706 utext_next32(fText); 1707 c = utext_current32(fText); 1708 } 1709 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1710 } 1711 UTRIE_GET16(&fData->fTrie, c, category); 1712 } 1713 1714 // Loop through the text, looking for ranges of dictionary characters. 1715 // For each span, find the appropriate break engine, and ask it to find 1716 // any breaks within the span. 1717 // Note: we always do this in the forward direction, so that the break 1718 // cache is built in the right order. 1719 if (reverse) { 1720 utext_setNativeIndex(fText, rangeStart); 1721 c = utext_current32(fText); 1722 UTRIE_GET16(&fData->fTrie, c, category); 1723 } 1724 while(U_SUCCESS(status)) { 1725 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1726 utext_next32(fText); // TODO: tweak for post-increment operation 1727 c = utext_current32(fText); 1728 UTRIE_GET16(&fData->fTrie, c, category); 1729 } 1730 if (current >= rangeEnd) { 1731 break; 1732 } 1733 1734 // We now have a dictionary character. Get the appropriate language object 1735 // to deal with it. 1736 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1737 1738 // Ask the language object if there are any breaks. It will leave the text 1739 // pointer on the other side of its range, ready to search for the next one. 1740 if (lbe != NULL) { 1741 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1742 } 1743 1744 // Reload the loop variables for the next go-round 1745 c = utext_current32(fText); 1746 UTRIE_GET16(&fData->fTrie, c, category); 1747 } 1748 1749 // If we found breaks, build a new break cache. The first and last entries must 1750 // be the original starting and ending position. 1751 if (foundBreakCount > 0) { 1752 int32_t totalBreaks = foundBreakCount; 1753 if (startPos < breaks.elementAti(0)) { 1754 totalBreaks += 1; 1755 } 1756 if (endPos > breaks.peeki()) { 1757 totalBreaks += 1; 1758 } 1759 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1760 if (fCachedBreakPositions != NULL) { 1761 int32_t out = 0; 1762 fNumCachedBreakPositions = totalBreaks; 1763 if (startPos < breaks.elementAti(0)) { 1764 fCachedBreakPositions[out++] = startPos; 1765 } 1766 for (int32_t i = 0; i < foundBreakCount; ++i) { 1767 fCachedBreakPositions[out++] = breaks.elementAti(i); 1768 } 1769 if (endPos > fCachedBreakPositions[out-1]) { 1770 fCachedBreakPositions[out] = endPos; 1771 } 1772 // If there are breaks, then by definition, we are replacing the original 1773 // proposed break by one of the breaks we found. Use following() and 1774 // preceding() to do the work. They should never recurse in this case. 1775 if (reverse) { 1776 return preceding(endPos); 1777 } 1778 else { 1779 return following(startPos); 1780 } 1781 } 1782 // If the allocation failed, just fall through to the "no breaks found" case. 1783 } 1784 1785 // If we get here, there were no language-based breaks. Set the text pointer 1786 // to the original proposed break. 1787 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1788 return (reverse ? startPos : endPos); 1789 } 1790 1791 U_NAMESPACE_END 1792 1793 // defined in ucln_cmn.h 1794 1795 static icu::UStack *gLanguageBreakFactories = NULL; 1796 1797 /** 1798 * Release all static memory held by breakiterator. 1799 */ 1800 U_CDECL_BEGIN 1801 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1802 if (gLanguageBreakFactories) { 1803 delete gLanguageBreakFactories; 1804 gLanguageBreakFactories = NULL; 1805 } 1806 return TRUE; 1807 } 1808 U_CDECL_END 1809 1810 U_CDECL_BEGIN 1811 static void U_CALLCONV _deleteFactory(void *obj) { 1812 delete (icu::LanguageBreakFactory *) obj; 1813 } 1814 U_CDECL_END 1815 U_NAMESPACE_BEGIN 1816 1817 static const LanguageBreakEngine* 1818 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1819 { 1820 UBool needsInit; 1821 UErrorCode status = U_ZERO_ERROR; 1822 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1823 1824 if (needsInit) { 1825 UStack *factories = new UStack(_deleteFactory, NULL, status); 1826 if (factories != NULL && U_SUCCESS(status)) { 1827 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1828 factories->push(builtIn, status); 1829 #ifdef U_LOCAL_SERVICE_HOOK 1830 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1831 if (extra != NULL) { 1832 factories->push(extra, status); 1833 } 1834 #endif 1835 } 1836 umtx_lock(NULL); 1837 if (gLanguageBreakFactories == NULL) { 1838 gLanguageBreakFactories = factories; 1839 factories = NULL; 1840 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1841 } 1842 umtx_unlock(NULL); 1843 delete factories; 1844 } 1845 1846 if (gLanguageBreakFactories == NULL) { 1847 return NULL; 1848 } 1849 1850 int32_t i = gLanguageBreakFactories->size(); 1851 const LanguageBreakEngine *lbe = NULL; 1852 while (--i >= 0) { 1853 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1854 lbe = factory->getEngineFor(c, breakType); 1855 if (lbe != NULL) { 1856 break; 1857 } 1858 } 1859 return lbe; 1860 } 1861 1862 1863 //------------------------------------------------------------------------------- 1864 // 1865 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1866 // the character c. 1867 // 1868 //------------------------------------------------------------------------------- 1869 const LanguageBreakEngine * 1870 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1871 const LanguageBreakEngine *lbe = NULL; 1872 UErrorCode status = U_ZERO_ERROR; 1873 1874 if (fLanguageBreakEngines == NULL) { 1875 fLanguageBreakEngines = new UStack(status); 1876 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1877 delete fLanguageBreakEngines; 1878 fLanguageBreakEngines = 0; 1879 return NULL; 1880 } 1881 } 1882 1883 int32_t i = fLanguageBreakEngines->size(); 1884 while (--i >= 0) { 1885 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1886 if (lbe->handles(c, fBreakType)) { 1887 return lbe; 1888 } 1889 } 1890 1891 // No existing dictionary took the character. See if a factory wants to 1892 // give us a new LanguageBreakEngine for this character. 1893 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1894 1895 // If we got one, use it and push it on our stack. 1896 if (lbe != NULL) { 1897 fLanguageBreakEngines->push((void *)lbe, status); 1898 // Even if we can't remember it, we can keep looking it up, so 1899 // return it even if the push fails. 1900 return lbe; 1901 } 1902 1903 // No engine is forthcoming for this character. Add it to the 1904 // reject set. Create the reject break engine if needed. 1905 if (fUnhandledBreakEngine == NULL) { 1906 fUnhandledBreakEngine = new UnhandledEngine(status); 1907 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1908 status = U_MEMORY_ALLOCATION_ERROR; 1909 } 1910 // Put it last so that scripts for which we have an engine get tried 1911 // first. 1912 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1913 // If we can't insert it, or creation failed, get rid of it 1914 if (U_FAILURE(status)) { 1915 delete fUnhandledBreakEngine; 1916 fUnhandledBreakEngine = 0; 1917 return NULL; 1918 } 1919 } 1920 1921 // Tell the reject engine about the character; at its discretion, it may 1922 // add more than just the one character. 1923 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1924 1925 return fUnhandledBreakEngine; 1926 } 1927 1928 1929 1930 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1931 return fBreakType; 1932 }*/ 1933 1934 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1935 fBreakType = type; 1936 reset(); 1937 } 1938 1939 U_NAMESPACE_END 1940 1941 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1942