1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2011 International Business Machines Corporation 4 * and others. All rights reserved. 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include <typeinfo> // for 'typeid' to work 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/rbbi.h" 20 #include "unicode/schriter.h" 21 #include "unicode/uchriter.h" 22 #include "unicode/udata.h" 23 #include "unicode/uclean.h" 24 #include "rbbidata.h" 25 #include "rbbirb.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "umutex.h" 29 #include "ucln_cmn.h" 30 #include "brkeng.h" 31 32 #include "uassert.h" 33 #include "uvector.h" 34 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 36 #if U_LOCAL_SERVICE_HOOK 37 #include "localsvc.h" 38 #endif 39 40 #ifdef RBBI_DEBUG 41 static UBool fTrace = FALSE; 42 #endif 43 44 U_NAMESPACE_BEGIN 45 46 // The state number of the starting state 47 #define START_STATE 1 48 49 // The state-transition value indicating "stop" 50 #define STOP_STATE 0 51 52 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 54 55 56 //======================================================================= 57 // constructors 58 //======================================================================= 59 60 /** 61 * Constructs a RuleBasedBreakIterator that uses the already-created 62 * tables object that is passed in as a parameter. 63 */ 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 65 { 66 init(); 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor 68 if (U_FAILURE(status)) {return;} 69 if(fData == 0) { 70 status = U_MEMORY_ALLOCATION_ERROR; 71 return; 72 } 73 } 74 75 /** 76 * Same as above but does not adopt memory 77 */ 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 79 { 80 init(); 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 82 if (U_FAILURE(status)) {return;} 83 if(fData == 0) { 84 status = U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 } 88 89 90 // 91 // Construct from precompiled binary rules (tables). This constructor is public API, 92 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 93 // 94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 95 uint32_t ruleLength, 96 UErrorCode &status) { 97 init(); 98 if (U_FAILURE(status)) { 99 return; 100 } 101 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 102 status = U_ILLEGAL_ARGUMENT_ERROR; 103 return; 104 } 105 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 106 if (data->fLength > ruleLength) { 107 status = U_ILLEGAL_ARGUMENT_ERROR; 108 return; 109 } 110 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 111 if (U_FAILURE(status)) {return;} 112 if(fData == 0) { 113 status = U_MEMORY_ALLOCATION_ERROR; 114 return; 115 } 116 } 117 118 119 //------------------------------------------------------------------------------- 120 // 121 // Constructor from a UDataMemory handle to precompiled break rules 122 // stored in an ICU data file. 123 // 124 //------------------------------------------------------------------------------- 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 126 { 127 init(); 128 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 129 if (U_FAILURE(status)) {return;} 130 if(fData == 0) { 131 status = U_MEMORY_ALLOCATION_ERROR; 132 return; 133 } 134 } 135 136 137 138 //------------------------------------------------------------------------------- 139 // 140 // Constructor from a set of rules supplied as a string. 141 // 142 //------------------------------------------------------------------------------- 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 144 UParseError &parseError, 145 UErrorCode &status) 146 { 147 init(); 148 if (U_FAILURE(status)) {return;} 149 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 150 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 151 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 152 // creates and returns a complete RBBI. From here, in a constructor, we 153 // can't just return the object created by the builder factory, hence 154 // the assignment of the factory created object to "this". 155 if (U_SUCCESS(status)) { 156 *this = *bi; 157 delete bi; 158 } 159 } 160 161 162 //------------------------------------------------------------------------------- 163 // 164 // Default Constructor. Create an empty shell that can be set up later. 165 // Used when creating a RuleBasedBreakIterator from a set 166 // of rules. 167 //------------------------------------------------------------------------------- 168 RuleBasedBreakIterator::RuleBasedBreakIterator() { 169 init(); 170 } 171 172 173 //------------------------------------------------------------------------------- 174 // 175 // Copy constructor. Will produce a break iterator with the same behavior, 176 // and which iterates over the same text, as the one passed in. 177 // 178 //------------------------------------------------------------------------------- 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 180 : BreakIterator(other) 181 { 182 this->init(); 183 *this = other; 184 } 185 186 187 /** 188 * Destructor 189 */ 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 191 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 192 // fCharIter was adopted from the outside. 193 delete fCharIter; 194 } 195 fCharIter = NULL; 196 delete fSCharIter; 197 fCharIter = NULL; 198 delete fDCharIter; 199 fDCharIter = NULL; 200 201 utext_close(fText); 202 203 if (fData != NULL) { 204 fData->removeReference(); 205 fData = NULL; 206 } 207 if (fCachedBreakPositions) { 208 uprv_free(fCachedBreakPositions); 209 fCachedBreakPositions = NULL; 210 } 211 if (fLanguageBreakEngines) { 212 delete fLanguageBreakEngines; 213 fLanguageBreakEngines = NULL; 214 } 215 if (fUnhandledBreakEngine) { 216 delete fUnhandledBreakEngine; 217 fUnhandledBreakEngine = NULL; 218 } 219 } 220 221 /** 222 * Assignment operator. Sets this iterator to have the same behavior, 223 * and iterate over the same text, as the one passed in. 224 */ 225 RuleBasedBreakIterator& 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 227 if (this == &that) { 228 return *this; 229 } 230 reset(); // Delete break cache information 231 fBreakType = that.fBreakType; 232 if (fLanguageBreakEngines != NULL) { 233 delete fLanguageBreakEngines; 234 fLanguageBreakEngines = NULL; // Just rebuild for now 235 } 236 // TODO: clone fLanguageBreakEngines from "that" 237 UErrorCode status = U_ZERO_ERROR; 238 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 239 240 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 241 delete fCharIter; 242 } 243 fCharIter = NULL; 244 245 if (that.fCharIter != NULL ) { 246 // This is a little bit tricky - it will intially appear that 247 // this->fCharIter is adopted, even if that->fCharIter was 248 // not adopted. That's ok. 249 fCharIter = that.fCharIter->clone(); 250 } 251 252 if (fData != NULL) { 253 fData->removeReference(); 254 fData = NULL; 255 } 256 if (that.fData != NULL) { 257 fData = that.fData->addReference(); 258 } 259 260 return *this; 261 } 262 263 264 265 //----------------------------------------------------------------------------- 266 // 267 // init() Shared initialization routine. Used by all the constructors. 268 // Initializes all fields, leaving the object in a consistent state. 269 // 270 //----------------------------------------------------------------------------- 271 void RuleBasedBreakIterator::init() { 272 UErrorCode status = U_ZERO_ERROR; 273 fBufferClone = FALSE; 274 fText = utext_openUChars(NULL, NULL, 0, &status); 275 fCharIter = NULL; 276 fSCharIter = NULL; 277 fDCharIter = NULL; 278 fData = NULL; 279 fLastRuleStatusIndex = 0; 280 fLastStatusIndexValid = TRUE; 281 fDictionaryCharCount = 0; 282 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 283 // dictionary behavior for Break Iterators that are 284 // built from rules. Even better would be the ability to 285 // declare the type in the rules. 286 287 fCachedBreakPositions = NULL; 288 fLanguageBreakEngines = NULL; 289 fUnhandledBreakEngine = NULL; 290 fNumCachedBreakPositions = 0; 291 fPositionInCache = 0; 292 293 #ifdef RBBI_DEBUG 294 static UBool debugInitDone = FALSE; 295 if (debugInitDone == FALSE) { 296 char *debugEnv = getenv("U_RBBIDEBUG"); 297 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 298 fTrace = TRUE; 299 } 300 debugInitDone = TRUE; 301 } 302 #endif 303 } 304 305 306 307 //----------------------------------------------------------------------------- 308 // 309 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 310 // behavior, and iterating over the same text, as this one. 311 // Virtual function: does the right thing with subclasses. 312 // 313 //----------------------------------------------------------------------------- 314 BreakIterator* 315 RuleBasedBreakIterator::clone(void) const { 316 return new RuleBasedBreakIterator(*this); 317 } 318 319 /** 320 * Equality operator. Returns TRUE if both BreakIterators are of the 321 * same class, have the same behavior, and iterate over the same text. 322 */ 323 UBool 324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 325 if (typeid(*this) != typeid(that)) { 326 return FALSE; 327 } 328 329 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 330 331 if (!utext_equals(fText, that2.fText)) { 332 // The two break iterators are operating on different text, 333 // or have a different interation position. 334 return FALSE; 335 }; 336 337 // TODO: need a check for when in a dictionary region at different offsets. 338 339 if (that2.fData == fData || 340 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 341 // The two break iterators are using the same rules. 342 return TRUE; 343 } 344 return FALSE; 345 } 346 347 /** 348 * Compute a hash code for this BreakIterator 349 * @return A hash code 350 */ 351 int32_t 352 RuleBasedBreakIterator::hashCode(void) const { 353 int32_t hash = 0; 354 if (fData != NULL) { 355 hash = fData->hashCode(); 356 } 357 return hash; 358 } 359 360 361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 362 if (U_FAILURE(status)) { 363 return; 364 } 365 reset(); 366 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 367 368 // Set up a dummy CharacterIterator to be returned if anyone 369 // calls getText(). With input from UText, there is no reasonable 370 // way to return a characterIterator over the actual input text. 371 // Return one over an empty string instead - this is the closest 372 // we can come to signaling a failure. 373 // (GetText() is obsolete, this failure is sort of OK) 374 if (fDCharIter == NULL) { 375 static const UChar c = 0; 376 fDCharIter = new UCharCharacterIterator(&c, 0); 377 if (fDCharIter == NULL) { 378 status = U_MEMORY_ALLOCATION_ERROR; 379 return; 380 } 381 } 382 383 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 384 // existing fCharIter was adopted from the outside. Delete it now. 385 delete fCharIter; 386 } 387 fCharIter = fDCharIter; 388 389 this->first(); 390 } 391 392 393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 394 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 395 return result; 396 } 397 398 399 400 /** 401 * Returns the description used to create this iterator 402 */ 403 const UnicodeString& 404 RuleBasedBreakIterator::getRules() const { 405 if (fData != NULL) { 406 return fData->getRuleSourceString(); 407 } else { 408 static const UnicodeString *s; 409 if (s == NULL) { 410 // TODO: something more elegant here. 411 // perhaps API should return the string by value. 412 // Note: thread unsafe init & leak are semi-ok, better than 413 // what was before. Sould be cleaned up, though. 414 s = new UnicodeString; 415 } 416 return *s; 417 } 418 } 419 420 //======================================================================= 421 // BreakIterator overrides 422 //======================================================================= 423 424 /** 425 * Return a CharacterIterator over the text being analyzed. 426 */ 427 CharacterIterator& 428 RuleBasedBreakIterator::getText() const { 429 return *fCharIter; 430 } 431 432 /** 433 * Set the iterator to analyze a new piece of text. This function resets 434 * the current iteration position to the beginning of the text. 435 * @param newText An iterator over the text to analyze. 436 */ 437 void 438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 439 // If we are holding a CharacterIterator adopted from a 440 // previous call to this function, delete it now. 441 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 442 delete fCharIter; 443 } 444 445 fCharIter = newText; 446 UErrorCode status = U_ZERO_ERROR; 447 reset(); 448 if (newText==NULL || newText->startIndex() != 0) { 449 // startIndex !=0 wants to be an error, but there's no way to report it. 450 // Make the iterator text be an empty string. 451 fText = utext_openUChars(fText, NULL, 0, &status); 452 } else { 453 fText = utext_openCharacterIterator(fText, newText, &status); 454 } 455 this->first(); 456 } 457 458 /** 459 * Set the iterator to analyze a new piece of text. This function resets 460 * the current iteration position to the beginning of the text. 461 * @param newText An iterator over the text to analyze. 462 */ 463 void 464 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 465 UErrorCode status = U_ZERO_ERROR; 466 reset(); 467 fText = utext_openConstUnicodeString(fText, &newText, &status); 468 469 // Set up a character iterator on the string. 470 // Needed in case someone calls getText(). 471 // Can not, unfortunately, do this lazily on the (probably never) 472 // call to getText(), because getText is const. 473 if (fSCharIter == NULL) { 474 fSCharIter = new StringCharacterIterator(newText); 475 } else { 476 fSCharIter->setText(newText); 477 } 478 479 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 480 // old fCharIter was adopted from the outside. Delete it. 481 delete fCharIter; 482 } 483 fCharIter = fSCharIter; 484 485 this->first(); 486 } 487 488 489 490 /** 491 * Sets the current iteration position to the beginning of the text. 492 * @return The offset of the beginning of the text. 493 */ 494 int32_t RuleBasedBreakIterator::first(void) { 495 reset(); 496 fLastRuleStatusIndex = 0; 497 fLastStatusIndexValid = TRUE; 498 //if (fText == NULL) 499 // return BreakIterator::DONE; 500 501 utext_setNativeIndex(fText, 0); 502 return 0; 503 } 504 505 /** 506 * Sets the current iteration position to the end of the text. 507 * @return The text's past-the-end offset. 508 */ 509 int32_t RuleBasedBreakIterator::last(void) { 510 reset(); 511 if (fText == NULL) { 512 fLastRuleStatusIndex = 0; 513 fLastStatusIndexValid = TRUE; 514 return BreakIterator::DONE; 515 } 516 517 fLastStatusIndexValid = FALSE; 518 int32_t pos = (int32_t)utext_nativeLength(fText); 519 utext_setNativeIndex(fText, pos); 520 return pos; 521 } 522 523 /** 524 * Advances the iterator either forward or backward the specified number of steps. 525 * Negative values move backward, and positive values move forward. This is 526 * equivalent to repeatedly calling next() or previous(). 527 * @param n The number of steps to move. The sign indicates the direction 528 * (negative is backwards, and positive is forwards). 529 * @return The character offset of the boundary position n boundaries away from 530 * the current one. 531 */ 532 int32_t RuleBasedBreakIterator::next(int32_t n) { 533 int32_t result = current(); 534 while (n > 0) { 535 result = next(); 536 --n; 537 } 538 while (n < 0) { 539 result = previous(); 540 ++n; 541 } 542 return result; 543 } 544 545 /** 546 * Advances the iterator to the next boundary position. 547 * @return The position of the first boundary after this one. 548 */ 549 int32_t RuleBasedBreakIterator::next(void) { 550 // if we have cached break positions and we're still in the range 551 // covered by them, just move one step forward in the cache 552 if (fCachedBreakPositions != NULL) { 553 if (fPositionInCache < fNumCachedBreakPositions - 1) { 554 ++fPositionInCache; 555 int32_t pos = fCachedBreakPositions[fPositionInCache]; 556 utext_setNativeIndex(fText, pos); 557 return pos; 558 } 559 else { 560 reset(); 561 } 562 } 563 564 int32_t startPos = current(); 565 int32_t result = handleNext(fData->fForwardTable); 566 if (fDictionaryCharCount > 0) { 567 result = checkDictionary(startPos, result, FALSE); 568 } 569 return result; 570 } 571 572 /** 573 * Advances the iterator backwards, to the last boundary preceding this one. 574 * @return The position of the last boundary position preceding this one. 575 */ 576 int32_t RuleBasedBreakIterator::previous(void) { 577 int32_t result; 578 int32_t startPos; 579 580 // if we have cached break positions and we're still in the range 581 // covered by them, just move one step backward in the cache 582 if (fCachedBreakPositions != NULL) { 583 if (fPositionInCache > 0) { 584 --fPositionInCache; 585 // If we're at the beginning of the cache, need to reevaluate the 586 // rule status 587 if (fPositionInCache <= 0) { 588 fLastStatusIndexValid = FALSE; 589 } 590 int32_t pos = fCachedBreakPositions[fPositionInCache]; 591 utext_setNativeIndex(fText, pos); 592 return pos; 593 } 594 else { 595 reset(); 596 } 597 } 598 599 // if we're already sitting at the beginning of the text, return DONE 600 if (fText == NULL || (startPos = current()) == 0) { 601 fLastRuleStatusIndex = 0; 602 fLastStatusIndexValid = TRUE; 603 return BreakIterator::DONE; 604 } 605 606 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 607 result = handlePrevious(fData->fReverseTable); 608 if (fDictionaryCharCount > 0) { 609 result = checkDictionary(result, startPos, TRUE); 610 } 611 return result; 612 } 613 614 // old rule syntax 615 // set things up. handlePrevious() will back us up to some valid 616 // break position before the current position (we back our internal 617 // iterator up one step to prevent handlePrevious() from returning 618 // the current position), but not necessarily the last one before 619 620 // where we started 621 622 int32_t start = current(); 623 624 (void)UTEXT_PREVIOUS32(fText); 625 int32_t lastResult = handlePrevious(fData->fReverseTable); 626 if (lastResult == UBRK_DONE) { 627 lastResult = 0; 628 utext_setNativeIndex(fText, 0); 629 } 630 result = lastResult; 631 int32_t lastTag = 0; 632 UBool breakTagValid = FALSE; 633 634 // iterate forward from the known break position until we pass our 635 // starting point. The last break position before the starting 636 // point is our return value 637 638 for (;;) { 639 result = next(); 640 if (result == BreakIterator::DONE || result >= start) { 641 break; 642 } 643 lastResult = result; 644 lastTag = fLastRuleStatusIndex; 645 breakTagValid = TRUE; 646 } 647 648 // fLastBreakTag wants to have the value for section of text preceding 649 // the result position that we are to return (in lastResult.) If 650 // the backwards rules overshot and the above loop had to do two or more 651 // next()s to move up to the desired return position, we will have a valid 652 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 653 // we wont have a tag value for that position, which is only set by handleNext(). 654 655 // set the current iteration position to be the last break position 656 // before where we started, and then return that value 657 utext_setNativeIndex(fText, lastResult); 658 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 659 fLastStatusIndexValid = breakTagValid; 660 661 // No need to check the dictionary; it will have been handled by 662 // next() 663 664 return lastResult; 665 } 666 667 /** 668 * Sets the iterator to refer to the first boundary position following 669 * the specified position. 670 * @offset The position from which to begin searching for a break position. 671 * @return The position of the first break after the current position. 672 */ 673 int32_t RuleBasedBreakIterator::following(int32_t offset) { 674 // if we have cached break positions and offset is in the range 675 // covered by them, use them 676 // TODO: could use binary search 677 // TODO: what if offset is outside range, but break is not? 678 if (fCachedBreakPositions != NULL) { 679 if (offset >= fCachedBreakPositions[0] 680 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 681 fPositionInCache = 0; 682 // We are guaranteed not to leave the array due to range test above 683 while (offset >= fCachedBreakPositions[fPositionInCache]) { 684 ++fPositionInCache; 685 } 686 int32_t pos = fCachedBreakPositions[fPositionInCache]; 687 utext_setNativeIndex(fText, pos); 688 return pos; 689 } 690 else { 691 reset(); 692 } 693 } 694 695 // if the offset passed in is already past the end of the text, 696 // just return DONE; if it's before the beginning, return the 697 // text's starting offset 698 fLastRuleStatusIndex = 0; 699 fLastStatusIndexValid = TRUE; 700 if (fText == NULL || offset >= utext_nativeLength(fText)) { 701 last(); 702 return next(); 703 } 704 else if (offset < 0) { 705 return first(); 706 } 707 708 // otherwise, set our internal iteration position (temporarily) 709 // to the position passed in. If this is the _beginning_ position, 710 // then we can just use next() to get our return value 711 712 int32_t result = 0; 713 714 if (fData->fSafeRevTable != NULL) { 715 // new rule syntax 716 utext_setNativeIndex(fText, offset); 717 // move forward one codepoint to prepare for moving back to a 718 // safe point. 719 // this handles offset being between a supplementary character 720 (void)UTEXT_NEXT32(fText); 721 // handlePrevious will move most of the time to < 1 boundary away 722 handlePrevious(fData->fSafeRevTable); 723 int32_t result = next(); 724 while (result <= offset) { 725 result = next(); 726 } 727 return result; 728 } 729 if (fData->fSafeFwdTable != NULL) { 730 // backup plan if forward safe table is not available 731 utext_setNativeIndex(fText, offset); 732 (void)UTEXT_PREVIOUS32(fText); 733 // handle next will give result >= offset 734 handleNext(fData->fSafeFwdTable); 735 // previous will give result 0 or 1 boundary away from offset, 736 // most of the time 737 // we have to 738 int32_t oldresult = previous(); 739 while (oldresult > offset) { 740 int32_t result = previous(); 741 if (result <= offset) { 742 return oldresult; 743 } 744 oldresult = result; 745 } 746 int32_t result = next(); 747 if (result <= offset) { 748 return next(); 749 } 750 return result; 751 } 752 // otherwise, we have to sync up first. Use handlePrevious() to back 753 // up to a known break position before the specified position (if 754 // we can determine that the specified position is a break position, 755 // we don't back up at all). This may or may not be the last break 756 // position at or before our starting position. Advance forward 757 // from here until we've passed the starting position. The position 758 // we stop on will be the first break position after the specified one. 759 // old rule syntax 760 761 utext_setNativeIndex(fText, offset); 762 if (offset==0 || 763 (offset==1 && utext_getNativeIndex(fText)==0)) { 764 return next(); 765 } 766 result = previous(); 767 768 while (result != BreakIterator::DONE && result <= offset) { 769 result = next(); 770 } 771 772 return result; 773 } 774 775 /** 776 * Sets the iterator to refer to the last boundary position before the 777 * specified position. 778 * @offset The position to begin searching for a break from. 779 * @return The position of the last boundary before the starting position. 780 */ 781 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 782 // if we have cached break positions and offset is in the range 783 // covered by them, use them 784 if (fCachedBreakPositions != NULL) { 785 // TODO: binary search? 786 // TODO: What if offset is outside range, but break is not? 787 if (offset > fCachedBreakPositions[0] 788 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 789 fPositionInCache = 0; 790 while (fPositionInCache < fNumCachedBreakPositions 791 && offset > fCachedBreakPositions[fPositionInCache]) 792 ++fPositionInCache; 793 --fPositionInCache; 794 // If we're at the beginning of the cache, need to reevaluate the 795 // rule status 796 if (fPositionInCache <= 0) { 797 fLastStatusIndexValid = FALSE; 798 } 799 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 800 return fCachedBreakPositions[fPositionInCache]; 801 } 802 else { 803 reset(); 804 } 805 } 806 807 // if the offset passed in is already past the end of the text, 808 // just return DONE; if it's before the beginning, return the 809 // text's starting offset 810 if (fText == NULL || offset > utext_nativeLength(fText)) { 811 // return BreakIterator::DONE; 812 return last(); 813 } 814 else if (offset < 0) { 815 return first(); 816 } 817 818 // if we start by updating the current iteration position to the 819 // position specified by the caller, we can just use previous() 820 // to carry out this operation 821 822 if (fData->fSafeFwdTable != NULL) { 823 // new rule syntax 824 utext_setNativeIndex(fText, offset); 825 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 826 if (newOffset != offset) { 827 // Will come here if specified offset was not a code point boundary AND 828 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 829 // indices to the containing code point. 830 // For breakitereator::preceding only, these non-code-point indices need to be moved 831 // up to refer to the following codepoint. 832 (void)UTEXT_NEXT32(fText); 833 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 834 } 835 836 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 837 // rather than adjusting the position unconditionally? 838 // (Change would interact with safe rules.) 839 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 840 // affects only preceding(), seems cleaner, but is slightly different. 841 (void)UTEXT_PREVIOUS32(fText); 842 handleNext(fData->fSafeFwdTable); 843 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 844 while (result >= offset) { 845 result = previous(); 846 } 847 return result; 848 } 849 if (fData->fSafeRevTable != NULL) { 850 // backup plan if forward safe table is not available 851 // TODO: check whether this path can be discarded 852 // It's probably OK to say that rules must supply both safe tables 853 // if they use safe tables at all. We have certainly never described 854 // to anyone how to work with just one safe table. 855 utext_setNativeIndex(fText, offset); 856 (void)UTEXT_NEXT32(fText); 857 858 // handle previous will give result <= offset 859 handlePrevious(fData->fSafeRevTable); 860 861 // next will give result 0 or 1 boundary away from offset, 862 // most of the time 863 // we have to 864 int32_t oldresult = next(); 865 while (oldresult < offset) { 866 int32_t result = next(); 867 if (result >= offset) { 868 return oldresult; 869 } 870 oldresult = result; 871 } 872 int32_t result = previous(); 873 if (result >= offset) { 874 return previous(); 875 } 876 return result; 877 } 878 879 // old rule syntax 880 utext_setNativeIndex(fText, offset); 881 return previous(); 882 } 883 884 /** 885 * Returns true if the specfied position is a boundary position. As a side 886 * effect, leaves the iterator pointing to the first boundary position at 887 * or after "offset". 888 * @param offset the offset to check. 889 * @return True if "offset" is a boundary position. 890 */ 891 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 892 // the beginning index of the iterator is always a boundary position by definition 893 if (offset == 0) { 894 first(); // For side effects on current position, tag values. 895 return TRUE; 896 } 897 898 if (offset == (int32_t)utext_nativeLength(fText)) { 899 last(); // For side effects on current position, tag values. 900 return TRUE; 901 } 902 903 // out-of-range indexes are never boundary positions 904 if (offset < 0) { 905 first(); // For side effects on current position, tag values. 906 return FALSE; 907 } 908 909 if (offset > utext_nativeLength(fText)) { 910 last(); // For side effects on current position, tag values. 911 return FALSE; 912 } 913 914 // otherwise, we can use following() on the position before the specified 915 // one and return true if the position we get back is the one the user 916 // specified 917 utext_previous32From(fText, offset); 918 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 919 UBool result = following(backOne) == offset; 920 return result; 921 } 922 923 /** 924 * Returns the current iteration position. 925 * @return The current iteration position. 926 */ 927 int32_t RuleBasedBreakIterator::current(void) const { 928 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 929 return pos; 930 } 931 932 //======================================================================= 933 // implementation 934 //======================================================================= 935 936 // 937 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 938 // of user text. A variable with this enum type keeps track of where we 939 // are. The state machine only fetches user input while in the RUN mode. 940 // 941 enum RBBIRunMode { 942 RBBI_START, // state machine processing is before first char of input 943 RBBI_RUN, // state machine processing is in the user text 944 RBBI_END // state machine processing is after end of user text. 945 }; 946 947 948 //----------------------------------------------------------------------------------- 949 // 950 // handleNext(stateTable) 951 // This method is the actual implementation of the rbbi next() method. 952 // This method initializes the state machine to state 1 953 // and advances through the text character by character until we reach the end 954 // of the text or the state machine transitions to state 0. We update our return 955 // value every time the state machine passes through an accepting state. 956 // 957 //----------------------------------------------------------------------------------- 958 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 959 int32_t state; 960 int16_t category = 0; 961 RBBIRunMode mode; 962 963 RBBIStateTableRow *row; 964 UChar32 c; 965 int32_t lookaheadStatus = 0; 966 int32_t lookaheadTagIdx = 0; 967 int32_t result = 0; 968 int32_t initialPosition = 0; 969 int32_t lookaheadResult = 0; 970 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 971 const char *tableData = statetable->fTableData; 972 uint32_t tableRowLen = statetable->fRowLen; 973 974 #ifdef RBBI_DEBUG 975 if (fTrace) { 976 RBBIDebugPuts("Handle Next pos char state category"); 977 } 978 #endif 979 980 // No matter what, handleNext alway correctly sets the break tag value. 981 fLastStatusIndexValid = TRUE; 982 fLastRuleStatusIndex = 0; 983 984 // if we're already at the end of the text, return DONE. 985 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 986 result = initialPosition; 987 c = UTEXT_NEXT32(fText); 988 if (fData == NULL || c==U_SENTINEL) { 989 return BreakIterator::DONE; 990 } 991 992 // Set the initial state for the state machine 993 state = START_STATE; 994 row = (RBBIStateTableRow *) 995 //(statetable->fTableData + (statetable->fRowLen * state)); 996 (tableData + tableRowLen * state); 997 998 999 mode = RBBI_RUN; 1000 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1001 category = 2; 1002 mode = RBBI_START; 1003 } 1004 1005 1006 // loop until we reach the end of the text or transition to state 0 1007 // 1008 for (;;) { 1009 if (c == U_SENTINEL) { 1010 // Reached end of input string. 1011 if (mode == RBBI_END) { 1012 // We have already run the loop one last time with the 1013 // character set to the psueudo {eof} value. Now it is time 1014 // to unconditionally bail out. 1015 if (lookaheadResult > result) { 1016 // We ran off the end of the string with a pending look-ahead match. 1017 // Treat this as if the look-ahead condition had been met, and return 1018 // the match at the / position from the look-ahead rule. 1019 result = lookaheadResult; 1020 fLastRuleStatusIndex = lookaheadTagIdx; 1021 lookaheadStatus = 0; 1022 } 1023 break; 1024 } 1025 // Run the loop one last time with the fake end-of-input character category. 1026 mode = RBBI_END; 1027 category = 1; 1028 } 1029 1030 // 1031 // Get the char category. An incoming category of 1 or 2 means that 1032 // we are preset for doing the beginning or end of input, and 1033 // that we shouldn't get a category from an actual text input character. 1034 // 1035 if (mode == RBBI_RUN) { 1036 // look up the current character's character category, which tells us 1037 // which column in the state table to look at. 1038 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1039 // not the size of the character going in, which is a UChar32. 1040 // 1041 UTRIE_GET16(&fData->fTrie, c, category); 1042 1043 // Check the dictionary bit in the character's category. 1044 // Counter is only used by dictionary based iterators (subclasses). 1045 // Chars that need to be handled by a dictionary have a flag bit set 1046 // in their category values. 1047 // 1048 if ((category & 0x4000) != 0) { 1049 fDictionaryCharCount++; 1050 // And off the dictionary flag bit. 1051 category &= ~0x4000; 1052 } 1053 } 1054 1055 #ifdef RBBI_DEBUG 1056 if (fTrace) { 1057 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1058 if (0x20<=c && c<0x7f) { 1059 RBBIDebugPrintf("\"%c\" ", c); 1060 } else { 1061 RBBIDebugPrintf("%5x ", c); 1062 } 1063 RBBIDebugPrintf("%3d %3d\n", state, category); 1064 } 1065 #endif 1066 1067 // State Transition - move machine to its next state 1068 // 1069 state = row->fNextState[category]; 1070 row = (RBBIStateTableRow *) 1071 // (statetable->fTableData + (statetable->fRowLen * state)); 1072 (tableData + tableRowLen * state); 1073 1074 1075 if (row->fAccepting == -1) { 1076 // Match found, common case. 1077 if (mode != RBBI_START) { 1078 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1079 } 1080 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1081 } 1082 1083 if (row->fLookAhead != 0) { 1084 if (lookaheadStatus != 0 1085 && row->fAccepting == lookaheadStatus) { 1086 // Lookahead match is completed. 1087 result = lookaheadResult; 1088 fLastRuleStatusIndex = lookaheadTagIdx; 1089 lookaheadStatus = 0; 1090 // TODO: make a standalone hard break in a rule work. 1091 if (lookAheadHardBreak) { 1092 UTEXT_SETNATIVEINDEX(fText, result); 1093 return result; 1094 } 1095 // Look-ahead completed, but other rules may match further. Continue on 1096 // TODO: junk this feature? I don't think it's used anywhwere. 1097 goto continueOn; 1098 } 1099 1100 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1101 lookaheadResult = r; 1102 lookaheadStatus = row->fLookAhead; 1103 lookaheadTagIdx = row->fTagIdx; 1104 goto continueOn; 1105 } 1106 1107 1108 if (row->fAccepting != 0) { 1109 // Because this is an accepting state, any in-progress look-ahead match 1110 // is no longer relavant. Clear out the pending lookahead status. 1111 lookaheadStatus = 0; // clear out any pending look-ahead match. 1112 } 1113 1114 continueOn: 1115 if (state == STOP_STATE) { 1116 // This is the normal exit from the lookup state machine. 1117 // We have advanced through the string until it is certain that no 1118 // longer match is possible, no matter what characters follow. 1119 break; 1120 } 1121 1122 // Advance to the next character. 1123 // If this is a beginning-of-input loop iteration, don't advance 1124 // the input position. The next iteration will be processing the 1125 // first real input character. 1126 if (mode == RBBI_RUN) { 1127 c = UTEXT_NEXT32(fText); 1128 } else { 1129 if (mode == RBBI_START) { 1130 mode = RBBI_RUN; 1131 } 1132 } 1133 1134 1135 } 1136 1137 // The state machine is done. Check whether it found a match... 1138 1139 // If the iterator failed to advance in the match engine, force it ahead by one. 1140 // (This really indicates a defect in the break rules. They should always match 1141 // at least one character.) 1142 if (result == initialPosition) { 1143 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1144 UTEXT_NEXT32(fText); 1145 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1146 } 1147 1148 // Leave the iterator at our result position. 1149 UTEXT_SETNATIVEINDEX(fText, result); 1150 #ifdef RBBI_DEBUG 1151 if (fTrace) { 1152 RBBIDebugPrintf("result = %d\n\n", result); 1153 } 1154 #endif 1155 return result; 1156 } 1157 1158 1159 1160 //----------------------------------------------------------------------------------- 1161 // 1162 // handlePrevious() 1163 // 1164 // Iterate backwards, according to the logic of the reverse rules. 1165 // This version handles the exact style backwards rules. 1166 // 1167 // The logic of this function is very similar to handleNext(), above. 1168 // 1169 //----------------------------------------------------------------------------------- 1170 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1171 int32_t state; 1172 int16_t category = 0; 1173 RBBIRunMode mode; 1174 RBBIStateTableRow *row; 1175 UChar32 c; 1176 int32_t lookaheadStatus = 0; 1177 int32_t result = 0; 1178 int32_t initialPosition = 0; 1179 int32_t lookaheadResult = 0; 1180 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1181 1182 #ifdef RBBI_DEBUG 1183 if (fTrace) { 1184 RBBIDebugPuts("Handle Previous pos char state category"); 1185 } 1186 #endif 1187 1188 // handlePrevious() never gets the rule status. 1189 // Flag the status as invalid; if the user ever asks for status, we will need 1190 // to back up, then re-find the break position using handleNext(), which does 1191 // get the status value. 1192 fLastStatusIndexValid = FALSE; 1193 fLastRuleStatusIndex = 0; 1194 1195 // if we're already at the start of the text, return DONE. 1196 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1197 return BreakIterator::DONE; 1198 } 1199 1200 // Set up the starting char. 1201 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1202 result = initialPosition; 1203 c = UTEXT_PREVIOUS32(fText); 1204 1205 // Set the initial state for the state machine 1206 state = START_STATE; 1207 row = (RBBIStateTableRow *) 1208 (statetable->fTableData + (statetable->fRowLen * state)); 1209 category = 3; 1210 mode = RBBI_RUN; 1211 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1212 category = 2; 1213 mode = RBBI_START; 1214 } 1215 1216 1217 // loop until we reach the start of the text or transition to state 0 1218 // 1219 for (;;) { 1220 if (c == U_SENTINEL) { 1221 // Reached end of input string. 1222 if (mode == RBBI_END) { 1223 // We have already run the loop one last time with the 1224 // character set to the psueudo {eof} value. Now it is time 1225 // to unconditionally bail out. 1226 if (lookaheadResult < result) { 1227 // We ran off the end of the string with a pending look-ahead match. 1228 // Treat this as if the look-ahead condition had been met, and return 1229 // the match at the / position from the look-ahead rule. 1230 result = lookaheadResult; 1231 lookaheadStatus = 0; 1232 } else if (result == initialPosition) { 1233 // Ran off start, no match found. 1234 // move one index one (towards the start, since we are doing a previous()) 1235 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1236 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1237 } 1238 break; 1239 } 1240 // Run the loop one last time with the fake end-of-input character category. 1241 mode = RBBI_END; 1242 category = 1; 1243 } 1244 1245 // 1246 // Get the char category. An incoming category of 1 or 2 means that 1247 // we are preset for doing the beginning or end of input, and 1248 // that we shouldn't get a category from an actual text input character. 1249 // 1250 if (mode == RBBI_RUN) { 1251 // look up the current character's character category, which tells us 1252 // which column in the state table to look at. 1253 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1254 // not the size of the character going in, which is a UChar32. 1255 // 1256 UTRIE_GET16(&fData->fTrie, c, category); 1257 1258 // Check the dictionary bit in the character's category. 1259 // Counter is only used by dictionary based iterators (subclasses). 1260 // Chars that need to be handled by a dictionary have a flag bit set 1261 // in their category values. 1262 // 1263 if ((category & 0x4000) != 0) { 1264 fDictionaryCharCount++; 1265 // And off the dictionary flag bit. 1266 category &= ~0x4000; 1267 } 1268 } 1269 1270 #ifdef RBBI_DEBUG 1271 if (fTrace) { 1272 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1273 if (0x20<=c && c<0x7f) { 1274 RBBIDebugPrintf("\"%c\" ", c); 1275 } else { 1276 RBBIDebugPrintf("%5x ", c); 1277 } 1278 RBBIDebugPrintf("%3d %3d\n", state, category); 1279 } 1280 #endif 1281 1282 // State Transition - move machine to its next state 1283 // 1284 state = row->fNextState[category]; 1285 row = (RBBIStateTableRow *) 1286 (statetable->fTableData + (statetable->fRowLen * state)); 1287 1288 if (row->fAccepting == -1) { 1289 // Match found, common case. 1290 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1291 } 1292 1293 if (row->fLookAhead != 0) { 1294 if (lookaheadStatus != 0 1295 && row->fAccepting == lookaheadStatus) { 1296 // Lookahead match is completed. 1297 result = lookaheadResult; 1298 lookaheadStatus = 0; 1299 // TODO: make a standalone hard break in a rule work. 1300 if (lookAheadHardBreak) { 1301 UTEXT_SETNATIVEINDEX(fText, result); 1302 return result; 1303 } 1304 // Look-ahead completed, but other rules may match further. Continue on 1305 // TODO: junk this feature? I don't think it's used anywhwere. 1306 goto continueOn; 1307 } 1308 1309 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1310 lookaheadResult = r; 1311 lookaheadStatus = row->fLookAhead; 1312 goto continueOn; 1313 } 1314 1315 1316 if (row->fAccepting != 0) { 1317 // Because this is an accepting state, any in-progress look-ahead match 1318 // is no longer relavant. Clear out the pending lookahead status. 1319 lookaheadStatus = 0; 1320 } 1321 1322 continueOn: 1323 if (state == STOP_STATE) { 1324 // This is the normal exit from the lookup state machine. 1325 // We have advanced through the string until it is certain that no 1326 // longer match is possible, no matter what characters follow. 1327 break; 1328 } 1329 1330 // Move (backwards) to the next character to process. 1331 // If this is a beginning-of-input loop iteration, don't advance 1332 // the input position. The next iteration will be processing the 1333 // first real input character. 1334 if (mode == RBBI_RUN) { 1335 c = UTEXT_PREVIOUS32(fText); 1336 } else { 1337 if (mode == RBBI_START) { 1338 mode = RBBI_RUN; 1339 } 1340 } 1341 } 1342 1343 // The state machine is done. Check whether it found a match... 1344 1345 // If the iterator failed to advance in the match engine, force it ahead by one. 1346 // (This really indicates a defect in the break rules. They should always match 1347 // at least one character.) 1348 if (result == initialPosition) { 1349 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1350 UTEXT_PREVIOUS32(fText); 1351 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1352 } 1353 1354 // Leave the iterator at our result position. 1355 UTEXT_SETNATIVEINDEX(fText, result); 1356 #ifdef RBBI_DEBUG 1357 if (fTrace) { 1358 RBBIDebugPrintf("result = %d\n\n", result); 1359 } 1360 #endif 1361 return result; 1362 } 1363 1364 1365 void 1366 RuleBasedBreakIterator::reset() 1367 { 1368 if (fCachedBreakPositions) { 1369 uprv_free(fCachedBreakPositions); 1370 } 1371 fCachedBreakPositions = NULL; 1372 fNumCachedBreakPositions = 0; 1373 fDictionaryCharCount = 0; 1374 fPositionInCache = 0; 1375 } 1376 1377 1378 1379 //------------------------------------------------------------------------------- 1380 // 1381 // getRuleStatus() Return the break rule tag associated with the current 1382 // iterator position. If the iterator arrived at its current 1383 // position by iterating forwards, the value will have been 1384 // cached by the handleNext() function. 1385 // 1386 // If no cached status value is available, the status is 1387 // found by doing a previous() followed by a next(), which 1388 // leaves the iterator where it started, and computes the 1389 // status while doing the next(). 1390 // 1391 //------------------------------------------------------------------------------- 1392 void RuleBasedBreakIterator::makeRuleStatusValid() { 1393 if (fLastStatusIndexValid == FALSE) { 1394 // No cached status is available. 1395 if (fText == NULL || current() == 0) { 1396 // At start of text, or there is no text. Status is always zero. 1397 fLastRuleStatusIndex = 0; 1398 fLastStatusIndexValid = TRUE; 1399 } else { 1400 // Not at start of text. Find status the tedious way. 1401 int32_t pa = current(); 1402 previous(); 1403 if (fNumCachedBreakPositions > 0) { 1404 reset(); // Blow off the dictionary cache 1405 } 1406 int32_t pb = next(); 1407 if (pa != pb) { 1408 // note: the if (pa != pb) test is here only to eliminate warnings for 1409 // unused local variables on gcc. Logically, it isn't needed. 1410 U_ASSERT(pa == pb); 1411 } 1412 } 1413 } 1414 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1415 } 1416 1417 1418 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1419 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1420 nonConstThis->makeRuleStatusValid(); 1421 1422 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1423 // (the number of status values.) 1424 // This function returns the last (largest) of the array of status values. 1425 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1426 int32_t tagVal = fData->fRuleStatusTable[idx]; 1427 1428 return tagVal; 1429 } 1430 1431 1432 1433 1434 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1435 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1436 { 1437 if (U_FAILURE(status)) { 1438 return 0; 1439 } 1440 1441 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1442 nonConstThis->makeRuleStatusValid(); 1443 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1444 int32_t numValsToCopy = numVals; 1445 if (numVals > capacity) { 1446 status = U_BUFFER_OVERFLOW_ERROR; 1447 numValsToCopy = capacity; 1448 } 1449 int i; 1450 for (i=0; i<numValsToCopy; i++) { 1451 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1452 } 1453 return numVals; 1454 } 1455 1456 1457 1458 //------------------------------------------------------------------------------- 1459 // 1460 // getBinaryRules Access to the compiled form of the rules, 1461 // for use by build system tools that save the data 1462 // for standard iterator types. 1463 // 1464 //------------------------------------------------------------------------------- 1465 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1466 const uint8_t *retPtr = NULL; 1467 length = 0; 1468 1469 if (fData != NULL) { 1470 retPtr = (const uint8_t *)fData->fHeader; 1471 length = fData->fHeader->fLength; 1472 } 1473 return retPtr; 1474 } 1475 1476 1477 1478 1479 //------------------------------------------------------------------------------- 1480 // 1481 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1482 // Saving one heap allocation isn't worth the trouble. 1483 // Cloning shouldn't be done in tight loops, and 1484 // making the clone copy involves other heap operations anyway. 1485 // And the application code for correctly dealing with buffer 1486 // size problems and the eventual object destruction is ugly. 1487 // 1488 //------------------------------------------------------------------------------- 1489 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1490 int32_t &bufferSize, 1491 UErrorCode &status) 1492 { 1493 if (U_FAILURE(status)){ 1494 return NULL; 1495 } 1496 1497 // 1498 // If user buffer size is zero this is a preflight operation to 1499 // obtain the needed buffer size, allowing for worst case misalignment. 1500 // 1501 if (bufferSize == 0) { 1502 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1503 return NULL; 1504 } 1505 1506 1507 // 1508 // Check the alignment and size of the user supplied buffer. 1509 // Allocate heap memory if the user supplied memory is insufficient. 1510 // 1511 char *buf = (char *)stackBuffer; 1512 uint32_t s = bufferSize; 1513 1514 if (stackBuffer == NULL) { 1515 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1516 } 1517 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1518 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1519 s -= offsetUp; 1520 buf += offsetUp; 1521 } 1522 if (s < sizeof(RuleBasedBreakIterator)) { 1523 // Not enough room in the caller-supplied buffer. 1524 // Do a plain-vanilla heap based clone and return that, along with 1525 // a warning that the clone was allocated. 1526 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1527 if (clonedBI == 0) { 1528 status = U_MEMORY_ALLOCATION_ERROR; 1529 } else { 1530 status = U_SAFECLONE_ALLOCATED_WARNING; 1531 } 1532 return clonedBI; 1533 } 1534 1535 // 1536 // Clone the source BI into the caller-supplied buffer. 1537 // 1538 RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this); 1539 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1540 1541 return clone; 1542 } 1543 1544 1545 //------------------------------------------------------------------------------- 1546 // 1547 // isDictionaryChar Return true if the category lookup for this char 1548 // indicates that it is in the set of dictionary lookup 1549 // chars. 1550 // 1551 // This function is intended for use by dictionary based 1552 // break iterators. 1553 // 1554 //------------------------------------------------------------------------------- 1555 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1556 if (fData == NULL) { 1557 return FALSE; 1558 } 1559 uint16_t category; 1560 UTRIE_GET16(&fData->fTrie, c, category); 1561 return (category & 0x4000) != 0; 1562 }*/ 1563 1564 1565 //------------------------------------------------------------------------------- 1566 // 1567 // checkDictionary This function handles all processing of characters in 1568 // the "dictionary" set. It will determine the appropriate 1569 // course of action, and possibly set up a cache in the 1570 // process. 1571 // 1572 //------------------------------------------------------------------------------- 1573 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1574 int32_t endPos, 1575 UBool reverse) { 1576 // Reset the old break cache first. 1577 uint32_t dictionaryCount = fDictionaryCharCount; 1578 reset(); 1579 1580 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 1581 return (reverse ? startPos : endPos); 1582 } 1583 1584 // Bug 5532. The dictionary code will crash if the input text is UTF-8 1585 // because native indexes are different from UTF-16 indexes. 1586 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. 1587 // It wont give the right breaks, but it's better than a crash. 1588 // 1589 // Check the type of the UText by checking its pFuncs field, which 1590 // is UText's function dispatch table. It will be the same for all 1591 // UTF-8 UTexts and different for any other UText type. 1592 // 1593 // We have no other type of UText available with non-UTF-16 native indexing. 1594 // This whole check will go away once the dictionary code is fixed. 1595 static const void *utext_utf8Funcs; 1596 if (utext_utf8Funcs == NULL) { 1597 // Cache the UTF-8 UText function pointer value. 1598 UErrorCode status = U_ZERO_ERROR; 1599 UText tempUText = UTEXT_INITIALIZER; 1600 utext_openUTF8(&tempUText, NULL, 0, &status); 1601 utext_utf8Funcs = tempUText.pFuncs; 1602 utext_close(&tempUText); 1603 } 1604 if (fText->pFuncs == utext_utf8Funcs) { 1605 return (reverse ? startPos : endPos); 1606 } 1607 1608 // Starting from the starting point, scan towards the proposed result, 1609 // looking for the first dictionary character (which may be the one 1610 // we're on, if we're starting in the middle of a range). 1611 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1612 if (reverse) { 1613 UTEXT_PREVIOUS32(fText); 1614 } 1615 1616 int32_t rangeStart = startPos; 1617 int32_t rangeEnd = endPos; 1618 1619 uint16_t category; 1620 int32_t current; 1621 UErrorCode status = U_ZERO_ERROR; 1622 UStack breaks(status); 1623 int32_t foundBreakCount = 0; 1624 UChar32 c = utext_current32(fText); 1625 1626 UTRIE_GET16(&fData->fTrie, c, category); 1627 1628 // Is the character we're starting on a dictionary character? If so, we 1629 // need to back up to include the entire run; otherwise the results of 1630 // the break algorithm will differ depending on where we start. Since 1631 // the result is cached and there is typically a non-dictionary break 1632 // within a small number of words, there should be little performance impact. 1633 if (category & 0x4000) { 1634 if (reverse) { 1635 do { 1636 utext_next32(fText); // TODO: recast to work directly with postincrement. 1637 c = utext_current32(fText); 1638 UTRIE_GET16(&fData->fTrie, c, category); 1639 } while (c != U_SENTINEL && (category & 0x4000)); 1640 // Back up to the last dictionary character 1641 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1642 if (c == U_SENTINEL) { 1643 // c = fText->last32(); 1644 // TODO: why was this if needed? 1645 c = UTEXT_PREVIOUS32(fText); 1646 } 1647 else { 1648 c = UTEXT_PREVIOUS32(fText); 1649 } 1650 } 1651 else { 1652 do { 1653 c = UTEXT_PREVIOUS32(fText); 1654 UTRIE_GET16(&fData->fTrie, c, category); 1655 } 1656 while (c != U_SENTINEL && (category & 0x4000)); 1657 // Back up to the last dictionary character 1658 if (c == U_SENTINEL) { 1659 // c = fText->first32(); 1660 c = utext_current32(fText); 1661 } 1662 else { 1663 utext_next32(fText); 1664 c = utext_current32(fText); 1665 } 1666 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1667 } 1668 UTRIE_GET16(&fData->fTrie, c, category); 1669 } 1670 1671 // Loop through the text, looking for ranges of dictionary characters. 1672 // For each span, find the appropriate break engine, and ask it to find 1673 // any breaks within the span. 1674 // Note: we always do this in the forward direction, so that the break 1675 // cache is built in the right order. 1676 if (reverse) { 1677 utext_setNativeIndex(fText, rangeStart); 1678 c = utext_current32(fText); 1679 UTRIE_GET16(&fData->fTrie, c, category); 1680 } 1681 while(U_SUCCESS(status)) { 1682 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1683 utext_next32(fText); // TODO: tweak for post-increment operation 1684 c = utext_current32(fText); 1685 UTRIE_GET16(&fData->fTrie, c, category); 1686 } 1687 if (current >= rangeEnd) { 1688 break; 1689 } 1690 1691 // We now have a dictionary character. Get the appropriate language object 1692 // to deal with it. 1693 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1694 1695 // Ask the language object if there are any breaks. It will leave the text 1696 // pointer on the other side of its range, ready to search for the next one. 1697 if (lbe != NULL) { 1698 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1699 } 1700 1701 // Reload the loop variables for the next go-round 1702 c = utext_current32(fText); 1703 UTRIE_GET16(&fData->fTrie, c, category); 1704 } 1705 1706 // If we found breaks, build a new break cache. The first and last entries must 1707 // be the original starting and ending position. 1708 if (foundBreakCount > 0) { 1709 int32_t totalBreaks = foundBreakCount; 1710 if (startPos < breaks.elementAti(0)) { 1711 totalBreaks += 1; 1712 } 1713 if (endPos > breaks.peeki()) { 1714 totalBreaks += 1; 1715 } 1716 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1717 if (fCachedBreakPositions != NULL) { 1718 int32_t out = 0; 1719 fNumCachedBreakPositions = totalBreaks; 1720 if (startPos < breaks.elementAti(0)) { 1721 fCachedBreakPositions[out++] = startPos; 1722 } 1723 for (int32_t i = 0; i < foundBreakCount; ++i) { 1724 fCachedBreakPositions[out++] = breaks.elementAti(i); 1725 } 1726 if (endPos > fCachedBreakPositions[out-1]) { 1727 fCachedBreakPositions[out] = endPos; 1728 } 1729 // If there are breaks, then by definition, we are replacing the original 1730 // proposed break by one of the breaks we found. Use following() and 1731 // preceding() to do the work. They should never recurse in this case. 1732 if (reverse) { 1733 return preceding(endPos - 1); 1734 } 1735 else { 1736 return following(startPos); 1737 } 1738 } 1739 // If the allocation failed, just fall through to the "no breaks found" case. 1740 } 1741 1742 // If we get here, there were no language-based breaks. Set the text pointer 1743 // to the original proposed break. 1744 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1745 return (reverse ? startPos : endPos); 1746 } 1747 1748 U_NAMESPACE_END 1749 1750 // defined in ucln_cmn.h 1751 1752 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; 1753 1754 /** 1755 * Release all static memory held by breakiterator. 1756 */ 1757 U_CDECL_BEGIN 1758 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1759 if (gLanguageBreakFactories) { 1760 delete gLanguageBreakFactories; 1761 gLanguageBreakFactories = NULL; 1762 } 1763 return TRUE; 1764 } 1765 U_CDECL_END 1766 1767 U_CDECL_BEGIN 1768 static void U_CALLCONV _deleteFactory(void *obj) { 1769 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; 1770 } 1771 U_CDECL_END 1772 U_NAMESPACE_BEGIN 1773 1774 static const LanguageBreakEngine* 1775 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1776 { 1777 UBool needsInit; 1778 UErrorCode status = U_ZERO_ERROR; 1779 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1780 1781 if (needsInit) { 1782 UStack *factories = new UStack(_deleteFactory, NULL, status); 1783 if (factories != NULL && U_SUCCESS(status)) { 1784 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1785 factories->push(builtIn, status); 1786 #ifdef U_LOCAL_SERVICE_HOOK 1787 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1788 if (extra != NULL) { 1789 factories->push(extra, status); 1790 } 1791 #endif 1792 } 1793 umtx_lock(NULL); 1794 if (gLanguageBreakFactories == NULL) { 1795 gLanguageBreakFactories = factories; 1796 factories = NULL; 1797 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1798 } 1799 umtx_unlock(NULL); 1800 delete factories; 1801 } 1802 1803 if (gLanguageBreakFactories == NULL) { 1804 return NULL; 1805 } 1806 1807 int32_t i = gLanguageBreakFactories->size(); 1808 const LanguageBreakEngine *lbe = NULL; 1809 while (--i >= 0) { 1810 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1811 lbe = factory->getEngineFor(c, breakType); 1812 if (lbe != NULL) { 1813 break; 1814 } 1815 } 1816 return lbe; 1817 } 1818 1819 1820 //------------------------------------------------------------------------------- 1821 // 1822 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1823 // the characer c. 1824 // 1825 //------------------------------------------------------------------------------- 1826 const LanguageBreakEngine * 1827 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1828 const LanguageBreakEngine *lbe = NULL; 1829 UErrorCode status = U_ZERO_ERROR; 1830 1831 if (fLanguageBreakEngines == NULL) { 1832 fLanguageBreakEngines = new UStack(status); 1833 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1834 delete fLanguageBreakEngines; 1835 fLanguageBreakEngines = 0; 1836 return NULL; 1837 } 1838 } 1839 1840 int32_t i = fLanguageBreakEngines->size(); 1841 while (--i >= 0) { 1842 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1843 if (lbe->handles(c, fBreakType)) { 1844 return lbe; 1845 } 1846 } 1847 1848 // No existing dictionary took the character. See if a factory wants to 1849 // give us a new LanguageBreakEngine for this character. 1850 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1851 1852 // If we got one, use it and push it on our stack. 1853 if (lbe != NULL) { 1854 fLanguageBreakEngines->push((void *)lbe, status); 1855 // Even if we can't remember it, we can keep looking it up, so 1856 // return it even if the push fails. 1857 return lbe; 1858 } 1859 1860 // No engine is forthcoming for this character. Add it to the 1861 // reject set. Create the reject break engine if needed. 1862 if (fUnhandledBreakEngine == NULL) { 1863 fUnhandledBreakEngine = new UnhandledEngine(status); 1864 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1865 status = U_MEMORY_ALLOCATION_ERROR; 1866 } 1867 // Put it last so that scripts for which we have an engine get tried 1868 // first. 1869 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1870 // If we can't insert it, or creation failed, get rid of it 1871 if (U_FAILURE(status)) { 1872 delete fUnhandledBreakEngine; 1873 fUnhandledBreakEngine = 0; 1874 return NULL; 1875 } 1876 } 1877 1878 // Tell the reject engine about the character; at its discretion, it may 1879 // add more than just the one character. 1880 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1881 1882 return fUnhandledBreakEngine; 1883 } 1884 1885 1886 1887 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1888 return fBreakType; 1889 }*/ 1890 1891 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1892 fBreakType = type; 1893 reset(); 1894 } 1895 1896 U_NAMESPACE_END 1897 1898 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1899