1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2009 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_BREAK_ITERATION 16 17 #include "unicode/rbbi.h" 18 #include "unicode/schriter.h" 19 #include "unicode/uchriter.h" 20 #include "unicode/udata.h" 21 #include "unicode/uclean.h" 22 #include "rbbidata.h" 23 #include "rbbirb.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "umutex.h" 27 #include "ucln_cmn.h" 28 #include "brkeng.h" 29 30 #include "uassert.h" 31 #include "uvector.h" 32 33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 34 #if U_LOCAL_SERVICE_HOOK 35 #include "localsvc.h" 36 #endif 37 38 #ifdef RBBI_DEBUG 39 static UBool fTrace = FALSE; 40 #endif 41 42 U_NAMESPACE_BEGIN 43 44 // The state number of the starting state 45 #define START_STATE 1 46 47 // The state-transition value indicating "stop" 48 #define STOP_STATE 0 49 50 51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 52 53 54 //======================================================================= 55 // constructors 56 //======================================================================= 57 58 /** 59 * Constructs a RuleBasedBreakIterator that uses the already-created 60 * tables object that is passed in as a parameter. 61 */ 62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 63 { 64 init(); 65 fData = new RBBIDataWrapper(data, status); // status checked in constructor 66 if (U_FAILURE(status)) {return;} 67 if(fData == 0) { 68 status = U_MEMORY_ALLOCATION_ERROR; 69 return; 70 } 71 } 72 73 /** 74 * Same as above but does not adopt memory 75 */ 76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 77 { 78 init(); 79 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 80 if (U_FAILURE(status)) {return;} 81 if(fData == 0) { 82 status = U_MEMORY_ALLOCATION_ERROR; 83 return; 84 } 85 } 86 87 //------------------------------------------------------------------------------- 88 // 89 // Constructor from a UDataMemory handle to precompiled break rules 90 // stored in an ICU data file. 91 // 92 //------------------------------------------------------------------------------- 93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 94 { 95 init(); 96 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 97 if (U_FAILURE(status)) {return;} 98 if(fData == 0) { 99 status = U_MEMORY_ALLOCATION_ERROR; 100 return; 101 } 102 } 103 104 105 106 //------------------------------------------------------------------------------- 107 // 108 // Constructor from a set of rules supplied as a string. 109 // 110 //------------------------------------------------------------------------------- 111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 112 UParseError &parseError, 113 UErrorCode &status) 114 { 115 init(); 116 if (U_FAILURE(status)) {return;} 117 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 118 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 119 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 120 // creates and returns a complete RBBI. From here, in a constructor, we 121 // can't just return the object created by the builder factory, hence 122 // the assignment of the factory created object to "this". 123 if (U_SUCCESS(status)) { 124 *this = *bi; 125 delete bi; 126 } 127 } 128 129 130 //------------------------------------------------------------------------------- 131 // 132 // Default Constructor. Create an empty shell that can be set up later. 133 // Used when creating a RuleBasedBreakIterator from a set 134 // of rules. 135 //------------------------------------------------------------------------------- 136 RuleBasedBreakIterator::RuleBasedBreakIterator() { 137 init(); 138 } 139 140 141 //------------------------------------------------------------------------------- 142 // 143 // Copy constructor. Will produce a break iterator with the same behavior, 144 // and which iterates over the same text, as the one passed in. 145 // 146 //------------------------------------------------------------------------------- 147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 148 : BreakIterator(other) 149 { 150 this->init(); 151 *this = other; 152 } 153 154 155 /** 156 * Destructor 157 */ 158 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 159 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 160 // fCharIter was adopted from the outside. 161 delete fCharIter; 162 } 163 fCharIter = NULL; 164 delete fSCharIter; 165 fCharIter = NULL; 166 delete fDCharIter; 167 fDCharIter = NULL; 168 169 utext_close(fText); 170 171 if (fData != NULL) { 172 fData->removeReference(); 173 fData = NULL; 174 } 175 if (fCachedBreakPositions) { 176 uprv_free(fCachedBreakPositions); 177 fCachedBreakPositions = NULL; 178 } 179 if (fLanguageBreakEngines) { 180 delete fLanguageBreakEngines; 181 fLanguageBreakEngines = NULL; 182 } 183 if (fUnhandledBreakEngine) { 184 delete fUnhandledBreakEngine; 185 fUnhandledBreakEngine = NULL; 186 } 187 } 188 189 /** 190 * Assignment operator. Sets this iterator to have the same behavior, 191 * and iterate over the same text, as the one passed in. 192 */ 193 RuleBasedBreakIterator& 194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 195 if (this == &that) { 196 return *this; 197 } 198 reset(); // Delete break cache information 199 fBreakType = that.fBreakType; 200 if (fLanguageBreakEngines != NULL) { 201 delete fLanguageBreakEngines; 202 fLanguageBreakEngines = NULL; // Just rebuild for now 203 } 204 // TODO: clone fLanguageBreakEngines from "that" 205 UErrorCode status = U_ZERO_ERROR; 206 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 207 208 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 209 delete fCharIter; 210 } 211 fCharIter = NULL; 212 213 if (that.fCharIter != NULL ) { 214 // This is a little bit tricky - it will intially appear that 215 // this->fCharIter is adopted, even if that->fCharIter was 216 // not adopted. That's ok. 217 fCharIter = that.fCharIter->clone(); 218 } 219 220 if (fData != NULL) { 221 fData->removeReference(); 222 fData = NULL; 223 } 224 if (that.fData != NULL) { 225 fData = that.fData->addReference(); 226 } 227 228 return *this; 229 } 230 231 232 233 //----------------------------------------------------------------------------- 234 // 235 // init() Shared initialization routine. Used by all the constructors. 236 // Initializes all fields, leaving the object in a consistent state. 237 // 238 //----------------------------------------------------------------------------- 239 void RuleBasedBreakIterator::init() { 240 UErrorCode status = U_ZERO_ERROR; 241 fBufferClone = FALSE; 242 fText = utext_openUChars(NULL, NULL, 0, &status); 243 fCharIter = NULL; 244 fSCharIter = NULL; 245 fDCharIter = NULL; 246 fData = NULL; 247 fLastRuleStatusIndex = 0; 248 fLastStatusIndexValid = TRUE; 249 fDictionaryCharCount = 0; 250 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 251 // dictionary behavior for Break Iterators that are 252 // built from rules. Even better would be the ability to 253 // declare the type in the rules. 254 255 fCachedBreakPositions = NULL; 256 fLanguageBreakEngines = NULL; 257 fUnhandledBreakEngine = NULL; 258 fNumCachedBreakPositions = 0; 259 fPositionInCache = 0; 260 261 #ifdef RBBI_DEBUG 262 static UBool debugInitDone = FALSE; 263 if (debugInitDone == FALSE) { 264 char *debugEnv = getenv("U_RBBIDEBUG"); 265 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 266 fTrace = TRUE; 267 } 268 debugInitDone = TRUE; 269 } 270 #endif 271 } 272 273 274 275 //----------------------------------------------------------------------------- 276 // 277 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 278 // behavior, and iterating over the same text, as this one. 279 // Virtual function: does the right thing with subclasses. 280 // 281 //----------------------------------------------------------------------------- 282 BreakIterator* 283 RuleBasedBreakIterator::clone(void) const { 284 return new RuleBasedBreakIterator(*this); 285 } 286 287 /** 288 * Equality operator. Returns TRUE if both BreakIterators are of the 289 * same class, have the same behavior, and iterate over the same text. 290 */ 291 UBool 292 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 293 if (that.getDynamicClassID() != getDynamicClassID()) { 294 return FALSE; 295 } 296 297 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 298 299 if (!utext_equals(fText, that2.fText)) { 300 // The two break iterators are operating on different text, 301 // or have a different interation position. 302 return FALSE; 303 }; 304 305 // TODO: need a check for when in a dictionary region at different offsets. 306 307 if (that2.fData == fData || 308 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 309 // The two break iterators are using the same rules. 310 return TRUE; 311 } 312 return FALSE; 313 } 314 315 /** 316 * Compute a hash code for this BreakIterator 317 * @return A hash code 318 */ 319 int32_t 320 RuleBasedBreakIterator::hashCode(void) const { 321 int32_t hash = 0; 322 if (fData != NULL) { 323 hash = fData->hashCode(); 324 } 325 return hash; 326 } 327 328 329 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 330 if (U_FAILURE(status)) { 331 return; 332 } 333 reset(); 334 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 335 336 // Set up a dummy CharacterIterator to be returned if anyone 337 // calls getText(). With input from UText, there is no reasonable 338 // way to return a characterIterator over the actual input text. 339 // Return one over an empty string instead - this is the closest 340 // we can come to signaling a failure. 341 // (GetText() is obsolete, this failure is sort of OK) 342 if (fDCharIter == NULL) { 343 static const UChar c = 0; 344 fDCharIter = new UCharCharacterIterator(&c, 0); 345 if (fDCharIter == NULL) { 346 status = U_MEMORY_ALLOCATION_ERROR; 347 return; 348 } 349 } 350 351 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 352 // existing fCharIter was adopted from the outside. Delete it now. 353 delete fCharIter; 354 } 355 fCharIter = fDCharIter; 356 357 this->first(); 358 } 359 360 361 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 362 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 363 return result; 364 } 365 366 367 368 /** 369 * Returns the description used to create this iterator 370 */ 371 const UnicodeString& 372 RuleBasedBreakIterator::getRules() const { 373 if (fData != NULL) { 374 return fData->getRuleSourceString(); 375 } else { 376 static const UnicodeString *s; 377 if (s == NULL) { 378 // TODO: something more elegant here. 379 // perhaps API should return the string by value. 380 // Note: thread unsafe init & leak are semi-ok, better than 381 // what was before. Sould be cleaned up, though. 382 s = new UnicodeString; 383 } 384 return *s; 385 } 386 } 387 388 //======================================================================= 389 // BreakIterator overrides 390 //======================================================================= 391 392 /** 393 * Return a CharacterIterator over the text being analyzed. 394 */ 395 CharacterIterator& 396 RuleBasedBreakIterator::getText() const { 397 return *fCharIter; 398 } 399 400 /** 401 * Set the iterator to analyze a new piece of text. This function resets 402 * the current iteration position to the beginning of the text. 403 * @param newText An iterator over the text to analyze. 404 */ 405 void 406 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 407 // If we are holding a CharacterIterator adopted from a 408 // previous call to this function, delete it now. 409 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 410 delete fCharIter; 411 } 412 413 fCharIter = newText; 414 UErrorCode status = U_ZERO_ERROR; 415 reset(); 416 if (newText==NULL || newText->startIndex() != 0) { 417 // startIndex !=0 wants to be an error, but there's no way to report it. 418 // Make the iterator text be an empty string. 419 fText = utext_openUChars(fText, NULL, 0, &status); 420 } else { 421 fText = utext_openCharacterIterator(fText, newText, &status); 422 } 423 this->first(); 424 } 425 426 /** 427 * Set the iterator to analyze a new piece of text. This function resets 428 * the current iteration position to the beginning of the text. 429 * @param newText An iterator over the text to analyze. 430 */ 431 void 432 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 433 UErrorCode status = U_ZERO_ERROR; 434 reset(); 435 fText = utext_openConstUnicodeString(fText, &newText, &status); 436 437 // Set up a character iterator on the string. 438 // Needed in case someone calls getText(). 439 // Can not, unfortunately, do this lazily on the (probably never) 440 // call to getText(), because getText is const. 441 if (fSCharIter == NULL) { 442 fSCharIter = new StringCharacterIterator(newText); 443 } else { 444 fSCharIter->setText(newText); 445 } 446 447 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 448 // old fCharIter was adopted from the outside. Delete it. 449 delete fCharIter; 450 } 451 fCharIter = fSCharIter; 452 453 this->first(); 454 } 455 456 457 458 /** 459 * Sets the current iteration position to the beginning of the text. 460 * @return The offset of the beginning of the text. 461 */ 462 int32_t RuleBasedBreakIterator::first(void) { 463 reset(); 464 fLastRuleStatusIndex = 0; 465 fLastStatusIndexValid = TRUE; 466 //if (fText == NULL) 467 // return BreakIterator::DONE; 468 469 utext_setNativeIndex(fText, 0); 470 return 0; 471 } 472 473 /** 474 * Sets the current iteration position to the end of the text. 475 * @return The text's past-the-end offset. 476 */ 477 int32_t RuleBasedBreakIterator::last(void) { 478 reset(); 479 if (fText == NULL) { 480 fLastRuleStatusIndex = 0; 481 fLastStatusIndexValid = TRUE; 482 return BreakIterator::DONE; 483 } 484 485 fLastStatusIndexValid = FALSE; 486 int32_t pos = (int32_t)utext_nativeLength(fText); 487 utext_setNativeIndex(fText, pos); 488 return pos; 489 } 490 491 /** 492 * Advances the iterator either forward or backward the specified number of steps. 493 * Negative values move backward, and positive values move forward. This is 494 * equivalent to repeatedly calling next() or previous(). 495 * @param n The number of steps to move. The sign indicates the direction 496 * (negative is backwards, and positive is forwards). 497 * @return The character offset of the boundary position n boundaries away from 498 * the current one. 499 */ 500 int32_t RuleBasedBreakIterator::next(int32_t n) { 501 int32_t result = current(); 502 while (n > 0) { 503 result = next(); 504 --n; 505 } 506 while (n < 0) { 507 result = previous(); 508 ++n; 509 } 510 return result; 511 } 512 513 /** 514 * Advances the iterator to the next boundary position. 515 * @return The position of the first boundary after this one. 516 */ 517 int32_t RuleBasedBreakIterator::next(void) { 518 // if we have cached break positions and we're still in the range 519 // covered by them, just move one step forward in the cache 520 if (fCachedBreakPositions != NULL) { 521 if (fPositionInCache < fNumCachedBreakPositions - 1) { 522 ++fPositionInCache; 523 int32_t pos = fCachedBreakPositions[fPositionInCache]; 524 utext_setNativeIndex(fText, pos); 525 return pos; 526 } 527 else { 528 reset(); 529 } 530 } 531 532 int32_t startPos = current(); 533 int32_t result = handleNext(fData->fForwardTable); 534 if (fDictionaryCharCount > 0) { 535 result = checkDictionary(startPos, result, FALSE); 536 } 537 return result; 538 } 539 540 /** 541 * Advances the iterator backwards, to the last boundary preceding this one. 542 * @return The position of the last boundary position preceding this one. 543 */ 544 int32_t RuleBasedBreakIterator::previous(void) { 545 int32_t result; 546 int32_t startPos; 547 548 // if we have cached break positions and we're still in the range 549 // covered by them, just move one step backward in the cache 550 if (fCachedBreakPositions != NULL) { 551 if (fPositionInCache > 0) { 552 --fPositionInCache; 553 // If we're at the beginning of the cache, need to reevaluate the 554 // rule status 555 if (fPositionInCache <= 0) { 556 fLastStatusIndexValid = FALSE; 557 } 558 int32_t pos = fCachedBreakPositions[fPositionInCache]; 559 utext_setNativeIndex(fText, pos); 560 return pos; 561 } 562 else { 563 reset(); 564 } 565 } 566 567 // if we're already sitting at the beginning of the text, return DONE 568 if (fText == NULL || (startPos = current()) == 0) { 569 fLastRuleStatusIndex = 0; 570 fLastStatusIndexValid = TRUE; 571 return BreakIterator::DONE; 572 } 573 574 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 575 result = handlePrevious(fData->fReverseTable); 576 if (fDictionaryCharCount > 0) { 577 result = checkDictionary(result, startPos, TRUE); 578 } 579 return result; 580 } 581 582 // old rule syntax 583 // set things up. handlePrevious() will back us up to some valid 584 // break position before the current position (we back our internal 585 // iterator up one step to prevent handlePrevious() from returning 586 // the current position), but not necessarily the last one before 587 588 // where we started 589 590 int32_t start = current(); 591 592 UTEXT_PREVIOUS32(fText); 593 int32_t lastResult = handlePrevious(fData->fReverseTable); 594 if (lastResult == UBRK_DONE) { 595 lastResult = 0; 596 utext_setNativeIndex(fText, 0); 597 } 598 result = lastResult; 599 int32_t lastTag = 0; 600 UBool breakTagValid = FALSE; 601 602 // iterate forward from the known break position until we pass our 603 // starting point. The last break position before the starting 604 // point is our return value 605 606 for (;;) { 607 result = next(); 608 if (result == BreakIterator::DONE || result >= start) { 609 break; 610 } 611 lastResult = result; 612 lastTag = fLastRuleStatusIndex; 613 breakTagValid = TRUE; 614 } 615 616 // fLastBreakTag wants to have the value for section of text preceding 617 // the result position that we are to return (in lastResult.) If 618 // the backwards rules overshot and the above loop had to do two or more 619 // next()s to move up to the desired return position, we will have a valid 620 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 621 // we wont have a tag value for that position, which is only set by handleNext(). 622 623 // set the current iteration position to be the last break position 624 // before where we started, and then return that value 625 utext_setNativeIndex(fText, lastResult); 626 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 627 fLastStatusIndexValid = breakTagValid; 628 629 // No need to check the dictionary; it will have been handled by 630 // next() 631 632 return lastResult; 633 } 634 635 /** 636 * Sets the iterator to refer to the first boundary position following 637 * the specified position. 638 * @offset The position from which to begin searching for a break position. 639 * @return The position of the first break after the current position. 640 */ 641 int32_t RuleBasedBreakIterator::following(int32_t offset) { 642 // if we have cached break positions and offset is in the range 643 // covered by them, use them 644 // TODO: could use binary search 645 // TODO: what if offset is outside range, but break is not? 646 if (fCachedBreakPositions != NULL) { 647 if (offset >= fCachedBreakPositions[0] 648 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 649 fPositionInCache = 0; 650 // We are guaranteed not to leave the array due to range test above 651 while (offset >= fCachedBreakPositions[fPositionInCache]) { 652 ++fPositionInCache; 653 } 654 int32_t pos = fCachedBreakPositions[fPositionInCache]; 655 utext_setNativeIndex(fText, pos); 656 return pos; 657 } 658 else { 659 reset(); 660 } 661 } 662 663 // if the offset passed in is already past the end of the text, 664 // just return DONE; if it's before the beginning, return the 665 // text's starting offset 666 fLastRuleStatusIndex = 0; 667 fLastStatusIndexValid = TRUE; 668 if (fText == NULL || offset >= utext_nativeLength(fText)) { 669 last(); 670 return next(); 671 } 672 else if (offset < 0) { 673 return first(); 674 } 675 676 // otherwise, set our internal iteration position (temporarily) 677 // to the position passed in. If this is the _beginning_ position, 678 // then we can just use next() to get our return value 679 680 int32_t result = 0; 681 682 if (fData->fSafeRevTable != NULL) { 683 // new rule syntax 684 utext_setNativeIndex(fText, offset); 685 // move forward one codepoint to prepare for moving back to a 686 // safe point. 687 // this handles offset being between a supplementary character 688 UTEXT_NEXT32(fText); 689 // handlePrevious will move most of the time to < 1 boundary away 690 handlePrevious(fData->fSafeRevTable); 691 int32_t result = next(); 692 while (result <= offset) { 693 result = next(); 694 } 695 return result; 696 } 697 if (fData->fSafeFwdTable != NULL) { 698 // backup plan if forward safe table is not available 699 utext_setNativeIndex(fText, offset); 700 UTEXT_PREVIOUS32(fText); 701 // handle next will give result >= offset 702 handleNext(fData->fSafeFwdTable); 703 // previous will give result 0 or 1 boundary away from offset, 704 // most of the time 705 // we have to 706 int32_t oldresult = previous(); 707 while (oldresult > offset) { 708 int32_t result = previous(); 709 if (result <= offset) { 710 return oldresult; 711 } 712 oldresult = result; 713 } 714 int32_t result = next(); 715 if (result <= offset) { 716 return next(); 717 } 718 return result; 719 } 720 // otherwise, we have to sync up first. Use handlePrevious() to back 721 // up to a known break position before the specified position (if 722 // we can determine that the specified position is a break position, 723 // we don't back up at all). This may or may not be the last break 724 // position at or before our starting position. Advance forward 725 // from here until we've passed the starting position. The position 726 // we stop on will be the first break position after the specified one. 727 // old rule syntax 728 729 utext_setNativeIndex(fText, offset); 730 if (offset==0 || 731 offset==1 && utext_getNativeIndex(fText)==0) { 732 return next(); 733 } 734 result = previous(); 735 736 while (result != BreakIterator::DONE && result <= offset) { 737 result = next(); 738 } 739 740 return result; 741 } 742 743 /** 744 * Sets the iterator to refer to the last boundary position before the 745 * specified position. 746 * @offset The position to begin searching for a break from. 747 * @return The position of the last boundary before the starting position. 748 */ 749 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 750 // if we have cached break positions and offset is in the range 751 // covered by them, use them 752 if (fCachedBreakPositions != NULL) { 753 // TODO: binary search? 754 // TODO: What if offset is outside range, but break is not? 755 if (offset > fCachedBreakPositions[0] 756 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 757 fPositionInCache = 0; 758 while (fPositionInCache < fNumCachedBreakPositions 759 && offset > fCachedBreakPositions[fPositionInCache]) 760 ++fPositionInCache; 761 --fPositionInCache; 762 // If we're at the beginning of the cache, need to reevaluate the 763 // rule status 764 if (fPositionInCache <= 0) { 765 fLastStatusIndexValid = FALSE; 766 } 767 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 768 return fCachedBreakPositions[fPositionInCache]; 769 } 770 else { 771 reset(); 772 } 773 } 774 775 // if the offset passed in is already past the end of the text, 776 // just return DONE; if it's before the beginning, return the 777 // text's starting offset 778 if (fText == NULL || offset > utext_nativeLength(fText)) { 779 // return BreakIterator::DONE; 780 return last(); 781 } 782 else if (offset < 0) { 783 return first(); 784 } 785 786 // if we start by updating the current iteration position to the 787 // position specified by the caller, we can just use previous() 788 // to carry out this operation 789 790 if (fData->fSafeFwdTable != NULL) { 791 // new rule syntax 792 utext_setNativeIndex(fText, offset); 793 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 794 if (newOffset != offset) { 795 // Will come here if specified offset was not a code point boundary AND 796 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 797 // indices to the containing code point. 798 // For breakitereator::preceding only, these non-code-point indices need to be moved 799 // up to refer to the following codepoint. 800 UTEXT_NEXT32(fText); 801 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 802 } 803 804 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 805 // rather than adjusting the position unconditionally? 806 // (Change would interact with safe rules.) 807 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 808 // affects only preceding(), seems cleaner, but is slightly different. 809 UTEXT_PREVIOUS32(fText); 810 handleNext(fData->fSafeFwdTable); 811 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 812 while (result >= offset) { 813 result = previous(); 814 } 815 return result; 816 } 817 if (fData->fSafeRevTable != NULL) { 818 // backup plan if forward safe table is not available 819 // TODO: check whether this path can be discarded 820 // It's probably OK to say that rules must supply both safe tables 821 // if they use safe tables at all. We have certainly never described 822 // to anyone how to work with just one safe table. 823 utext_setNativeIndex(fText, offset); 824 UTEXT_NEXT32(fText); 825 826 // handle previous will give result <= offset 827 handlePrevious(fData->fSafeRevTable); 828 829 // next will give result 0 or 1 boundary away from offset, 830 // most of the time 831 // we have to 832 int32_t oldresult = next(); 833 while (oldresult < offset) { 834 int32_t result = next(); 835 if (result >= offset) { 836 return oldresult; 837 } 838 oldresult = result; 839 } 840 int32_t result = previous(); 841 if (result >= offset) { 842 return previous(); 843 } 844 return result; 845 } 846 847 // old rule syntax 848 utext_setNativeIndex(fText, offset); 849 return previous(); 850 } 851 852 /** 853 * Returns true if the specfied position is a boundary position. As a side 854 * effect, leaves the iterator pointing to the first boundary position at 855 * or after "offset". 856 * @param offset the offset to check. 857 * @return True if "offset" is a boundary position. 858 */ 859 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 860 // the beginning index of the iterator is always a boundary position by definition 861 if (offset == 0) { 862 first(); // For side effects on current position, tag values. 863 return TRUE; 864 } 865 866 if (offset == (int32_t)utext_nativeLength(fText)) { 867 last(); // For side effects on current position, tag values. 868 return TRUE; 869 } 870 871 // out-of-range indexes are never boundary positions 872 if (offset < 0) { 873 first(); // For side effects on current position, tag values. 874 return FALSE; 875 } 876 877 if (offset > utext_nativeLength(fText)) { 878 last(); // For side effects on current position, tag values. 879 return FALSE; 880 } 881 882 // otherwise, we can use following() on the position before the specified 883 // one and return true if the position we get back is the one the user 884 // specified 885 utext_previous32From(fText, offset); 886 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 887 UBool result = following(backOne) == offset; 888 return result; 889 } 890 891 /** 892 * Returns the current iteration position. 893 * @return The current iteration position. 894 */ 895 int32_t RuleBasedBreakIterator::current(void) const { 896 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 897 return pos; 898 } 899 900 //======================================================================= 901 // implementation 902 //======================================================================= 903 904 // 905 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 906 // of user text. A variable with this enum type keeps track of where we 907 // are. The state machine only fetches user input while in the RUN mode. 908 // 909 enum RBBIRunMode { 910 RBBI_START, // state machine processing is before first char of input 911 RBBI_RUN, // state machine processing is in the user text 912 RBBI_END // state machine processing is after end of user text. 913 }; 914 915 916 //----------------------------------------------------------------------------------- 917 // 918 // handleNext(stateTable) 919 // This method is the actual implementation of the rbbi next() method. 920 // This method initializes the state machine to state 1 921 // and advances through the text character by character until we reach the end 922 // of the text or the state machine transitions to state 0. We update our return 923 // value every time the state machine passes through an accepting state. 924 // 925 //----------------------------------------------------------------------------------- 926 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 927 int32_t state; 928 int16_t category = 0; 929 RBBIRunMode mode; 930 931 RBBIStateTableRow *row; 932 UChar32 c; 933 int32_t lookaheadStatus = 0; 934 int32_t lookaheadTagIdx = 0; 935 int32_t result = 0; 936 int32_t initialPosition = 0; 937 int32_t lookaheadResult = 0; 938 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 939 const char *tableData = statetable->fTableData; 940 uint32_t tableRowLen = statetable->fRowLen; 941 942 #ifdef RBBI_DEBUG 943 if (fTrace) { 944 RBBIDebugPuts("Handle Next pos char state category"); 945 } 946 #endif 947 948 // No matter what, handleNext alway correctly sets the break tag value. 949 fLastStatusIndexValid = TRUE; 950 fLastRuleStatusIndex = 0; 951 952 // if we're already at the end of the text, return DONE. 953 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 954 result = initialPosition; 955 c = UTEXT_NEXT32(fText); 956 if (fData == NULL || c==U_SENTINEL) { 957 return BreakIterator::DONE; 958 } 959 960 // Set the initial state for the state machine 961 state = START_STATE; 962 row = (RBBIStateTableRow *) 963 //(statetable->fTableData + (statetable->fRowLen * state)); 964 (tableData + tableRowLen * state); 965 966 967 mode = RBBI_RUN; 968 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 969 category = 2; 970 mode = RBBI_START; 971 } 972 973 974 // loop until we reach the end of the text or transition to state 0 975 // 976 for (;;) { 977 if (c == U_SENTINEL) { 978 // Reached end of input string. 979 if (mode == RBBI_END) { 980 // We have already run the loop one last time with the 981 // character set to the psueudo {eof} value. Now it is time 982 // to unconditionally bail out. 983 if (lookaheadResult > result) { 984 // We ran off the end of the string with a pending look-ahead match. 985 // Treat this as if the look-ahead condition had been met, and return 986 // the match at the / position from the look-ahead rule. 987 result = lookaheadResult; 988 fLastRuleStatusIndex = lookaheadTagIdx; 989 lookaheadStatus = 0; 990 } 991 break; 992 } 993 // Run the loop one last time with the fake end-of-input character category. 994 mode = RBBI_END; 995 category = 1; 996 } 997 998 // 999 // Get the char category. An incoming category of 1 or 2 means that 1000 // we are preset for doing the beginning or end of input, and 1001 // that we shouldn't get a category from an actual text input character. 1002 // 1003 if (mode == RBBI_RUN) { 1004 // look up the current character's character category, which tells us 1005 // which column in the state table to look at. 1006 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1007 // not the size of the character going in, which is a UChar32. 1008 // 1009 UTRIE_GET16(&fData->fTrie, c, category); 1010 1011 // Check the dictionary bit in the character's category. 1012 // Counter is only used by dictionary based iterators (subclasses). 1013 // Chars that need to be handled by a dictionary have a flag bit set 1014 // in their category values. 1015 // 1016 if ((category & 0x4000) != 0) { 1017 fDictionaryCharCount++; 1018 // And off the dictionary flag bit. 1019 category &= ~0x4000; 1020 } 1021 } 1022 1023 #ifdef RBBI_DEBUG 1024 if (fTrace) { 1025 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1026 if (0x20<=c && c<0x7f) { 1027 RBBIDebugPrintf("\"%c\" ", c); 1028 } else { 1029 RBBIDebugPrintf("%5x ", c); 1030 } 1031 RBBIDebugPrintf("%3d %3d\n", state, category); 1032 } 1033 #endif 1034 1035 // State Transition - move machine to its next state 1036 // 1037 state = row->fNextState[category]; 1038 row = (RBBIStateTableRow *) 1039 // (statetable->fTableData + (statetable->fRowLen * state)); 1040 (tableData + tableRowLen * state); 1041 1042 1043 if (row->fAccepting == -1) { 1044 // Match found, common case. 1045 if (mode != RBBI_START) { 1046 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1047 } 1048 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1049 } 1050 1051 if (row->fLookAhead != 0) { 1052 if (lookaheadStatus != 0 1053 && row->fAccepting == lookaheadStatus) { 1054 // Lookahead match is completed. 1055 result = lookaheadResult; 1056 fLastRuleStatusIndex = lookaheadTagIdx; 1057 lookaheadStatus = 0; 1058 // TODO: make a standalone hard break in a rule work. 1059 if (lookAheadHardBreak) { 1060 UTEXT_SETNATIVEINDEX(fText, result); 1061 return result; 1062 } 1063 // Look-ahead completed, but other rules may match further. Continue on 1064 // TODO: junk this feature? I don't think it's used anywhwere. 1065 goto continueOn; 1066 } 1067 1068 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1069 lookaheadResult = r; 1070 lookaheadStatus = row->fLookAhead; 1071 lookaheadTagIdx = row->fTagIdx; 1072 goto continueOn; 1073 } 1074 1075 1076 if (row->fAccepting != 0) { 1077 // Because this is an accepting state, any in-progress look-ahead match 1078 // is no longer relavant. Clear out the pending lookahead status. 1079 lookaheadStatus = 0; // clear out any pending look-ahead match. 1080 } 1081 1082 continueOn: 1083 if (state == STOP_STATE) { 1084 // This is the normal exit from the lookup state machine. 1085 // We have advanced through the string until it is certain that no 1086 // longer match is possible, no matter what characters follow. 1087 break; 1088 } 1089 1090 // Advance to the next character. 1091 // If this is a beginning-of-input loop iteration, don't advance 1092 // the input position. The next iteration will be processing the 1093 // first real input character. 1094 if (mode == RBBI_RUN) { 1095 c = UTEXT_NEXT32(fText); 1096 } else { 1097 if (mode == RBBI_START) { 1098 mode = RBBI_RUN; 1099 } 1100 } 1101 1102 1103 } 1104 1105 // The state machine is done. Check whether it found a match... 1106 1107 // If the iterator failed to advance in the match engine, force it ahead by one. 1108 // (This really indicates a defect in the break rules. They should always match 1109 // at least one character.) 1110 if (result == initialPosition) { 1111 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1112 UTEXT_NEXT32(fText); 1113 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1114 } 1115 1116 // Leave the iterator at our result position. 1117 UTEXT_SETNATIVEINDEX(fText, result); 1118 #ifdef RBBI_DEBUG 1119 if (fTrace) { 1120 RBBIDebugPrintf("result = %d\n\n", result); 1121 } 1122 #endif 1123 return result; 1124 } 1125 1126 1127 1128 //----------------------------------------------------------------------------------- 1129 // 1130 // handlePrevious() 1131 // 1132 // Iterate backwards, according to the logic of the reverse rules. 1133 // This version handles the exact style backwards rules. 1134 // 1135 // The logic of this function is very similar to handleNext(), above. 1136 // 1137 //----------------------------------------------------------------------------------- 1138 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1139 int32_t state; 1140 int16_t category = 0; 1141 RBBIRunMode mode; 1142 RBBIStateTableRow *row; 1143 UChar32 c; 1144 int32_t lookaheadStatus = 0; 1145 int32_t result = 0; 1146 int32_t initialPosition = 0; 1147 int32_t lookaheadResult = 0; 1148 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1149 1150 #ifdef RBBI_DEBUG 1151 if (fTrace) { 1152 RBBIDebugPuts("Handle Previous pos char state category"); 1153 } 1154 #endif 1155 1156 // handlePrevious() never gets the rule status. 1157 // Flag the status as invalid; if the user ever asks for status, we will need 1158 // to back up, then re-find the break position using handleNext(), which does 1159 // get the status value. 1160 fLastStatusIndexValid = FALSE; 1161 fLastRuleStatusIndex = 0; 1162 1163 // if we're already at the start of the text, return DONE. 1164 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1165 return BreakIterator::DONE; 1166 } 1167 1168 // Set up the starting char. 1169 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1170 result = initialPosition; 1171 c = UTEXT_PREVIOUS32(fText); 1172 1173 // Set the initial state for the state machine 1174 state = START_STATE; 1175 row = (RBBIStateTableRow *) 1176 (statetable->fTableData + (statetable->fRowLen * state)); 1177 category = 3; 1178 mode = RBBI_RUN; 1179 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1180 category = 2; 1181 mode = RBBI_START; 1182 } 1183 1184 1185 // loop until we reach the start of the text or transition to state 0 1186 // 1187 for (;;) { 1188 if (c == U_SENTINEL) { 1189 // Reached end of input string. 1190 if (mode == RBBI_END || 1191 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { 1192 // We have already run the loop one last time with the 1193 // character set to the psueudo {eof} value. Now it is time 1194 // to unconditionally bail out. 1195 // (Or we have an old format binary rule file that does not support {eof}.) 1196 if (lookaheadResult < result) { 1197 // We ran off the end of the string with a pending look-ahead match. 1198 // Treat this as if the look-ahead condition had been met, and return 1199 // the match at the / position from the look-ahead rule. 1200 result = lookaheadResult; 1201 lookaheadStatus = 0; 1202 } else if (result == initialPosition) { 1203 // Ran off start, no match found. 1204 // move one index one (towards the start, since we are doing a previous()) 1205 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1206 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1207 } 1208 break; 1209 } 1210 // Run the loop one last time with the fake end-of-input character category. 1211 mode = RBBI_END; 1212 category = 1; 1213 } 1214 1215 // 1216 // Get the char category. An incoming category of 1 or 2 means that 1217 // we are preset for doing the beginning or end of input, and 1218 // that we shouldn't get a category from an actual text input character. 1219 // 1220 if (mode == RBBI_RUN) { 1221 // look up the current character's character category, which tells us 1222 // which column in the state table to look at. 1223 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1224 // not the size of the character going in, which is a UChar32. 1225 // 1226 UTRIE_GET16(&fData->fTrie, c, category); 1227 1228 // Check the dictionary bit in the character's category. 1229 // Counter is only used by dictionary based iterators (subclasses). 1230 // Chars that need to be handled by a dictionary have a flag bit set 1231 // in their category values. 1232 // 1233 if ((category & 0x4000) != 0) { 1234 fDictionaryCharCount++; 1235 // And off the dictionary flag bit. 1236 category &= ~0x4000; 1237 } 1238 } 1239 1240 #ifdef RBBI_DEBUG 1241 if (fTrace) { 1242 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1243 if (0x20<=c && c<0x7f) { 1244 RBBIDebugPrintf("\"%c\" ", c); 1245 } else { 1246 RBBIDebugPrintf("%5x ", c); 1247 } 1248 RBBIDebugPrintf("%3d %3d\n", state, category); 1249 } 1250 #endif 1251 1252 // State Transition - move machine to its next state 1253 // 1254 state = row->fNextState[category]; 1255 row = (RBBIStateTableRow *) 1256 (statetable->fTableData + (statetable->fRowLen * state)); 1257 1258 if (row->fAccepting == -1) { 1259 // Match found, common case. 1260 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1261 } 1262 1263 if (row->fLookAhead != 0) { 1264 if (lookaheadStatus != 0 1265 && row->fAccepting == lookaheadStatus) { 1266 // Lookahead match is completed. 1267 result = lookaheadResult; 1268 lookaheadStatus = 0; 1269 // TODO: make a standalone hard break in a rule work. 1270 if (lookAheadHardBreak) { 1271 UTEXT_SETNATIVEINDEX(fText, result); 1272 return result; 1273 } 1274 // Look-ahead completed, but other rules may match further. Continue on 1275 // TODO: junk this feature? I don't think it's used anywhwere. 1276 goto continueOn; 1277 } 1278 1279 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1280 lookaheadResult = r; 1281 lookaheadStatus = row->fLookAhead; 1282 goto continueOn; 1283 } 1284 1285 1286 if (row->fAccepting != 0) { 1287 // Because this is an accepting state, any in-progress look-ahead match 1288 // is no longer relavant. Clear out the pending lookahead status. 1289 lookaheadStatus = 0; 1290 } 1291 1292 continueOn: 1293 if (state == STOP_STATE) { 1294 // This is the normal exit from the lookup state machine. 1295 // We have advanced through the string until it is certain that no 1296 // longer match is possible, no matter what characters follow. 1297 break; 1298 } 1299 1300 // Move (backwards) to the next character to process. 1301 // If this is a beginning-of-input loop iteration, don't advance 1302 // the input position. The next iteration will be processing the 1303 // first real input character. 1304 if (mode == RBBI_RUN) { 1305 c = UTEXT_PREVIOUS32(fText); 1306 } else { 1307 if (mode == RBBI_START) { 1308 mode = RBBI_RUN; 1309 } 1310 } 1311 } 1312 1313 // The state machine is done. Check whether it found a match... 1314 1315 // If the iterator failed to advance in the match engine, force it ahead by one. 1316 // (This really indicates a defect in the break rules. They should always match 1317 // at least one character.) 1318 if (result == initialPosition) { 1319 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1320 UTEXT_PREVIOUS32(fText); 1321 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1322 } 1323 1324 // Leave the iterator at our result position. 1325 UTEXT_SETNATIVEINDEX(fText, result); 1326 #ifdef RBBI_DEBUG 1327 if (fTrace) { 1328 RBBIDebugPrintf("result = %d\n\n", result); 1329 } 1330 #endif 1331 return result; 1332 } 1333 1334 1335 void 1336 RuleBasedBreakIterator::reset() 1337 { 1338 if (fCachedBreakPositions) { 1339 uprv_free(fCachedBreakPositions); 1340 } 1341 fCachedBreakPositions = NULL; 1342 fNumCachedBreakPositions = 0; 1343 fDictionaryCharCount = 0; 1344 fPositionInCache = 0; 1345 } 1346 1347 1348 1349 //------------------------------------------------------------------------------- 1350 // 1351 // getRuleStatus() Return the break rule tag associated with the current 1352 // iterator position. If the iterator arrived at its current 1353 // position by iterating forwards, the value will have been 1354 // cached by the handleNext() function. 1355 // 1356 // If no cached status value is available, the status is 1357 // found by doing a previous() followed by a next(), which 1358 // leaves the iterator where it started, and computes the 1359 // status while doing the next(). 1360 // 1361 //------------------------------------------------------------------------------- 1362 void RuleBasedBreakIterator::makeRuleStatusValid() { 1363 if (fLastStatusIndexValid == FALSE) { 1364 // No cached status is available. 1365 if (fText == NULL || current() == 0) { 1366 // At start of text, or there is no text. Status is always zero. 1367 fLastRuleStatusIndex = 0; 1368 fLastStatusIndexValid = TRUE; 1369 } else { 1370 // Not at start of text. Find status the tedious way. 1371 int32_t pa = current(); 1372 previous(); 1373 if (fNumCachedBreakPositions > 0) { 1374 reset(); // Blow off the dictionary cache 1375 } 1376 int32_t pb = next(); 1377 if (pa != pb) { 1378 // note: the if (pa != pb) test is here only to eliminate warnings for 1379 // unused local variables on gcc. Logically, it isn't needed. 1380 U_ASSERT(pa == pb); 1381 } 1382 } 1383 } 1384 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1385 } 1386 1387 1388 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1389 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1390 nonConstThis->makeRuleStatusValid(); 1391 1392 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1393 // (the number of status values.) 1394 // This function returns the last (largest) of the array of status values. 1395 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1396 int32_t tagVal = fData->fRuleStatusTable[idx]; 1397 1398 return tagVal; 1399 } 1400 1401 1402 1403 1404 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1405 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1406 { 1407 if (U_FAILURE(status)) { 1408 return 0; 1409 } 1410 1411 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1412 nonConstThis->makeRuleStatusValid(); 1413 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1414 int32_t numValsToCopy = numVals; 1415 if (numVals > capacity) { 1416 status = U_BUFFER_OVERFLOW_ERROR; 1417 numValsToCopy = capacity; 1418 } 1419 int i; 1420 for (i=0; i<numValsToCopy; i++) { 1421 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1422 } 1423 return numVals; 1424 } 1425 1426 1427 1428 //------------------------------------------------------------------------------- 1429 // 1430 // getBinaryRules Access to the compiled form of the rules, 1431 // for use by build system tools that save the data 1432 // for standard iterator types. 1433 // 1434 //------------------------------------------------------------------------------- 1435 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1436 const uint8_t *retPtr = NULL; 1437 length = 0; 1438 1439 if (fData != NULL) { 1440 retPtr = (const uint8_t *)fData->fHeader; 1441 length = fData->fHeader->fLength; 1442 } 1443 return retPtr; 1444 } 1445 1446 1447 1448 1449 //------------------------------------------------------------------------------- 1450 // 1451 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1452 // Saving one heap allocation isn't worth the trouble. 1453 // Cloning shouldn't be done in tight loops, and 1454 // making the clone copy involves other heap operations anyway. 1455 // And the application code for correctly dealing with buffer 1456 // size problems and the eventual object destruction is ugly. 1457 // 1458 //------------------------------------------------------------------------------- 1459 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1460 int32_t &bufferSize, 1461 UErrorCode &status) 1462 { 1463 if (U_FAILURE(status)){ 1464 return NULL; 1465 } 1466 1467 // 1468 // If user buffer size is zero this is a preflight operation to 1469 // obtain the needed buffer size, allowing for worst case misalignment. 1470 // 1471 if (bufferSize == 0) { 1472 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1473 return NULL; 1474 } 1475 1476 1477 // 1478 // Check the alignment and size of the user supplied buffer. 1479 // Allocate heap memory if the user supplied memory is insufficient. 1480 // 1481 char *buf = (char *)stackBuffer; 1482 uint32_t s = bufferSize; 1483 1484 if (stackBuffer == NULL) { 1485 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1486 } 1487 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1488 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1489 s -= offsetUp; 1490 buf += offsetUp; 1491 } 1492 if (s < sizeof(RuleBasedBreakIterator)) { 1493 // Not enough room in the caller-supplied buffer. 1494 // Do a plain-vanilla heap based clone and return that, along with 1495 // a warning that the clone was allocated. 1496 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1497 if (clonedBI == 0) { 1498 status = U_MEMORY_ALLOCATION_ERROR; 1499 } else { 1500 status = U_SAFECLONE_ALLOCATED_WARNING; 1501 } 1502 return clonedBI; 1503 } 1504 1505 // 1506 // Clone the source BI into the caller-supplied buffer. 1507 // TODO: using an overloaded operator new to directly initialize the 1508 // copy in the user's buffer would be better, but it doesn't seem 1509 // to get along with namespaces. Investigate why. 1510 // 1511 // The memcpy is only safe with an empty (default constructed) 1512 // break iterator. Use on others can screw up reference counts 1513 // to data. memcpy-ing objects is not really a good idea... 1514 // 1515 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy 1516 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; 1517 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part 1518 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) 1519 *clone = *this; // clone = the real BI we want. 1520 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1521 1522 return clone; 1523 } 1524 1525 1526 //------------------------------------------------------------------------------- 1527 // 1528 // isDictionaryChar Return true if the category lookup for this char 1529 // indicates that it is in the set of dictionary lookup 1530 // chars. 1531 // 1532 // This function is intended for use by dictionary based 1533 // break iterators. 1534 // 1535 //------------------------------------------------------------------------------- 1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1537 if (fData == NULL) { 1538 return FALSE; 1539 } 1540 uint16_t category; 1541 UTRIE_GET16(&fData->fTrie, c, category); 1542 return (category & 0x4000) != 0; 1543 }*/ 1544 1545 1546 //------------------------------------------------------------------------------- 1547 // 1548 // checkDictionary This function handles all processing of characters in 1549 // the "dictionary" set. It will determine the appropriate 1550 // course of action, and possibly set up a cache in the 1551 // process. 1552 // 1553 //------------------------------------------------------------------------------- 1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1555 int32_t endPos, 1556 UBool reverse) { 1557 // Reset the old break cache first. 1558 uint32_t dictionaryCount = fDictionaryCharCount; 1559 reset(); 1560 1561 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 1562 return (reverse ? startPos : endPos); 1563 } 1564 1565 // Starting from the starting point, scan towards the proposed result, 1566 // looking for the first dictionary character (which may be the one 1567 // we're on, if we're starting in the middle of a range). 1568 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1569 if (reverse) { 1570 UTEXT_PREVIOUS32(fText); 1571 } 1572 1573 int32_t rangeStart = startPos; 1574 int32_t rangeEnd = endPos; 1575 1576 uint16_t category; 1577 int32_t current; 1578 UErrorCode status = U_ZERO_ERROR; 1579 UStack breaks(status); 1580 int32_t foundBreakCount = 0; 1581 UChar32 c = utext_current32(fText); 1582 1583 UTRIE_GET16(&fData->fTrie, c, category); 1584 1585 // Is the character we're starting on a dictionary character? If so, we 1586 // need to back up to include the entire run; otherwise the results of 1587 // the break algorithm will differ depending on where we start. Since 1588 // the result is cached and there is typically a non-dictionary break 1589 // within a small number of words, there should be little performance impact. 1590 if (category & 0x4000) { 1591 if (reverse) { 1592 do { 1593 utext_next32(fText); // TODO: recast to work directly with postincrement. 1594 c = utext_current32(fText); 1595 UTRIE_GET16(&fData->fTrie, c, category); 1596 } while (c != U_SENTINEL && (category & 0x4000)); 1597 // Back up to the last dictionary character 1598 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1599 if (c == U_SENTINEL) { 1600 // c = fText->last32(); 1601 // TODO: why was this if needed? 1602 c = UTEXT_PREVIOUS32(fText); 1603 } 1604 else { 1605 c = UTEXT_PREVIOUS32(fText); 1606 } 1607 } 1608 else { 1609 do { 1610 c = UTEXT_PREVIOUS32(fText); 1611 UTRIE_GET16(&fData->fTrie, c, category); 1612 } 1613 while (c != U_SENTINEL && (category & 0x4000)); 1614 // Back up to the last dictionary character 1615 if (c == U_SENTINEL) { 1616 // c = fText->first32(); 1617 c = utext_current32(fText); 1618 } 1619 else { 1620 utext_next32(fText); 1621 c = utext_current32(fText); 1622 } 1623 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1624 } 1625 UTRIE_GET16(&fData->fTrie, c, category); 1626 } 1627 1628 // Loop through the text, looking for ranges of dictionary characters. 1629 // For each span, find the appropriate break engine, and ask it to find 1630 // any breaks within the span. 1631 // Note: we always do this in the forward direction, so that the break 1632 // cache is built in the right order. 1633 if (reverse) { 1634 utext_setNativeIndex(fText, rangeStart); 1635 c = utext_current32(fText); 1636 UTRIE_GET16(&fData->fTrie, c, category); 1637 } 1638 while(U_SUCCESS(status)) { 1639 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1640 utext_next32(fText); // TODO: tweak for post-increment operation 1641 c = utext_current32(fText); 1642 UTRIE_GET16(&fData->fTrie, c, category); 1643 } 1644 if (current >= rangeEnd) { 1645 break; 1646 } 1647 1648 // We now have a dictionary character. Get the appropriate language object 1649 // to deal with it. 1650 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1651 1652 // Ask the language object if there are any breaks. It will leave the text 1653 // pointer on the other side of its range, ready to search for the next one. 1654 if (lbe != NULL) { 1655 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1656 } 1657 1658 // Reload the loop variables for the next go-round 1659 c = utext_current32(fText); 1660 UTRIE_GET16(&fData->fTrie, c, category); 1661 } 1662 1663 // If we found breaks, build a new break cache. The first and last entries must 1664 // be the original starting and ending position. 1665 if (foundBreakCount > 0) { 1666 int32_t totalBreaks = foundBreakCount; 1667 if (startPos < breaks.elementAti(0)) { 1668 totalBreaks += 1; 1669 } 1670 if (endPos > breaks.peeki()) { 1671 totalBreaks += 1; 1672 } 1673 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1674 if (fCachedBreakPositions != NULL) { 1675 int32_t out = 0; 1676 fNumCachedBreakPositions = totalBreaks; 1677 if (startPos < breaks.elementAti(0)) { 1678 fCachedBreakPositions[out++] = startPos; 1679 } 1680 for (int32_t i = 0; i < foundBreakCount; ++i) { 1681 fCachedBreakPositions[out++] = breaks.elementAti(i); 1682 } 1683 if (endPos > fCachedBreakPositions[out-1]) { 1684 fCachedBreakPositions[out] = endPos; 1685 } 1686 // If there are breaks, then by definition, we are replacing the original 1687 // proposed break by one of the breaks we found. Use following() and 1688 // preceding() to do the work. They should never recurse in this case. 1689 if (reverse) { 1690 return preceding(endPos - 1); 1691 } 1692 else { 1693 return following(startPos); 1694 } 1695 } 1696 // If the allocation failed, just fall through to the "no breaks found" case. 1697 } 1698 1699 // If we get here, there were no language-based breaks. Set the text pointer 1700 // to the original proposed break. 1701 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1702 return (reverse ? startPos : endPos); 1703 } 1704 1705 U_NAMESPACE_END 1706 1707 // defined in ucln_cmn.h 1708 1709 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; 1710 1711 /** 1712 * Release all static memory held by breakiterator. 1713 */ 1714 U_CDECL_BEGIN 1715 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1716 if (gLanguageBreakFactories) { 1717 delete gLanguageBreakFactories; 1718 gLanguageBreakFactories = NULL; 1719 } 1720 return TRUE; 1721 } 1722 U_CDECL_END 1723 1724 U_CDECL_BEGIN 1725 static void U_CALLCONV _deleteFactory(void *obj) { 1726 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; 1727 } 1728 U_CDECL_END 1729 U_NAMESPACE_BEGIN 1730 1731 static const LanguageBreakEngine* 1732 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1733 { 1734 UBool needsInit; 1735 UErrorCode status = U_ZERO_ERROR; 1736 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1737 1738 if (needsInit) { 1739 UStack *factories = new UStack(_deleteFactory, NULL, status); 1740 if (factories != NULL && U_SUCCESS(status)) { 1741 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1742 factories->push(builtIn, status); 1743 #ifdef U_LOCAL_SERVICE_HOOK 1744 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1745 if (extra != NULL) { 1746 factories->push(extra, status); 1747 } 1748 #endif 1749 } 1750 umtx_lock(NULL); 1751 if (gLanguageBreakFactories == NULL) { 1752 gLanguageBreakFactories = factories; 1753 factories = NULL; 1754 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1755 } 1756 umtx_unlock(NULL); 1757 delete factories; 1758 } 1759 1760 if (gLanguageBreakFactories == NULL) { 1761 return NULL; 1762 } 1763 1764 int32_t i = gLanguageBreakFactories->size(); 1765 const LanguageBreakEngine *lbe = NULL; 1766 while (--i >= 0) { 1767 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1768 lbe = factory->getEngineFor(c, breakType); 1769 if (lbe != NULL) { 1770 break; 1771 } 1772 } 1773 return lbe; 1774 } 1775 1776 1777 //------------------------------------------------------------------------------- 1778 // 1779 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1780 // the characer c. 1781 // 1782 //------------------------------------------------------------------------------- 1783 const LanguageBreakEngine * 1784 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1785 const LanguageBreakEngine *lbe = NULL; 1786 UErrorCode status = U_ZERO_ERROR; 1787 1788 if (fLanguageBreakEngines == NULL) { 1789 fLanguageBreakEngines = new UStack(status); 1790 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1791 delete fLanguageBreakEngines; 1792 fLanguageBreakEngines = 0; 1793 return NULL; 1794 } 1795 } 1796 1797 int32_t i = fLanguageBreakEngines->size(); 1798 while (--i >= 0) { 1799 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1800 if (lbe->handles(c, fBreakType)) { 1801 return lbe; 1802 } 1803 } 1804 1805 // No existing dictionary took the character. See if a factory wants to 1806 // give us a new LanguageBreakEngine for this character. 1807 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1808 1809 // If we got one, use it and push it on our stack. 1810 if (lbe != NULL) { 1811 fLanguageBreakEngines->push((void *)lbe, status); 1812 // Even if we can't remember it, we can keep looking it up, so 1813 // return it even if the push fails. 1814 return lbe; 1815 } 1816 1817 // No engine is forthcoming for this character. Add it to the 1818 // reject set. Create the reject break engine if needed. 1819 if (fUnhandledBreakEngine == NULL) { 1820 fUnhandledBreakEngine = new UnhandledEngine(status); 1821 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1822 status = U_MEMORY_ALLOCATION_ERROR; 1823 } 1824 // Put it last so that scripts for which we have an engine get tried 1825 // first. 1826 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1827 // If we can't insert it, or creation failed, get rid of it 1828 if (U_FAILURE(status)) { 1829 delete fUnhandledBreakEngine; 1830 fUnhandledBreakEngine = 0; 1831 return NULL; 1832 } 1833 } 1834 1835 // Tell the reject engine about the character; at its discretion, it may 1836 // add more than just the one character. 1837 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1838 1839 return fUnhandledBreakEngine; 1840 } 1841 1842 1843 1844 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1845 return fBreakType; 1846 }*/ 1847 1848 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1849 fBreakType = type; 1850 reset(); 1851 } 1852 1853 U_NAMESPACE_END 1854 1855 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1856