1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2010 International Business Machines Corporation 4 * and others. All rights reserved. 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include <typeinfo> // for 'typeid' to work 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/rbbi.h" 20 #include "unicode/schriter.h" 21 #include "unicode/uchriter.h" 22 #include "unicode/udata.h" 23 #include "unicode/uclean.h" 24 #include "rbbidata.h" 25 #include "rbbirb.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "umutex.h" 29 #include "ucln_cmn.h" 30 #include "brkeng.h" 31 32 #include "uassert.h" 33 #include "uvector.h" 34 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 36 #if U_LOCAL_SERVICE_HOOK 37 #include "localsvc.h" 38 #endif 39 40 #ifdef RBBI_DEBUG 41 static UBool fTrace = FALSE; 42 #endif 43 44 U_NAMESPACE_BEGIN 45 46 // The state number of the starting state 47 #define START_STATE 1 48 49 // The state-transition value indicating "stop" 50 #define STOP_STATE 0 51 52 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 54 55 56 //======================================================================= 57 // constructors 58 //======================================================================= 59 60 /** 61 * Constructs a RuleBasedBreakIterator that uses the already-created 62 * tables object that is passed in as a parameter. 63 */ 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 65 { 66 init(); 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor 68 if (U_FAILURE(status)) {return;} 69 if(fData == 0) { 70 status = U_MEMORY_ALLOCATION_ERROR; 71 return; 72 } 73 } 74 75 /** 76 * Same as above but does not adopt memory 77 */ 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 79 { 80 init(); 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 82 if (U_FAILURE(status)) {return;} 83 if(fData == 0) { 84 status = U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 } 88 89 //------------------------------------------------------------------------------- 90 // 91 // Constructor from a UDataMemory handle to precompiled break rules 92 // stored in an ICU data file. 93 // 94 //------------------------------------------------------------------------------- 95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 96 { 97 init(); 98 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 99 if (U_FAILURE(status)) {return;} 100 if(fData == 0) { 101 status = U_MEMORY_ALLOCATION_ERROR; 102 return; 103 } 104 } 105 106 107 108 //------------------------------------------------------------------------------- 109 // 110 // Constructor from a set of rules supplied as a string. 111 // 112 //------------------------------------------------------------------------------- 113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 114 UParseError &parseError, 115 UErrorCode &status) 116 { 117 init(); 118 if (U_FAILURE(status)) {return;} 119 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 120 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 121 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 122 // creates and returns a complete RBBI. From here, in a constructor, we 123 // can't just return the object created by the builder factory, hence 124 // the assignment of the factory created object to "this". 125 if (U_SUCCESS(status)) { 126 *this = *bi; 127 delete bi; 128 } 129 } 130 131 132 //------------------------------------------------------------------------------- 133 // 134 // Default Constructor. Create an empty shell that can be set up later. 135 // Used when creating a RuleBasedBreakIterator from a set 136 // of rules. 137 //------------------------------------------------------------------------------- 138 RuleBasedBreakIterator::RuleBasedBreakIterator() { 139 init(); 140 } 141 142 143 //------------------------------------------------------------------------------- 144 // 145 // Copy constructor. Will produce a break iterator with the same behavior, 146 // and which iterates over the same text, as the one passed in. 147 // 148 //------------------------------------------------------------------------------- 149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 150 : BreakIterator(other) 151 { 152 this->init(); 153 *this = other; 154 } 155 156 157 /** 158 * Destructor 159 */ 160 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 161 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 162 // fCharIter was adopted from the outside. 163 delete fCharIter; 164 } 165 fCharIter = NULL; 166 delete fSCharIter; 167 fCharIter = NULL; 168 delete fDCharIter; 169 fDCharIter = NULL; 170 171 utext_close(fText); 172 173 if (fData != NULL) { 174 fData->removeReference(); 175 fData = NULL; 176 } 177 if (fCachedBreakPositions) { 178 uprv_free(fCachedBreakPositions); 179 fCachedBreakPositions = NULL; 180 } 181 if (fLanguageBreakEngines) { 182 delete fLanguageBreakEngines; 183 fLanguageBreakEngines = NULL; 184 } 185 if (fUnhandledBreakEngine) { 186 delete fUnhandledBreakEngine; 187 fUnhandledBreakEngine = NULL; 188 } 189 } 190 191 /** 192 * Assignment operator. Sets this iterator to have the same behavior, 193 * and iterate over the same text, as the one passed in. 194 */ 195 RuleBasedBreakIterator& 196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 197 if (this == &that) { 198 return *this; 199 } 200 reset(); // Delete break cache information 201 fBreakType = that.fBreakType; 202 if (fLanguageBreakEngines != NULL) { 203 delete fLanguageBreakEngines; 204 fLanguageBreakEngines = NULL; // Just rebuild for now 205 } 206 // TODO: clone fLanguageBreakEngines from "that" 207 UErrorCode status = U_ZERO_ERROR; 208 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 209 210 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 211 delete fCharIter; 212 } 213 fCharIter = NULL; 214 215 if (that.fCharIter != NULL ) { 216 // This is a little bit tricky - it will intially appear that 217 // this->fCharIter is adopted, even if that->fCharIter was 218 // not adopted. That's ok. 219 fCharIter = that.fCharIter->clone(); 220 } 221 222 if (fData != NULL) { 223 fData->removeReference(); 224 fData = NULL; 225 } 226 if (that.fData != NULL) { 227 fData = that.fData->addReference(); 228 } 229 230 return *this; 231 } 232 233 234 235 //----------------------------------------------------------------------------- 236 // 237 // init() Shared initialization routine. Used by all the constructors. 238 // Initializes all fields, leaving the object in a consistent state. 239 // 240 //----------------------------------------------------------------------------- 241 void RuleBasedBreakIterator::init() { 242 UErrorCode status = U_ZERO_ERROR; 243 fBufferClone = FALSE; 244 fText = utext_openUChars(NULL, NULL, 0, &status); 245 fCharIter = NULL; 246 fSCharIter = NULL; 247 fDCharIter = NULL; 248 fData = NULL; 249 fLastRuleStatusIndex = 0; 250 fLastStatusIndexValid = TRUE; 251 fDictionaryCharCount = 0; 252 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 253 // dictionary behavior for Break Iterators that are 254 // built from rules. Even better would be the ability to 255 // declare the type in the rules. 256 257 fCachedBreakPositions = NULL; 258 fLanguageBreakEngines = NULL; 259 fUnhandledBreakEngine = NULL; 260 fNumCachedBreakPositions = 0; 261 fPositionInCache = 0; 262 263 #ifdef RBBI_DEBUG 264 static UBool debugInitDone = FALSE; 265 if (debugInitDone == FALSE) { 266 char *debugEnv = getenv("U_RBBIDEBUG"); 267 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 268 fTrace = TRUE; 269 } 270 debugInitDone = TRUE; 271 } 272 #endif 273 } 274 275 276 277 //----------------------------------------------------------------------------- 278 // 279 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 280 // behavior, and iterating over the same text, as this one. 281 // Virtual function: does the right thing with subclasses. 282 // 283 //----------------------------------------------------------------------------- 284 BreakIterator* 285 RuleBasedBreakIterator::clone(void) const { 286 return new RuleBasedBreakIterator(*this); 287 } 288 289 /** 290 * Equality operator. Returns TRUE if both BreakIterators are of the 291 * same class, have the same behavior, and iterate over the same text. 292 */ 293 UBool 294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 295 if (typeid(*this) != typeid(that)) { 296 return FALSE; 297 } 298 299 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 300 301 if (!utext_equals(fText, that2.fText)) { 302 // The two break iterators are operating on different text, 303 // or have a different interation position. 304 return FALSE; 305 }; 306 307 // TODO: need a check for when in a dictionary region at different offsets. 308 309 if (that2.fData == fData || 310 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 311 // The two break iterators are using the same rules. 312 return TRUE; 313 } 314 return FALSE; 315 } 316 317 /** 318 * Compute a hash code for this BreakIterator 319 * @return A hash code 320 */ 321 int32_t 322 RuleBasedBreakIterator::hashCode(void) const { 323 int32_t hash = 0; 324 if (fData != NULL) { 325 hash = fData->hashCode(); 326 } 327 return hash; 328 } 329 330 331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 332 if (U_FAILURE(status)) { 333 return; 334 } 335 reset(); 336 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 337 338 // Set up a dummy CharacterIterator to be returned if anyone 339 // calls getText(). With input from UText, there is no reasonable 340 // way to return a characterIterator over the actual input text. 341 // Return one over an empty string instead - this is the closest 342 // we can come to signaling a failure. 343 // (GetText() is obsolete, this failure is sort of OK) 344 if (fDCharIter == NULL) { 345 static const UChar c = 0; 346 fDCharIter = new UCharCharacterIterator(&c, 0); 347 if (fDCharIter == NULL) { 348 status = U_MEMORY_ALLOCATION_ERROR; 349 return; 350 } 351 } 352 353 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 354 // existing fCharIter was adopted from the outside. Delete it now. 355 delete fCharIter; 356 } 357 fCharIter = fDCharIter; 358 359 this->first(); 360 } 361 362 363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 364 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 365 return result; 366 } 367 368 369 370 /** 371 * Returns the description used to create this iterator 372 */ 373 const UnicodeString& 374 RuleBasedBreakIterator::getRules() const { 375 if (fData != NULL) { 376 return fData->getRuleSourceString(); 377 } else { 378 static const UnicodeString *s; 379 if (s == NULL) { 380 // TODO: something more elegant here. 381 // perhaps API should return the string by value. 382 // Note: thread unsafe init & leak are semi-ok, better than 383 // what was before. Sould be cleaned up, though. 384 s = new UnicodeString; 385 } 386 return *s; 387 } 388 } 389 390 //======================================================================= 391 // BreakIterator overrides 392 //======================================================================= 393 394 /** 395 * Return a CharacterIterator over the text being analyzed. 396 */ 397 CharacterIterator& 398 RuleBasedBreakIterator::getText() const { 399 return *fCharIter; 400 } 401 402 /** 403 * Set the iterator to analyze a new piece of text. This function resets 404 * the current iteration position to the beginning of the text. 405 * @param newText An iterator over the text to analyze. 406 */ 407 void 408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 409 // If we are holding a CharacterIterator adopted from a 410 // previous call to this function, delete it now. 411 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 412 delete fCharIter; 413 } 414 415 fCharIter = newText; 416 UErrorCode status = U_ZERO_ERROR; 417 reset(); 418 if (newText==NULL || newText->startIndex() != 0) { 419 // startIndex !=0 wants to be an error, but there's no way to report it. 420 // Make the iterator text be an empty string. 421 fText = utext_openUChars(fText, NULL, 0, &status); 422 } else { 423 fText = utext_openCharacterIterator(fText, newText, &status); 424 } 425 this->first(); 426 } 427 428 /** 429 * Set the iterator to analyze a new piece of text. This function resets 430 * the current iteration position to the beginning of the text. 431 * @param newText An iterator over the text to analyze. 432 */ 433 void 434 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 435 UErrorCode status = U_ZERO_ERROR; 436 reset(); 437 fText = utext_openConstUnicodeString(fText, &newText, &status); 438 439 // Set up a character iterator on the string. 440 // Needed in case someone calls getText(). 441 // Can not, unfortunately, do this lazily on the (probably never) 442 // call to getText(), because getText is const. 443 if (fSCharIter == NULL) { 444 fSCharIter = new StringCharacterIterator(newText); 445 } else { 446 fSCharIter->setText(newText); 447 } 448 449 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 450 // old fCharIter was adopted from the outside. Delete it. 451 delete fCharIter; 452 } 453 fCharIter = fSCharIter; 454 455 this->first(); 456 } 457 458 459 460 /** 461 * Sets the current iteration position to the beginning of the text. 462 * @return The offset of the beginning of the text. 463 */ 464 int32_t RuleBasedBreakIterator::first(void) { 465 reset(); 466 fLastRuleStatusIndex = 0; 467 fLastStatusIndexValid = TRUE; 468 //if (fText == NULL) 469 // return BreakIterator::DONE; 470 471 utext_setNativeIndex(fText, 0); 472 return 0; 473 } 474 475 /** 476 * Sets the current iteration position to the end of the text. 477 * @return The text's past-the-end offset. 478 */ 479 int32_t RuleBasedBreakIterator::last(void) { 480 reset(); 481 if (fText == NULL) { 482 fLastRuleStatusIndex = 0; 483 fLastStatusIndexValid = TRUE; 484 return BreakIterator::DONE; 485 } 486 487 fLastStatusIndexValid = FALSE; 488 int32_t pos = (int32_t)utext_nativeLength(fText); 489 utext_setNativeIndex(fText, pos); 490 return pos; 491 } 492 493 /** 494 * Advances the iterator either forward or backward the specified number of steps. 495 * Negative values move backward, and positive values move forward. This is 496 * equivalent to repeatedly calling next() or previous(). 497 * @param n The number of steps to move. The sign indicates the direction 498 * (negative is backwards, and positive is forwards). 499 * @return The character offset of the boundary position n boundaries away from 500 * the current one. 501 */ 502 int32_t RuleBasedBreakIterator::next(int32_t n) { 503 int32_t result = current(); 504 while (n > 0) { 505 result = next(); 506 --n; 507 } 508 while (n < 0) { 509 result = previous(); 510 ++n; 511 } 512 return result; 513 } 514 515 /** 516 * Advances the iterator to the next boundary position. 517 * @return The position of the first boundary after this one. 518 */ 519 int32_t RuleBasedBreakIterator::next(void) { 520 // if we have cached break positions and we're still in the range 521 // covered by them, just move one step forward in the cache 522 if (fCachedBreakPositions != NULL) { 523 if (fPositionInCache < fNumCachedBreakPositions - 1) { 524 ++fPositionInCache; 525 int32_t pos = fCachedBreakPositions[fPositionInCache]; 526 utext_setNativeIndex(fText, pos); 527 return pos; 528 } 529 else { 530 reset(); 531 } 532 } 533 534 int32_t startPos = current(); 535 int32_t result = handleNext(fData->fForwardTable); 536 if (fDictionaryCharCount > 0) { 537 result = checkDictionary(startPos, result, FALSE); 538 } 539 return result; 540 } 541 542 /** 543 * Advances the iterator backwards, to the last boundary preceding this one. 544 * @return The position of the last boundary position preceding this one. 545 */ 546 int32_t RuleBasedBreakIterator::previous(void) { 547 int32_t result; 548 int32_t startPos; 549 550 // if we have cached break positions and we're still in the range 551 // covered by them, just move one step backward in the cache 552 if (fCachedBreakPositions != NULL) { 553 if (fPositionInCache > 0) { 554 --fPositionInCache; 555 // If we're at the beginning of the cache, need to reevaluate the 556 // rule status 557 if (fPositionInCache <= 0) { 558 fLastStatusIndexValid = FALSE; 559 } 560 int32_t pos = fCachedBreakPositions[fPositionInCache]; 561 utext_setNativeIndex(fText, pos); 562 return pos; 563 } 564 else { 565 reset(); 566 } 567 } 568 569 // if we're already sitting at the beginning of the text, return DONE 570 if (fText == NULL || (startPos = current()) == 0) { 571 fLastRuleStatusIndex = 0; 572 fLastStatusIndexValid = TRUE; 573 return BreakIterator::DONE; 574 } 575 576 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 577 result = handlePrevious(fData->fReverseTable); 578 if (fDictionaryCharCount > 0) { 579 result = checkDictionary(result, startPos, TRUE); 580 } 581 return result; 582 } 583 584 // old rule syntax 585 // set things up. handlePrevious() will back us up to some valid 586 // break position before the current position (we back our internal 587 // iterator up one step to prevent handlePrevious() from returning 588 // the current position), but not necessarily the last one before 589 590 // where we started 591 592 int32_t start = current(); 593 594 UTEXT_PREVIOUS32(fText); 595 int32_t lastResult = handlePrevious(fData->fReverseTable); 596 if (lastResult == UBRK_DONE) { 597 lastResult = 0; 598 utext_setNativeIndex(fText, 0); 599 } 600 result = lastResult; 601 int32_t lastTag = 0; 602 UBool breakTagValid = FALSE; 603 604 // iterate forward from the known break position until we pass our 605 // starting point. The last break position before the starting 606 // point is our return value 607 608 for (;;) { 609 result = next(); 610 if (result == BreakIterator::DONE || result >= start) { 611 break; 612 } 613 lastResult = result; 614 lastTag = fLastRuleStatusIndex; 615 breakTagValid = TRUE; 616 } 617 618 // fLastBreakTag wants to have the value for section of text preceding 619 // the result position that we are to return (in lastResult.) If 620 // the backwards rules overshot and the above loop had to do two or more 621 // next()s to move up to the desired return position, we will have a valid 622 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 623 // we wont have a tag value for that position, which is only set by handleNext(). 624 625 // set the current iteration position to be the last break position 626 // before where we started, and then return that value 627 utext_setNativeIndex(fText, lastResult); 628 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 629 fLastStatusIndexValid = breakTagValid; 630 631 // No need to check the dictionary; it will have been handled by 632 // next() 633 634 return lastResult; 635 } 636 637 /** 638 * Sets the iterator to refer to the first boundary position following 639 * the specified position. 640 * @offset The position from which to begin searching for a break position. 641 * @return The position of the first break after the current position. 642 */ 643 int32_t RuleBasedBreakIterator::following(int32_t offset) { 644 // if we have cached break positions and offset is in the range 645 // covered by them, use them 646 // TODO: could use binary search 647 // TODO: what if offset is outside range, but break is not? 648 if (fCachedBreakPositions != NULL) { 649 if (offset >= fCachedBreakPositions[0] 650 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 651 fPositionInCache = 0; 652 // We are guaranteed not to leave the array due to range test above 653 while (offset >= fCachedBreakPositions[fPositionInCache]) { 654 ++fPositionInCache; 655 } 656 int32_t pos = fCachedBreakPositions[fPositionInCache]; 657 utext_setNativeIndex(fText, pos); 658 return pos; 659 } 660 else { 661 reset(); 662 } 663 } 664 665 // if the offset passed in is already past the end of the text, 666 // just return DONE; if it's before the beginning, return the 667 // text's starting offset 668 fLastRuleStatusIndex = 0; 669 fLastStatusIndexValid = TRUE; 670 if (fText == NULL || offset >= utext_nativeLength(fText)) { 671 last(); 672 return next(); 673 } 674 else if (offset < 0) { 675 return first(); 676 } 677 678 // otherwise, set our internal iteration position (temporarily) 679 // to the position passed in. If this is the _beginning_ position, 680 // then we can just use next() to get our return value 681 682 int32_t result = 0; 683 684 if (fData->fSafeRevTable != NULL) { 685 // new rule syntax 686 utext_setNativeIndex(fText, offset); 687 // move forward one codepoint to prepare for moving back to a 688 // safe point. 689 // this handles offset being between a supplementary character 690 UTEXT_NEXT32(fText); 691 // handlePrevious will move most of the time to < 1 boundary away 692 handlePrevious(fData->fSafeRevTable); 693 int32_t result = next(); 694 while (result <= offset) { 695 result = next(); 696 } 697 return result; 698 } 699 if (fData->fSafeFwdTable != NULL) { 700 // backup plan if forward safe table is not available 701 utext_setNativeIndex(fText, offset); 702 UTEXT_PREVIOUS32(fText); 703 // handle next will give result >= offset 704 handleNext(fData->fSafeFwdTable); 705 // previous will give result 0 or 1 boundary away from offset, 706 // most of the time 707 // we have to 708 int32_t oldresult = previous(); 709 while (oldresult > offset) { 710 int32_t result = previous(); 711 if (result <= offset) { 712 return oldresult; 713 } 714 oldresult = result; 715 } 716 int32_t result = next(); 717 if (result <= offset) { 718 return next(); 719 } 720 return result; 721 } 722 // otherwise, we have to sync up first. Use handlePrevious() to back 723 // up to a known break position before the specified position (if 724 // we can determine that the specified position is a break position, 725 // we don't back up at all). This may or may not be the last break 726 // position at or before our starting position. Advance forward 727 // from here until we've passed the starting position. The position 728 // we stop on will be the first break position after the specified one. 729 // old rule syntax 730 731 utext_setNativeIndex(fText, offset); 732 if (offset==0 || 733 (offset==1 && utext_getNativeIndex(fText)==0)) { 734 return next(); 735 } 736 result = previous(); 737 738 while (result != BreakIterator::DONE && result <= offset) { 739 result = next(); 740 } 741 742 return result; 743 } 744 745 /** 746 * Sets the iterator to refer to the last boundary position before the 747 * specified position. 748 * @offset The position to begin searching for a break from. 749 * @return The position of the last boundary before the starting position. 750 */ 751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 752 // if we have cached break positions and offset is in the range 753 // covered by them, use them 754 if (fCachedBreakPositions != NULL) { 755 // TODO: binary search? 756 // TODO: What if offset is outside range, but break is not? 757 if (offset > fCachedBreakPositions[0] 758 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 759 fPositionInCache = 0; 760 while (fPositionInCache < fNumCachedBreakPositions 761 && offset > fCachedBreakPositions[fPositionInCache]) 762 ++fPositionInCache; 763 --fPositionInCache; 764 // If we're at the beginning of the cache, need to reevaluate the 765 // rule status 766 if (fPositionInCache <= 0) { 767 fLastStatusIndexValid = FALSE; 768 } 769 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 770 return fCachedBreakPositions[fPositionInCache]; 771 } 772 else { 773 reset(); 774 } 775 } 776 777 // if the offset passed in is already past the end of the text, 778 // just return DONE; if it's before the beginning, return the 779 // text's starting offset 780 if (fText == NULL || offset > utext_nativeLength(fText)) { 781 // return BreakIterator::DONE; 782 return last(); 783 } 784 else if (offset < 0) { 785 return first(); 786 } 787 788 // if we start by updating the current iteration position to the 789 // position specified by the caller, we can just use previous() 790 // to carry out this operation 791 792 if (fData->fSafeFwdTable != NULL) { 793 // new rule syntax 794 utext_setNativeIndex(fText, offset); 795 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 796 if (newOffset != offset) { 797 // Will come here if specified offset was not a code point boundary AND 798 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 799 // indices to the containing code point. 800 // For breakitereator::preceding only, these non-code-point indices need to be moved 801 // up to refer to the following codepoint. 802 UTEXT_NEXT32(fText); 803 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 804 } 805 806 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 807 // rather than adjusting the position unconditionally? 808 // (Change would interact with safe rules.) 809 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 810 // affects only preceding(), seems cleaner, but is slightly different. 811 UTEXT_PREVIOUS32(fText); 812 handleNext(fData->fSafeFwdTable); 813 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 814 while (result >= offset) { 815 result = previous(); 816 } 817 return result; 818 } 819 if (fData->fSafeRevTable != NULL) { 820 // backup plan if forward safe table is not available 821 // TODO: check whether this path can be discarded 822 // It's probably OK to say that rules must supply both safe tables 823 // if they use safe tables at all. We have certainly never described 824 // to anyone how to work with just one safe table. 825 utext_setNativeIndex(fText, offset); 826 UTEXT_NEXT32(fText); 827 828 // handle previous will give result <= offset 829 handlePrevious(fData->fSafeRevTable); 830 831 // next will give result 0 or 1 boundary away from offset, 832 // most of the time 833 // we have to 834 int32_t oldresult = next(); 835 while (oldresult < offset) { 836 int32_t result = next(); 837 if (result >= offset) { 838 return oldresult; 839 } 840 oldresult = result; 841 } 842 int32_t result = previous(); 843 if (result >= offset) { 844 return previous(); 845 } 846 return result; 847 } 848 849 // old rule syntax 850 utext_setNativeIndex(fText, offset); 851 return previous(); 852 } 853 854 /** 855 * Returns true if the specfied position is a boundary position. As a side 856 * effect, leaves the iterator pointing to the first boundary position at 857 * or after "offset". 858 * @param offset the offset to check. 859 * @return True if "offset" is a boundary position. 860 */ 861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 862 // the beginning index of the iterator is always a boundary position by definition 863 if (offset == 0) { 864 first(); // For side effects on current position, tag values. 865 return TRUE; 866 } 867 868 if (offset == (int32_t)utext_nativeLength(fText)) { 869 last(); // For side effects on current position, tag values. 870 return TRUE; 871 } 872 873 // out-of-range indexes are never boundary positions 874 if (offset < 0) { 875 first(); // For side effects on current position, tag values. 876 return FALSE; 877 } 878 879 if (offset > utext_nativeLength(fText)) { 880 last(); // For side effects on current position, tag values. 881 return FALSE; 882 } 883 884 // otherwise, we can use following() on the position before the specified 885 // one and return true if the position we get back is the one the user 886 // specified 887 utext_previous32From(fText, offset); 888 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 889 UBool result = following(backOne) == offset; 890 return result; 891 } 892 893 /** 894 * Returns the current iteration position. 895 * @return The current iteration position. 896 */ 897 int32_t RuleBasedBreakIterator::current(void) const { 898 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 899 return pos; 900 } 901 902 //======================================================================= 903 // implementation 904 //======================================================================= 905 906 // 907 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 908 // of user text. A variable with this enum type keeps track of where we 909 // are. The state machine only fetches user input while in the RUN mode. 910 // 911 enum RBBIRunMode { 912 RBBI_START, // state machine processing is before first char of input 913 RBBI_RUN, // state machine processing is in the user text 914 RBBI_END // state machine processing is after end of user text. 915 }; 916 917 918 //----------------------------------------------------------------------------------- 919 // 920 // handleNext(stateTable) 921 // This method is the actual implementation of the rbbi next() method. 922 // This method initializes the state machine to state 1 923 // and advances through the text character by character until we reach the end 924 // of the text or the state machine transitions to state 0. We update our return 925 // value every time the state machine passes through an accepting state. 926 // 927 //----------------------------------------------------------------------------------- 928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 929 int32_t state; 930 int16_t category = 0; 931 RBBIRunMode mode; 932 933 RBBIStateTableRow *row; 934 UChar32 c; 935 int32_t lookaheadStatus = 0; 936 int32_t lookaheadTagIdx = 0; 937 int32_t result = 0; 938 int32_t initialPosition = 0; 939 int32_t lookaheadResult = 0; 940 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 941 const char *tableData = statetable->fTableData; 942 uint32_t tableRowLen = statetable->fRowLen; 943 944 #ifdef RBBI_DEBUG 945 if (fTrace) { 946 RBBIDebugPuts("Handle Next pos char state category"); 947 } 948 #endif 949 950 // No matter what, handleNext alway correctly sets the break tag value. 951 fLastStatusIndexValid = TRUE; 952 fLastRuleStatusIndex = 0; 953 954 // if we're already at the end of the text, return DONE. 955 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 956 result = initialPosition; 957 c = UTEXT_NEXT32(fText); 958 if (fData == NULL || c==U_SENTINEL) { 959 return BreakIterator::DONE; 960 } 961 962 // Set the initial state for the state machine 963 state = START_STATE; 964 row = (RBBIStateTableRow *) 965 //(statetable->fTableData + (statetable->fRowLen * state)); 966 (tableData + tableRowLen * state); 967 968 969 mode = RBBI_RUN; 970 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 971 category = 2; 972 mode = RBBI_START; 973 } 974 975 976 // loop until we reach the end of the text or transition to state 0 977 // 978 for (;;) { 979 if (c == U_SENTINEL) { 980 // Reached end of input string. 981 if (mode == RBBI_END) { 982 // We have already run the loop one last time with the 983 // character set to the psueudo {eof} value. Now it is time 984 // to unconditionally bail out. 985 if (lookaheadResult > result) { 986 // We ran off the end of the string with a pending look-ahead match. 987 // Treat this as if the look-ahead condition had been met, and return 988 // the match at the / position from the look-ahead rule. 989 result = lookaheadResult; 990 fLastRuleStatusIndex = lookaheadTagIdx; 991 lookaheadStatus = 0; 992 } 993 break; 994 } 995 // Run the loop one last time with the fake end-of-input character category. 996 mode = RBBI_END; 997 category = 1; 998 } 999 1000 // 1001 // Get the char category. An incoming category of 1 or 2 means that 1002 // we are preset for doing the beginning or end of input, and 1003 // that we shouldn't get a category from an actual text input character. 1004 // 1005 if (mode == RBBI_RUN) { 1006 // look up the current character's character category, which tells us 1007 // which column in the state table to look at. 1008 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1009 // not the size of the character going in, which is a UChar32. 1010 // 1011 UTRIE_GET16(&fData->fTrie, c, category); 1012 1013 // Check the dictionary bit in the character's category. 1014 // Counter is only used by dictionary based iterators (subclasses). 1015 // Chars that need to be handled by a dictionary have a flag bit set 1016 // in their category values. 1017 // 1018 if ((category & 0x4000) != 0) { 1019 fDictionaryCharCount++; 1020 // And off the dictionary flag bit. 1021 category &= ~0x4000; 1022 } 1023 } 1024 1025 #ifdef RBBI_DEBUG 1026 if (fTrace) { 1027 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1028 if (0x20<=c && c<0x7f) { 1029 RBBIDebugPrintf("\"%c\" ", c); 1030 } else { 1031 RBBIDebugPrintf("%5x ", c); 1032 } 1033 RBBIDebugPrintf("%3d %3d\n", state, category); 1034 } 1035 #endif 1036 1037 // State Transition - move machine to its next state 1038 // 1039 state = row->fNextState[category]; 1040 row = (RBBIStateTableRow *) 1041 // (statetable->fTableData + (statetable->fRowLen * state)); 1042 (tableData + tableRowLen * state); 1043 1044 1045 if (row->fAccepting == -1) { 1046 // Match found, common case. 1047 if (mode != RBBI_START) { 1048 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1049 } 1050 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1051 } 1052 1053 if (row->fLookAhead != 0) { 1054 if (lookaheadStatus != 0 1055 && row->fAccepting == lookaheadStatus) { 1056 // Lookahead match is completed. 1057 result = lookaheadResult; 1058 fLastRuleStatusIndex = lookaheadTagIdx; 1059 lookaheadStatus = 0; 1060 // TODO: make a standalone hard break in a rule work. 1061 if (lookAheadHardBreak) { 1062 UTEXT_SETNATIVEINDEX(fText, result); 1063 return result; 1064 } 1065 // Look-ahead completed, but other rules may match further. Continue on 1066 // TODO: junk this feature? I don't think it's used anywhwere. 1067 goto continueOn; 1068 } 1069 1070 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1071 lookaheadResult = r; 1072 lookaheadStatus = row->fLookAhead; 1073 lookaheadTagIdx = row->fTagIdx; 1074 goto continueOn; 1075 } 1076 1077 1078 if (row->fAccepting != 0) { 1079 // Because this is an accepting state, any in-progress look-ahead match 1080 // is no longer relavant. Clear out the pending lookahead status. 1081 lookaheadStatus = 0; // clear out any pending look-ahead match. 1082 } 1083 1084 continueOn: 1085 if (state == STOP_STATE) { 1086 // This is the normal exit from the lookup state machine. 1087 // We have advanced through the string until it is certain that no 1088 // longer match is possible, no matter what characters follow. 1089 break; 1090 } 1091 1092 // Advance to the next character. 1093 // If this is a beginning-of-input loop iteration, don't advance 1094 // the input position. The next iteration will be processing the 1095 // first real input character. 1096 if (mode == RBBI_RUN) { 1097 c = UTEXT_NEXT32(fText); 1098 } else { 1099 if (mode == RBBI_START) { 1100 mode = RBBI_RUN; 1101 } 1102 } 1103 1104 1105 } 1106 1107 // The state machine is done. Check whether it found a match... 1108 1109 // If the iterator failed to advance in the match engine, force it ahead by one. 1110 // (This really indicates a defect in the break rules. They should always match 1111 // at least one character.) 1112 if (result == initialPosition) { 1113 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1114 UTEXT_NEXT32(fText); 1115 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1116 } 1117 1118 // Leave the iterator at our result position. 1119 UTEXT_SETNATIVEINDEX(fText, result); 1120 #ifdef RBBI_DEBUG 1121 if (fTrace) { 1122 RBBIDebugPrintf("result = %d\n\n", result); 1123 } 1124 #endif 1125 return result; 1126 } 1127 1128 1129 1130 //----------------------------------------------------------------------------------- 1131 // 1132 // handlePrevious() 1133 // 1134 // Iterate backwards, according to the logic of the reverse rules. 1135 // This version handles the exact style backwards rules. 1136 // 1137 // The logic of this function is very similar to handleNext(), above. 1138 // 1139 //----------------------------------------------------------------------------------- 1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1141 int32_t state; 1142 int16_t category = 0; 1143 RBBIRunMode mode; 1144 RBBIStateTableRow *row; 1145 UChar32 c; 1146 int32_t lookaheadStatus = 0; 1147 int32_t result = 0; 1148 int32_t initialPosition = 0; 1149 int32_t lookaheadResult = 0; 1150 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1151 1152 #ifdef RBBI_DEBUG 1153 if (fTrace) { 1154 RBBIDebugPuts("Handle Previous pos char state category"); 1155 } 1156 #endif 1157 1158 // handlePrevious() never gets the rule status. 1159 // Flag the status as invalid; if the user ever asks for status, we will need 1160 // to back up, then re-find the break position using handleNext(), which does 1161 // get the status value. 1162 fLastStatusIndexValid = FALSE; 1163 fLastRuleStatusIndex = 0; 1164 1165 // if we're already at the start of the text, return DONE. 1166 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1167 return BreakIterator::DONE; 1168 } 1169 1170 // Set up the starting char. 1171 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1172 result = initialPosition; 1173 c = UTEXT_PREVIOUS32(fText); 1174 1175 // Set the initial state for the state machine 1176 state = START_STATE; 1177 row = (RBBIStateTableRow *) 1178 (statetable->fTableData + (statetable->fRowLen * state)); 1179 category = 3; 1180 mode = RBBI_RUN; 1181 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1182 category = 2; 1183 mode = RBBI_START; 1184 } 1185 1186 1187 // loop until we reach the start of the text or transition to state 0 1188 // 1189 for (;;) { 1190 if (c == U_SENTINEL) { 1191 // Reached end of input string. 1192 if (mode == RBBI_END) { 1193 // We have already run the loop one last time with the 1194 // character set to the psueudo {eof} value. Now it is time 1195 // to unconditionally bail out. 1196 if (lookaheadResult < result) { 1197 // We ran off the end of the string with a pending look-ahead match. 1198 // Treat this as if the look-ahead condition had been met, and return 1199 // the match at the / position from the look-ahead rule. 1200 result = lookaheadResult; 1201 lookaheadStatus = 0; 1202 } else if (result == initialPosition) { 1203 // Ran off start, no match found. 1204 // move one index one (towards the start, since we are doing a previous()) 1205 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1206 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1207 } 1208 break; 1209 } 1210 // Run the loop one last time with the fake end-of-input character category. 1211 mode = RBBI_END; 1212 category = 1; 1213 } 1214 1215 // 1216 // Get the char category. An incoming category of 1 or 2 means that 1217 // we are preset for doing the beginning or end of input, and 1218 // that we shouldn't get a category from an actual text input character. 1219 // 1220 if (mode == RBBI_RUN) { 1221 // look up the current character's character category, which tells us 1222 // which column in the state table to look at. 1223 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1224 // not the size of the character going in, which is a UChar32. 1225 // 1226 UTRIE_GET16(&fData->fTrie, c, category); 1227 1228 // Check the dictionary bit in the character's category. 1229 // Counter is only used by dictionary based iterators (subclasses). 1230 // Chars that need to be handled by a dictionary have a flag bit set 1231 // in their category values. 1232 // 1233 if ((category & 0x4000) != 0) { 1234 fDictionaryCharCount++; 1235 // And off the dictionary flag bit. 1236 category &= ~0x4000; 1237 } 1238 } 1239 1240 #ifdef RBBI_DEBUG 1241 if (fTrace) { 1242 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1243 if (0x20<=c && c<0x7f) { 1244 RBBIDebugPrintf("\"%c\" ", c); 1245 } else { 1246 RBBIDebugPrintf("%5x ", c); 1247 } 1248 RBBIDebugPrintf("%3d %3d\n", state, category); 1249 } 1250 #endif 1251 1252 // State Transition - move machine to its next state 1253 // 1254 state = row->fNextState[category]; 1255 row = (RBBIStateTableRow *) 1256 (statetable->fTableData + (statetable->fRowLen * state)); 1257 1258 if (row->fAccepting == -1) { 1259 // Match found, common case. 1260 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1261 } 1262 1263 if (row->fLookAhead != 0) { 1264 if (lookaheadStatus != 0 1265 && row->fAccepting == lookaheadStatus) { 1266 // Lookahead match is completed. 1267 result = lookaheadResult; 1268 lookaheadStatus = 0; 1269 // TODO: make a standalone hard break in a rule work. 1270 if (lookAheadHardBreak) { 1271 UTEXT_SETNATIVEINDEX(fText, result); 1272 return result; 1273 } 1274 // Look-ahead completed, but other rules may match further. Continue on 1275 // TODO: junk this feature? I don't think it's used anywhwere. 1276 goto continueOn; 1277 } 1278 1279 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1280 lookaheadResult = r; 1281 lookaheadStatus = row->fLookAhead; 1282 goto continueOn; 1283 } 1284 1285 1286 if (row->fAccepting != 0) { 1287 // Because this is an accepting state, any in-progress look-ahead match 1288 // is no longer relavant. Clear out the pending lookahead status. 1289 lookaheadStatus = 0; 1290 } 1291 1292 continueOn: 1293 if (state == STOP_STATE) { 1294 // This is the normal exit from the lookup state machine. 1295 // We have advanced through the string until it is certain that no 1296 // longer match is possible, no matter what characters follow. 1297 break; 1298 } 1299 1300 // Move (backwards) to the next character to process. 1301 // If this is a beginning-of-input loop iteration, don't advance 1302 // the input position. The next iteration will be processing the 1303 // first real input character. 1304 if (mode == RBBI_RUN) { 1305 c = UTEXT_PREVIOUS32(fText); 1306 } else { 1307 if (mode == RBBI_START) { 1308 mode = RBBI_RUN; 1309 } 1310 } 1311 } 1312 1313 // The state machine is done. Check whether it found a match... 1314 1315 // If the iterator failed to advance in the match engine, force it ahead by one. 1316 // (This really indicates a defect in the break rules. They should always match 1317 // at least one character.) 1318 if (result == initialPosition) { 1319 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1320 UTEXT_PREVIOUS32(fText); 1321 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1322 } 1323 1324 // Leave the iterator at our result position. 1325 UTEXT_SETNATIVEINDEX(fText, result); 1326 #ifdef RBBI_DEBUG 1327 if (fTrace) { 1328 RBBIDebugPrintf("result = %d\n\n", result); 1329 } 1330 #endif 1331 return result; 1332 } 1333 1334 1335 void 1336 RuleBasedBreakIterator::reset() 1337 { 1338 if (fCachedBreakPositions) { 1339 uprv_free(fCachedBreakPositions); 1340 } 1341 fCachedBreakPositions = NULL; 1342 fNumCachedBreakPositions = 0; 1343 fDictionaryCharCount = 0; 1344 fPositionInCache = 0; 1345 } 1346 1347 1348 1349 //------------------------------------------------------------------------------- 1350 // 1351 // getRuleStatus() Return the break rule tag associated with the current 1352 // iterator position. If the iterator arrived at its current 1353 // position by iterating forwards, the value will have been 1354 // cached by the handleNext() function. 1355 // 1356 // If no cached status value is available, the status is 1357 // found by doing a previous() followed by a next(), which 1358 // leaves the iterator where it started, and computes the 1359 // status while doing the next(). 1360 // 1361 //------------------------------------------------------------------------------- 1362 void RuleBasedBreakIterator::makeRuleStatusValid() { 1363 if (fLastStatusIndexValid == FALSE) { 1364 // No cached status is available. 1365 if (fText == NULL || current() == 0) { 1366 // At start of text, or there is no text. Status is always zero. 1367 fLastRuleStatusIndex = 0; 1368 fLastStatusIndexValid = TRUE; 1369 } else { 1370 // Not at start of text. Find status the tedious way. 1371 int32_t pa = current(); 1372 previous(); 1373 if (fNumCachedBreakPositions > 0) { 1374 reset(); // Blow off the dictionary cache 1375 } 1376 int32_t pb = next(); 1377 if (pa != pb) { 1378 // note: the if (pa != pb) test is here only to eliminate warnings for 1379 // unused local variables on gcc. Logically, it isn't needed. 1380 U_ASSERT(pa == pb); 1381 } 1382 } 1383 } 1384 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1385 } 1386 1387 1388 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1389 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1390 nonConstThis->makeRuleStatusValid(); 1391 1392 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1393 // (the number of status values.) 1394 // This function returns the last (largest) of the array of status values. 1395 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1396 int32_t tagVal = fData->fRuleStatusTable[idx]; 1397 1398 return tagVal; 1399 } 1400 1401 1402 1403 1404 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1405 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1406 { 1407 if (U_FAILURE(status)) { 1408 return 0; 1409 } 1410 1411 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1412 nonConstThis->makeRuleStatusValid(); 1413 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1414 int32_t numValsToCopy = numVals; 1415 if (numVals > capacity) { 1416 status = U_BUFFER_OVERFLOW_ERROR; 1417 numValsToCopy = capacity; 1418 } 1419 int i; 1420 for (i=0; i<numValsToCopy; i++) { 1421 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1422 } 1423 return numVals; 1424 } 1425 1426 1427 1428 //------------------------------------------------------------------------------- 1429 // 1430 // getBinaryRules Access to the compiled form of the rules, 1431 // for use by build system tools that save the data 1432 // for standard iterator types. 1433 // 1434 //------------------------------------------------------------------------------- 1435 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1436 const uint8_t *retPtr = NULL; 1437 length = 0; 1438 1439 if (fData != NULL) { 1440 retPtr = (const uint8_t *)fData->fHeader; 1441 length = fData->fHeader->fLength; 1442 } 1443 return retPtr; 1444 } 1445 1446 1447 1448 1449 //------------------------------------------------------------------------------- 1450 // 1451 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1452 // Saving one heap allocation isn't worth the trouble. 1453 // Cloning shouldn't be done in tight loops, and 1454 // making the clone copy involves other heap operations anyway. 1455 // And the application code for correctly dealing with buffer 1456 // size problems and the eventual object destruction is ugly. 1457 // 1458 //------------------------------------------------------------------------------- 1459 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1460 int32_t &bufferSize, 1461 UErrorCode &status) 1462 { 1463 if (U_FAILURE(status)){ 1464 return NULL; 1465 } 1466 1467 // 1468 // If user buffer size is zero this is a preflight operation to 1469 // obtain the needed buffer size, allowing for worst case misalignment. 1470 // 1471 if (bufferSize == 0) { 1472 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1473 return NULL; 1474 } 1475 1476 1477 // 1478 // Check the alignment and size of the user supplied buffer. 1479 // Allocate heap memory if the user supplied memory is insufficient. 1480 // 1481 char *buf = (char *)stackBuffer; 1482 uint32_t s = bufferSize; 1483 1484 if (stackBuffer == NULL) { 1485 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1486 } 1487 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1488 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1489 s -= offsetUp; 1490 buf += offsetUp; 1491 } 1492 if (s < sizeof(RuleBasedBreakIterator)) { 1493 // Not enough room in the caller-supplied buffer. 1494 // Do a plain-vanilla heap based clone and return that, along with 1495 // a warning that the clone was allocated. 1496 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1497 if (clonedBI == 0) { 1498 status = U_MEMORY_ALLOCATION_ERROR; 1499 } else { 1500 status = U_SAFECLONE_ALLOCATED_WARNING; 1501 } 1502 return clonedBI; 1503 } 1504 1505 // 1506 // Clone the source BI into the caller-supplied buffer. 1507 // TODO: using an overloaded operator new to directly initialize the 1508 // copy in the user's buffer would be better, but it doesn't seem 1509 // to get along with namespaces. Investigate why. 1510 // 1511 // The memcpy is only safe with an empty (default constructed) 1512 // break iterator. Use on others can screw up reference counts 1513 // to data. memcpy-ing objects is not really a good idea... 1514 // 1515 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy 1516 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; 1517 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part 1518 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) 1519 *clone = *this; // clone = the real BI we want. 1520 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1521 1522 return clone; 1523 } 1524 1525 1526 //------------------------------------------------------------------------------- 1527 // 1528 // isDictionaryChar Return true if the category lookup for this char 1529 // indicates that it is in the set of dictionary lookup 1530 // chars. 1531 // 1532 // This function is intended for use by dictionary based 1533 // break iterators. 1534 // 1535 //------------------------------------------------------------------------------- 1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1537 if (fData == NULL) { 1538 return FALSE; 1539 } 1540 uint16_t category; 1541 UTRIE_GET16(&fData->fTrie, c, category); 1542 return (category & 0x4000) != 0; 1543 }*/ 1544 1545 1546 //------------------------------------------------------------------------------- 1547 // 1548 // checkDictionary This function handles all processing of characters in 1549 // the "dictionary" set. It will determine the appropriate 1550 // course of action, and possibly set up a cache in the 1551 // process. 1552 // 1553 //------------------------------------------------------------------------------- 1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1555 int32_t endPos, 1556 UBool reverse) { 1557 // Reset the old break cache first. 1558 uint32_t dictionaryCount = fDictionaryCharCount; 1559 reset(); 1560 1561 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 1562 return (reverse ? startPos : endPos); 1563 } 1564 1565 // Bug 5532. The dictionary code will crash if the input text is UTF-8 1566 // because native indexes are different from UTF-16 indexes. 1567 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. 1568 // It wont give the right breaks, but it's better than a crash. 1569 // 1570 // Check the type of the UText by checking its pFuncs field, which 1571 // is UText's function dispatch table. It will be the same for all 1572 // UTF-8 UTexts and different for any other UText type. 1573 // 1574 // We have no other type of UText available with non-UTF-16 native indexing. 1575 // This whole check will go away once the dictionary code is fixed. 1576 static const void *utext_utf8Funcs; 1577 if (utext_utf8Funcs == NULL) { 1578 // Cache the UTF-8 UText function pointer value. 1579 UErrorCode status = U_ZERO_ERROR; 1580 UText tempUText = UTEXT_INITIALIZER; 1581 utext_openUTF8(&tempUText, NULL, 0, &status); 1582 utext_utf8Funcs = tempUText.pFuncs; 1583 utext_close(&tempUText); 1584 } 1585 if (fText->pFuncs == utext_utf8Funcs) { 1586 return (reverse ? startPos : endPos); 1587 } 1588 1589 // Starting from the starting point, scan towards the proposed result, 1590 // looking for the first dictionary character (which may be the one 1591 // we're on, if we're starting in the middle of a range). 1592 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1593 if (reverse) { 1594 UTEXT_PREVIOUS32(fText); 1595 } 1596 1597 int32_t rangeStart = startPos; 1598 int32_t rangeEnd = endPos; 1599 1600 uint16_t category; 1601 int32_t current; 1602 UErrorCode status = U_ZERO_ERROR; 1603 UStack breaks(status); 1604 int32_t foundBreakCount = 0; 1605 UChar32 c = utext_current32(fText); 1606 1607 UTRIE_GET16(&fData->fTrie, c, category); 1608 1609 // Is the character we're starting on a dictionary character? If so, we 1610 // need to back up to include the entire run; otherwise the results of 1611 // the break algorithm will differ depending on where we start. Since 1612 // the result is cached and there is typically a non-dictionary break 1613 // within a small number of words, there should be little performance impact. 1614 if (category & 0x4000) { 1615 if (reverse) { 1616 do { 1617 utext_next32(fText); // TODO: recast to work directly with postincrement. 1618 c = utext_current32(fText); 1619 UTRIE_GET16(&fData->fTrie, c, category); 1620 } while (c != U_SENTINEL && (category & 0x4000)); 1621 // Back up to the last dictionary character 1622 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1623 if (c == U_SENTINEL) { 1624 // c = fText->last32(); 1625 // TODO: why was this if needed? 1626 c = UTEXT_PREVIOUS32(fText); 1627 } 1628 else { 1629 c = UTEXT_PREVIOUS32(fText); 1630 } 1631 } 1632 else { 1633 do { 1634 c = UTEXT_PREVIOUS32(fText); 1635 UTRIE_GET16(&fData->fTrie, c, category); 1636 } 1637 while (c != U_SENTINEL && (category & 0x4000)); 1638 // Back up to the last dictionary character 1639 if (c == U_SENTINEL) { 1640 // c = fText->first32(); 1641 c = utext_current32(fText); 1642 } 1643 else { 1644 utext_next32(fText); 1645 c = utext_current32(fText); 1646 } 1647 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1648 } 1649 UTRIE_GET16(&fData->fTrie, c, category); 1650 } 1651 1652 // Loop through the text, looking for ranges of dictionary characters. 1653 // For each span, find the appropriate break engine, and ask it to find 1654 // any breaks within the span. 1655 // Note: we always do this in the forward direction, so that the break 1656 // cache is built in the right order. 1657 if (reverse) { 1658 utext_setNativeIndex(fText, rangeStart); 1659 c = utext_current32(fText); 1660 UTRIE_GET16(&fData->fTrie, c, category); 1661 } 1662 while(U_SUCCESS(status)) { 1663 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1664 utext_next32(fText); // TODO: tweak for post-increment operation 1665 c = utext_current32(fText); 1666 UTRIE_GET16(&fData->fTrie, c, category); 1667 } 1668 if (current >= rangeEnd) { 1669 break; 1670 } 1671 1672 // We now have a dictionary character. Get the appropriate language object 1673 // to deal with it. 1674 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1675 1676 // Ask the language object if there are any breaks. It will leave the text 1677 // pointer on the other side of its range, ready to search for the next one. 1678 if (lbe != NULL) { 1679 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1680 } 1681 1682 // Reload the loop variables for the next go-round 1683 c = utext_current32(fText); 1684 UTRIE_GET16(&fData->fTrie, c, category); 1685 } 1686 1687 // If we found breaks, build a new break cache. The first and last entries must 1688 // be the original starting and ending position. 1689 if (foundBreakCount > 0) { 1690 int32_t totalBreaks = foundBreakCount; 1691 if (startPos < breaks.elementAti(0)) { 1692 totalBreaks += 1; 1693 } 1694 if (endPos > breaks.peeki()) { 1695 totalBreaks += 1; 1696 } 1697 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1698 if (fCachedBreakPositions != NULL) { 1699 int32_t out = 0; 1700 fNumCachedBreakPositions = totalBreaks; 1701 if (startPos < breaks.elementAti(0)) { 1702 fCachedBreakPositions[out++] = startPos; 1703 } 1704 for (int32_t i = 0; i < foundBreakCount; ++i) { 1705 fCachedBreakPositions[out++] = breaks.elementAti(i); 1706 } 1707 if (endPos > fCachedBreakPositions[out-1]) { 1708 fCachedBreakPositions[out] = endPos; 1709 } 1710 // If there are breaks, then by definition, we are replacing the original 1711 // proposed break by one of the breaks we found. Use following() and 1712 // preceding() to do the work. They should never recurse in this case. 1713 if (reverse) { 1714 return preceding(endPos - 1); 1715 } 1716 else { 1717 return following(startPos); 1718 } 1719 } 1720 // If the allocation failed, just fall through to the "no breaks found" case. 1721 } 1722 1723 // If we get here, there were no language-based breaks. Set the text pointer 1724 // to the original proposed break. 1725 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1726 return (reverse ? startPos : endPos); 1727 } 1728 1729 U_NAMESPACE_END 1730 1731 // defined in ucln_cmn.h 1732 1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; 1734 1735 /** 1736 * Release all static memory held by breakiterator. 1737 */ 1738 U_CDECL_BEGIN 1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1740 if (gLanguageBreakFactories) { 1741 delete gLanguageBreakFactories; 1742 gLanguageBreakFactories = NULL; 1743 } 1744 return TRUE; 1745 } 1746 U_CDECL_END 1747 1748 U_CDECL_BEGIN 1749 static void U_CALLCONV _deleteFactory(void *obj) { 1750 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; 1751 } 1752 U_CDECL_END 1753 U_NAMESPACE_BEGIN 1754 1755 static const LanguageBreakEngine* 1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1757 { 1758 UBool needsInit; 1759 UErrorCode status = U_ZERO_ERROR; 1760 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1761 1762 if (needsInit) { 1763 UStack *factories = new UStack(_deleteFactory, NULL, status); 1764 if (factories != NULL && U_SUCCESS(status)) { 1765 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1766 factories->push(builtIn, status); 1767 #ifdef U_LOCAL_SERVICE_HOOK 1768 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1769 if (extra != NULL) { 1770 factories->push(extra, status); 1771 } 1772 #endif 1773 } 1774 umtx_lock(NULL); 1775 if (gLanguageBreakFactories == NULL) { 1776 gLanguageBreakFactories = factories; 1777 factories = NULL; 1778 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1779 } 1780 umtx_unlock(NULL); 1781 delete factories; 1782 } 1783 1784 if (gLanguageBreakFactories == NULL) { 1785 return NULL; 1786 } 1787 1788 int32_t i = gLanguageBreakFactories->size(); 1789 const LanguageBreakEngine *lbe = NULL; 1790 while (--i >= 0) { 1791 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1792 lbe = factory->getEngineFor(c, breakType); 1793 if (lbe != NULL) { 1794 break; 1795 } 1796 } 1797 return lbe; 1798 } 1799 1800 1801 //------------------------------------------------------------------------------- 1802 // 1803 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1804 // the characer c. 1805 // 1806 //------------------------------------------------------------------------------- 1807 const LanguageBreakEngine * 1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1809 const LanguageBreakEngine *lbe = NULL; 1810 UErrorCode status = U_ZERO_ERROR; 1811 1812 if (fLanguageBreakEngines == NULL) { 1813 fLanguageBreakEngines = new UStack(status); 1814 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1815 delete fLanguageBreakEngines; 1816 fLanguageBreakEngines = 0; 1817 return NULL; 1818 } 1819 } 1820 1821 int32_t i = fLanguageBreakEngines->size(); 1822 while (--i >= 0) { 1823 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1824 if (lbe->handles(c, fBreakType)) { 1825 return lbe; 1826 } 1827 } 1828 1829 // No existing dictionary took the character. See if a factory wants to 1830 // give us a new LanguageBreakEngine for this character. 1831 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1832 1833 // If we got one, use it and push it on our stack. 1834 if (lbe != NULL) { 1835 fLanguageBreakEngines->push((void *)lbe, status); 1836 // Even if we can't remember it, we can keep looking it up, so 1837 // return it even if the push fails. 1838 return lbe; 1839 } 1840 1841 // No engine is forthcoming for this character. Add it to the 1842 // reject set. Create the reject break engine if needed. 1843 if (fUnhandledBreakEngine == NULL) { 1844 fUnhandledBreakEngine = new UnhandledEngine(status); 1845 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1846 status = U_MEMORY_ALLOCATION_ERROR; 1847 } 1848 // Put it last so that scripts for which we have an engine get tried 1849 // first. 1850 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1851 // If we can't insert it, or creation failed, get rid of it 1852 if (U_FAILURE(status)) { 1853 delete fUnhandledBreakEngine; 1854 fUnhandledBreakEngine = 0; 1855 return NULL; 1856 } 1857 } 1858 1859 // Tell the reject engine about the character; at its discretion, it may 1860 // add more than just the one character. 1861 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1862 1863 return fUnhandledBreakEngine; 1864 } 1865 1866 1867 1868 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1869 return fBreakType; 1870 }*/ 1871 1872 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1873 fBreakType = type; 1874 reset(); 1875 } 1876 1877 U_NAMESPACE_END 1878 1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1880