1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2008 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 *************************************************************************** 6 */ 7 // 8 // file: rbbi.c Contains the implementation of the rule based break iterator 9 // runtime engine and the API implementation for 10 // class RuleBasedBreakIterator 11 // 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_BREAK_ITERATION 16 17 #include "unicode/rbbi.h" 18 #include "unicode/schriter.h" 19 #include "unicode/uchriter.h" 20 #include "unicode/udata.h" 21 #include "unicode/uclean.h" 22 #include "rbbidata.h" 23 #include "rbbirb.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "umutex.h" 27 #include "ucln_cmn.h" 28 #include "brkeng.h" 29 30 #include "uassert.h" 31 #include "uvector.h" 32 #include <stdio.h> 33 34 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 35 #if U_LOCAL_SERVICE_HOOK 36 #include "localsvc.h" 37 #endif 38 39 #ifdef RBBI_DEBUG 40 static UBool fTrace = FALSE; 41 #endif 42 43 U_NAMESPACE_BEGIN 44 45 // The state number of the starting state 46 #define START_STATE 1 47 48 // The state-transition value indicating "stop" 49 #define STOP_STATE 0 50 51 52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 53 54 55 //======================================================================= 56 // constructors 57 //======================================================================= 58 59 /** 60 * Constructs a RuleBasedBreakIterator that uses the already-created 61 * tables object that is passed in as a parameter. 62 */ 63 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 64 { 65 init(); 66 fData = new RBBIDataWrapper(data, status); // status checked in constructor 67 if (U_FAILURE(status)) {return;} 68 if(fData == 0) { 69 status = U_MEMORY_ALLOCATION_ERROR; 70 return; 71 } 72 } 73 74 /** 75 * Same as above but does not adopt memory 76 */ 77 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 78 { 79 init(); 80 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 81 if (U_FAILURE(status)) {return;} 82 if(fData == 0) { 83 status = U_MEMORY_ALLOCATION_ERROR; 84 return; 85 } 86 } 87 88 //------------------------------------------------------------------------------- 89 // 90 // Constructor from a UDataMemory handle to precompiled break rules 91 // stored in an ICU data file. 92 // 93 //------------------------------------------------------------------------------- 94 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 95 { 96 init(); 97 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 98 if (U_FAILURE(status)) {return;} 99 if(fData == 0) { 100 status = U_MEMORY_ALLOCATION_ERROR; 101 return; 102 } 103 } 104 105 106 107 //------------------------------------------------------------------------------- 108 // 109 // Constructor from a set of rules supplied as a string. 110 // 111 //------------------------------------------------------------------------------- 112 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 113 UParseError &parseError, 114 UErrorCode &status) 115 { 116 init(); 117 if (U_FAILURE(status)) {return;} 118 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 119 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 120 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 121 // creates and returns a complete RBBI. From here, in a constructor, we 122 // can't just return the object created by the builder factory, hence 123 // the assignment of the factory created object to "this". 124 if (U_SUCCESS(status)) { 125 *this = *bi; 126 delete bi; 127 } 128 } 129 130 131 //------------------------------------------------------------------------------- 132 // 133 // Default Constructor. Create an empty shell that can be set up later. 134 // Used when creating a RuleBasedBreakIterator from a set 135 // of rules. 136 //------------------------------------------------------------------------------- 137 RuleBasedBreakIterator::RuleBasedBreakIterator() { 138 init(); 139 } 140 141 142 //------------------------------------------------------------------------------- 143 // 144 // Copy constructor. Will produce a break iterator with the same behavior, 145 // and which iterates over the same text, as the one passed in. 146 // 147 //------------------------------------------------------------------------------- 148 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 149 : BreakIterator(other) 150 { 151 this->init(); 152 *this = other; 153 } 154 155 156 /** 157 * Destructor 158 */ 159 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 160 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 161 // fCharIter was adopted from the outside. 162 delete fCharIter; 163 } 164 fCharIter = NULL; 165 delete fSCharIter; 166 fCharIter = NULL; 167 delete fDCharIter; 168 fDCharIter = NULL; 169 170 utext_close(fText); 171 172 if (fData != NULL) { 173 fData->removeReference(); 174 fData = NULL; 175 } 176 if (fCachedBreakPositions) { 177 uprv_free(fCachedBreakPositions); 178 fCachedBreakPositions = NULL; 179 } 180 if (fLanguageBreakEngines) { 181 delete fLanguageBreakEngines; 182 fLanguageBreakEngines = NULL; 183 } 184 if (fUnhandledBreakEngine) { 185 delete fUnhandledBreakEngine; 186 fUnhandledBreakEngine = NULL; 187 } 188 } 189 190 /** 191 * Assignment operator. Sets this iterator to have the same behavior, 192 * and iterate over the same text, as the one passed in. 193 */ 194 RuleBasedBreakIterator& 195 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 196 if (this == &that) { 197 return *this; 198 } 199 reset(); // Delete break cache information 200 fBreakType = that.fBreakType; 201 if (fLanguageBreakEngines != NULL) { 202 delete fLanguageBreakEngines; 203 fLanguageBreakEngines = NULL; // Just rebuild for now 204 } 205 // TODO: clone fLanguageBreakEngines from "that" 206 UErrorCode status = U_ZERO_ERROR; 207 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 208 209 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 210 delete fCharIter; 211 } 212 fCharIter = NULL; 213 214 if (that.fCharIter != NULL ) { 215 // This is a little bit tricky - it will intially appear that 216 // this->fCharIter is adopted, even if that->fCharIter was 217 // not adopted. That's ok. 218 fCharIter = that.fCharIter->clone(); 219 } 220 221 if (fData != NULL) { 222 fData->removeReference(); 223 fData = NULL; 224 } 225 if (that.fData != NULL) { 226 fData = that.fData->addReference(); 227 } 228 229 return *this; 230 } 231 232 233 234 //----------------------------------------------------------------------------- 235 // 236 // init() Shared initialization routine. Used by all the constructors. 237 // Initializes all fields, leaving the object in a consistent state. 238 // 239 //----------------------------------------------------------------------------- 240 void RuleBasedBreakIterator::init() { 241 UErrorCode status = U_ZERO_ERROR; 242 fBufferClone = FALSE; 243 fText = utext_openUChars(NULL, NULL, 0, &status); 244 fCharIter = NULL; 245 fSCharIter = NULL; 246 fDCharIter = NULL; 247 fData = NULL; 248 fLastRuleStatusIndex = 0; 249 fLastStatusIndexValid = TRUE; 250 fDictionaryCharCount = 0; 251 fBreakType = -1; 252 253 fCachedBreakPositions = NULL; 254 fLanguageBreakEngines = NULL; 255 fUnhandledBreakEngine = NULL; 256 fNumCachedBreakPositions = 0; 257 fPositionInCache = 0; 258 259 #ifdef RBBI_DEBUG 260 static UBool debugInitDone = FALSE; 261 if (debugInitDone == FALSE) { 262 char *debugEnv = getenv("U_RBBIDEBUG"); 263 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 264 fTrace = TRUE; 265 } 266 debugInitDone = TRUE; 267 } 268 #endif 269 } 270 271 272 273 //----------------------------------------------------------------------------- 274 // 275 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 276 // behavior, and iterating over the same text, as this one. 277 // Virtual function: does the right thing with subclasses. 278 // 279 //----------------------------------------------------------------------------- 280 BreakIterator* 281 RuleBasedBreakIterator::clone(void) const { 282 return new RuleBasedBreakIterator(*this); 283 } 284 285 /** 286 * Equality operator. Returns TRUE if both BreakIterators are of the 287 * same class, have the same behavior, and iterate over the same text. 288 */ 289 UBool 290 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 291 if (that.getDynamicClassID() != getDynamicClassID()) { 292 return FALSE; 293 } 294 295 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 296 297 if (!utext_equals(fText, that2.fText)) { 298 // The two break iterators are operating on different text, 299 // or have a different interation position. 300 return FALSE; 301 }; 302 303 // TODO: need a check for when in a dictionary region at different offsets. 304 305 if (that2.fData == fData || 306 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 307 // The two break iterators are using the same rules. 308 return TRUE; 309 } 310 return FALSE; 311 } 312 313 /** 314 * Compute a hash code for this BreakIterator 315 * @return A hash code 316 */ 317 int32_t 318 RuleBasedBreakIterator::hashCode(void) const { 319 int32_t hash = 0; 320 if (fData != NULL) { 321 hash = fData->hashCode(); 322 } 323 return hash; 324 } 325 326 327 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 328 if (U_FAILURE(status)) { 329 return; 330 } 331 reset(); 332 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 333 334 // Set up a dummy CharacterIterator to be returned if anyone 335 // calls getText(). With input from UText, there is no reasonable 336 // way to return a characterIterator over the actual input text. 337 // Return one over an empty string instead - this is the closest 338 // we can come to signaling a failure. 339 // (GetText() is obsolete, this failure is sort of OK) 340 if (fDCharIter == NULL) { 341 static const UChar c = 0; 342 fDCharIter = new UCharCharacterIterator(&c, 0); 343 if (fDCharIter == NULL) { 344 status = U_MEMORY_ALLOCATION_ERROR; 345 return; 346 } 347 } 348 349 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 350 // existing fCharIter was adopted from the outside. Delete it now. 351 delete fCharIter; 352 } 353 fCharIter = fDCharIter; 354 355 this->first(); 356 } 357 358 359 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 360 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 361 return result; 362 } 363 364 365 366 /** 367 * Returns the description used to create this iterator 368 */ 369 const UnicodeString& 370 RuleBasedBreakIterator::getRules() const { 371 if (fData != NULL) { 372 return fData->getRuleSourceString(); 373 } else { 374 static const UnicodeString *s; 375 if (s == NULL) { 376 // TODO: something more elegant here. 377 // perhaps API should return the string by value. 378 // Note: thread unsafe init & leak are semi-ok, better than 379 // what was before. Sould be cleaned up, though. 380 s = new UnicodeString; 381 } 382 return *s; 383 } 384 } 385 386 //======================================================================= 387 // BreakIterator overrides 388 //======================================================================= 389 390 /** 391 * Return a CharacterIterator over the text being analyzed. 392 */ 393 CharacterIterator& 394 RuleBasedBreakIterator::getText() const { 395 return *fCharIter; 396 } 397 398 /** 399 * Set the iterator to analyze a new piece of text. This function resets 400 * the current iteration position to the beginning of the text. 401 * @param newText An iterator over the text to analyze. 402 */ 403 void 404 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 405 // If we are holding a CharacterIterator adopted from a 406 // previous call to this function, delete it now. 407 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 408 delete fCharIter; 409 } 410 411 fCharIter = newText; 412 UErrorCode status = U_ZERO_ERROR; 413 reset(); 414 if (newText==NULL || newText->startIndex() != 0) { 415 // startIndex !=0 wants to be an error, but there's no way to report it. 416 // Make the iterator text be an empty string. 417 fText = utext_openUChars(fText, NULL, 0, &status); 418 } else { 419 fText = utext_openCharacterIterator(fText, newText, &status); 420 } 421 this->first(); 422 } 423 424 /** 425 * Set the iterator to analyze a new piece of text. This function resets 426 * the current iteration position to the beginning of the text. 427 * @param newText An iterator over the text to analyze. 428 */ 429 void 430 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 431 UErrorCode status = U_ZERO_ERROR; 432 reset(); 433 fText = utext_openConstUnicodeString(fText, &newText, &status); 434 435 // Set up a character iterator on the string. 436 // Needed in case someone calls getText(). 437 // Can not, unfortunately, do this lazily on the (probably never) 438 // call to getText(), because getText is const. 439 if (fSCharIter == NULL) { 440 fSCharIter = new StringCharacterIterator(newText); 441 } else { 442 fSCharIter->setText(newText); 443 } 444 445 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 446 // old fCharIter was adopted from the outside. Delete it. 447 delete fCharIter; 448 } 449 fCharIter = fSCharIter; 450 451 this->first(); 452 } 453 454 455 456 /** 457 * Sets the current iteration position to the beginning of the text. 458 * @return The offset of the beginning of the text. 459 */ 460 int32_t RuleBasedBreakIterator::first(void) { 461 reset(); 462 fLastRuleStatusIndex = 0; 463 fLastStatusIndexValid = TRUE; 464 //if (fText == NULL) 465 // return BreakIterator::DONE; 466 467 utext_setNativeIndex(fText, 0); 468 return 0; 469 } 470 471 /** 472 * Sets the current iteration position to the end of the text. 473 * @return The text's past-the-end offset. 474 */ 475 int32_t RuleBasedBreakIterator::last(void) { 476 reset(); 477 if (fText == NULL) { 478 fLastRuleStatusIndex = 0; 479 fLastStatusIndexValid = TRUE; 480 return BreakIterator::DONE; 481 } 482 483 fLastStatusIndexValid = FALSE; 484 int32_t pos = (int32_t)utext_nativeLength(fText); 485 utext_setNativeIndex(fText, pos); 486 return pos; 487 } 488 489 /** 490 * Advances the iterator either forward or backward the specified number of steps. 491 * Negative values move backward, and positive values move forward. This is 492 * equivalent to repeatedly calling next() or previous(). 493 * @param n The number of steps to move. The sign indicates the direction 494 * (negative is backwards, and positive is forwards). 495 * @return The character offset of the boundary position n boundaries away from 496 * the current one. 497 */ 498 int32_t RuleBasedBreakIterator::next(int32_t n) { 499 int32_t result = current(); 500 while (n > 0) { 501 result = next(); 502 --n; 503 } 504 while (n < 0) { 505 result = previous(); 506 ++n; 507 } 508 return result; 509 } 510 511 /** 512 * Advances the iterator to the next boundary position. 513 * @return The position of the first boundary after this one. 514 */ 515 int32_t RuleBasedBreakIterator::next(void) { 516 // if we have cached break positions and we're still in the range 517 // covered by them, just move one step forward in the cache 518 if (fCachedBreakPositions != NULL) { 519 if (fPositionInCache < fNumCachedBreakPositions - 1) { 520 ++fPositionInCache; 521 int32_t pos = fCachedBreakPositions[fPositionInCache]; 522 utext_setNativeIndex(fText, pos); 523 return pos; 524 } 525 else { 526 reset(); 527 } 528 } 529 530 int32_t startPos = current(); 531 int32_t result = handleNext(fData->fForwardTable); 532 if (fDictionaryCharCount > 0) { 533 result = checkDictionary(startPos, result, FALSE); 534 } 535 return result; 536 } 537 538 /** 539 * Advances the iterator backwards, to the last boundary preceding this one. 540 * @return The position of the last boundary position preceding this one. 541 */ 542 int32_t RuleBasedBreakIterator::previous(void) { 543 int32_t result; 544 int32_t startPos; 545 546 // if we have cached break positions and we're still in the range 547 // covered by them, just move one step backward in the cache 548 if (fCachedBreakPositions != NULL) { 549 if (fPositionInCache > 0) { 550 --fPositionInCache; 551 // If we're at the beginning of the cache, need to reevaluate the 552 // rule status 553 if (fPositionInCache <= 0) { 554 fLastStatusIndexValid = FALSE; 555 } 556 int32_t pos = fCachedBreakPositions[fPositionInCache]; 557 utext_setNativeIndex(fText, pos); 558 return pos; 559 } 560 else { 561 reset(); 562 } 563 } 564 565 // if we're already sitting at the beginning of the text, return DONE 566 if (fText == NULL || (startPos = current()) == 0) { 567 fLastRuleStatusIndex = 0; 568 fLastStatusIndexValid = TRUE; 569 return BreakIterator::DONE; 570 } 571 572 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 573 result = handlePrevious(fData->fReverseTable); 574 if (fDictionaryCharCount > 0) { 575 result = checkDictionary(result, startPos, TRUE); 576 } 577 return result; 578 } 579 580 // old rule syntax 581 // set things up. handlePrevious() will back us up to some valid 582 // break position before the current position (we back our internal 583 // iterator up one step to prevent handlePrevious() from returning 584 // the current position), but not necessarily the last one before 585 586 // where we started 587 588 int32_t start = current(); 589 590 UTEXT_PREVIOUS32(fText); 591 int32_t lastResult = handlePrevious(fData->fReverseTable); 592 if (lastResult == UBRK_DONE) { 593 lastResult = 0; 594 utext_setNativeIndex(fText, 0); 595 } 596 result = lastResult; 597 int32_t lastTag = 0; 598 UBool breakTagValid = FALSE; 599 600 // iterate forward from the known break position until we pass our 601 // starting point. The last break position before the starting 602 // point is our return value 603 604 for (;;) { 605 result = next(); 606 if (result == BreakIterator::DONE || result >= start) { 607 break; 608 } 609 lastResult = result; 610 lastTag = fLastRuleStatusIndex; 611 breakTagValid = TRUE; 612 } 613 614 // fLastBreakTag wants to have the value for section of text preceding 615 // the result position that we are to return (in lastResult.) If 616 // the backwards rules overshot and the above loop had to do two or more 617 // next()s to move up to the desired return position, we will have a valid 618 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 619 // we wont have a tag value for that position, which is only set by handleNext(). 620 621 // set the current iteration position to be the last break position 622 // before where we started, and then return that value 623 utext_setNativeIndex(fText, lastResult); 624 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 625 fLastStatusIndexValid = breakTagValid; 626 627 // No need to check the dictionary; it will have been handled by 628 // next() 629 630 return lastResult; 631 } 632 633 /** 634 * Sets the iterator to refer to the first boundary position following 635 * the specified position. 636 * @offset The position from which to begin searching for a break position. 637 * @return The position of the first break after the current position. 638 */ 639 int32_t RuleBasedBreakIterator::following(int32_t offset) { 640 // if we have cached break positions and offset is in the range 641 // covered by them, use them 642 // TODO: could use binary search 643 // TODO: what if offset is outside range, but break is not? 644 if (fCachedBreakPositions != NULL) { 645 if (offset >= fCachedBreakPositions[0] 646 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 647 fPositionInCache = 0; 648 // We are guaranteed not to leave the array due to range test above 649 while (offset >= fCachedBreakPositions[fPositionInCache]) { 650 ++fPositionInCache; 651 } 652 int32_t pos = fCachedBreakPositions[fPositionInCache]; 653 utext_setNativeIndex(fText, pos); 654 return pos; 655 } 656 else { 657 reset(); 658 } 659 } 660 661 // if the offset passed in is already past the end of the text, 662 // just return DONE; if it's before the beginning, return the 663 // text's starting offset 664 fLastRuleStatusIndex = 0; 665 fLastStatusIndexValid = TRUE; 666 if (fText == NULL || offset >= utext_nativeLength(fText)) { 667 last(); 668 return next(); 669 } 670 else if (offset < 0) { 671 return first(); 672 } 673 674 // otherwise, set our internal iteration position (temporarily) 675 // to the position passed in. If this is the _beginning_ position, 676 // then we can just use next() to get our return value 677 678 int32_t result = 0; 679 680 if (fData->fSafeRevTable != NULL) { 681 // new rule syntax 682 utext_setNativeIndex(fText, offset); 683 // move forward one codepoint to prepare for moving back to a 684 // safe point. 685 // this handles offset being between a supplementary character 686 UTEXT_NEXT32(fText); 687 // handlePrevious will move most of the time to < 1 boundary away 688 handlePrevious(fData->fSafeRevTable); 689 int32_t result = next(); 690 while (result <= offset) { 691 result = next(); 692 } 693 return result; 694 } 695 if (fData->fSafeFwdTable != NULL) { 696 // backup plan if forward safe table is not available 697 utext_setNativeIndex(fText, offset); 698 UTEXT_PREVIOUS32(fText); 699 // handle next will give result >= offset 700 handleNext(fData->fSafeFwdTable); 701 // previous will give result 0 or 1 boundary away from offset, 702 // most of the time 703 // we have to 704 int32_t oldresult = previous(); 705 while (oldresult > offset) { 706 int32_t result = previous(); 707 if (result <= offset) { 708 return oldresult; 709 } 710 oldresult = result; 711 } 712 int32_t result = next(); 713 if (result <= offset) { 714 return next(); 715 } 716 return result; 717 } 718 // otherwise, we have to sync up first. Use handlePrevious() to back 719 // up to a known break position before the specified position (if 720 // we can determine that the specified position is a break position, 721 // we don't back up at all). This may or may not be the last break 722 // position at or before our starting position. Advance forward 723 // from here until we've passed the starting position. The position 724 // we stop on will be the first break position after the specified one. 725 // old rule syntax 726 727 utext_setNativeIndex(fText, offset); 728 if (offset==0 || 729 offset==1 && utext_getNativeIndex(fText)==0) { 730 return next(); 731 } 732 result = previous(); 733 734 while (result != BreakIterator::DONE && result <= offset) { 735 result = next(); 736 } 737 738 return result; 739 } 740 741 /** 742 * Sets the iterator to refer to the last boundary position before the 743 * specified position. 744 * @offset The position to begin searching for a break from. 745 * @return The position of the last boundary before the starting position. 746 */ 747 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 748 // if we have cached break positions and offset is in the range 749 // covered by them, use them 750 if (fCachedBreakPositions != NULL) { 751 // TODO: binary search? 752 // TODO: What if offset is outside range, but break is not? 753 if (offset > fCachedBreakPositions[0] 754 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 755 fPositionInCache = 0; 756 while (fPositionInCache < fNumCachedBreakPositions 757 && offset > fCachedBreakPositions[fPositionInCache]) 758 ++fPositionInCache; 759 --fPositionInCache; 760 // If we're at the beginning of the cache, need to reevaluate the 761 // rule status 762 if (fPositionInCache <= 0) { 763 fLastStatusIndexValid = FALSE; 764 } 765 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 766 return fCachedBreakPositions[fPositionInCache]; 767 } 768 else { 769 reset(); 770 } 771 } 772 773 // if the offset passed in is already past the end of the text, 774 // just return DONE; if it's before the beginning, return the 775 // text's starting offset 776 if (fText == NULL || offset > utext_nativeLength(fText)) { 777 // return BreakIterator::DONE; 778 return last(); 779 } 780 else if (offset < 0) { 781 return first(); 782 } 783 784 // if we start by updating the current iteration position to the 785 // position specified by the caller, we can just use previous() 786 // to carry out this operation 787 788 if (fData->fSafeFwdTable != NULL) { 789 // new rule syntax 790 utext_setNativeIndex(fText, offset); 791 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 792 if (newOffset != offset) { 793 // Will come here if specified offset was not a code point boundary AND 794 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 795 // indices to the containing code point. 796 // For breakitereator::preceding only, these non-code-point indices need to be moved 797 // up to refer to the following codepoint. 798 UTEXT_NEXT32(fText); 799 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 800 } 801 802 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 803 // rather than adjusting the position unconditionally? 804 // (Change would interact with safe rules.) 805 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 806 // affects only preceding(), seems cleaner, but is slightly different. 807 UTEXT_PREVIOUS32(fText); 808 handleNext(fData->fSafeFwdTable); 809 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 810 while (result >= offset) { 811 result = previous(); 812 } 813 return result; 814 } 815 if (fData->fSafeRevTable != NULL) { 816 // backup plan if forward safe table is not available 817 // TODO: check whether this path can be discarded 818 // It's probably OK to say that rules must supply both safe tables 819 // if they use safe tables at all. We have certainly never described 820 // to anyone how to work with just one safe table. 821 utext_setNativeIndex(fText, offset); 822 UTEXT_NEXT32(fText); 823 824 // handle previous will give result <= offset 825 handlePrevious(fData->fSafeRevTable); 826 827 // next will give result 0 or 1 boundary away from offset, 828 // most of the time 829 // we have to 830 int32_t oldresult = next(); 831 while (oldresult < offset) { 832 int32_t result = next(); 833 if (result >= offset) { 834 return oldresult; 835 } 836 oldresult = result; 837 } 838 int32_t result = previous(); 839 if (result >= offset) { 840 return previous(); 841 } 842 return result; 843 } 844 845 // old rule syntax 846 utext_setNativeIndex(fText, offset); 847 return previous(); 848 } 849 850 /** 851 * Returns true if the specfied position is a boundary position. As a side 852 * effect, leaves the iterator pointing to the first boundary position at 853 * or after "offset". 854 * @param offset the offset to check. 855 * @return True if "offset" is a boundary position. 856 */ 857 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 858 // the beginning index of the iterator is always a boundary position by definition 859 if (offset == 0) { 860 first(); // For side effects on current position, tag values. 861 return TRUE; 862 } 863 864 if (offset == (int32_t)utext_nativeLength(fText)) { 865 last(); // For side effects on current position, tag values. 866 return TRUE; 867 } 868 869 // out-of-range indexes are never boundary positions 870 if (offset < 0) { 871 first(); // For side effects on current position, tag values. 872 return FALSE; 873 } 874 875 if (offset > utext_nativeLength(fText)) { 876 last(); // For side effects on current position, tag values. 877 return FALSE; 878 } 879 880 // otherwise, we can use following() on the position before the specified 881 // one and return true if the position we get back is the one the user 882 // specified 883 utext_previous32From(fText, offset); 884 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 885 UBool result = following(backOne) == offset; 886 return result; 887 } 888 889 /** 890 * Returns the current iteration position. 891 * @return The current iteration position. 892 */ 893 int32_t RuleBasedBreakIterator::current(void) const { 894 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 895 return pos; 896 } 897 898 //======================================================================= 899 // implementation 900 //======================================================================= 901 902 // 903 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 904 // of user text. A variable with this enum type keeps track of where we 905 // are. The state machine only fetches user input while in the RUN mode. 906 // 907 enum RBBIRunMode { 908 RBBI_START, // state machine processing is before first char of input 909 RBBI_RUN, // state machine processing is in the user text 910 RBBI_END // state machine processing is after end of user text. 911 }; 912 913 914 //----------------------------------------------------------------------------------- 915 // 916 // handleNext(stateTable) 917 // This method is the actual implementation of the rbbi next() method. 918 // This method initializes the state machine to state 1 919 // and advances through the text character by character until we reach the end 920 // of the text or the state machine transitions to state 0. We update our return 921 // value every time the state machine passes through an accepting state. 922 // 923 //----------------------------------------------------------------------------------- 924 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 925 int32_t state; 926 int16_t category = 0; 927 RBBIRunMode mode; 928 929 RBBIStateTableRow *row; 930 UChar32 c; 931 int32_t lookaheadStatus = 0; 932 int32_t lookaheadTagIdx = 0; 933 int32_t result = 0; 934 int32_t initialPosition = 0; 935 int32_t lookaheadResult = 0; 936 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 937 const char *tableData = statetable->fTableData; 938 uint32_t tableRowLen = statetable->fRowLen; 939 940 #ifdef RBBI_DEBUG 941 if (fTrace) { 942 RBBIDebugPuts("Handle Next pos char state category"); 943 } 944 #endif 945 946 // No matter what, handleNext alway correctly sets the break tag value. 947 fLastStatusIndexValid = TRUE; 948 fLastRuleStatusIndex = 0; 949 950 // if we're already at the end of the text, return DONE. 951 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 952 result = initialPosition; 953 c = UTEXT_NEXT32(fText); 954 if (fData == NULL || c==U_SENTINEL) { 955 return BreakIterator::DONE; 956 } 957 958 // Set the initial state for the state machine 959 state = START_STATE; 960 row = (RBBIStateTableRow *) 961 //(statetable->fTableData + (statetable->fRowLen * state)); 962 (tableData + tableRowLen * state); 963 964 965 mode = RBBI_RUN; 966 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 967 category = 2; 968 mode = RBBI_START; 969 } 970 971 972 // loop until we reach the end of the text or transition to state 0 973 // 974 for (;;) { 975 if (c == U_SENTINEL) { 976 // Reached end of input string. 977 if (mode == RBBI_END) { 978 // We have already run the loop one last time with the 979 // character set to the psueudo {eof} value. Now it is time 980 // to unconditionally bail out. 981 if (lookaheadResult > result) { 982 // We ran off the end of the string with a pending look-ahead match. 983 // Treat this as if the look-ahead condition had been met, and return 984 // the match at the / position from the look-ahead rule. 985 result = lookaheadResult; 986 fLastRuleStatusIndex = lookaheadTagIdx; 987 lookaheadStatus = 0; 988 } 989 break; 990 } 991 // Run the loop one last time with the fake end-of-input character category. 992 mode = RBBI_END; 993 category = 1; 994 } 995 996 // 997 // Get the char category. An incoming category of 1 or 2 means that 998 // we are preset for doing the beginning or end of input, and 999 // that we shouldn't get a category from an actual text input character. 1000 // 1001 if (mode == RBBI_RUN) { 1002 // look up the current character's character category, which tells us 1003 // which column in the state table to look at. 1004 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1005 // not the size of the character going in, which is a UChar32. 1006 // 1007 UTRIE_GET16(&fData->fTrie, c, category); 1008 1009 // Check the dictionary bit in the character's category. 1010 // Counter is only used by dictionary based iterators (subclasses). 1011 // Chars that need to be handled by a dictionary have a flag bit set 1012 // in their category values. 1013 // 1014 if ((category & 0x4000) != 0) { 1015 fDictionaryCharCount++; 1016 // And off the dictionary flag bit. 1017 category &= ~0x4000; 1018 } 1019 } 1020 1021 #ifdef RBBI_DEBUG 1022 if (fTrace) { 1023 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); 1024 if (0x20<=c && c<0x7f) { 1025 RBBIDebugPrintf("\"%c\" ", c); 1026 } else { 1027 RBBIDebugPrintf("%5x ", c); 1028 } 1029 RBBIDebugPrintf("%3d %3d\n", state, category); 1030 } 1031 #endif 1032 1033 // State Transition - move machine to its next state 1034 // 1035 state = row->fNextState[category]; 1036 row = (RBBIStateTableRow *) 1037 // (statetable->fTableData + (statetable->fRowLen * state)); 1038 (tableData + tableRowLen * state); 1039 1040 1041 if (row->fAccepting == -1) { 1042 // Match found, common case. 1043 if (mode != RBBI_START) { 1044 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1045 } 1046 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1047 } 1048 1049 if (row->fLookAhead != 0) { 1050 if (lookaheadStatus != 0 1051 && row->fAccepting == lookaheadStatus) { 1052 // Lookahead match is completed. 1053 result = lookaheadResult; 1054 fLastRuleStatusIndex = lookaheadTagIdx; 1055 lookaheadStatus = 0; 1056 // TODO: make a standalone hard break in a rule work. 1057 if (lookAheadHardBreak) { 1058 UTEXT_SETNATIVEINDEX(fText, result); 1059 return result; 1060 } 1061 // Look-ahead completed, but other rules may match further. Continue on 1062 // TODO: junk this feature? I don't think it's used anywhwere. 1063 goto continueOn; 1064 } 1065 1066 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1067 lookaheadResult = r; 1068 lookaheadStatus = row->fLookAhead; 1069 lookaheadTagIdx = row->fTagIdx; 1070 goto continueOn; 1071 } 1072 1073 1074 if (row->fAccepting != 0) { 1075 // Because this is an accepting state, any in-progress look-ahead match 1076 // is no longer relavant. Clear out the pending lookahead status. 1077 lookaheadStatus = 0; // clear out any pending look-ahead match. 1078 } 1079 1080 continueOn: 1081 if (state == STOP_STATE) { 1082 // This is the normal exit from the lookup state machine. 1083 // We have advanced through the string until it is certain that no 1084 // longer match is possible, no matter what characters follow. 1085 break; 1086 } 1087 1088 // Advance to the next character. 1089 // If this is a beginning-of-input loop iteration, don't advance 1090 // the input position. The next iteration will be processing the 1091 // first real input character. 1092 if (mode == RBBI_RUN) { 1093 c = UTEXT_NEXT32(fText); 1094 } else { 1095 if (mode == RBBI_START) { 1096 mode = RBBI_RUN; 1097 } 1098 } 1099 1100 1101 } 1102 1103 // The state machine is done. Check whether it found a match... 1104 1105 // If the iterator failed to advance in the match engine, force it ahead by one. 1106 // (This really indicates a defect in the break rules. They should always match 1107 // at least one character.) 1108 if (result == initialPosition) { 1109 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1110 UTEXT_NEXT32(fText); 1111 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1112 } 1113 1114 // Leave the iterator at our result position. 1115 UTEXT_SETNATIVEINDEX(fText, result); 1116 #ifdef RBBI_DEBUG 1117 if (fTrace) { 1118 RBBIDebugPrintf("result = %d\n\n", result); 1119 } 1120 #endif 1121 return result; 1122 } 1123 1124 1125 1126 //----------------------------------------------------------------------------------- 1127 // 1128 // handlePrevious() 1129 // 1130 // Iterate backwards, according to the logic of the reverse rules. 1131 // This version handles the exact style backwards rules. 1132 // 1133 // The logic of this function is very similar to handleNext(), above. 1134 // 1135 //----------------------------------------------------------------------------------- 1136 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1137 int32_t state; 1138 int16_t category = 0; 1139 RBBIRunMode mode; 1140 RBBIStateTableRow *row; 1141 UChar32 c; 1142 int32_t lookaheadStatus = 0; 1143 int32_t result = 0; 1144 int32_t initialPosition = 0; 1145 int32_t lookaheadResult = 0; 1146 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1147 1148 #ifdef RBBI_DEBUG 1149 if (fTrace) { 1150 RBBIDebugPuts("Handle Previous pos char state category"); 1151 } 1152 #endif 1153 1154 // handlePrevious() never gets the rule status. 1155 // Flag the status as invalid; if the user ever asks for status, we will need 1156 // to back up, then re-find the break position using handleNext(), which does 1157 // get the status value. 1158 fLastStatusIndexValid = FALSE; 1159 fLastRuleStatusIndex = 0; 1160 1161 // if we're already at the start of the text, return DONE. 1162 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1163 return BreakIterator::DONE; 1164 } 1165 1166 // Set up the starting char. 1167 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1168 result = initialPosition; 1169 c = UTEXT_PREVIOUS32(fText); 1170 1171 // Set the initial state for the state machine 1172 state = START_STATE; 1173 row = (RBBIStateTableRow *) 1174 (statetable->fTableData + (statetable->fRowLen * state)); 1175 category = 3; 1176 mode = RBBI_RUN; 1177 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1178 category = 2; 1179 mode = RBBI_START; 1180 } 1181 1182 1183 // loop until we reach the start of the text or transition to state 0 1184 // 1185 for (;;) { 1186 if (c == U_SENTINEL) { 1187 // Reached end of input string. 1188 if (mode == RBBI_END || 1189 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { 1190 // We have already run the loop one last time with the 1191 // character set to the psueudo {eof} value. Now it is time 1192 // to unconditionally bail out. 1193 // (Or we have an old format binary rule file that does not support {eof}.) 1194 if (lookaheadResult < result) { 1195 // We ran off the end of the string with a pending look-ahead match. 1196 // Treat this as if the look-ahead condition had been met, and return 1197 // the match at the / position from the look-ahead rule. 1198 result = lookaheadResult; 1199 lookaheadStatus = 0; 1200 } else if (result == initialPosition) { 1201 // Ran off start, no match found. 1202 // move one index one (towards the start, since we are doing a previous()) 1203 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1204 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1205 } 1206 break; 1207 } 1208 // Run the loop one last time with the fake end-of-input character category. 1209 mode = RBBI_END; 1210 category = 1; 1211 } 1212 1213 // 1214 // Get the char category. An incoming category of 1 or 2 means that 1215 // we are preset for doing the beginning or end of input, and 1216 // that we shouldn't get a category from an actual text input character. 1217 // 1218 if (mode == RBBI_RUN) { 1219 // look up the current character's character category, which tells us 1220 // which column in the state table to look at. 1221 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1222 // not the size of the character going in, which is a UChar32. 1223 // 1224 UTRIE_GET16(&fData->fTrie, c, category); 1225 1226 // Check the dictionary bit in the character's category. 1227 // Counter is only used by dictionary based iterators (subclasses). 1228 // Chars that need to be handled by a dictionary have a flag bit set 1229 // in their category values. 1230 // 1231 if ((category & 0x4000) != 0) { 1232 fDictionaryCharCount++; 1233 // And off the dictionary flag bit. 1234 category &= ~0x4000; 1235 } 1236 } 1237 1238 #ifdef RBBI_DEBUG 1239 if (fTrace) { 1240 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1241 if (0x20<=c && c<0x7f) { 1242 RBBIDebugPrintf("\"%c\" ", c); 1243 } else { 1244 RBBIDebugPrintf("%5x ", c); 1245 } 1246 RBBIDebugPrintf("%3d %3d\n", state, category); 1247 } 1248 #endif 1249 1250 // State Transition - move machine to its next state 1251 // 1252 state = row->fNextState[category]; 1253 row = (RBBIStateTableRow *) 1254 (statetable->fTableData + (statetable->fRowLen * state)); 1255 1256 if (row->fAccepting == -1) { 1257 // Match found, common case. 1258 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1259 } 1260 1261 if (row->fLookAhead != 0) { 1262 if (lookaheadStatus != 0 1263 && row->fAccepting == lookaheadStatus) { 1264 // Lookahead match is completed. 1265 result = lookaheadResult; 1266 lookaheadStatus = 0; 1267 // TODO: make a standalone hard break in a rule work. 1268 if (lookAheadHardBreak) { 1269 UTEXT_SETNATIVEINDEX(fText, result); 1270 return result; 1271 } 1272 // Look-ahead completed, but other rules may match further. Continue on 1273 // TODO: junk this feature? I don't think it's used anywhwere. 1274 goto continueOn; 1275 } 1276 1277 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1278 lookaheadResult = r; 1279 lookaheadStatus = row->fLookAhead; 1280 goto continueOn; 1281 } 1282 1283 1284 if (row->fAccepting != 0) { 1285 // Because this is an accepting state, any in-progress look-ahead match 1286 // is no longer relavant. Clear out the pending lookahead status. 1287 lookaheadStatus = 0; 1288 } 1289 1290 continueOn: 1291 if (state == STOP_STATE) { 1292 // This is the normal exit from the lookup state machine. 1293 // We have advanced through the string until it is certain that no 1294 // longer match is possible, no matter what characters follow. 1295 break; 1296 } 1297 1298 // Move (backwards) to the next character to process. 1299 // If this is a beginning-of-input loop iteration, don't advance 1300 // the input position. The next iteration will be processing the 1301 // first real input character. 1302 if (mode == RBBI_RUN) { 1303 c = UTEXT_PREVIOUS32(fText); 1304 } else { 1305 if (mode == RBBI_START) { 1306 mode = RBBI_RUN; 1307 } 1308 } 1309 } 1310 1311 // The state machine is done. Check whether it found a match... 1312 1313 // If the iterator failed to advance in the match engine, force it ahead by one. 1314 // (This really indicates a defect in the break rules. They should always match 1315 // at least one character.) 1316 if (result == initialPosition) { 1317 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1318 UTEXT_PREVIOUS32(fText); 1319 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1320 } 1321 1322 // Leave the iterator at our result position. 1323 UTEXT_SETNATIVEINDEX(fText, result); 1324 #ifdef RBBI_DEBUG 1325 if (fTrace) { 1326 RBBIDebugPrintf("result = %d\n\n", result); 1327 } 1328 #endif 1329 return result; 1330 } 1331 1332 1333 void 1334 RuleBasedBreakIterator::reset() 1335 { 1336 if (fCachedBreakPositions) { 1337 uprv_free(fCachedBreakPositions); 1338 } 1339 fCachedBreakPositions = NULL; 1340 fNumCachedBreakPositions = 0; 1341 fDictionaryCharCount = 0; 1342 fPositionInCache = 0; 1343 } 1344 1345 1346 1347 //------------------------------------------------------------------------------- 1348 // 1349 // getRuleStatus() Return the break rule tag associated with the current 1350 // iterator position. If the iterator arrived at its current 1351 // position by iterating forwards, the value will have been 1352 // cached by the handleNext() function. 1353 // 1354 // If no cached status value is available, the status is 1355 // found by doing a previous() followed by a next(), which 1356 // leaves the iterator where it started, and computes the 1357 // status while doing the next(). 1358 // 1359 //------------------------------------------------------------------------------- 1360 void RuleBasedBreakIterator::makeRuleStatusValid() { 1361 if (fLastStatusIndexValid == FALSE) { 1362 // No cached status is available. 1363 if (fText == NULL || current() == 0) { 1364 // At start of text, or there is no text. Status is always zero. 1365 fLastRuleStatusIndex = 0; 1366 fLastStatusIndexValid = TRUE; 1367 } else { 1368 // Not at start of text. Find status the tedious way. 1369 int32_t pa = current(); 1370 previous(); 1371 if (fNumCachedBreakPositions > 0) { 1372 reset(); // Blow off the dictionary cache 1373 } 1374 int32_t pb = next(); 1375 if (pa != pb) { 1376 // note: the if (pa != pb) test is here only to eliminate warnings for 1377 // unused local variables on gcc. Logically, it isn't needed. 1378 U_ASSERT(pa == pb); 1379 } 1380 } 1381 } 1382 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1383 } 1384 1385 1386 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1387 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1388 nonConstThis->makeRuleStatusValid(); 1389 1390 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1391 // (the number of status values.) 1392 // This function returns the last (largest) of the array of status values. 1393 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1394 int32_t tagVal = fData->fRuleStatusTable[idx]; 1395 1396 return tagVal; 1397 } 1398 1399 1400 1401 1402 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1403 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1404 { 1405 if (U_FAILURE(status)) { 1406 return 0; 1407 } 1408 1409 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1410 nonConstThis->makeRuleStatusValid(); 1411 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1412 int32_t numValsToCopy = numVals; 1413 if (numVals > capacity) { 1414 status = U_BUFFER_OVERFLOW_ERROR; 1415 numValsToCopy = capacity; 1416 } 1417 int i; 1418 for (i=0; i<numValsToCopy; i++) { 1419 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1420 } 1421 return numVals; 1422 } 1423 1424 1425 1426 //------------------------------------------------------------------------------- 1427 // 1428 // getBinaryRules Access to the compiled form of the rules, 1429 // for use by build system tools that save the data 1430 // for standard iterator types. 1431 // 1432 //------------------------------------------------------------------------------- 1433 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1434 const uint8_t *retPtr = NULL; 1435 length = 0; 1436 1437 if (fData != NULL) { 1438 retPtr = (const uint8_t *)fData->fHeader; 1439 length = fData->fHeader->fLength; 1440 } 1441 return retPtr; 1442 } 1443 1444 1445 1446 1447 //------------------------------------------------------------------------------- 1448 // 1449 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1450 // Saving one heap allocation isn't worth the trouble. 1451 // Cloning shouldn't be done in tight loops, and 1452 // making the clone copy involves other heap operations anyway. 1453 // And the application code for correctly dealing with buffer 1454 // size problems and the eventual object destruction is ugly. 1455 // 1456 //------------------------------------------------------------------------------- 1457 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1458 int32_t &bufferSize, 1459 UErrorCode &status) 1460 { 1461 if (U_FAILURE(status)){ 1462 return NULL; 1463 } 1464 1465 // 1466 // If user buffer size is zero this is a preflight operation to 1467 // obtain the needed buffer size, allowing for worst case misalignment. 1468 // 1469 if (bufferSize == 0) { 1470 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1471 return NULL; 1472 } 1473 1474 1475 // 1476 // Check the alignment and size of the user supplied buffer. 1477 // Allocate heap memory if the user supplied memory is insufficient. 1478 // 1479 char *buf = (char *)stackBuffer; 1480 uint32_t s = bufferSize; 1481 1482 if (stackBuffer == NULL) { 1483 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1484 } 1485 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1486 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1487 s -= offsetUp; 1488 buf += offsetUp; 1489 } 1490 if (s < sizeof(RuleBasedBreakIterator)) { 1491 // Not enough room in the caller-supplied buffer. 1492 // Do a plain-vanilla heap based clone and return that, along with 1493 // a warning that the clone was allocated. 1494 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1495 if (clonedBI == 0) { 1496 status = U_MEMORY_ALLOCATION_ERROR; 1497 } else { 1498 status = U_SAFECLONE_ALLOCATED_WARNING; 1499 } 1500 return clonedBI; 1501 } 1502 1503 // 1504 // Clone the source BI into the caller-supplied buffer. 1505 // TODO: using an overloaded operator new to directly initialize the 1506 // copy in the user's buffer would be better, but it doesn't seem 1507 // to get along with namespaces. Investigate why. 1508 // 1509 // The memcpy is only safe with an empty (default constructed) 1510 // break iterator. Use on others can screw up reference counts 1511 // to data. memcpy-ing objects is not really a good idea... 1512 // 1513 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy 1514 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; 1515 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part 1516 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) 1517 *clone = *this; // clone = the real BI we want. 1518 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1519 1520 return clone; 1521 } 1522 1523 1524 //------------------------------------------------------------------------------- 1525 // 1526 // isDictionaryChar Return true if the category lookup for this char 1527 // indicates that it is in the set of dictionary lookup 1528 // chars. 1529 // 1530 // This function is intended for use by dictionary based 1531 // break iterators. 1532 // 1533 //------------------------------------------------------------------------------- 1534 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1535 if (fData == NULL) { 1536 return FALSE; 1537 } 1538 uint16_t category; 1539 UTRIE_GET16(&fData->fTrie, c, category); 1540 return (category & 0x4000) != 0; 1541 }*/ 1542 1543 1544 //------------------------------------------------------------------------------- 1545 // 1546 // checkDictionary This function handles all processing of characters in 1547 // the "dictionary" set. It will determine the appropriate 1548 // course of action, and possibly set up a cache in the 1549 // process. 1550 // 1551 //------------------------------------------------------------------------------- 1552 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1553 int32_t endPos, 1554 UBool reverse) { 1555 // Reset the old break cache first. 1556 // uint32_t dictionaryCount = fDictionaryCharCount; 1557 reset(); 1558 1559 // note: code segment below assumes that dictionary chars are in the 1560 // startPos-endPos range 1561 // value returned should be next character in sequence 1562 // if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 1563 if ((endPos - startPos) <= 1) { 1564 return (reverse ? startPos : endPos); 1565 } 1566 1567 // Starting from the starting point, scan towards the proposed result, 1568 // looking for the first dictionary character (which may be the one 1569 // we're on, if we're starting in the middle of a range). 1570 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1571 if (reverse) { 1572 UTEXT_PREVIOUS32(fText); 1573 } 1574 1575 int32_t rangeStart = startPos; 1576 int32_t rangeEnd = endPos; 1577 1578 uint16_t category; 1579 int32_t current; 1580 UErrorCode status = U_ZERO_ERROR; 1581 UStack breaks(status); 1582 int32_t foundBreakCount = 0; 1583 UChar32 c = utext_current32(fText); 1584 1585 UTRIE_GET16(&fData->fTrie, c, category); 1586 1587 // Is the character we're starting on a dictionary character? If so, we 1588 // need to back up to include the entire run; otherwise the results of 1589 // the break algorithm will differ depending on where we start. Since 1590 // the result is cached and there is typically a non-dictionary break 1591 // within a small number of words, there should be little performance impact. 1592 if (category & 0x4000) { 1593 if (reverse) { 1594 do { 1595 utext_next32(fText); // TODO: recast to work directly with postincrement. 1596 c = utext_current32(fText); 1597 UTRIE_GET16(&fData->fTrie, c, category); 1598 } while (c != U_SENTINEL && (category & 0x4000)); 1599 // Back up to the last dictionary character 1600 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1601 if (c == U_SENTINEL) { 1602 // c = fText->last32(); 1603 // TODO: why was this if needed? 1604 c = UTEXT_PREVIOUS32(fText); 1605 } 1606 else { 1607 c = UTEXT_PREVIOUS32(fText); 1608 } 1609 } 1610 else { 1611 do { 1612 c = UTEXT_PREVIOUS32(fText); 1613 UTRIE_GET16(&fData->fTrie, c, category); 1614 } 1615 while (c != U_SENTINEL && (category & 0x4000)); 1616 // Back up to the last dictionary character 1617 if (c == U_SENTINEL) { 1618 // c = fText->first32(); 1619 c = utext_current32(fText); 1620 } 1621 else { 1622 utext_next32(fText); 1623 c = utext_current32(fText); 1624 } 1625 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1626 } 1627 UTRIE_GET16(&fData->fTrie, c, category); 1628 } 1629 1630 // Loop through the text, looking for ranges of dictionary characters. 1631 // For each span, find the appropriate break engine, and ask it to find 1632 // any breaks within the span. 1633 // Note: we always do this in the forward direction, so that the break 1634 // cache is built in the right order. 1635 if (reverse) { 1636 utext_setNativeIndex(fText, rangeStart); 1637 c = utext_current32(fText); 1638 UTRIE_GET16(&fData->fTrie, c, category); 1639 } 1640 while(U_SUCCESS(status)) { 1641 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1642 utext_next32(fText); // TODO: tweak for post-increment operation 1643 c = utext_current32(fText); 1644 UTRIE_GET16(&fData->fTrie, c, category); 1645 } 1646 if (current >= rangeEnd) { 1647 break; 1648 } 1649 1650 // We now have a dictionary character. Get the appropriate language object 1651 // to deal with it. 1652 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1653 1654 // Ask the language object if there are any breaks. It will leave the text 1655 // pointer on the other side of its range, ready to search for the next one. 1656 if (lbe != NULL) { 1657 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1658 } 1659 1660 // Reload the loop variables for the next go-round 1661 c = utext_current32(fText); 1662 UTRIE_GET16(&fData->fTrie, c, category); 1663 } 1664 1665 // If we found breaks, build a new break cache. The first and last entries must 1666 // be the original starting and ending position. 1667 if (foundBreakCount > 0) { 1668 int32_t totalBreaks = foundBreakCount; 1669 if (startPos < breaks.elementAti(0)) { 1670 totalBreaks += 1; 1671 } 1672 if (endPos > breaks.peeki()) { 1673 totalBreaks += 1; 1674 } 1675 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1676 if (fCachedBreakPositions != NULL) { 1677 int32_t out = 0; 1678 fNumCachedBreakPositions = totalBreaks; 1679 if (startPos < breaks.elementAti(0)) { 1680 fCachedBreakPositions[out++] = startPos; 1681 } 1682 for (int32_t i = 0; i < foundBreakCount; ++i) { 1683 fCachedBreakPositions[out++] = breaks.elementAti(i); 1684 } 1685 if (endPos > fCachedBreakPositions[out-1]) { 1686 fCachedBreakPositions[out] = endPos; 1687 } 1688 // If there are breaks, then by definition, we are replacing the original 1689 // proposed break by one of the breaks we found. Use following() and 1690 // preceding() to do the work. They should never recurse in this case. 1691 if (reverse) { 1692 return preceding(endPos); 1693 } 1694 else { 1695 return following(startPos); 1696 } 1697 } 1698 // If the allocation failed, just fall through to the "no breaks found" case. 1699 } 1700 1701 // If we get here, there were no language-based breaks. Set the text pointer 1702 // to the original proposed break. 1703 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1704 return (reverse ? startPos : endPos); 1705 } 1706 1707 U_NAMESPACE_END 1708 1709 // defined in ucln_cmn.h 1710 1711 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; 1712 1713 /** 1714 * Release all static memory held by breakiterator. 1715 */ 1716 U_CDECL_BEGIN 1717 static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1718 if (gLanguageBreakFactories) { 1719 delete gLanguageBreakFactories; 1720 gLanguageBreakFactories = NULL; 1721 } 1722 return TRUE; 1723 } 1724 U_CDECL_END 1725 1726 U_CDECL_BEGIN 1727 static void U_CALLCONV _deleteFactory(void *obj) { 1728 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; 1729 } 1730 U_CDECL_END 1731 U_NAMESPACE_BEGIN 1732 1733 static const LanguageBreakEngine* 1734 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1735 { 1736 UBool needsInit; 1737 UErrorCode status = U_ZERO_ERROR; 1738 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1739 1740 if (needsInit) { 1741 UStack *factories = new UStack(_deleteFactory, NULL, status); 1742 if (factories != NULL && U_SUCCESS(status)) { 1743 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1744 factories->push(builtIn, status); 1745 #ifdef U_LOCAL_SERVICE_HOOK 1746 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1747 if (extra != NULL) { 1748 factories->push(extra, status); 1749 } 1750 #endif 1751 } 1752 umtx_lock(NULL); 1753 if (gLanguageBreakFactories == NULL) { 1754 gLanguageBreakFactories = factories; 1755 factories = NULL; 1756 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1757 } 1758 umtx_unlock(NULL); 1759 delete factories; 1760 } 1761 1762 if (gLanguageBreakFactories == NULL) { 1763 return NULL; 1764 } 1765 1766 int32_t i = gLanguageBreakFactories->size(); 1767 const LanguageBreakEngine *lbe = NULL; 1768 while (--i >= 0) { 1769 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1770 lbe = factory->getEngineFor(c, breakType); 1771 if (lbe != NULL) { 1772 break; 1773 } 1774 } 1775 return lbe; 1776 } 1777 1778 1779 //------------------------------------------------------------------------------- 1780 // 1781 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1782 // the characer c. 1783 // 1784 //------------------------------------------------------------------------------- 1785 const LanguageBreakEngine * 1786 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1787 const LanguageBreakEngine *lbe = NULL; 1788 UErrorCode status = U_ZERO_ERROR; 1789 1790 if (fLanguageBreakEngines == NULL) { 1791 fLanguageBreakEngines = new UStack(status); 1792 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1793 delete fLanguageBreakEngines; 1794 fLanguageBreakEngines = 0; 1795 return NULL; 1796 } 1797 } 1798 1799 int32_t i = fLanguageBreakEngines->size(); 1800 while (--i >= 0) { 1801 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1802 if (lbe->handles(c, fBreakType)) { 1803 return lbe; 1804 } 1805 } 1806 1807 // No existing dictionary took the character. See if a factory wants to 1808 // give us a new LanguageBreakEngine for this character. 1809 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1810 1811 // If we got one, use it and push it on our stack. 1812 if (lbe != NULL) { 1813 fLanguageBreakEngines->push((void *)lbe, status); 1814 // Even if we can't remember it, we can keep looking it up, so 1815 // return it even if the push fails. 1816 return lbe; 1817 } 1818 1819 // No engine is forthcoming for this character. Add it to the 1820 // reject set. Create the reject break engine if needed. 1821 if (fUnhandledBreakEngine == NULL) { 1822 fUnhandledBreakEngine = new UnhandledEngine(status); 1823 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1824 status = U_MEMORY_ALLOCATION_ERROR; 1825 } 1826 // Put it last so that scripts for which we have an engine get tried 1827 // first. 1828 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1829 // If we can't insert it, or creation failed, get rid of it 1830 if (U_FAILURE(status)) { 1831 delete fUnhandledBreakEngine; 1832 fUnhandledBreakEngine = 0; 1833 return NULL; 1834 } 1835 } 1836 1837 // Tell the reject engine about the character; at its discretion, it may 1838 // add more than just the one character. 1839 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1840 1841 return fUnhandledBreakEngine; 1842 } 1843 1844 1845 1846 /*int32_t RuleBasedBreakIterator::getBreakType() const { 1847 return fBreakType; 1848 }*/ 1849 1850 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1851 fBreakType = type; 1852 reset(); 1853 } 1854 1855 U_NAMESPACE_END 1856 1857 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1858