1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include "unicode/utypes.h" 11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION 12 13 #include "cmemory.h" 14 15 #include "unicode/filteredbrk.h" 16 #include "unicode/ucharstriebuilder.h" 17 #include "unicode/ures.h" 18 19 #include "uresimp.h" // ures_getByKeyWithFallback 20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR 21 #include "uvector.h" 22 #include "cmemory.h" 23 24 U_NAMESPACE_BEGIN 25 26 #ifndef FB_DEBUG 27 #define FB_DEBUG 0 28 #endif 29 30 #if FB_DEBUG 31 #include <stdio.h> 32 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { 33 char buf[2048]; 34 if(s) { 35 s->extract(0,s->length(),buf,2048); 36 } else { 37 strcpy(buf,"NULL"); 38 } 39 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", 40 f, l, m, buf, (const void*)s, b?'T':'F',(int)d); 41 } 42 43 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) 44 #else 45 #define FB_TRACE(m,s,b,d) 46 #endif 47 48 /** 49 * Used with sortedInsert() 50 */ 51 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 52 const UnicodeString &a = *(const UnicodeString*)t1.pointer; 53 const UnicodeString &b = *(const UnicodeString*)t2.pointer; 54 return a.compare(b); 55 } 56 57 /** 58 * A UVector which implements a set of strings. 59 */ 60 class U_COMMON_API UStringSet : public UVector { 61 public: 62 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, 63 uhash_compareUnicodeString, 64 1, 65 status) {} 66 virtual ~UStringSet(); 67 /** 68 * Is this UnicodeSet contained? 69 */ 70 inline UBool contains(const UnicodeString& s) { 71 return contains((void*) &s); 72 } 73 using UVector::contains; 74 /** 75 * Return the ith UnicodeString alias 76 */ 77 inline const UnicodeString* getStringAt(int32_t i) const { 78 return (const UnicodeString*)elementAt(i); 79 } 80 /** 81 * Adopt the UnicodeString if not already contained. 82 * Caller no longer owns the pointer in any case. 83 * @return true if adopted successfully, false otherwise (error, or else duplicate) 84 */ 85 inline UBool adopt(UnicodeString *str, UErrorCode &status) { 86 if(U_FAILURE(status) || contains(*str)) { 87 delete str; 88 return false; 89 } else { 90 sortedInsert(str, compareUnicodeString, status); 91 if(U_FAILURE(status)) { 92 delete str; 93 return false; 94 } 95 return true; 96 } 97 } 98 /** 99 * Add by value. 100 * @return true if successfully adopted. 101 */ 102 inline UBool add(const UnicodeString& str, UErrorCode &status) { 103 if(U_FAILURE(status)) return false; 104 UnicodeString *t = new UnicodeString(str); 105 if(t==NULL) { 106 status = U_MEMORY_ALLOCATION_ERROR; return false; 107 } 108 return adopt(t, status); 109 } 110 /** 111 * Remove this string. 112 * @return true if successfully removed, false otherwise (error, or else it wasn't there) 113 */ 114 inline UBool remove(const UnicodeString &s, UErrorCode &status) { 115 if(U_FAILURE(status)) return false; 116 return removeElement((void*) &s); 117 } 118 }; 119 120 /** 121 * Virtual, won't be inlined 122 */ 123 UStringSet::~UStringSet() {} 124 125 /* ----------------------------------------------------------- */ 126 127 128 /* Filtered Break constants */ 129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie 130 static const int32_t kMATCH = (1<<1); //< exact match - skip this one. 131 static const int32_t kSuppressInReverse = (1<<0); 132 static const int32_t kAddToForward = (1<<1); 133 static const UChar kFULLSTOP = 0x002E; // '.' 134 135 /** 136 * Shared data for SimpleFilteredSentenceBreakIterator 137 */ 138 class SimpleFilteredSentenceBreakData : public UMemory { 139 public: 140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } 142 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } 143 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } 144 virtual ~SimpleFilteredSentenceBreakData(); 145 146 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." 147 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. 148 int32_t refcount; 149 }; 150 151 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} 152 153 /** 154 * Concrete implementation 155 */ 156 class SimpleFilteredSentenceBreakIterator : public BreakIterator { 157 public: 158 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); 159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); 160 virtual ~SimpleFilteredSentenceBreakIterator(); 161 private: 162 SimpleFilteredSentenceBreakData *fData; 163 LocalPointer<BreakIterator> fDelegate; 164 LocalUTextPointer fText; 165 166 /* -- subclass interface -- */ 167 public: 168 /* -- cloning and other subclass stuff -- */ 169 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, 170 int32_t &/*BufferSize*/, 171 UErrorCode &status) { 172 // for now - always deep clone 173 status = U_SAFECLONE_ALLOCATED_WARNING; 174 return clone(); 175 } 176 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); } 177 virtual UClassID getDynamicClassID(void) const { return NULL; } 178 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } 179 180 /* -- text modifying -- */ 181 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } 182 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } 183 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } 184 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } 185 186 /* -- other functions that are just delegated -- */ 187 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } 188 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } 189 190 /* -- ITERATION -- */ 191 virtual int32_t first(void); 192 virtual int32_t preceding(int32_t offset); 193 virtual int32_t previous(void); 194 virtual UBool isBoundary(int32_t offset); 195 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. 196 197 virtual int32_t next(void); 198 199 virtual int32_t next(int32_t n); 200 virtual int32_t following(int32_t offset); 201 virtual int32_t last(void); 202 203 private: 204 /** 205 * Given that the fDelegate has already given its "initial" answer, 206 * find the NEXT actual (non-excepted) break. 207 * @param n initial position from delegate 208 * @return new break position or UBRK_DONE 209 */ 210 int32_t internalNext(int32_t n); 211 /** 212 * Given that the fDelegate has already given its "initial" answer, 213 * find the PREV actual (non-excepted) break. 214 * @param n initial position from delegate 215 * @return new break position or UBRK_DONE 216 */ 217 int32_t internalPrev(int32_t n); 218 /** 219 * set up the UText with the value of the fDelegate. 220 * Call this before calling breakExceptionAt. 221 * May be able to avoid excess calls 222 */ 223 void resetState(UErrorCode &status); 224 /** 225 * Is there a match (exception) at this spot? 226 */ 227 enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; 228 /** 229 * Determine if there is an exception at this spot 230 * @param n spot to check 231 * @return kNoExceptionHere or kExceptionHere 232 **/ 233 enum EFBMatchResult breakExceptionAt(int32_t n); 234 }; 235 236 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) 237 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) 238 { 239 } 240 241 242 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : 243 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), 244 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), 245 fDelegate(adopt) 246 { 247 // all set.. 248 } 249 250 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { 251 fData = fData->decr(); 252 } 253 254 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { 255 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); 256 } 257 258 SimpleFilteredSentenceBreakIterator::EFBMatchResult 259 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { 260 int64_t bestPosn = -1; 261 int32_t bestValue = -1; 262 // loops while 'n' points to an exception. 263 utext_setNativeIndex(fText.getAlias(), n); // from n.. 264 fData->fBackwardsTrie->reset(); 265 UChar32 uch; 266 267 //if(debug2) u_printf(" n@ %d\n", n); 268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 269 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? 270 // TODO only do this the 1st time? 271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); 272 } else { 273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); 274 uch = utext_next32(fText.getAlias()); 275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); 276 } 277 278 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; 279 280 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. 281 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie 282 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far 283 bestPosn = utext_getNativeIndex(fText.getAlias()); 284 bestValue = fData->fBackwardsTrie->getValue(); 285 } 286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); 287 } 288 289 if(USTRINGTRIE_MATCHES(r)) { // exact match? 290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 291 bestValue = fData->fBackwardsTrie->getValue(); 292 bestPosn = utext_getNativeIndex(fText.getAlias()); 293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 294 } 295 296 if(bestPosn>=0) { 297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 298 299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? 300 //int32_t bestValue = fBackwardsTrie->getValue(); 301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); 302 303 if(bestValue == kMATCH) { // exact match! 304 //if(debug2) u_printf(" exact backward match\n"); 305 return kExceptionHere; // See if the next is another exception. 306 } else if(bestValue == kPARTIAL 307 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie 308 //if(debug2) u_printf(" partial backward match\n"); 309 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 310 // to see if it matches something going forward. 311 fData->fForwardsPartialTrie->reset(); 312 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; 313 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. 314 //if(debug2) u_printf("Retrying at %d\n", bestPosn); 315 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && 316 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { 317 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); 318 } 319 if(USTRINGTRIE_MATCHES(rfwd)) { 320 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); 321 // only full matches here, nothing to check 322 // skip the next: 323 return kExceptionHere; 324 } else { 325 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); 326 // no match (no exception) -return the 'underlying' break 327 return kNoExceptionHere; 328 } 329 } else { 330 return kNoExceptionHere; // internal error and/or no forwards trie 331 } 332 } else { 333 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match 334 return kNoExceptionHere; // No match - so exit. Not an exception. 335 } 336 } 337 338 // the workhorse single next. 339 int32_t 340 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { 341 if(n == UBRK_DONE || // at end or 342 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 343 return n; 344 } 345 // OK, do we need to break here? 346 UErrorCode status = U_ZERO_ERROR; 347 // refresh text 348 resetState(status); 349 if(U_FAILURE(status)) return UBRK_DONE; // bail out 350 int64_t utextLen = utext_nativeLength(fText.getAlias()); 351 352 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 353 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). 354 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 355 356 switch(m) { 357 case kExceptionHere: 358 n = fDelegate->next(); // skip this one. Find the next lowerlevel break. 359 continue; 360 361 default: 362 case kNoExceptionHere: 363 return n; 364 } 365 } 366 return n; 367 } 368 369 int32_t 370 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { 371 if(n == 0 || n == UBRK_DONE || // at end or 372 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 373 return n; 374 } 375 // OK, do we need to break here? 376 UErrorCode status = U_ZERO_ERROR; 377 // refresh text 378 resetState(status); 379 if(U_FAILURE(status)) return UBRK_DONE; // bail out 380 381 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 382 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). 383 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 384 385 switch(m) { 386 case kExceptionHere: 387 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. 388 continue; 389 390 default: 391 case kNoExceptionHere: 392 return n; 393 } 394 } 395 return n; 396 } 397 398 399 int32_t 400 SimpleFilteredSentenceBreakIterator::next() { 401 return internalNext(fDelegate->next()); 402 } 403 404 int32_t 405 SimpleFilteredSentenceBreakIterator::first(void) { 406 // Don't suppress a break opportunity at the beginning of text. 407 return fDelegate->first(); 408 } 409 410 int32_t 411 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { 412 return internalPrev(fDelegate->preceding(offset)); 413 } 414 415 int32_t 416 SimpleFilteredSentenceBreakIterator::previous(void) { 417 return internalPrev(fDelegate->previous()); 418 } 419 420 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { 421 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress 422 423 if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions 424 425 UErrorCode status = U_ZERO_ERROR; 426 resetState(status); 427 428 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); 429 430 switch(m) { 431 case kExceptionHere: 432 return false; 433 default: 434 case kNoExceptionHere: 435 return true; 436 } 437 } 438 439 int32_t 440 SimpleFilteredSentenceBreakIterator::next(int32_t offset) { 441 return internalNext(fDelegate->next(offset)); 442 } 443 444 int32_t 445 SimpleFilteredSentenceBreakIterator::following(int32_t offset) { 446 return internalNext(fDelegate->following(offset)); 447 } 448 449 int32_t 450 SimpleFilteredSentenceBreakIterator::last(void) { 451 // Don't suppress a break opportunity at the end of text. 452 return fDelegate->last(); 453 } 454 455 456 /** 457 * Concrete implementation of builder class. 458 */ 459 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { 460 public: 461 virtual ~SimpleFilteredBreakIteratorBuilder(); 462 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); 463 SimpleFilteredBreakIteratorBuilder(UErrorCode &status); 464 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 465 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 466 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); 467 private: 468 UStringSet fSet; 469 }; 470 471 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() 472 { 473 } 474 475 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) 476 : fSet(status) 477 { 478 } 479 480 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) 481 : fSet(status) 482 { 483 if(U_SUCCESS(status)) { 484 UErrorCode subStatus = U_ZERO_ERROR; 485 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); 486 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 487 status = subStatus; // copy the failing status 488 #if FB_DEBUG 489 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 490 #endif 491 return; // leaves the builder empty, if you try to use it. 492 } 493 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus)); 494 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 495 status = subStatus; // copy the failing status 496 #if FB_DEBUG 497 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 498 #endif 499 return; // leaves the builder empty, if you try to use it. 500 } 501 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus)); 502 503 #if FB_DEBUG 504 { 505 UErrorCode subsub = subStatus; 506 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); 507 } 508 #endif 509 510 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 511 status = subStatus; // copy the failing status 512 #if FB_DEBUG 513 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 514 #endif 515 return; // leaves the builder empty, if you try to use it. 516 } 517 518 LocalUResourceBundlePointer strs; 519 subStatus = status; // Pick up inherited warning status now 520 do { 521 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); 522 if(strs.isValid() && U_SUCCESS(subStatus)) { 523 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); 524 suppressBreakAfter(str, status); // load the string 525 } 526 } while (strs.isValid() && U_SUCCESS(subStatus)); 527 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { 528 status = subStatus; 529 } 530 } 531 } 532 533 UBool 534 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 535 { 536 UBool r = fSet.add(exception, status); 537 FB_TRACE("suppressBreakAfter",&exception,r,0); 538 return r; 539 } 540 541 UBool 542 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 543 { 544 UBool r = fSet.remove(exception, status); 545 FB_TRACE("unsuppressBreakAfter",&exception,r,0); 546 return r; 547 } 548 549 /** 550 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. 551 * Work around this. 552 * 553 * Note: "new UnicodeString[subCount]" ends up calling global operator new 554 * on MSVC2012 for some reason. 555 */ 556 static inline UnicodeString* newUnicodeStringArray(size_t count) { 557 return new UnicodeString[count ? count : 1]; 558 } 559 560 BreakIterator * 561 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { 562 LocalPointer<BreakIterator> adopt(adoptBreakIterator); 563 564 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); 565 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); 566 if(U_FAILURE(status)) { 567 return NULL; 568 } 569 570 int32_t revCount = 0; 571 int32_t fwdCount = 0; 572 573 int32_t subCount = fSet.size(); 574 575 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); 576 577 LocalArray<UnicodeString> ustrs(ustrs_ptr); 578 579 LocalMemory<int> partials; 580 partials.allocateInsteadAndReset(subCount); 581 582 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. 583 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." 584 585 int n=0; 586 for ( int32_t i = 0; 587 i<fSet.size(); 588 i++) { 589 const UnicodeString *abbr = fSet.getStringAt(i); 590 if(abbr) { 591 FB_TRACE("build",abbr,TRUE,i); 592 ustrs[n] = *abbr; // copy by value 593 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i); 594 } else { 595 FB_TRACE("build",abbr,FALSE,i); 596 status = U_MEMORY_ALLOCATION_ERROR; 597 return NULL; 598 } 599 partials[n] = 0; // default: not partial 600 n++; 601 } 602 // first pass - find partials. 603 for(int i=0;i<subCount;i++) { 604 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations 605 if(nn>-1 && (nn+1)!=ustrs[i].length()) { 606 FB_TRACE("partial",&ustrs[i],FALSE,i); 607 // is partial. 608 // is it unique? 609 int sameAs = -1; 610 for(int j=0;j<subCount;j++) { 611 if(j==i) continue; 612 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { 613 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); 614 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn 615 if(partials[j]==0) { // hasn't been processed yet 616 partials[j] = kSuppressInReverse | kAddToForward; 617 FB_TRACE("suppressing",&ustrs[j],FALSE,j); 618 } else if(partials[j] & kSuppressInReverse) { 619 sameAs = j; // the other entry is already in the reverse table. 620 } 621 } 622 } 623 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs); 624 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]); 625 UnicodeString prefix(ustrs[i], 0, nn+1); 626 if(sameAs == -1 && partials[i] == 0) { 627 // first one - add the prefix to the reverse table. 628 prefix.reverse(); 629 builder->add(prefix, kPARTIAL, status); 630 revCount++; 631 FB_TRACE("Added partial",&prefix,FALSE, i); 632 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 633 partials[i] = kSuppressInReverse | kAddToForward; 634 } else { 635 FB_TRACE("NOT adding partial",&prefix,FALSE, i); 636 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 637 } 638 } 639 } 640 for(int i=0;i<subCount;i++) { 641 if(partials[i]==0) { 642 ustrs[i].reverse(); 643 builder->add(ustrs[i], kMATCH, status); 644 revCount++; 645 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); 646 } else { 647 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); 648 649 // an optimization would be to only add the portion after the '.' 650 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, 651 // instead of "Ph.D." since we already know the "Ph." part is a match. 652 // would need the trie to be able to hold 0-length strings, though. 653 builder2->add(ustrs[i], kMATCH, status); // forward 654 fwdCount++; 655 //ustrs[i].reverse(); 656 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); 657 } 658 } 659 FB_TRACE("AbbrCount",NULL,FALSE, subCount); 660 661 if(revCount>0) { 662 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); 663 if(U_FAILURE(status)) { 664 FB_TRACE(u_errorName(status),NULL,FALSE, -1); 665 return NULL; 666 } 667 } 668 669 if(fwdCount>0) { 670 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); 671 if(U_FAILURE(status)) { 672 FB_TRACE(u_errorName(status),NULL,FALSE, -1); 673 return NULL; 674 } 675 } 676 677 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); 678 } 679 680 681 // ----------- Base class implementation 682 683 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { 684 } 685 686 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { 687 } 688 689 FilteredBreakIteratorBuilder * 690 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { 691 if(U_FAILURE(status)) return NULL; 692 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); 693 return (U_SUCCESS(status))? ret.orphan(): NULL; 694 } 695 696 FilteredBreakIteratorBuilder * 697 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { 698 if(U_FAILURE(status)) return NULL; 699 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); 700 return (U_SUCCESS(status))? ret.orphan(): NULL; 701 } 702 703 U_NAMESPACE_END 704 705 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION 706