Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2014-2015, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
     12 
     13 #include "cmemory.h"
     14 
     15 #include "unicode/filteredbrk.h"
     16 #include "unicode/ucharstriebuilder.h"
     17 #include "unicode/ures.h"
     18 
     19 #include "uresimp.h" // ures_getByKeyWithFallback
     20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
     21 #include "uvector.h"
     22 #include "cmemory.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 #ifndef FB_DEBUG
     27 #define FB_DEBUG 0
     28 #endif
     29 
     30 #if FB_DEBUG
     31 #include <stdio.h>
     32 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
     33   char buf[2048];
     34   if(s) {
     35     s->extract(0,s->length(),buf,2048);
     36   } else {
     37     strcpy(buf,"NULL");
     38   }
     39   fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
     40           f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
     41 }
     42 
     43 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
     44 #else
     45 #define FB_TRACE(m,s,b,d)
     46 #endif
     47 
     48 /**
     49  * Used with sortedInsert()
     50  */
     51 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
     52     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
     53     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
     54     return a.compare(b);
     55 }
     56 
     57 /**
     58  * A UVector which implements a set of strings.
     59  */
     60 class U_COMMON_API UStringSet : public UVector {
     61  public:
     62   UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
     63                                            uhash_compareUnicodeString,
     64                                            1,
     65                                            status) {}
     66   virtual ~UStringSet();
     67   /**
     68    * Is this UnicodeSet contained?
     69    */
     70   inline UBool contains(const UnicodeString& s) {
     71     return contains((void*) &s);
     72   }
     73   using UVector::contains;
     74   /**
     75    * Return the ith UnicodeString alias
     76    */
     77   inline const UnicodeString* getStringAt(int32_t i) const {
     78     return (const UnicodeString*)elementAt(i);
     79   }
     80   /**
     81    * Adopt the UnicodeString if not already contained.
     82    * Caller no longer owns the pointer in any case.
     83    * @return true if adopted successfully, false otherwise (error, or else duplicate)
     84    */
     85   inline UBool adopt(UnicodeString *str, UErrorCode &status) {
     86     if(U_FAILURE(status) || contains(*str)) {
     87       delete str;
     88       return false;
     89     } else {
     90       sortedInsert(str, compareUnicodeString, status);
     91       if(U_FAILURE(status)) {
     92         delete str;
     93         return false;
     94       }
     95       return true;
     96     }
     97   }
     98   /**
     99    * Add by value.
    100    * @return true if successfully adopted.
    101    */
    102   inline UBool add(const UnicodeString& str, UErrorCode &status) {
    103     if(U_FAILURE(status)) return false;
    104     UnicodeString *t = new UnicodeString(str);
    105     if(t==NULL) {
    106       status = U_MEMORY_ALLOCATION_ERROR; return false;
    107     }
    108     return adopt(t, status);
    109   }
    110   /**
    111    * Remove this string.
    112    * @return true if successfully removed, false otherwise (error, or else it wasn't there)
    113    */
    114   inline UBool remove(const UnicodeString &s, UErrorCode &status) {
    115     if(U_FAILURE(status)) return false;
    116     return removeElement((void*) &s);
    117   }
    118 };
    119 
    120 /**
    121  * Virtual, won't be inlined
    122  */
    123 UStringSet::~UStringSet() {}
    124 
    125 /* ----------------------------------------------------------- */
    126 
    127 
    128 /* Filtered Break constants */
    129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
    130 static const int32_t kMATCH   = (1<<1); //< exact match - skip this one.
    131 static const int32_t kSuppressInReverse = (1<<0);
    132 static const int32_t kAddToForward = (1<<1);
    133 static const UChar   kFULLSTOP = 0x002E; // '.'
    134 
    135 /**
    136  * Shared data for SimpleFilteredSentenceBreakIterator
    137  */
    138 class SimpleFilteredSentenceBreakData : public UMemory {
    139 public:
    140   SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
    141       : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
    142   SimpleFilteredSentenceBreakData *incr() { refcount++;  return this; }
    143   SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
    144   virtual ~SimpleFilteredSentenceBreakData();
    145 
    146   LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
    147   LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
    148   int32_t                     refcount;
    149 };
    150 
    151 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
    152 
    153 /**
    154  * Concrete implementation
    155  */
    156 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
    157 public:
    158   SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
    159   SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
    160   virtual ~SimpleFilteredSentenceBreakIterator();
    161 private:
    162   SimpleFilteredSentenceBreakData *fData;
    163   LocalPointer<BreakIterator> fDelegate;
    164   LocalUTextPointer           fText;
    165 
    166   /* -- subclass interface -- */
    167 public:
    168   /* -- cloning and other subclass stuff -- */
    169   virtual BreakIterator *  createBufferClone(void * /*stackBuffer*/,
    170                                              int32_t &/*BufferSize*/,
    171                                              UErrorCode &status) {
    172     // for now - always deep clone
    173     status = U_SAFECLONE_ALLOCATED_WARNING;
    174     return clone();
    175   }
    176   virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
    177   virtual UClassID getDynamicClassID(void) const { return NULL; }
    178   virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
    179 
    180   /* -- text modifying -- */
    181   virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
    182   virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
    183   virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
    184   virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
    185 
    186   /* -- other functions that are just delegated -- */
    187   virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
    188   virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
    189 
    190   /* -- ITERATION -- */
    191   virtual int32_t first(void);
    192   virtual int32_t preceding(int32_t offset);
    193   virtual int32_t previous(void);
    194   virtual UBool isBoundary(int32_t offset);
    195   virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
    196 
    197   virtual int32_t next(void);
    198 
    199   virtual int32_t next(int32_t n);
    200   virtual int32_t following(int32_t offset);
    201   virtual int32_t last(void);
    202 
    203 private:
    204     /**
    205      * Given that the fDelegate has already given its "initial" answer,
    206      * find the NEXT actual (non-excepted) break.
    207      * @param n initial position from delegate
    208      * @return new break position or UBRK_DONE
    209      */
    210     int32_t internalNext(int32_t n);
    211     /**
    212      * Given that the fDelegate has already given its "initial" answer,
    213      * find the PREV actual (non-excepted) break.
    214      * @param n initial position from delegate
    215      * @return new break position or UBRK_DONE
    216      */
    217     int32_t internalPrev(int32_t n);
    218     /**
    219      * set up the UText with the value of the fDelegate.
    220      * Call this before calling breakExceptionAt.
    221      * May be able to avoid excess calls
    222      */
    223     void resetState(UErrorCode &status);
    224     /**
    225      * Is there a match  (exception) at this spot?
    226      */
    227     enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
    228     /**
    229      * Determine if there is an exception at this spot
    230      * @param n spot to check
    231      * @return kNoExceptionHere or kExceptionHere
    232      **/
    233     enum EFBMatchResult breakExceptionAt(int32_t n);
    234 };
    235 
    236 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
    237   : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
    238 {
    239 }
    240 
    241 
    242 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
    243   BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
    244   fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
    245   fDelegate(adopt)
    246 {
    247   // all set..
    248 }
    249 
    250 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
    251     fData = fData->decr();
    252 }
    253 
    254 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
    255   fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
    256 }
    257 
    258 SimpleFilteredSentenceBreakIterator::EFBMatchResult
    259 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
    260     int64_t bestPosn = -1;
    261     int32_t bestValue = -1;
    262     // loops while 'n' points to an exception.
    263     utext_setNativeIndex(fText.getAlias(), n); // from n..
    264     fData->fBackwardsTrie->reset();
    265     UChar32 uch;
    266 
    267     //if(debug2) u_printf(" n@ %d\n", n);
    268     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
    269     if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
    270       // TODO only do this the 1st time?
    271       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
    272     } else {
    273       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
    274       uch = utext_next32(fText.getAlias());
    275       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
    276     }
    277 
    278     UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
    279 
    280     while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
    281           USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
    282       if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
    283         bestPosn = utext_getNativeIndex(fText.getAlias());
    284         bestValue = fData->fBackwardsTrie->getValue();
    285       }
    286       //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
    287     }
    288 
    289     if(USTRINGTRIE_MATCHES(r)) { // exact match?
    290       //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    291       bestValue = fData->fBackwardsTrie->getValue();
    292       bestPosn = utext_getNativeIndex(fText.getAlias());
    293       //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    294     }
    295 
    296     if(bestPosn>=0) {
    297       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    298 
    299       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
    300       //int32_t bestValue = fBackwardsTrie->getValue();
    301       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
    302 
    303       if(bestValue == kMATCH) { // exact match!
    304         //if(debug2) u_printf(" exact backward match\n");
    305         return kExceptionHere; // See if the next is another exception.
    306       } else if(bestValue == kPARTIAL
    307                 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
    308         //if(debug2) u_printf(" partial backward match\n");
    309         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
    310         // to see if it matches something going forward.
    311         fData->fForwardsPartialTrie->reset();
    312         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
    313         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
    314         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
    315         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
    316               USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
    317           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
    318         }
    319         if(USTRINGTRIE_MATCHES(rfwd)) {
    320           //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
    321           // only full matches here, nothing to check
    322           // skip the next:
    323             return kExceptionHere;
    324         } else {
    325           //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
    326           // no match (no exception) -return the 'underlying' break
    327           return kNoExceptionHere;
    328         }
    329       } else {
    330         return kNoExceptionHere; // internal error and/or no forwards trie
    331       }
    332     } else {
    333       //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r);  // no best match
    334       return kNoExceptionHere; // No match - so exit. Not an exception.
    335     }
    336 }
    337 
    338 // the workhorse single next.
    339 int32_t
    340 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
    341   if(n == UBRK_DONE || // at end  or
    342     fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
    343       return n;
    344   }
    345   // OK, do we need to break here?
    346   UErrorCode status = U_ZERO_ERROR;
    347   // refresh text
    348   resetState(status);
    349   if(U_FAILURE(status)) return UBRK_DONE; // bail out
    350   int64_t utextLen = utext_nativeLength(fText.getAlias());
    351 
    352   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
    353   while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
    354     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
    355 
    356     switch(m) {
    357     case kExceptionHere:
    358       n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
    359       continue;
    360 
    361     default:
    362     case kNoExceptionHere:
    363       return n;
    364     }
    365   }
    366   return n;
    367 }
    368 
    369 int32_t
    370 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
    371   if(n == 0 || n == UBRK_DONE || // at end  or
    372     fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
    373       return n;
    374   }
    375   // OK, do we need to break here?
    376   UErrorCode status = U_ZERO_ERROR;
    377   // refresh text
    378   resetState(status);
    379   if(U_FAILURE(status)) return UBRK_DONE; // bail out
    380 
    381   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
    382   while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
    383     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
    384 
    385     switch(m) {
    386     case kExceptionHere:
    387       n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
    388       continue;
    389 
    390     default:
    391     case kNoExceptionHere:
    392       return n;
    393     }
    394   }
    395   return n;
    396 }
    397 
    398 
    399 int32_t
    400 SimpleFilteredSentenceBreakIterator::next() {
    401   return internalNext(fDelegate->next());
    402 }
    403 
    404 int32_t
    405 SimpleFilteredSentenceBreakIterator::first(void) {
    406   // Don't suppress a break opportunity at the beginning of text.
    407   return fDelegate->first();
    408 }
    409 
    410 int32_t
    411 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
    412   return internalPrev(fDelegate->preceding(offset));
    413 }
    414 
    415 int32_t
    416 SimpleFilteredSentenceBreakIterator::previous(void) {
    417   return internalPrev(fDelegate->previous());
    418 }
    419 
    420 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
    421   if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
    422 
    423   if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
    424 
    425   UErrorCode status = U_ZERO_ERROR;
    426   resetState(status);
    427 
    428   SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
    429 
    430   switch(m) {
    431   case kExceptionHere:
    432     return false;
    433   default:
    434   case kNoExceptionHere:
    435     return true;
    436   }
    437 }
    438 
    439 int32_t
    440 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
    441   return internalNext(fDelegate->next(offset));
    442 }
    443 
    444 int32_t
    445 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
    446   return internalNext(fDelegate->following(offset));
    447 }
    448 
    449 int32_t
    450 SimpleFilteredSentenceBreakIterator::last(void) {
    451   // Don't suppress a break opportunity at the end of text.
    452   return fDelegate->last();
    453 }
    454 
    455 
    456 /**
    457  * Concrete implementation of builder class.
    458  */
    459 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
    460 public:
    461   virtual ~SimpleFilteredBreakIteratorBuilder();
    462   SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
    463   SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
    464   virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
    465   virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
    466   virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
    467 private:
    468   UStringSet fSet;
    469 };
    470 
    471 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
    472 {
    473 }
    474 
    475 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
    476   : fSet(status)
    477 {
    478 }
    479 
    480 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
    481   : fSet(status)
    482 {
    483   if(U_SUCCESS(status)) {
    484     UErrorCode subStatus = U_ZERO_ERROR;
    485     LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
    486     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
    487       status = subStatus; // copy the failing status
    488 #if FB_DEBUG
    489       fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
    490 #endif
    491       return;  // leaves the builder empty, if you try to use it.
    492     }
    493     LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
    494     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
    495       status = subStatus; // copy the failing status
    496 #if FB_DEBUG
    497       fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
    498 #endif
    499       return;  // leaves the builder empty, if you try to use it.
    500     }
    501     LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
    502 
    503 #if FB_DEBUG
    504     {
    505       UErrorCode subsub = subStatus;
    506       fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
    507     }
    508 #endif
    509 
    510     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
    511       status = subStatus; // copy the failing status
    512 #if FB_DEBUG
    513       fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
    514 #endif
    515       return;  // leaves the builder empty, if you try to use it.
    516     }
    517 
    518     LocalUResourceBundlePointer strs;
    519     subStatus = status; // Pick up inherited warning status now
    520     do {
    521       strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
    522       if(strs.isValid() && U_SUCCESS(subStatus)) {
    523         UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
    524         suppressBreakAfter(str, status); // load the string
    525       }
    526     } while (strs.isValid() && U_SUCCESS(subStatus));
    527     if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
    528       status = subStatus;
    529     }
    530   }
    531 }
    532 
    533 UBool
    534 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
    535 {
    536   UBool r = fSet.add(exception, status);
    537   FB_TRACE("suppressBreakAfter",&exception,r,0);
    538   return r;
    539 }
    540 
    541 UBool
    542 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
    543 {
    544   UBool r = fSet.remove(exception, status);
    545   FB_TRACE("unsuppressBreakAfter",&exception,r,0);
    546   return r;
    547 }
    548 
    549 /**
    550  * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
    551  * Work around this.
    552  *
    553  * Note: "new UnicodeString[subCount]" ends up calling global operator new
    554  * on MSVC2012 for some reason.
    555  */
    556 static inline UnicodeString* newUnicodeStringArray(size_t count) {
    557     return new UnicodeString[count ? count : 1];
    558 }
    559 
    560 BreakIterator *
    561 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
    562   LocalPointer<BreakIterator> adopt(adoptBreakIterator);
    563 
    564   LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
    565   LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
    566   if(U_FAILURE(status)) {
    567     return NULL;
    568   }
    569 
    570   int32_t revCount = 0;
    571   int32_t fwdCount = 0;
    572 
    573   int32_t subCount = fSet.size();
    574 
    575   UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
    576 
    577   LocalArray<UnicodeString> ustrs(ustrs_ptr);
    578 
    579   LocalMemory<int> partials;
    580   partials.allocateInsteadAndReset(subCount);
    581 
    582   LocalPointer<UCharsTrie>    backwardsTrie; //  i.e. ".srM" for Mrs.
    583   LocalPointer<UCharsTrie>    forwardsPartialTrie; //  Has ".a" for "a.M."
    584 
    585   int n=0;
    586   for ( int32_t i = 0;
    587         i<fSet.size();
    588         i++) {
    589     const UnicodeString *abbr = fSet.getStringAt(i);
    590     if(abbr) {
    591       FB_TRACE("build",abbr,TRUE,i);
    592       ustrs[n] = *abbr; // copy by value
    593       FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
    594     } else {
    595       FB_TRACE("build",abbr,FALSE,i);
    596       status = U_MEMORY_ALLOCATION_ERROR;
    597       return NULL;
    598     }
    599     partials[n] = 0; // default: not partial
    600     n++;
    601   }
    602   // first pass - find partials.
    603   for(int i=0;i<subCount;i++) {
    604     int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
    605     if(nn>-1 && (nn+1)!=ustrs[i].length()) {
    606       FB_TRACE("partial",&ustrs[i],FALSE,i);
    607       // is partial.
    608       // is it unique?
    609       int sameAs = -1;
    610       for(int j=0;j<subCount;j++) {
    611         if(j==i) continue;
    612         if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
    613           FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
    614           //UBool otherIsPartial = ((nn+1)!=ustrs[j].length());  // true if ustrs[j] doesn't end at nn
    615           if(partials[j]==0) { // hasn't been processed yet
    616             partials[j] = kSuppressInReverse | kAddToForward;
    617             FB_TRACE("suppressing",&ustrs[j],FALSE,j);
    618           } else if(partials[j] & kSuppressInReverse) {
    619             sameAs = j; // the other entry is already in the reverse table.
    620           }
    621         }
    622       }
    623       FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
    624       FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
    625       UnicodeString prefix(ustrs[i], 0, nn+1);
    626       if(sameAs == -1 && partials[i] == 0) {
    627         // first one - add the prefix to the reverse table.
    628         prefix.reverse();
    629         builder->add(prefix, kPARTIAL, status);
    630         revCount++;
    631         FB_TRACE("Added partial",&prefix,FALSE, i);
    632         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
    633         partials[i] = kSuppressInReverse | kAddToForward;
    634       } else {
    635         FB_TRACE("NOT adding partial",&prefix,FALSE, i);
    636         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
    637       }
    638     }
    639   }
    640   for(int i=0;i<subCount;i++) {
    641     if(partials[i]==0) {
    642       ustrs[i].reverse();
    643       builder->add(ustrs[i], kMATCH, status);
    644       revCount++;
    645       FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
    646     } else {
    647       FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
    648 
    649       // an optimization would be to only add the portion after the '.'
    650       // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
    651       // instead of "Ph.D." since we already know the "Ph." part is a match.
    652       // would need the trie to be able to hold 0-length strings, though.
    653       builder2->add(ustrs[i], kMATCH, status); // forward
    654       fwdCount++;
    655       //ustrs[i].reverse();
    656       ////if(debug2) u_printf("SUPPRESS- not Added(%d):  /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
    657     }
    658   }
    659   FB_TRACE("AbbrCount",NULL,FALSE, subCount);
    660 
    661   if(revCount>0) {
    662     backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
    663     if(U_FAILURE(status)) {
    664       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
    665       return NULL;
    666     }
    667   }
    668 
    669   if(fwdCount>0) {
    670     forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
    671     if(U_FAILURE(status)) {
    672       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
    673       return NULL;
    674     }
    675   }
    676 
    677   return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
    678 }
    679 
    680 
    681 // ----------- Base class implementation
    682 
    683 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
    684 }
    685 
    686 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
    687 }
    688 
    689 FilteredBreakIteratorBuilder *
    690 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
    691   if(U_FAILURE(status)) return NULL;
    692   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
    693   return (U_SUCCESS(status))? ret.orphan(): NULL;
    694 }
    695 
    696 FilteredBreakIteratorBuilder *
    697 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
    698   if(U_FAILURE(status)) return NULL;
    699   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
    700   return (U_SUCCESS(status))? ret.orphan(): NULL;
    701 }
    702 
    703 U_NAMESPACE_END
    704 
    705 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
    706