Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2014, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 *******************************************************************************
      6 */
      7 
      8 #include "unicode/filteredbrk.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
     11 
     12 #include <unicode/ucharstriebuilder.h>
     13 
     14 #include <set>
     15 #include <string>
     16 #include <functional>
     17 #include "uresimp.h"
     18 #include "ubrkimpl.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 using namespace std;
     23 
     24 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
     25 static const int32_t kMATCH   = (1<<1); //< exact match - skip this one.
     26 static const int32_t kSuppressInReverse = (1<<0);
     27 static const int32_t kAddToForward = (1<<1);
     28 static const UChar kFULLSTOP = 0x002E; // '.'
     29 
     30 class ULISentenceBreakIterator : public BreakIterator {
     31 public:
     32   ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
     33   virtual ~ULISentenceBreakIterator() {}
     34   ULISentenceBreakIterator(const ULISentenceBreakIterator& other);
     35 private:
     36   LocalPointer<BreakIterator> fDelegate;
     37   LocalUTextPointer           fText;
     38   LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
     39   LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
     40 
     41   /* -- subclass interface -- */
     42 public:
     43   /* -- cloning and other subclass stuff -- */
     44   virtual BreakIterator *  createBufferClone(void * /*stackBuffer*/,
     45                                              int32_t &/*BufferSize*/,
     46                                              UErrorCode &status) {
     47     // for now - always deep clone
     48     status = U_SAFECLONE_ALLOCATED_WARNING;
     49     return clone();
     50   }
     51   virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); }
     52   virtual UClassID getDynamicClassID(void) const { return NULL; }
     53   virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; }
     54 
     55   /* -- text modifying -- */
     56   virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
     57   virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
     58   virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
     59   virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
     60 
     61   /* -- other functions that are just delegated -- */
     62   virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
     63   virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
     64 
     65   /* -- ITERATION -- */
     66   virtual int32_t first(void) { return fDelegate->first(); }
     67   virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); }
     68   virtual int32_t previous(void) { return fDelegate->previous(); }
     69   virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); }
     70   virtual int32_t current(void) const { return fDelegate->current(); }
     71 
     72   virtual int32_t next(void);
     73 
     74   virtual int32_t next(int32_t n) { return fDelegate->next(n); }
     75   virtual int32_t following(int32_t offset) { return fDelegate->following(offset); }
     76   virtual int32_t last(void) { return fDelegate->last(); }
     77 
     78 };
     79 
     80 ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other)
     81   : BreakIterator(other), fDelegate(other.fDelegate->clone())
     82 {
     83   /*
     84     TODO: not able to clone Tries. Should be a refcounted hidden master instead.
     85   if(other.fBackwardsTrie.isValid()) {
     86     fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
     87   }
     88   if(other.fForwardsPartialTrie.isValid()) {
     89     fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
     90   }
     91   */
     92 }
     93 
     94 
     95 ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
     96   BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
     97   fDelegate(adopt),
     98   fBackwardsTrie(backwards),
     99   fForwardsPartialTrie(forwards)
    100 {
    101   // all set..
    102 }
    103 
    104 int32_t ULISentenceBreakIterator::next() {
    105   int32_t n = fDelegate->next();
    106   if(n == UBRK_DONE || // at end  or
    107      fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
    108     return n;
    109   }
    110   // OK, do we need to break here?
    111   UErrorCode status = U_ZERO_ERROR;
    112   // refresh text
    113   fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
    114   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
    115   do { // outer loop runs once per underlying break (from fDelegate).
    116     // loops while 'n' points to an exception.
    117     utext_setNativeIndex(fText.getAlias(), n); // from n..
    118     fBackwardsTrie->reset();
    119     UChar32 uch;
    120     //if(debug2) u_printf(" n@ %d\n", n);
    121     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
    122     if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
    123       // TODO only do this the 1st time?
    124       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
    125     } else {
    126       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
    127       uch = utext_next32(fText.getAlias());
    128       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
    129     }
    130     UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
    131 
    132     int32_t bestPosn = -1;
    133     int32_t bestValue = -1;
    134 
    135     while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
    136           USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
    137       if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
    138         bestPosn = utext_getNativeIndex(fText.getAlias());
    139         bestValue = fBackwardsTrie->getValue();
    140       }
    141       //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
    142     }
    143 
    144     if(USTRINGTRIE_MATCHES(r)) { // exact match?
    145       //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    146       bestValue = fBackwardsTrie->getValue();
    147       bestPosn = utext_getNativeIndex(fText.getAlias());
    148       //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    149     }
    150 
    151     if(bestPosn>=0) {
    152       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
    153 
    154       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
    155       //int32_t bestValue = fBackwardsTrie->getValue();
    156       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
    157 
    158       if(bestValue == kMATCH) { // exact match!
    159         //if(debug2) u_printf(" exact backward match\n");
    160         n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
    161         if(n==UBRK_DONE) return n;
    162         continue; // See if the next is another exception.
    163       } else if(bestValue == kPARTIAL
    164                 && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
    165         //if(debug2) u_printf(" partial backward match\n");
    166         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
    167         // to see if it matches something going forward.
    168         fForwardsPartialTrie->reset();
    169         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
    170         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
    171         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
    172         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
    173               USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
    174           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
    175         }
    176         if(USTRINGTRIE_MATCHES(rfwd)) {
    177           //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
    178           // only full matches here, nothing to check
    179           // skip the next:
    180           n = fDelegate->next();
    181           if(n==UBRK_DONE) return n;
    182           continue;
    183         } else {
    184           //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
    185           // no match (no exception) -return the 'underlying' break
    186           return n;
    187         }
    188       } else {
    189         return n; // internal error and/or no forwards trie
    190       }
    191     } else {
    192       //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r);  // no best match
    193       return n; // No match - so exit. Not an exception.
    194     }
    195   } while(n != UBRK_DONE);
    196   return n;
    197 }
    198 
    199 U_NAMESPACE_END
    200 
    201 #if 0
    202 // Would improve performance - but, platform issues.
    203 // for the 'set'
    204 namespace std {
    205   template <> struct hash<icu::UnicodeString> {
    206     size_t operator()( const UnicodeString& str ) const {
    207       return (size_t)str.hashCode();
    208     }
    209   };
    210 }
    211 #endif
    212 
    213 U_NAMESPACE_BEGIN
    214 
    215 class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
    216 public:
    217   virtual ~SimpleFilteredBreakIteratorBuilder();
    218   SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
    219   SimpleFilteredBreakIteratorBuilder();
    220   virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
    221   virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
    222   virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
    223 private:
    224   set<UnicodeString> fSet;
    225 };
    226 
    227 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
    228 {
    229 }
    230 
    231 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
    232   : fSet()
    233 {
    234   if(U_SUCCESS(status)) {
    235     LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
    236     LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status));
    237     LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status));
    238     if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it.
    239 
    240     LocalUResourceBundlePointer strs;
    241     UErrorCode subStatus = status;
    242     do {
    243       strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
    244       if(strs.isValid() && U_SUCCESS(subStatus)) {
    245         UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
    246         suppressBreakAfter(str, status); // load the string
    247       }
    248     } while (strs.isValid() && U_SUCCESS(subStatus));
    249     if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
    250       status = subStatus;
    251     }
    252   }
    253 }
    254 
    255 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
    256   : fSet()
    257 {
    258 }
    259 
    260 UBool
    261 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
    262 {
    263   if( U_FAILURE(status) ) return FALSE;
    264   return fSet.insert(exception).second;
    265 }
    266 
    267 UBool
    268 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
    269 {
    270   if( U_FAILURE(status) ) return FALSE;
    271   return ((fSet.erase(exception)) != 0);
    272 }
    273 
    274 BreakIterator *
    275 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
    276   LocalPointer<BreakIterator> adopt(adoptBreakIterator);
    277 
    278   if(U_FAILURE(status)) {
    279     return NULL;
    280   }
    281 
    282   LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status));
    283   LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status));
    284 
    285   int32_t revCount = 0;
    286   int32_t fwdCount = 0;
    287 
    288   int32_t subCount = fSet.size();
    289   LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]);
    290   LocalArray<int> partials(new int[subCount]);
    291 
    292   LocalPointer<UCharsTrie>    backwardsTrie; //  i.e. ".srM" for Mrs.
    293   LocalPointer<UCharsTrie>    forwardsPartialTrie; //  Has ".a" for "a.M."
    294 
    295   int n=0;
    296   for ( set<UnicodeString>::iterator i = fSet.begin();
    297         i != fSet.end();
    298         i++) {
    299     const UnicodeString &abbr = *i;
    300     ustrs[n] = abbr;
    301     partials[n] = 0; // default: not partial
    302     n++;
    303   }
    304   // first pass - find partials.
    305   for(int i=0;i<subCount;i++) {
    306     int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
    307     if(nn>-1 && (nn+1)!=ustrs[i].length()) {
    308       //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer());
    309       // is partial.
    310       // is it unique?
    311       int sameAs = -1;
    312       for(int j=0;j<subCount;j++) {
    313         if(j==i) continue;
    314         if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
    315           //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1);
    316           //UBool otherIsPartial = ((nn+1)!=ustrs[j].length());  // true if ustrs[j] doesn't end at nn
    317           if(partials[j]==0) { // hasn't been processed yet
    318             partials[j] = kSuppressInReverse | kAddToForward;
    319             //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer());
    320           } else if(partials[j] & kSuppressInReverse) {
    321             sameAs = j; // the other entry is already in the reverse table.
    322           }
    323         }
    324       }
    325       //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n",      ustrs[i].getTerminatedBuffer(), sameAs, partials[i]);
    326       UnicodeString prefix(ustrs[i], 0, nn+1);
    327       if(sameAs == -1 && partials[i] == 0) {
    328         // first one - add the prefix to the reverse table.
    329         prefix.reverse();
    330         builder->add(prefix, kPARTIAL, status);
    331         revCount++;
    332         //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status));
    333         partials[i] = kSuppressInReverse | kAddToForward;
    334       } else {
    335         //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer());
    336       }
    337     }
    338   }
    339   for(int i=0;i<subCount;i++) {
    340     if(partials[i]==0) {
    341       ustrs[i].reverse();
    342       builder->add(ustrs[i], kMATCH, status);
    343       revCount++;
    344       //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status));
    345     } else {
    346       //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer());
    347 
    348       // an optimization would be to only add the portion after the '.'
    349       // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
    350       // instead of "Ph.D." since we already know the "Ph." part is a match.
    351       // would need the trie to be able to hold 0-length strings, though.
    352       builder2->add(ustrs[i], kMATCH, status); // forward
    353       fwdCount++;
    354       //ustrs[i].reverse();
    355       ////if(debug2) u_printf("SUPPRESS- not Added(%d):  /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
    356     }
    357   }
    358   //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount);
    359 
    360   if(revCount>0) {
    361     backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
    362     if(U_FAILURE(status)) {
    363       //printf("Error %s building backwards\n", u_errorName(status));
    364       return NULL;
    365     }
    366   }
    367 
    368   if(fwdCount>0) {
    369     forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
    370     if(U_FAILURE(status)) {
    371       //printf("Error %s building forwards\n", u_errorName(status));
    372       return NULL;
    373     }
    374   }
    375 
    376   return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
    377 }
    378 
    379 
    380 // -----------
    381 
    382 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
    383 }
    384 
    385 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
    386 }
    387 
    388 FilteredBreakIteratorBuilder *
    389 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
    390   if(U_FAILURE(status)) return NULL;
    391 
    392   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status));
    393   if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
    394   return ret.orphan();
    395 }
    396 
    397 FilteredBreakIteratorBuilder *
    398 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
    399   if(U_FAILURE(status)) return NULL;
    400 
    401   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder());
    402   if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
    403   return ret.orphan();
    404 }
    405 
    406 U_NAMESPACE_END
    407 
    408 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
    409