Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2014-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.impl;
     10 
     11 import java.text.CharacterIterator;
     12 import java.util.HashSet;
     13 import java.util.Locale;
     14 
     15 import com.ibm.icu.impl.ICUResourceBundle.OpenType;
     16 import com.ibm.icu.text.BreakIterator;
     17 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
     18 import com.ibm.icu.text.UCharacterIterator;
     19 import com.ibm.icu.util.BytesTrie;
     20 import com.ibm.icu.util.CharsTrie;
     21 import com.ibm.icu.util.CharsTrieBuilder;
     22 import com.ibm.icu.util.StringTrieBuilder;
     23 import com.ibm.icu.util.ULocale;
     24 
     25 /**
     26  * @author tomzhang
     27  */
     28 public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
     29 
     30     private BreakIterator delegate;
     31     private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ?
     32     private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs.
     33     private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M."
     34 
     35     /**
     36      * @param adoptBreakIterator
     37      *            break iterator to adopt
     38      * @param forwardsPartialTrie
     39      *            forward & partial char trie to adopt
     40      * @param backwardsTrie
     41      *            backward trie to adopt
     42      */
     43     public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie,
     44             CharsTrie backwardsTrie) {
     45         this.delegate = adoptBreakIterator;
     46         this.forwardsPartialTrie = forwardsPartialTrie;
     47         this.backwardsTrie = backwardsTrie;
     48     }
     49 
     50 
     51     /**
     52      * Reset the filter from the delegate.
     53      */
     54     private final void resetState() {
     55         text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
     56     }
     57 
     58     /**
     59      * Is there an exception at this point?
     60      *
     61      * @param n the location of the possible break
     62      * @return
     63      */
     64     private final boolean breakExceptionAt(int n) {
     65         // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
     66 
     67         int bestPosn = -1;
     68         int bestValue = -1;
     69 
     70         // loops while 'n' points to an exception
     71         text.setIndex(n);
     72         backwardsTrie.reset();
     73         int uch;
     74 
     75 
     76 
     77         // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
     78         if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
     79             // TODO only do this the 1st time?
     80         } else {
     81             uch = text.nextCodePoint();
     82         }
     83 
     84         BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
     85 
     86         while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
     87                 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
     88             if (r.hasValue()) { // remember the best match so far
     89                 bestPosn = text.getIndex();
     90                 bestValue = backwardsTrie.getValue();
     91             }
     92         }
     93 
     94         if (r.matches()) { // exact match?
     95             bestValue = backwardsTrie.getValue();
     96             bestPosn = text.getIndex();
     97         }
     98 
     99         if (bestPosn >= 0) {
    100             if (bestValue == Builder.MATCH) { // exact match!
    101                 return true; // Exception here.
    102             } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
    103                 // make sure there's a forward trie
    104                 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
    105                 // to see if it matches something going forward.
    106                 forwardsPartialTrie.reset();
    107 
    108                 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
    109                 text.setIndex(bestPosn); // hope that's close ..
    110                 while ((uch = text.nextCodePoint()) != BreakIterator.DONE
    111                         && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
    112                 }
    113                 if (rfwd.matches()) {
    114                     // Exception here
    115                     return true;
    116                 } // else fall through
    117             } // else fall through
    118         } // else fall through
    119         return false; // No exception here.
    120     }
    121 
    122     /**
    123      * Given that the delegate has already given its "initial" answer,
    124      * find the NEXT actual (non-suppressed) break.
    125      * @param n initial position from delegate
    126      * @return new break position or BreakIterator.DONE
    127      */
    128     private final int internalNext(int n) {
    129         if (n == BreakIterator.DONE || // at end or
    130                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
    131             return n;
    132         }
    133         resetState();
    134 
    135         final int textLen = text.getLength();
    136 
    137         while (n != BreakIterator.DONE && n != textLen) {
    138             // outer loop runs once per underlying break (from fDelegate).
    139             // loops while 'n' points to an exception.
    140 
    141             if (breakExceptionAt(n)) {
    142                 // n points to a break exception
    143                 n = delegate.next();
    144             } else {
    145                 // no exception at this spot
    146                 return n;
    147             }
    148         }
    149         return n; //hit underlying DONE or break at end of text
    150     }
    151 
    152     /**
    153      * Given that the delegate has already given its "initial" answer,
    154      * find the PREV actual (non-suppressed) break.
    155      * @param n initial position from delegate
    156      * @return new break position or BreakIterator.DONE
    157      */
    158     private final int internalPrev(int n) {
    159         if (n == 0 || n == BreakIterator.DONE || // at end or
    160                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
    161             return n;
    162         }
    163         resetState();
    164 
    165         while (n != BreakIterator.DONE && n != 0) {
    166             // outer loop runs once per underlying break (from fDelegate).
    167             // loops while 'n' points to an exception.
    168 
    169             if (breakExceptionAt(n)) {
    170                 // n points to a break exception
    171                 n = delegate.previous();
    172             } else {
    173                 // no exception at this spot
    174                 return n;
    175             }
    176         }
    177         return n; //hit underlying DONE or break at end of text
    178     }
    179 
    180     @Override
    181     public boolean equals(Object obj) {
    182         if (obj == null)
    183             return false;
    184         if (this == obj)
    185             return true;
    186         if (getClass() != obj.getClass())
    187             return false;
    188         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
    189         return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
    190                 && forwardsPartialTrie.equals(other.forwardsPartialTrie);
    191     }
    192 
    193     @Override
    194     public int hashCode() {
    195         return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
    196     }
    197 
    198     @Override
    199     public Object clone() {
    200         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
    201         return other;
    202     }
    203 
    204 
    205     @Override
    206     public int first() {
    207         // Don't suppress a break opportunity at the beginning of text.
    208         return delegate.first();
    209     }
    210 
    211     @Override
    212     public int preceding(int offset) {
    213         return internalPrev(delegate.preceding(offset));
    214     }
    215 
    216     @Override
    217     public int previous() {
    218         return internalPrev(delegate.previous());
    219     }
    220 
    221     @Override
    222     public int current() {
    223         return delegate.current();
    224     }
    225 
    226     @Override
    227     public boolean isBoundary(int offset) {
    228         if(!delegate.isBoundary(offset)) {
    229             return false; // No underlying break to suppress?
    230         }
    231 
    232         // delegate thinks there's a break
    233         if(backwardsTrie == null) {
    234             return true; // no data
    235         }
    236 
    237         resetState();
    238         return !breakExceptionAt(offset); // if there's an exception: no break.
    239     }
    240 
    241     @Override
    242     public int next() {
    243         return internalNext(delegate.next());
    244     }
    245 
    246     @Override
    247     public int next(int n) {
    248         return internalNext(delegate.next(n));
    249     }
    250 
    251     @Override
    252     public int following(int offset) {
    253         return internalNext(delegate.following(offset));
    254     }
    255 
    256     @Override
    257     public int last() {
    258         // Don't suppress a break opportunity at the end of text.
    259         return delegate.last();
    260     }
    261 
    262     @Override
    263     public CharacterIterator getText() {
    264         return delegate.getText();
    265     }
    266 
    267     @Override
    268     public void setText(CharacterIterator newText) {
    269         delegate.setText(newText);
    270     }
    271 
    272     public static class Builder extends FilteredBreakIteratorBuilder {
    273         /**
    274          * filter set to store all exceptions
    275          */
    276         private HashSet<CharSequence> filterSet = new HashSet<CharSequence>();
    277 
    278         static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
    279         static final int MATCH = (1 << 1); // < exact match - skip this one.
    280         static final int SuppressInReverse = (1 << 0);
    281         static final int AddToForward = (1 << 1);
    282 
    283         public Builder(Locale loc) {
    284             this(ULocale.forLocale(loc));
    285         }
    286         /**
    287          * Create SimpleFilteredBreakIteratorBuilder using given locale
    288          * @param loc the locale to get filtered iterators
    289          */
    290         public Builder(ULocale loc) {
    291             ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
    292                     ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
    293 
    294             ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
    295 
    296             if (breaks != null) {
    297                 for (int index = 0, size = breaks.getSize(); index < size; ++index) {
    298                     ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
    299                     String br = b.getString();
    300                     filterSet.add(br);
    301                 }
    302             }
    303         }
    304 
    305         /**
    306          * Create SimpleFilteredBreakIteratorBuilder with no exception
    307          */
    308         public Builder() {
    309         }
    310 
    311         @Override
    312         public boolean suppressBreakAfter(CharSequence str) {
    313             return filterSet.add(str);
    314         }
    315 
    316         @Override
    317         public boolean unsuppressBreakAfter(CharSequence str) {
    318             return filterSet.remove(str);
    319         }
    320 
    321         @Override
    322         public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) {
    323             if( filterSet.isEmpty() ) {
    324                 // Short circuit - nothing to except.
    325                 return adoptBreakIterator;
    326             }
    327 
    328             CharsTrieBuilder builder = new CharsTrieBuilder();
    329             CharsTrieBuilder builder2 = new CharsTrieBuilder();
    330 
    331             int revCount = 0;
    332             int fwdCount = 0;
    333 
    334             int subCount = filterSet.size();
    335             CharSequence[] ustrs = new CharSequence[subCount];
    336             int[] partials = new int[subCount];
    337 
    338             CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs.
    339             CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M."
    340 
    341             int i = 0;
    342             for (CharSequence s : filterSet) {
    343                 ustrs[i] = s; // copy by value?
    344                 partials[i] = 0; // default: no partial
    345                 i++;
    346             }
    347 
    348             for (i = 0; i < subCount; i++) {
    349                 String thisStr = ustrs[i].toString(); // TODO: don't cast to String?
    350                 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations
    351                 if (nn > -1 && (nn + 1) != thisStr.length()) {
    352                     // is partial.
    353                     // is it unique?
    354                     int sameAs = -1;
    355                     for (int j = 0; j < subCount; j++) {
    356                         if (j == i)
    357                             continue;
    358                         if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) {
    359                             if (partials[j] == 0) { // hasn't been processed yet
    360                                 partials[j] = SuppressInReverse | AddToForward;
    361                             } else if ((partials[j] & SuppressInReverse) != 0) {
    362                                 sameAs = j; // the other entry is already in the reverse table.
    363                             }
    364                         }
    365                     }
    366 
    367                     if ((sameAs == -1) && (partials[i] == 0)) {
    368                         StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1));
    369                         // first one - add the prefix to the reverse table.
    370                         prefix.reverse();
    371                         builder.add(prefix, PARTIAL);
    372                         revCount++;
    373                         partials[i] = SuppressInReverse | AddToForward;
    374                     }
    375                 }
    376             }
    377 
    378             for (i = 0; i < subCount; i++) {
    379                 final String thisStr = ustrs[i].toString(); // TODO
    380                 if (partials[i] == 0) {
    381                     StringBuilder reversed = new StringBuilder(thisStr).reverse();
    382                     builder.add(reversed, MATCH);
    383                     revCount++;
    384                 } else {
    385                     // an optimization would be to only add the portion after the '.'
    386                     // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the
    387                     // forward,
    388                     // instead of "Ph.D." since we already know the "Ph." part is a match.
    389                     // would need the trie to be able to hold 0-length strings, though.
    390                     builder2.add(thisStr, MATCH); // forward
    391                     fwdCount++;
    392                 }
    393             }
    394 
    395             if (revCount > 0) {
    396                 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST);
    397             }
    398 
    399             if (fwdCount > 0) {
    400                 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST);
    401             }
    402             return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie);
    403         }
    404     }
    405 }
    406