Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2014-2016, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 package android.icu.impl;
     11 
     12 import java.text.CharacterIterator;
     13 import java.util.HashSet;
     14 import java.util.Locale;
     15 
     16 import android.icu.impl.ICUResourceBundle.OpenType;
     17 import android.icu.text.BreakIterator;
     18 import android.icu.text.FilteredBreakIteratorBuilder;
     19 import android.icu.text.UCharacterIterator;
     20 import android.icu.util.BytesTrie;
     21 import android.icu.util.CharsTrie;
     22 import android.icu.util.CharsTrieBuilder;
     23 import android.icu.util.StringTrieBuilder;
     24 import android.icu.util.ULocale;
     25 
     26 /**
     27  * @author tomzhang
     28  * @hide Only a subset of ICU is exposed in Android
     29  */
     30 public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
     31 
     32     private BreakIterator delegate;
     33     private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ?
     34     private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs.
     35     private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M."
     36 
     37     /**
     38      * @param adoptBreakIterator
     39      *            break iterator to adopt
     40      * @param forwardsPartialTrie
     41      *            forward & partial char trie to adopt
     42      * @param backwardsTrie
     43      *            backward trie to adopt
     44      */
     45     public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie,
     46             CharsTrie backwardsTrie) {
     47         this.delegate = adoptBreakIterator;
     48         this.forwardsPartialTrie = forwardsPartialTrie;
     49         this.backwardsTrie = backwardsTrie;
     50     }
     51 
     52 
     53     /**
     54      * Reset the filter from the delegate.
     55      */
     56     private final void resetState() {
     57         text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
     58     }
     59 
     60     /**
     61      * Is there an exception at this point?
     62      *
     63      * @param n the location of the possible break
     64      * @return
     65      */
     66     private final boolean breakExceptionAt(int n) {
     67         // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
     68 
     69         int bestPosn = -1;
     70         int bestValue = -1;
     71 
     72         // loops while 'n' points to an exception
     73         text.setIndex(n);
     74         backwardsTrie.reset();
     75         int uch;
     76 
     77 
     78 
     79         // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
     80         if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
     81             // TODO only do this the 1st time?
     82         } else {
     83             uch = text.nextCodePoint();
     84         }
     85 
     86         BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
     87 
     88         while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
     89                 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
     90             if (r.hasValue()) { // remember the best match so far
     91                 bestPosn = text.getIndex();
     92                 bestValue = backwardsTrie.getValue();
     93             }
     94         }
     95 
     96         if (r.matches()) { // exact match?
     97             bestValue = backwardsTrie.getValue();
     98             bestPosn = text.getIndex();
     99         }
    100 
    101         if (bestPosn >= 0) {
    102             if (bestValue == Builder.MATCH) { // exact match!
    103                 return true; // Exception here.
    104             } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
    105                 // make sure there's a forward trie
    106                 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
    107                 // to see if it matches something going forward.
    108                 forwardsPartialTrie.reset();
    109 
    110                 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
    111                 text.setIndex(bestPosn); // hope that's close ..
    112                 while ((uch = text.nextCodePoint()) != BreakIterator.DONE
    113                         && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
    114                 }
    115                 if (rfwd.matches()) {
    116                     // Exception here
    117                     return true;
    118                 } // else fall through
    119             } // else fall through
    120         } // else fall through
    121         return false; // No exception here.
    122     }
    123 
    124     /**
    125      * Given that the delegate has already given its "initial" answer,
    126      * find the NEXT actual (non-suppressed) break.
    127      * @param n initial position from delegate
    128      * @return new break position or BreakIterator.DONE
    129      */
    130     private final int internalNext(int n) {
    131         if (n == BreakIterator.DONE || // at end or
    132                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
    133             return n;
    134         }
    135         resetState();
    136 
    137         final int textLen = text.getLength();
    138 
    139         while (n != BreakIterator.DONE && n != textLen) {
    140             // outer loop runs once per underlying break (from fDelegate).
    141             // loops while 'n' points to an exception.
    142 
    143             if (breakExceptionAt(n)) {
    144                 // n points to a break exception
    145                 n = delegate.next();
    146             } else {
    147                 // no exception at this spot
    148                 return n;
    149             }
    150         }
    151         return n; //hit underlying DONE or break at end of text
    152     }
    153 
    154     /**
    155      * Given that the delegate has already given its "initial" answer,
    156      * find the PREV actual (non-suppressed) break.
    157      * @param n initial position from delegate
    158      * @return new break position or BreakIterator.DONE
    159      */
    160     private final int internalPrev(int n) {
    161         if (n == 0 || n == BreakIterator.DONE || // at end or
    162                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
    163             return n;
    164         }
    165         resetState();
    166 
    167         while (n != BreakIterator.DONE && n != 0) {
    168             // outer loop runs once per underlying break (from fDelegate).
    169             // loops while 'n' points to an exception.
    170 
    171             if (breakExceptionAt(n)) {
    172                 // n points to a break exception
    173                 n = delegate.previous();
    174             } else {
    175                 // no exception at this spot
    176                 return n;
    177             }
    178         }
    179         return n; //hit underlying DONE or break at end of text
    180     }
    181 
    182     @Override
    183     public boolean equals(Object obj) {
    184         if (obj == null)
    185             return false;
    186         if (this == obj)
    187             return true;
    188         if (getClass() != obj.getClass())
    189             return false;
    190         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
    191         return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
    192                 && forwardsPartialTrie.equals(other.forwardsPartialTrie);
    193     }
    194 
    195     @Override
    196     public int hashCode() {
    197         return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
    198     }
    199 
    200     @Override
    201     public Object clone() {
    202         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
    203         return other;
    204     }
    205 
    206 
    207     @Override
    208     public int first() {
    209         // Don't suppress a break opportunity at the beginning of text.
    210         return delegate.first();
    211     }
    212 
    213     @Override
    214     public int preceding(int offset) {
    215         return internalPrev(delegate.preceding(offset));
    216     }
    217 
    218     @Override
    219     public int previous() {
    220         return internalPrev(delegate.previous());
    221     }
    222 
    223     @Override
    224     public int current() {
    225         return delegate.current();
    226     }
    227 
    228     @Override
    229     public boolean isBoundary(int offset) {
    230         if(!delegate.isBoundary(offset)) {
    231             return false; // No underlying break to suppress?
    232         }
    233 
    234         // delegate thinks there's a break
    235         if(backwardsTrie == null) {
    236             return true; // no data
    237         }
    238 
    239         resetState();
    240         return !breakExceptionAt(offset); // if there's an exception: no break.
    241     }
    242 
    243     @Override
    244     public int next() {
    245         return internalNext(delegate.next());
    246     }
    247 
    248     @Override
    249     public int next(int n) {
    250         return internalNext(delegate.next(n));
    251     }
    252 
    253     @Override
    254     public int following(int offset) {
    255         return internalNext(delegate.following(offset));
    256     }
    257 
    258     @Override
    259     public int last() {
    260         // Don't suppress a break opportunity at the end of text.
    261         return delegate.last();
    262     }
    263 
    264     @Override
    265     public CharacterIterator getText() {
    266         return delegate.getText();
    267     }
    268 
    269     @Override
    270     public void setText(CharacterIterator newText) {
    271         delegate.setText(newText);
    272     }
    273 
    274     public static class Builder extends FilteredBreakIteratorBuilder {
    275         /**
    276          * filter set to store all exceptions
    277          */
    278         private HashSet<CharSequence> filterSet = new HashSet<CharSequence>();
    279 
    280         static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
    281         static final int MATCH = (1 << 1); // < exact match - skip this one.
    282         static final int SuppressInReverse = (1 << 0);
    283         static final int AddToForward = (1 << 1);
    284 
    285         public Builder(Locale loc) {
    286             this(ULocale.forLocale(loc));
    287         }
    288         /**
    289          * Create SimpleFilteredBreakIteratorBuilder using given locale
    290          * @param loc the locale to get filtered iterators
    291          */
    292         public Builder(ULocale loc) {
    293             ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
    294                     ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
    295 
    296             ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
    297 
    298             if (breaks != null) {
    299                 for (int index = 0, size = breaks.getSize(); index < size; ++index) {
    300                     ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
    301                     String br = b.getString();
    302                     filterSet.add(br);
    303                 }
    304             }
    305         }
    306 
    307         /**
    308          * Create SimpleFilteredBreakIteratorBuilder with no exception
    309          */
    310         public Builder() {
    311         }
    312 
    313         @Override
    314         public boolean suppressBreakAfter(CharSequence str) {
    315             return filterSet.add(str);
    316         }
    317 
    318         @Override
    319         public boolean unsuppressBreakAfter(CharSequence str) {
    320             return filterSet.remove(str);
    321         }
    322 
    323         @Override
    324         public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) {
    325             if( filterSet.isEmpty() ) {
    326                 // Short circuit - nothing to except.
    327                 return adoptBreakIterator;
    328             }
    329 
    330             CharsTrieBuilder builder = new CharsTrieBuilder();
    331             CharsTrieBuilder builder2 = new CharsTrieBuilder();
    332 
    333             int revCount = 0;
    334             int fwdCount = 0;
    335 
    336             int subCount = filterSet.size();
    337             CharSequence[] ustrs = new CharSequence[subCount];
    338             int[] partials = new int[subCount];
    339 
    340             CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs.
    341             CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M."
    342 
    343             int i = 0;
    344             for (CharSequence s : filterSet) {
    345                 ustrs[i] = s; // copy by value?
    346                 partials[i] = 0; // default: no partial
    347                 i++;
    348             }
    349 
    350             for (i = 0; i < subCount; i++) {
    351                 String thisStr = ustrs[i].toString(); // TODO: don't cast to String?
    352                 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations
    353                 if (nn > -1 && (nn + 1) != thisStr.length()) {
    354                     // is partial.
    355                     // is it unique?
    356                     int sameAs = -1;
    357                     for (int j = 0; j < subCount; j++) {
    358                         if (j == i)
    359                             continue;
    360                         if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) {
    361                             if (partials[j] == 0) { // hasn't been processed yet
    362                                 partials[j] = SuppressInReverse | AddToForward;
    363                             } else if ((partials[j] & SuppressInReverse) != 0) {
    364                                 sameAs = j; // the other entry is already in the reverse table.
    365                             }
    366                         }
    367                     }
    368 
    369                     if ((sameAs == -1) && (partials[i] == 0)) {
    370                         StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1));
    371                         // first one - add the prefix to the reverse table.
    372                         prefix.reverse();
    373                         builder.add(prefix, PARTIAL);
    374                         revCount++;
    375                         partials[i] = SuppressInReverse | AddToForward;
    376                     }
    377                 }
    378             }
    379 
    380             for (i = 0; i < subCount; i++) {
    381                 final String thisStr = ustrs[i].toString(); // TODO
    382                 if (partials[i] == 0) {
    383                     StringBuilder reversed = new StringBuilder(thisStr).reverse();
    384                     builder.add(reversed, MATCH);
    385                     revCount++;
    386                 } else {
    387                     // an optimization would be to only add the portion after the '.'
    388                     // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the
    389                     // forward,
    390                     // instead of "Ph.D." since we already know the "Ph." part is a match.
    391                     // would need the trie to be able to hold 0-length strings, though.
    392                     builder2.add(thisStr, MATCH); // forward
    393                     fwdCount++;
    394                 }
    395             }
    396 
    397             if (revCount > 0) {
    398                 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST);
    399             }
    400 
    401             if (fwdCount > 0) {
    402                 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST);
    403             }
    404             return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie);
    405         }
    406     }
    407 }
    408