Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2016, International Business Machines Corporation and         *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import static com.ibm.icu.impl.CharacterIteration.DONE32;
     12 
     13 import java.text.CharacterIterator;
     14 import java.util.concurrent.atomic.AtomicReferenceArray;
     15 
     16 import com.ibm.icu.impl.CharacterIteration;
     17 import com.ibm.icu.lang.UCharacter;
     18 import com.ibm.icu.lang.UProperty;
     19 
     20 final class UnhandledBreakEngine implements LanguageBreakEngine {
     21     // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
     22     // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
     23 
     24     // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
     25     // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
     26     // If handleChar() is updating the set of unhandled characters at the same time
     27     // findBreaks() or handles() is referencing it, the referencing functions must see
     28     // a consistent set. It doesn't matter whether they see it before or after the update,
     29     // but they should not see an inconsistent, changing set.
     30     //
     31     // To do this, an update is made by cloning the old set, updating the clone, then
     32     // replacing the old with the new. Once made visible, each set remains constant.
     33 
     34     // TODO: it's odd that findBreaks() can produce different results, depending
     35     // on which scripts have been previously seen by handleChar(). (This is not a
     36     // threading specific issue). Possibly stop on script boundaries?
     37 
     38     final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
     39     public UnhandledBreakEngine() {
     40         for (int i = 0; i < fHandled.length(); i++) {
     41             fHandled.set(i, new UnicodeSet());
     42         }
     43     }
     44 
     45     @Override
     46     public boolean handles(int c, int breakType) {
     47         return (breakType >= 0 && breakType < fHandled.length()) &&
     48                 (fHandled.get(breakType).contains(c));
     49     }
     50 
     51     @Override
     52     public int findBreaks(CharacterIterator text, int startPos, int endPos,
     53             int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
     54         if (breakType >= 0 && breakType < fHandled.length()) {
     55             UnicodeSet uniset = fHandled.get(breakType);
     56             int c = CharacterIteration.current32(text);
     57             while (text.getIndex() < endPos && uniset.contains(c)) {
     58                 CharacterIteration.next32(text);
     59                 c = CharacterIteration.current32(text);
     60             }
     61         }
     62         return 0;
     63     }
     64 
     65     /**
     66      * Update the set of unhandled characters for the specified breakType to include
     67      * all that have the same script as c.
     68      * May be called concurrently with handles() or findBreaks().
     69      * Must not be called concurrently with itself.
     70      */
     71     public void handleChar(int c, int breakType) {
     72         if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
     73             UnicodeSet originalSet = fHandled.get(breakType);
     74             if (!originalSet.contains(c)) {
     75                 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
     76                 UnicodeSet newSet = new UnicodeSet();
     77                 newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
     78                 newSet.addAll(originalSet);
     79                 fHandled.set(breakType, newSet);
     80             }
     81         }
     82     }
     83 }
     84