1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import static com.ibm.icu.impl.CharacterIteration.DONE32; 12 13 import java.text.CharacterIterator; 14 import java.util.concurrent.atomic.AtomicReferenceArray; 15 16 import com.ibm.icu.impl.CharacterIteration; 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.lang.UProperty; 19 20 final class UnhandledBreakEngine implements LanguageBreakEngine { 21 // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen. 22 // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one. 23 24 // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all 25 // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls. 26 // If handleChar() is updating the set of unhandled characters at the same time 27 // findBreaks() or handles() is referencing it, the referencing functions must see 28 // a consistent set. It doesn't matter whether they see it before or after the update, 29 // but they should not see an inconsistent, changing set. 30 // 31 // To do this, an update is made by cloning the old set, updating the clone, then 32 // replacing the old with the new. Once made visible, each set remains constant. 33 34 // TODO: it's odd that findBreaks() can produce different results, depending 35 // on which scripts have been previously seen by handleChar(). (This is not a 36 // threading specific issue). Possibly stop on script boundaries? 37 38 final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1); 39 public UnhandledBreakEngine() { 40 for (int i = 0; i < fHandled.length(); i++) { 41 fHandled.set(i, new UnicodeSet()); 42 } 43 } 44 45 @Override 46 public boolean handles(int c, int breakType) { 47 return (breakType >= 0 && breakType < fHandled.length()) && 48 (fHandled.get(breakType).contains(c)); 49 } 50 51 @Override 52 public int findBreaks(CharacterIterator text, int startPos, int endPos, 53 int breakType, DictionaryBreakEngine.DequeI foundBreaks) { 54 if (breakType >= 0 && breakType < fHandled.length()) { 55 UnicodeSet uniset = fHandled.get(breakType); 56 int c = CharacterIteration.current32(text); 57 while (text.getIndex() < endPos && uniset.contains(c)) { 58 CharacterIteration.next32(text); 59 c = CharacterIteration.current32(text); 60 } 61 } 62 return 0; 63 } 64 65 /** 66 * Update the set of unhandled characters for the specified breakType to include 67 * all that have the same script as c. 68 * May be called concurrently with handles() or findBreaks(). 69 * Must not be called concurrently with itself. 70 */ 71 public void handleChar(int c, int breakType) { 72 if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) { 73 UnicodeSet originalSet = fHandled.get(breakType); 74 if (!originalSet.contains(c)) { 75 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 76 UnicodeSet newSet = new UnicodeSet(); 77 newSet.applyIntPropertyValue(UProperty.SCRIPT, script); 78 newSet.addAll(originalSet); 79 fHandled.set(breakType, newSet); 80 } 81 } 82 } 83 } 84