Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  *   Copyright (C) 2009-2015, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.io.IOException;
     13 import java.nio.ByteBuffer;
     14 import java.util.ArrayList;
     15 import java.util.Iterator;
     16 
     17 import com.ibm.icu.text.UTF16;
     18 import com.ibm.icu.text.UnicodeSet;
     19 import com.ibm.icu.util.ICUUncheckedIOException;
     20 import com.ibm.icu.util.VersionInfo;
     21 
     22 /**
     23  * Low-level implementation of the Unicode Normalization Algorithm.
     24  * For the data structure and details see the documentation at the end of
     25  * C++ normalizer2impl.h and in the design doc at
     26  * http://site.icu-project.org/design/normalization/custom
     27  */
     28 public final class Normalizer2Impl {
     29     public static final class Hangul {
     30         /* Korean Hangul and Jamo constants */
     31         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
     32         public static final int JAMO_L_END=0x1112;
     33         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
     34         public static final int JAMO_V_END=0x1175;
     35         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
     36         public static final int JAMO_T_END=0x11c2;
     37 
     38         public static final int HANGUL_BASE=0xac00;
     39         public static final int HANGUL_END=0xd7a3;
     40 
     41         public static final int JAMO_L_COUNT=19;
     42         public static final int JAMO_V_COUNT=21;
     43         public static final int JAMO_T_COUNT=28;
     44 
     45         public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
     46         public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
     47 
     48         public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
     49 
     50         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
     51         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
     52 
     53         public static boolean isHangul(int c) {
     54             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
     55         }
     56         public static boolean isHangulLV(int c) {
     57             c-=HANGUL_BASE;
     58             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
     59         }
     60         public static boolean isJamoL(int c) {
     61             return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
     62         }
     63         public static boolean isJamoV(int c) {
     64             return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
     65         }
     66         public static boolean isJamoT(int c) {
     67             int t=c-JAMO_T_BASE;
     68             return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
     69         }
     70         public static boolean isJamo(int c) {
     71             return JAMO_L_BASE<=c && c<=JAMO_T_END &&
     72                 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
     73         }
     74 
     75         /**
     76          * Decomposes c, which must be a Hangul syllable, into buffer
     77          * and returns the length of the decomposition (2 or 3).
     78          */
     79         public static int decompose(int c, Appendable buffer) {
     80             try {
     81                 c-=HANGUL_BASE;
     82                 int c2=c%JAMO_T_COUNT;
     83                 c/=JAMO_T_COUNT;
     84                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
     85                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
     86                 if(c2==0) {
     87                     return 2;
     88                 } else {
     89                     buffer.append((char)(JAMO_T_BASE+c2));
     90                     return 3;
     91                 }
     92             } catch(IOException e) {
     93                 // Will not occur because we do not write to I/O.
     94                 throw new ICUUncheckedIOException(e);
     95             }
     96         }
     97 
     98         /**
     99          * Decomposes c, which must be a Hangul syllable, into buffer.
    100          * This is the raw, not recursive, decomposition. Its length is always 2.
    101          */
    102         public static void getRawDecomposition(int c, Appendable buffer) {
    103             try {
    104                 int orig=c;
    105                 c-=HANGUL_BASE;
    106                 int c2=c%JAMO_T_COUNT;
    107                 if(c2==0) {
    108                     c/=JAMO_T_COUNT;
    109                     buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
    110                     buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
    111                 } else {
    112                     buffer.append((char)(orig-c2));  // LV syllable
    113                     buffer.append((char)(JAMO_T_BASE+c2));
    114                 }
    115             } catch(IOException e) {
    116                 // Will not occur because we do not write to I/O.
    117                 throw new ICUUncheckedIOException(e);
    118             }
    119         }
    120     }
    121 
    122     /**
    123      * Writable buffer that takes care of canonical ordering.
    124      * Its Appendable methods behave like the C++ implementation's
    125      * appendZeroCC() methods.
    126      * <p>
    127      * If dest is a StringBuilder, then the buffer writes directly to it.
    128      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
    129      * until no further changes are necessary and whole segments are appended.
    130      * append() methods that take combining-class values always write to the StringBuilder.
    131      * Other append() methods flush and append to the Appendable.
    132      */
    133     public static final class ReorderingBuffer implements Appendable {
    134         public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
    135             impl=ni;
    136             app=dest;
    137             if(app instanceof StringBuilder) {
    138                 appIsStringBuilder=true;
    139                 str=(StringBuilder)dest;
    140                 // In Java, the constructor subsumes public void init(int destCapacity) {
    141                 str.ensureCapacity(destCapacity);
    142                 reorderStart=0;
    143                 if(str.length()==0) {
    144                     lastCC=0;
    145                 } else {
    146                     setIterator();
    147                     lastCC=previousCC();
    148                     // Set reorderStart after the last code point with cc<=1 if there is one.
    149                     if(lastCC>1) {
    150                         while(previousCC()>1) {}
    151                     }
    152                     reorderStart=codePointLimit;
    153                 }
    154             } else {
    155                 appIsStringBuilder=false;
    156                 str=new StringBuilder();
    157                 reorderStart=0;
    158                 lastCC=0;
    159             }
    160         }
    161 
    162         public boolean isEmpty() { return str.length()==0; }
    163         public int length() { return str.length(); }
    164         public int getLastCC() { return lastCC; }
    165 
    166         public StringBuilder getStringBuilder() { return str; }
    167 
    168         public boolean equals(CharSequence s, int start, int limit) {
    169             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
    170         }
    171 
    172         public void append(int c, int cc) {
    173             if(lastCC<=cc || cc==0) {
    174                 str.appendCodePoint(c);
    175                 lastCC=cc;
    176                 if(cc<=1) {
    177                     reorderStart=str.length();
    178                 }
    179             } else {
    180                 insert(c, cc);
    181             }
    182         }
    183         // s must be in NFD, otherwise change the implementation.
    184         public void append(CharSequence s, int start, int limit,
    185                            int leadCC, int trailCC) {
    186             if(start==limit) {
    187                 return;
    188             }
    189             if(lastCC<=leadCC || leadCC==0) {
    190                 if(trailCC<=1) {
    191                     reorderStart=str.length()+(limit-start);
    192                 } else if(leadCC<=1) {
    193                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
    194                 }
    195                 str.append(s, start, limit);
    196                 lastCC=trailCC;
    197             } else {
    198                 int c=Character.codePointAt(s, start);
    199                 start+=Character.charCount(c);
    200                 insert(c, leadCC);  // insert first code point
    201                 while(start<limit) {
    202                     c=Character.codePointAt(s, start);
    203                     start+=Character.charCount(c);
    204                     if(start<limit) {
    205                         // s must be in NFD, otherwise we need to use getCC().
    206                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
    207                     } else {
    208                         leadCC=trailCC;
    209                     }
    210                     append(c, leadCC);
    211                 }
    212             }
    213         }
    214         // The following append() methods work like C++ appendZeroCC().
    215         // They assume that the cc or trailCC of their input is 0.
    216         // Most of them implement Appendable interface methods.
    217         @Override
    218         public ReorderingBuffer append(char c) {
    219             str.append(c);
    220             lastCC=0;
    221             reorderStart=str.length();
    222             return this;
    223         }
    224         public void appendZeroCC(int c) {
    225             str.appendCodePoint(c);
    226             lastCC=0;
    227             reorderStart=str.length();
    228         }
    229         @Override
    230         public ReorderingBuffer append(CharSequence s) {
    231             if(s.length()!=0) {
    232                 str.append(s);
    233                 lastCC=0;
    234                 reorderStart=str.length();
    235             }
    236             return this;
    237         }
    238         @Override
    239         public ReorderingBuffer append(CharSequence s, int start, int limit) {
    240             if(start!=limit) {
    241                 str.append(s, start, limit);
    242                 lastCC=0;
    243                 reorderStart=str.length();
    244             }
    245             return this;
    246         }
    247         /**
    248          * Flushes from the intermediate StringBuilder to the Appendable,
    249          * if they are different objects.
    250          * Used after recomposition.
    251          * Must be called at the end when writing to a non-StringBuilder Appendable.
    252          */
    253         public void flush() {
    254             if(appIsStringBuilder) {
    255                 reorderStart=str.length();
    256             } else {
    257                 try {
    258                     app.append(str);
    259                     str.setLength(0);
    260                     reorderStart=0;
    261                 } catch(IOException e) {
    262                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
    263                 }
    264             }
    265             lastCC=0;
    266         }
    267         /**
    268          * Flushes from the intermediate StringBuilder to the Appendable,
    269          * if they are different objects.
    270          * Then appends the new text to the Appendable or StringBuilder.
    271          * Normally used after quick check loops find a non-empty sequence.
    272          */
    273         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
    274             if(appIsStringBuilder) {
    275                 str.append(s, start, limit);
    276                 reorderStart=str.length();
    277             } else {
    278                 try {
    279                     app.append(str).append(s, start, limit);
    280                     str.setLength(0);
    281                     reorderStart=0;
    282                 } catch(IOException e) {
    283                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
    284                 }
    285             }
    286             lastCC=0;
    287             return this;
    288         }
    289         public void remove() {
    290             str.setLength(0);
    291             lastCC=0;
    292             reorderStart=0;
    293         }
    294         public void removeSuffix(int suffixLength) {
    295             int oldLength=str.length();
    296             str.delete(oldLength-suffixLength, oldLength);
    297             lastCC=0;
    298             reorderStart=str.length();
    299         }
    300 
    301         /*
    302          * TODO: Revisit whether it makes sense to track reorderStart.
    303          * It is set to after the last known character with cc<=1,
    304          * which stops previousCC() before it reads that character and looks up its cc.
    305          * previousCC() is normally only called from insert().
    306          * In other words, reorderStart speeds up the insertion of a combining mark
    307          * into a multi-combining mark sequence where it does not belong at the end.
    308          * This might not be worth the trouble.
    309          * On the other hand, it's not a huge amount of trouble.
    310          *
    311          * We probably need it for UNORM_SIMPLE_APPEND.
    312          */
    313 
    314         // Inserts c somewhere before the last character.
    315         // Requires 0<cc<lastCC which implies reorderStart<limit.
    316         private void insert(int c, int cc) {
    317             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
    318             // insert c at codePointLimit, after the character with prevCC<=cc
    319             if(c<=0xffff) {
    320                 str.insert(codePointLimit, (char)c);
    321                 if(cc<=1) {
    322                     reorderStart=codePointLimit+1;
    323                 }
    324             } else {
    325                 str.insert(codePointLimit, Character.toChars(c));
    326                 if(cc<=1) {
    327                     reorderStart=codePointLimit+2;
    328                 }
    329             }
    330         }
    331 
    332         private final Normalizer2Impl impl;
    333         private final Appendable app;
    334         private final StringBuilder str;
    335         private final boolean appIsStringBuilder;
    336         private int reorderStart;
    337         private int lastCC;
    338 
    339         // private backward iterator
    340         private void setIterator() { codePointStart=str.length(); }
    341         private void skipPrevious() {  // Requires 0<codePointStart.
    342             codePointLimit=codePointStart;
    343             codePointStart=str.offsetByCodePoints(codePointStart, -1);
    344         }
    345         private int previousCC() {  // Returns 0 if there is no previous character.
    346             codePointLimit=codePointStart;
    347             if(reorderStart>=codePointStart) {
    348                 return 0;
    349             }
    350             int c=str.codePointBefore(codePointStart);
    351             codePointStart-=Character.charCount(c);
    352             return impl.getCCFromYesOrMaybeCP(c);
    353         }
    354 
    355         private int codePointStart, codePointLimit;
    356     }
    357 
    358     // TODO: Propose as public API on the UTF16 class.
    359     // TODO: Propose widening UTF16 methods that take char to take int.
    360     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
    361     public static final class UTF16Plus {
    362         /**
    363          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
    364          * is it a lead surrogate?
    365          * @param c code unit or code point
    366          * @return true or false
    367          */
    368         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
    369         /**
    370          * Compares two CharSequence objects for binary equality.
    371          * @param s1 first sequence
    372          * @param s2 second sequence
    373          * @return true if s1 contains the same text as s2
    374          */
    375         public static boolean equal(CharSequence s1,  CharSequence s2) {
    376             if(s1==s2) {
    377                 return true;
    378             }
    379             int length=s1.length();
    380             if(length!=s2.length()) {
    381                 return false;
    382             }
    383             for(int i=0; i<length; ++i) {
    384                 if(s1.charAt(i)!=s2.charAt(i)) {
    385                     return false;
    386                 }
    387             }
    388             return true;
    389         }
    390         /**
    391          * Compares two CharSequence subsequences for binary equality.
    392          * @param s1 first sequence
    393          * @param start1 start offset in first sequence
    394          * @param limit1 limit offset in first sequence
    395          * @param s2 second sequence
    396          * @param start2 start offset in second sequence
    397          * @param limit2 limit offset in second sequence
    398          * @return true if s1.subSequence(start1, limit1) contains the same text
    399          *              as s2.subSequence(start2, limit2)
    400          */
    401         public static boolean equal(CharSequence s1, int start1, int limit1,
    402                                     CharSequence s2, int start2, int limit2) {
    403             if((limit1-start1)!=(limit2-start2)) {
    404                 return false;
    405             }
    406             if(s1==s2 && start1==start2) {
    407                 return true;
    408             }
    409             while(start1<limit1) {
    410                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
    411                     return false;
    412                 }
    413             }
    414             return true;
    415         }
    416     }
    417 
    418     public Normalizer2Impl() {}
    419 
    420     private static final class IsAcceptable implements ICUBinary.Authenticate {
    421         @Override
    422         public boolean isDataVersionAcceptable(byte version[]) {
    423             return version[0]==3;
    424         }
    425     }
    426     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
    427     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
    428 
    429     public Normalizer2Impl load(ByteBuffer bytes) {
    430         try {
    431             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
    432             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
    433             if(indexesLength<=IX_MIN_LCCC_CP) {
    434                 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
    435             }
    436             int[] inIndexes=new int[indexesLength];
    437             inIndexes[0]=indexesLength*4;
    438             for(int i=1; i<indexesLength; ++i) {
    439                 inIndexes[i]=bytes.getInt();
    440             }
    441 
    442             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
    443             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
    444             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
    445 
    446             minYesNo=inIndexes[IX_MIN_YES_NO];
    447             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
    448             minNoNo=inIndexes[IX_MIN_NO_NO];
    449             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
    450             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
    451             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
    452             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
    453             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
    454             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
    455             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
    456 
    457             // Read the normTrie.
    458             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
    459             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
    460             normTrie=Trie2_16.createFromSerialized(bytes);
    461             int trieLength=normTrie.getSerializedLength();
    462             if(trieLength>(nextOffset-offset)) {
    463                 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
    464             }
    465             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
    466 
    467             // Read the composition and mapping data.
    468             offset=nextOffset;
    469             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
    470             int numChars=(nextOffset-offset)/2;
    471             if(numChars!=0) {
    472                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
    473                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
    474             }
    475 
    476             // smallFCD: new in formatVersion 2
    477             offset=nextOffset;
    478             smallFCD=new byte[0x100];
    479             bytes.get(smallFCD);
    480 
    481             return this;
    482         } catch(IOException e) {
    483             throw new ICUUncheckedIOException(e);
    484         }
    485     }
    486     public Normalizer2Impl load(String name) {
    487         return load(ICUBinary.getRequiredData(name));
    488     }
    489 
    490     private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
    491         if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
    492             set.add(start, end);
    493         } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
    494             int fcd16=getFCD16(start);
    495             if(fcd16>0xff) { set.add(start, end); }
    496         }
    497     }
    498 
    499     private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
    500         /* add the start code point to the USet */
    501         set.add(start);
    502         if(start!=end && isAlgorithmicNoNo(value) && (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) {
    503             // Range of code points with same-norm16-value algorithmic decompositions.
    504             // They might have different non-zero FCD16 values.
    505             int prevFCD16=getFCD16(start);
    506             while(++start<=end) {
    507                 int fcd16=getFCD16(start);
    508                 if(fcd16!=prevFCD16) {
    509                     set.add(start);
    510                     prevFCD16=fcd16;
    511                 }
    512             }
    513         }
    514     }
    515 
    516     public void addLcccChars(UnicodeSet set) {
    517         Iterator<Trie2.Range> trieIterator=normTrie.iterator();
    518         Trie2.Range range;
    519         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    520             enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
    521         }
    522     }
    523 
    524     public void addPropertyStarts(UnicodeSet set) {
    525         /* add the start code point of each same-value range of each trie */
    526         Iterator<Trie2.Range> trieIterator=normTrie.iterator();
    527         Trie2.Range range;
    528         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    529             enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
    530         }
    531 
    532         /* add Hangul LV syllables and LV+1 because of skippables */
    533         for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
    534             set.add(c);
    535             set.add(c+1);
    536         }
    537         set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
    538     }
    539 
    540     public void addCanonIterPropertyStarts(UnicodeSet set) {
    541         /* add the start code point of each same-value range of the canonical iterator data trie */
    542         ensureCanonIterData();
    543         // currently only used for the SEGMENT_STARTER property
    544         Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
    545         Trie2.Range range;
    546         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    547             /* add the start code point to the USet */
    548             set.add(range.startCodePoint);
    549         }
    550     }
    551     private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
    552         @Override
    553         public int map(int in) {
    554             return in&CANON_NOT_SEGMENT_STARTER;
    555         }
    556     };
    557 
    558     // low-level properties ------------------------------------------------ ***
    559 
    560     // Note: Normalizer2Impl.java r30983 (2011-nov-27)
    561     // still had getFCDTrie() which built and cached an FCD trie.
    562     // That provided faster access to FCD data than getFCD16FromNormData()
    563     // but required synchronization and consumed some 10kB of heap memory
    564     // in any process that uses FCD (e.g., via collation).
    565     // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
    566     // at least for ASCII & CJK.
    567 
    568     /**
    569      * Builds the canonical-iterator data for this instance.
    570      * This is required before any of {@link #isCanonSegmentStarter(int)} or
    571      * {@link #getCanonStartSet(int, UnicodeSet)} are called,
    572      * or else they crash.
    573      * @return this
    574      */
    575     public synchronized Normalizer2Impl ensureCanonIterData() {
    576         if(canonIterData==null) {
    577             Trie2Writable newData=new Trie2Writable(0, 0);
    578             canonStartSets=new ArrayList<UnicodeSet>();
    579             Iterator<Trie2.Range> trieIterator=normTrie.iterator();
    580             Trie2.Range range;
    581             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    582                 final int norm16=range.value;
    583                 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
    584                     // Inert, or 2-way mapping (including Hangul syllable).
    585                     // We do not write a canonStartSet for any yesNo character.
    586                     // Composites from 2-way mappings are added at runtime from the
    587                     // starter's compositions list, and the other characters in
    588                     // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
    589                     // "maybe" characters.
    590                     continue;
    591                 }
    592                 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
    593                     final int oldValue=newData.get(c);
    594                     int newValue=oldValue;
    595                     if(isMaybeOrNonZeroCC(norm16)) {
    596                         // not a segment starter if it occurs in a decomposition or has cc!=0
    597                         newValue|=CANON_NOT_SEGMENT_STARTER;
    598                         if(norm16<MIN_NORMAL_MAYBE_YES) {
    599                             newValue|=CANON_HAS_COMPOSITIONS;
    600                         }
    601                     } else if(norm16<minYesNo) {
    602                         newValue|=CANON_HAS_COMPOSITIONS;
    603                     } else {
    604                         // c has a one-way decomposition
    605                         int c2=c;
    606                         // Do not modify the whole-range norm16 value.
    607                         int norm16_2=norm16;
    608                         if (isDecompNoAlgorithmic(norm16_2)) {
    609                             // Maps to an isCompYesAndZeroCC.
    610                             c2 = mapAlgorithmic(c2, norm16_2);
    611                             norm16_2 = getNorm16(c2);
    612                             // No compatibility mappings for the CanonicalIterator.
    613                             assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
    614                         }
    615                         if (norm16_2 > minYesNo) {
    616                             // c decomposes, get everything from the variable-length extra data
    617                             int mapping=norm16_2>>OFFSET_SHIFT;
    618                             int firstUnit=extraData.charAt(mapping);
    619                             int length=firstUnit&MAPPING_LENGTH_MASK;
    620                             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
    621                                 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) {
    622                                     newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
    623                                 }
    624                             }
    625                             // Skip empty mappings (no characters in the decomposition).
    626                             if(length!=0) {
    627                                 ++mapping;  // skip over the firstUnit
    628                                 // add c to first code point's start set
    629                                 int limit=mapping+length;
    630                                 c2=extraData.codePointAt(mapping);
    631                                 addToStartSet(newData, c, c2);
    632                                 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
    633                                 // one-way mapping. A 2-way mapping is possible here after
    634                                 // intermediate algorithmic mapping.
    635                                 if(norm16_2>=minNoNo) {
    636                                     while((mapping+=Character.charCount(c2))<limit) {
    637                                         c2=extraData.codePointAt(mapping);
    638                                         int c2Value=newData.get(c2);
    639                                         if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
    640                                             newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
    641                                         }
    642                                     }
    643                                 }
    644                             }
    645                         } else {
    646                             // c decomposed to c2 algorithmically; c has cc==0
    647                             addToStartSet(newData, c, c2);
    648                         }
    649                     }
    650                     if(newValue!=oldValue) {
    651                         newData.set(c, newValue);
    652                     }
    653                 }
    654             }
    655             canonIterData=newData.toTrie2_32();
    656         }
    657         return this;
    658     }
    659 
    660     public int getNorm16(int c) { return normTrie.get(c); }
    661 
    662     public int getCompQuickCheck(int norm16) {
    663         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
    664             return 1;  // yes
    665         } else if(minMaybeYes<=norm16) {
    666             return 2;  // maybe
    667         } else {
    668             return 0;  // no
    669         }
    670     }
    671     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
    672     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
    673     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
    674 
    675     public int getCC(int norm16) {
    676         if(norm16>=MIN_NORMAL_MAYBE_YES) {
    677             return getCCFromNormalYesOrMaybe(norm16);
    678         }
    679         if(norm16<minNoNo || limitNoNo<=norm16) {
    680             return 0;
    681         }
    682         return getCCFromNoNo(norm16);
    683     }
    684     public static int getCCFromNormalYesOrMaybe(int norm16) {
    685         return (norm16 >> OFFSET_SHIFT) & 0xff;
    686     }
    687     public static int getCCFromYesOrMaybe(int norm16) {
    688         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
    689     }
    690     public int getCCFromYesOrMaybeCP(int c) {
    691         if (c < minCompNoMaybeCP) { return 0; }
    692         return getCCFromYesOrMaybe(getNorm16(c));
    693     }
    694 
    695     /**
    696      * Returns the FCD data for code point c.
    697      * @param c A Unicode code point.
    698      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
    699      */
    700     public int getFCD16(int c) {
    701         if(c<minDecompNoCP) {
    702             return 0;
    703         } else if(c<=0xffff) {
    704             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
    705         }
    706         return getFCD16FromNormData(c);
    707     }
    708     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
    709     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
    710         // 0<=lead<=0xffff
    711         byte bits=smallFCD[lead>>8];
    712         if(bits==0) { return false; }
    713         return ((bits>>((lead>>5)&7))&1)!=0;
    714     }
    715 
    716     /** Gets the FCD value from the regular normalization data. */
    717     public int getFCD16FromNormData(int c) {
    718         int norm16=getNorm16(c);
    719         if (norm16 >= limitNoNo) {
    720             if(norm16>=MIN_NORMAL_MAYBE_YES) {
    721                 // combining mark
    722                 norm16=getCCFromNormalYesOrMaybe(norm16);
    723                 return norm16|(norm16<<8);
    724             } else if(norm16>=minMaybeYes) {
    725                 return 0;
    726             } else {  // isDecompNoAlgorithmic(norm16)
    727                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
    728                 if (deltaTrailCC <= DELTA_TCCC_1) {
    729                     return deltaTrailCC >> OFFSET_SHIFT;
    730                 }
    731                 // Maps to an isCompYesAndZeroCC.
    732                 c=mapAlgorithmic(c, norm16);
    733                 norm16=getNorm16(c);
    734             }
    735         }
    736         if(norm16<=minYesNo || isHangulLVT(norm16)) {
    737             // no decomposition or Hangul syllable, all zeros
    738             return 0;
    739         }
    740         // c decomposes, get everything from the variable-length extra data
    741         int mapping=norm16>>OFFSET_SHIFT;
    742         int firstUnit=extraData.charAt(mapping);
    743         int fcd16=firstUnit>>8;  // tccc
    744         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
    745             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
    746         }
    747         return fcd16;
    748     }
    749 
    750     /**
    751      * Gets the decomposition for one code point.
    752      * @param c code point
    753      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
    754      */
    755     public String getDecomposition(int c) {
    756         int norm16;
    757         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
    758             // c does not decompose
    759             return null;
    760         }
    761         int decomp = -1;
    762         if(isDecompNoAlgorithmic(norm16)) {
    763             // Maps to an isCompYesAndZeroCC.
    764             decomp=c=mapAlgorithmic(c, norm16);
    765             // The mapping might decompose further.
    766             norm16 = getNorm16(c);
    767         }
    768         if (norm16 < minYesNo) {
    769             if(decomp<0) {
    770                 return null;
    771             } else {
    772                 return UTF16.valueOf(decomp);
    773             }
    774         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
    775             // Hangul syllable: decompose algorithmically
    776             StringBuilder buffer=new StringBuilder();
    777             Hangul.decompose(c, buffer);
    778             return buffer.toString();
    779         }
    780         // c decomposes, get everything from the variable-length extra data
    781         int mapping=norm16>>OFFSET_SHIFT;
    782         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
    783         return extraData.substring(mapping, mapping+length);
    784     }
    785 
    786     /**
    787      * Gets the raw decomposition for one code point.
    788      * @param c code point
    789      * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
    790      */
    791     public String getRawDecomposition(int c) {
    792         int norm16;
    793         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    794             // c does not decompose
    795             return null;
    796         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
    797             // Hangul syllable: decompose algorithmically
    798             StringBuilder buffer=new StringBuilder();
    799             Hangul.getRawDecomposition(c, buffer);
    800             return buffer.toString();
    801         } else if(isDecompNoAlgorithmic(norm16)) {
    802             return UTF16.valueOf(mapAlgorithmic(c, norm16));
    803         }
    804         // c decomposes, get everything from the variable-length extra data
    805         int mapping=norm16>>OFFSET_SHIFT;
    806         int firstUnit=extraData.charAt(mapping);
    807         int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
    808         if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
    809             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
    810             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
    811             int rawMapping=mapping-((firstUnit>>7)&1)-1;
    812             char rm0=extraData.charAt(rawMapping);
    813             if(rm0<=MAPPING_LENGTH_MASK) {
    814                 return extraData.substring(rawMapping-rm0, rawMapping);
    815             } else {
    816                 // Copy the normal mapping and replace its first two code units with rm0.
    817                 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
    818                 mapping+=1+2;  // skip over the firstUnit and the first two mapping code units
    819                 return buffer.append(extraData, mapping, mapping+mLength-2).toString();
    820             }
    821         } else {
    822             mapping+=1;  // skip over the firstUnit
    823             return extraData.substring(mapping, mapping+mLength);
    824         }
    825     }
    826 
    827     /**
    828      * Returns true if code point c starts a canonical-iterator string segment.
    829      * <b>{@link #ensureCanonIterData()} must have been called before this method,
    830      * or else this method will crash.</b>
    831      * @param c A Unicode code point.
    832      * @return true if c starts a canonical-iterator string segment.
    833      */
    834     public boolean isCanonSegmentStarter(int c) {
    835         return canonIterData.get(c)>=0;
    836     }
    837     /**
    838      * Returns true if there are characters whose decomposition starts with c.
    839      * If so, then the set is cleared and then filled with those characters.
    840      * <b>{@link #ensureCanonIterData()} must have been called before this method,
    841      * or else this method will crash.</b>
    842      * @param c A Unicode code point.
    843      * @param set A UnicodeSet to receive the characters whose decompositions
    844      *        start with c, if there are any.
    845      * @return true if there are characters whose decomposition starts with c.
    846      */
    847     public boolean getCanonStartSet(int c, UnicodeSet set) {
    848         int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
    849         if(canonValue==0) {
    850             return false;
    851         }
    852         set.clear();
    853         int value=canonValue&CANON_VALUE_MASK;
    854         if((canonValue&CANON_HAS_SET)!=0) {
    855             set.addAll(canonStartSets.get(value));
    856         } else if(value!=0) {
    857             set.add(value);
    858         }
    859         if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
    860             int norm16=getNorm16(c);
    861             if(norm16==JAMO_L) {
    862                 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
    863                 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
    864             } else {
    865                 addComposites(getCompositionsList(norm16), set);
    866             }
    867         }
    868         return true;
    869     }
    870 
    871     // Fixed norm16 values.
    872     public static final int MIN_YES_YES_WITH_CC=0xfe02;
    873     public static final int JAMO_VT=0xfe00;
    874     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
    875     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
    876     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
    877 
    878     // norm16 bit 0 is comp-boundary-after.
    879     public static final int HAS_COMP_BOUNDARY_AFTER=1;
    880     public static final int OFFSET_SHIFT=1;
    881 
    882     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
    883     // tccc (0, 1, >1) for quick FCC boundary-after tests.
    884     public static final int DELTA_TCCC_0=0;
    885     public static final int DELTA_TCCC_1=2;
    886     public static final int DELTA_TCCC_GT_1=4;
    887     public static final int DELTA_TCCC_MASK=6;
    888     public static final int DELTA_SHIFT=3;
    889 
    890     public static final int MAX_DELTA=0x40;
    891 
    892     // Byte offsets from the start of the data, after the generic header.
    893     public static final int IX_NORM_TRIE_OFFSET=0;
    894     public static final int IX_EXTRA_DATA_OFFSET=1;
    895     public static final int IX_SMALL_FCD_OFFSET=2;
    896     public static final int IX_RESERVED3_OFFSET=3;
    897     public static final int IX_TOTAL_SIZE=7;
    898 
    899     // Code point thresholds for quick check codes.
    900     public static final int IX_MIN_DECOMP_NO_CP=8;
    901     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
    902 
    903     // Norm16 value thresholds for quick check combinations and types of extra data.
    904 
    905     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
    906     public static final int IX_MIN_YES_NO=10;
    907     /** Mappings are comp-normalized. */
    908     public static final int IX_MIN_NO_NO=11;
    909     public static final int IX_LIMIT_NO_NO=12;
    910     public static final int IX_MIN_MAYBE_YES=13;
    911 
    912     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
    913     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
    914     /** Mappings are not comp-normalized but have a comp boundary before. */
    915     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
    916     /** Mappings do not have a comp boundary before. */
    917     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
    918     /** Mappings to the empty string. */
    919     public static final int IX_MIN_NO_NO_EMPTY=17;
    920 
    921     public static final int IX_MIN_LCCC_CP=18;
    922     public static final int IX_COUNT=20;
    923 
    924     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
    925     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
    926     // unused bit 0x20;
    927     public static final int MAPPING_LENGTH_MASK=0x1f;
    928 
    929     public static final int COMP_1_LAST_TUPLE=0x8000;
    930     public static final int COMP_1_TRIPLE=1;
    931     public static final int COMP_1_TRAIL_LIMIT=0x3400;
    932     public static final int COMP_1_TRAIL_MASK=0x7ffe;
    933     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
    934     public static final int COMP_2_TRAIL_SHIFT=6;
    935     public static final int COMP_2_TRAIL_MASK=0xffc0;
    936 
    937     // higher-level functionality ------------------------------------------ ***
    938 
    939     // NFD without an NFD Normalizer2 instance.
    940     public Appendable decompose(CharSequence s, StringBuilder dest) {
    941         decompose(s, 0, s.length(), dest, s.length());
    942         return dest;
    943     }
    944     /**
    945      * Decomposes s[src, limit[ and writes the result to dest.
    946      * limit can be NULL if src is NUL-terminated.
    947      * destLengthEstimate is the initial dest buffer capacity and can be -1.
    948      */
    949     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
    950                    int destLengthEstimate) {
    951         if(destLengthEstimate<0) {
    952             destLengthEstimate=limit-src;
    953         }
    954         dest.setLength(0);
    955         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
    956         decompose(s, src, limit, buffer);
    957     }
    958 
    959     // Dual functionality:
    960     // buffer!=NULL: normalize
    961     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
    962     public int decompose(CharSequence s, int src, int limit,
    963                          ReorderingBuffer buffer) {
    964         int minNoCP=minDecompNoCP;
    965 
    966         int prevSrc;
    967         int c=0;
    968         int norm16=0;
    969 
    970         // only for quick check
    971         int prevBoundary=src;
    972         int prevCC=0;
    973 
    974         for(;;) {
    975             // count code units below the minimum or with irrelevant data for the quick check
    976             for(prevSrc=src; src!=limit;) {
    977                 if( (c=s.charAt(src))<minNoCP ||
    978                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
    979                 ) {
    980                     ++src;
    981                 } else if(!UTF16.isSurrogate((char)c)) {
    982                     break;
    983                 } else {
    984                     char c2;
    985                     if(UTF16Plus.isSurrogateLead(c)) {
    986                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
    987                             c=Character.toCodePoint((char)c, c2);
    988                         }
    989                     } else /* trail surrogate */ {
    990                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
    991                             --src;
    992                             c=Character.toCodePoint(c2, (char)c);
    993                         }
    994                     }
    995                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
    996                         src+=Character.charCount(c);
    997                     } else {
    998                         break;
    999                     }
   1000                 }
   1001             }
   1002             // copy these code units all at once
   1003             if(src!=prevSrc) {
   1004                 if(buffer!=null) {
   1005                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
   1006                 } else {
   1007                     prevCC=0;
   1008                     prevBoundary=src;
   1009                 }
   1010             }
   1011             if(src==limit) {
   1012                 break;
   1013             }
   1014 
   1015             // Check one above-minimum, relevant code point.
   1016             src+=Character.charCount(c);
   1017             if(buffer!=null) {
   1018                 decompose(c, norm16, buffer);
   1019             } else {
   1020                 if(isDecompYes(norm16)) {
   1021                     int cc=getCCFromYesOrMaybe(norm16);
   1022                     if(prevCC<=cc || cc==0) {
   1023                         prevCC=cc;
   1024                         if(cc<=1) {
   1025                             prevBoundary=src;
   1026                         }
   1027                         continue;
   1028                     }
   1029                 }
   1030                 return prevBoundary;  // "no" or cc out of order
   1031             }
   1032         }
   1033         return src;
   1034     }
   1035     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
   1036         int limit=s.length();
   1037         if(limit==0) {
   1038             return;
   1039         }
   1040         if(doDecompose) {
   1041             decompose(s, 0, limit, buffer);
   1042             return;
   1043         }
   1044         // Just merge the strings at the boundary.
   1045         int c=Character.codePointAt(s, 0);
   1046         int src=0;
   1047         int firstCC, prevCC, cc;
   1048         firstCC=prevCC=cc=getCC(getNorm16(c));
   1049         while(cc!=0) {
   1050             prevCC=cc;
   1051             src+=Character.charCount(c);
   1052             if(src>=limit) {
   1053                 break;
   1054             }
   1055             c=Character.codePointAt(s, src);
   1056             cc=getCC(getNorm16(c));
   1057         };
   1058         buffer.append(s, 0, src, firstCC, prevCC);
   1059         buffer.append(s, src, limit);
   1060     }
   1061 
   1062     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
   1063     // doCompose: normalize
   1064     // !doCompose: isNormalized (buffer must be empty and initialized)
   1065     public boolean compose(CharSequence s, int src, int limit,
   1066                            boolean onlyContiguous,
   1067                            boolean doCompose,
   1068                            ReorderingBuffer buffer) {
   1069         int prevBoundary=src;
   1070         int minNoMaybeCP=minCompNoMaybeCP;
   1071 
   1072         for (;;) {
   1073             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
   1074             // or with (compYes && ccc==0) properties.
   1075             int prevSrc;
   1076             int c = 0;
   1077             int norm16 = 0;
   1078             for (;;) {
   1079                 if (src == limit) {
   1080                     if (prevBoundary != limit && doCompose) {
   1081                         buffer.append(s, prevBoundary, limit);
   1082                     }
   1083                     return true;
   1084                 }
   1085                 if( (c=s.charAt(src))<minNoMaybeCP ||
   1086                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
   1087                 ) {
   1088                     ++src;
   1089                 } else {
   1090                     prevSrc = src++;
   1091                     if(!UTF16.isSurrogate((char)c)) {
   1092                         break;
   1093                     } else {
   1094                         char c2;
   1095                         if(UTF16Plus.isSurrogateLead(c)) {
   1096                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
   1097                                 ++src;
   1098                                 c=Character.toCodePoint((char)c, c2);
   1099                             }
   1100                         } else /* trail surrogate */ {
   1101                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
   1102                                 --prevSrc;
   1103                                 c=Character.toCodePoint(c2, (char)c);
   1104                             }
   1105                         }
   1106                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1107                             break;
   1108                         }
   1109                     }
   1110                 }
   1111             }
   1112             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1113             // The current character is either a "noNo" (has a mapping)
   1114             // or a "maybeYes" (combines backward)
   1115             // or a "yesYes" with ccc!=0.
   1116             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
   1117 
   1118             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
   1119             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
   1120                 if (!doCompose) {
   1121                     return false;
   1122                 }
   1123                 // Fast path for mapping a character that is immediately surrounded by boundaries.
   1124                 // In this case, we need not decompose around the current character.
   1125                 if (isDecompNoAlgorithmic(norm16)) {
   1126                     // Maps to a single isCompYesAndZeroCC character
   1127                     // which also implies hasCompBoundaryBefore.
   1128                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
   1129                             hasCompBoundaryBefore(s, src, limit)) {
   1130                         if (prevBoundary != prevSrc) {
   1131                             buffer.append(s, prevBoundary, prevSrc);
   1132                         }
   1133                         buffer.append(mapAlgorithmic(c, norm16), 0);
   1134                         prevBoundary = src;
   1135                         continue;
   1136                     }
   1137                 } else if (norm16 < minNoNoCompBoundaryBefore) {
   1138                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
   1139                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
   1140                             hasCompBoundaryBefore(s, src, limit)) {
   1141                         if (prevBoundary != prevSrc) {
   1142                             buffer.append(s, prevBoundary, prevSrc);
   1143                         }
   1144                         int mapping = norm16 >> OFFSET_SHIFT;
   1145                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
   1146                         buffer.append(extraData, mapping, mapping + length);
   1147                         prevBoundary = src;
   1148                         continue;
   1149                     }
   1150                 } else if (norm16 >= minNoNoEmpty) {
   1151                     // The current character maps to nothing.
   1152                     // Simply omit it from the output if there is a boundary before _or_ after it.
   1153                     // The character itself implies no boundaries.
   1154                     if (hasCompBoundaryBefore(s, src, limit) ||
   1155                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
   1156                         if (prevBoundary != prevSrc) {
   1157                             buffer.append(s, prevBoundary, prevSrc);
   1158                         }
   1159                         prevBoundary = src;
   1160                         continue;
   1161                     }
   1162                 }
   1163                 // Other "noNo" type, or need to examine more text around this character:
   1164                 // Fall through to the slow path.
   1165             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
   1166                 char prev=s.charAt(prevSrc-1);
   1167                 if(c<Hangul.JAMO_T_BASE) {
   1168                     // The current character is a Jamo Vowel,
   1169                     // compose with previous Jamo L and following Jamo T.
   1170                     char l = (char)(prev-Hangul.JAMO_L_BASE);
   1171                     if(l<Hangul.JAMO_L_COUNT) {
   1172                         if (!doCompose) {
   1173                             return false;
   1174                         }
   1175                         int t;
   1176                         if (src != limit &&
   1177                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
   1178                                 t < Hangul.JAMO_T_COUNT) {
   1179                             // The next character is a Jamo T.
   1180                             ++src;
   1181                         } else if (hasCompBoundaryBefore(s, src, limit)) {
   1182                             // No Jamo T follows, not even via decomposition.
   1183                             t = 0;
   1184                         } else {
   1185                             t = -1;
   1186                         }
   1187                         if (t >= 0) {
   1188                             int syllable = Hangul.HANGUL_BASE +
   1189                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
   1190                                 Hangul.JAMO_T_COUNT + t;
   1191                             --prevSrc;  // Replace the Jamo L as well.
   1192                             if (prevBoundary != prevSrc) {
   1193                                 buffer.append(s, prevBoundary, prevSrc);
   1194                             }
   1195                             buffer.append((char)syllable);
   1196                             prevBoundary = src;
   1197                             continue;
   1198                         }
   1199                         // If we see L+V+x where x!=T then we drop to the slow path,
   1200                         // decompose and recompose.
   1201                         // This is to deal with NFKC finding normal L and V but a
   1202                         // compatibility variant of a T.
   1203                         // We need to either fully compose that combination here
   1204                         // (which would complicate the code and may not work with strange custom data)
   1205                         // or use the slow path.
   1206                     }
   1207                 } else if (Hangul.isHangulLV(prev)) {
   1208                     // The current character is a Jamo Trailing consonant,
   1209                     // compose with previous Hangul LV that does not contain a Jamo T.
   1210                     if (!doCompose) {
   1211                         return false;
   1212                     }
   1213                     int syllable = prev + c - Hangul.JAMO_T_BASE;
   1214                     --prevSrc;  // Replace the Hangul LV as well.
   1215                     if (prevBoundary != prevSrc) {
   1216                         buffer.append(s, prevBoundary, prevSrc);
   1217                     }
   1218                     buffer.append((char)syllable);
   1219                     prevBoundary = src;
   1220                     continue;
   1221                 }
   1222                 // No matching context, or may need to decompose surrounding text first:
   1223                 // Fall through to the slow path.
   1224             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
   1225                 // One or more combining marks that do not combine-back:
   1226                 // Check for canonical order, copy unchanged if ok and
   1227                 // if followed by a character with a boundary-before.
   1228                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
   1229                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
   1230                     // Fails FCD test, need to decompose and contiguously recompose.
   1231                     if (!doCompose) {
   1232                         return false;
   1233                     }
   1234                 } else {
   1235                     // If !onlyContiguous (not FCC), then we ignore the tccc of
   1236                     // the previous character which passed the quick check "yes && ccc==0" test.
   1237                     int n16;
   1238                     for (;;) {
   1239                         if (src == limit) {
   1240                             if (doCompose) {
   1241                                 buffer.append(s, prevBoundary, limit);
   1242                             }
   1243                             return true;
   1244                         }
   1245                         int prevCC = cc;
   1246                         c = Character.codePointAt(s, src);
   1247                         n16 = normTrie.get(c);
   1248                         if (n16 >= MIN_YES_YES_WITH_CC) {
   1249                             cc = getCCFromNormalYesOrMaybe(n16);
   1250                             if (prevCC > cc) {
   1251                                 if (!doCompose) {
   1252                                     return false;
   1253                                 }
   1254                                 break;
   1255                             }
   1256                         } else {
   1257                             break;
   1258                         }
   1259                         src += Character.charCount(c);
   1260                     }
   1261                     // p is after the last in-order combining mark.
   1262                     // If there is a boundary here, then we continue with no change.
   1263                     if (norm16HasCompBoundaryBefore(n16)) {
   1264                         if (isCompYesAndZeroCC(n16)) {
   1265                             src += Character.charCount(c);
   1266                         }
   1267                         continue;
   1268                     }
   1269                     // Use the slow path. There is no boundary in [prevSrc, src[.
   1270                 }
   1271             }
   1272 
   1273             // Slow path: Find the nearest boundaries around the current character,
   1274             // decompose and recompose.
   1275             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
   1276                 c = Character.codePointBefore(s, prevSrc);
   1277                 norm16 = normTrie.get(c);
   1278                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
   1279                     prevSrc -= Character.charCount(c);
   1280                 }
   1281             }
   1282             if (doCompose && prevBoundary != prevSrc) {
   1283                 buffer.append(s, prevBoundary, prevSrc);
   1284             }
   1285             int recomposeStartIndex=buffer.length();
   1286             // We know there is not a boundary here.
   1287             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
   1288                            buffer);
   1289             // Decompose until the next boundary.
   1290             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
   1291                                  buffer);
   1292             recompose(buffer, recomposeStartIndex, onlyContiguous);
   1293             if(!doCompose) {
   1294                 if(!buffer.equals(s, prevSrc, src)) {
   1295                     return false;
   1296                 }
   1297                 buffer.remove();
   1298             }
   1299             prevBoundary=src;
   1300         }
   1301     }
   1302 
   1303     /**
   1304      * Very similar to compose(): Make the same changes in both places if relevant.
   1305      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
   1306      * !doSpan: quickCheck
   1307      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
   1308      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
   1309      *         then the quick check result is "no"
   1310      */
   1311     public int composeQuickCheck(CharSequence s, int src, int limit,
   1312                                  boolean onlyContiguous, boolean doSpan) {
   1313         int qcResult=0;
   1314         int prevBoundary=src;
   1315         int minNoMaybeCP=minCompNoMaybeCP;
   1316 
   1317         for(;;) {
   1318             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
   1319             // or with (compYes && ccc==0) properties.
   1320             int prevSrc;
   1321             int c = 0;
   1322             int norm16 = 0;
   1323             for (;;) {
   1324                 if(src==limit) {
   1325                     return (src<<1)|qcResult;  // "yes" or "maybe"
   1326                 }
   1327                 if( (c=s.charAt(src))<minNoMaybeCP ||
   1328                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
   1329                 ) {
   1330                     ++src;
   1331                 } else {
   1332                     prevSrc = src++;
   1333                     if(!UTF16.isSurrogate((char)c)) {
   1334                         break;
   1335                     } else {
   1336                         char c2;
   1337                         if(UTF16Plus.isSurrogateLead(c)) {
   1338                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
   1339                                 ++src;
   1340                                 c=Character.toCodePoint((char)c, c2);
   1341                             }
   1342                         } else /* trail surrogate */ {
   1343                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
   1344                                 --prevSrc;
   1345                                 c=Character.toCodePoint(c2, (char)c);
   1346                             }
   1347                         }
   1348                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1349                             break;
   1350                         }
   1351                     }
   1352                 }
   1353             }
   1354             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1355             // The current character is either a "noNo" (has a mapping)
   1356             // or a "maybeYes" (combines backward)
   1357             // or a "yesYes" with ccc!=0.
   1358             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
   1359 
   1360             int prevNorm16 = INERT;
   1361             if (prevBoundary != prevSrc) {
   1362                 prevBoundary = prevSrc;
   1363                 if (!norm16HasCompBoundaryBefore(norm16)) {
   1364                     c = Character.codePointBefore(s, prevSrc);
   1365                     int n16 = getNorm16(c);
   1366                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
   1367                         prevBoundary -= Character.charCount(c);
   1368                         prevNorm16 = n16;
   1369                     }
   1370                 }
   1371             }
   1372 
   1373             if(isMaybeOrNonZeroCC(norm16)) {
   1374                 int cc=getCCFromYesOrMaybe(norm16);
   1375                 if (onlyContiguous /* FCC */ && cc != 0 &&
   1376                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
   1377                     // The [prevBoundary..prevSrc[ character
   1378                     // passed the quick check "yes && ccc==0" test
   1379                     // but is out of canonical order with the current combining mark.
   1380                 } else {
   1381                     // If !onlyContiguous (not FCC), then we ignore the tccc of
   1382                     // the previous character which passed the quick check "yes && ccc==0" test.
   1383                     for (;;) {
   1384                         if (norm16 < MIN_YES_YES_WITH_CC) {
   1385                             if (!doSpan) {
   1386                                 qcResult = 1;
   1387                             } else {
   1388                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
   1389                             }
   1390                         }
   1391                         if (src == limit) {
   1392                             return (src<<1) | qcResult;  // "yes" or "maybe"
   1393                         }
   1394                         int prevCC = cc;
   1395                         c = Character.codePointAt(s, src);
   1396                         norm16 = getNorm16(c);
   1397                         if (isMaybeOrNonZeroCC(norm16)) {
   1398                             cc = getCCFromYesOrMaybe(norm16);
   1399                             if (!(prevCC <= cc || cc == 0)) {
   1400                                 break;
   1401                             }
   1402                         } else {
   1403                             break;
   1404                         }
   1405                         src += Character.charCount(c);
   1406                     }
   1407                     // src is after the last in-order combining mark.
   1408                     if (isCompYesAndZeroCC(norm16)) {
   1409                         prevBoundary = src;
   1410                         src += Character.charCount(c);
   1411                         continue;
   1412                     }
   1413                 }
   1414             }
   1415             return prevBoundary<<1;  // "no"
   1416         }
   1417     }
   1418     public void composeAndAppend(CharSequence s,
   1419                                  boolean doCompose,
   1420                                  boolean onlyContiguous,
   1421                                  ReorderingBuffer buffer) {
   1422         int src=0, limit=s.length();
   1423         if(!buffer.isEmpty()) {
   1424             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
   1425             if(0!=firstStarterInSrc) {
   1426                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
   1427                                                                buffer.length(), onlyContiguous);
   1428                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
   1429                                                        firstStarterInSrc+16);
   1430                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
   1431                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
   1432                 middle.append(s, 0, firstStarterInSrc);
   1433                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
   1434                 src=firstStarterInSrc;
   1435             }
   1436         }
   1437         if(doCompose) {
   1438             compose(s, src, limit, onlyContiguous, true, buffer);
   1439         } else {
   1440             buffer.append(s, src, limit);
   1441         }
   1442     }
   1443     // Dual functionality:
   1444     // buffer!=NULL: normalize
   1445     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
   1446     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
   1447         // Note: In this function we use buffer->appendZeroCC() because we track
   1448         // the lead and trail combining classes here, rather than leaving it to
   1449         // the ReorderingBuffer.
   1450         // The exception is the call to decomposeShort() which uses the buffer
   1451         // in the normal way.
   1452 
   1453         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
   1454         // Similar to the prevBoundary in the compose() implementation.
   1455         int prevBoundary=src;
   1456         int prevSrc;
   1457         int c=0;
   1458         int prevFCD16=0;
   1459         int fcd16=0;
   1460 
   1461         for(;;) {
   1462             // count code units with lccc==0
   1463             for(prevSrc=src; src!=limit;) {
   1464                 if((c=s.charAt(src))<minLcccCP) {
   1465                     prevFCD16=~c;
   1466                     ++src;
   1467                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
   1468                     prevFCD16=0;
   1469                     ++src;
   1470                 } else {
   1471                     if(UTF16.isSurrogate((char)c)) {
   1472                         char c2;
   1473                         if(UTF16Plus.isSurrogateLead(c)) {
   1474                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
   1475                                 c=Character.toCodePoint((char)c, c2);
   1476                             }
   1477                         } else /* trail surrogate */ {
   1478                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
   1479                                 --src;
   1480                                 c=Character.toCodePoint(c2, (char)c);
   1481                             }
   1482                         }
   1483                     }
   1484                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
   1485                         prevFCD16=fcd16;
   1486                         src+=Character.charCount(c);
   1487                     } else {
   1488                         break;
   1489                     }
   1490                 }
   1491             }
   1492             // copy these code units all at once
   1493             if(src!=prevSrc) {
   1494                 if(src==limit) {
   1495                     if(buffer!=null) {
   1496                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
   1497                     }
   1498                     break;
   1499                 }
   1500                 prevBoundary=src;
   1501                 // We know that the previous character's lccc==0.
   1502                 if(prevFCD16<0) {
   1503                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
   1504                     int prev=~prevFCD16;
   1505                     if(prev<minDecompNoCP) {
   1506                         prevFCD16=0;
   1507                     } else {
   1508                         prevFCD16=getFCD16FromNormData(prev);
   1509                         if(prevFCD16>1) {
   1510                             --prevBoundary;
   1511                         }
   1512                     }
   1513                 } else {
   1514                     int p=src-1;
   1515                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
   1516                         Character.isHighSurrogate(s.charAt(p-1))
   1517                     ) {
   1518                         --p;
   1519                         // Need to fetch the previous character's FCD value because
   1520                         // prevFCD16 was just for the trail surrogate code point.
   1521                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
   1522                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
   1523                     }
   1524                     if(prevFCD16>1) {
   1525                         prevBoundary=p;
   1526                     }
   1527                 }
   1528                 if(buffer!=null) {
   1529                     // The last lccc==0 character is excluded from the
   1530                     // flush-and-append call in case it needs to be modified.
   1531                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
   1532                     buffer.append(s, prevBoundary, src);
   1533                 }
   1534                 // The start of the current character (c).
   1535                 prevSrc=src;
   1536             } else if(src==limit) {
   1537                 break;
   1538             }
   1539 
   1540             src+=Character.charCount(c);
   1541             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
   1542             // Check for proper order, and decompose locally if necessary.
   1543             if((prevFCD16&0xff)<=(fcd16>>8)) {
   1544                 // proper order: prev tccc <= current lccc
   1545                 if((fcd16&0xff)<=1) {
   1546                     prevBoundary=src;
   1547                 }
   1548                 if(buffer!=null) {
   1549                     buffer.appendZeroCC(c);
   1550                 }
   1551                 prevFCD16=fcd16;
   1552                 continue;
   1553             } else if(buffer==null) {
   1554                 return prevBoundary;  // quick check "no"
   1555             } else {
   1556                 /*
   1557                  * Back out the part of the source that we copied or appended
   1558                  * already but is now going to be decomposed.
   1559                  * prevSrc is set to after what was copied/appended.
   1560                  */
   1561                 buffer.removeSuffix(prevSrc-prevBoundary);
   1562                 /*
   1563                  * Find the part of the source that needs to be decomposed,
   1564                  * up to the next safe boundary.
   1565                  */
   1566                 src=findNextFCDBoundary(s, src, limit);
   1567                 /*
   1568                  * The source text does not fulfill the conditions for FCD.
   1569                  * Decompose and reorder a limited piece of the text.
   1570                  */
   1571                 decomposeShort(s, prevBoundary, src, false, false, buffer);
   1572                 prevBoundary=src;
   1573                 prevFCD16=0;
   1574             }
   1575         }
   1576         return src;
   1577     }
   1578     public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
   1579         int src=0, limit=s.length();
   1580         if(!buffer.isEmpty()) {
   1581             int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
   1582             if(0!=firstBoundaryInSrc) {
   1583                 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
   1584                                                                buffer.length());
   1585                 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
   1586                                                        firstBoundaryInSrc+16);
   1587                 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
   1588                 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
   1589                 middle.append(s, 0, firstBoundaryInSrc);
   1590                 makeFCD(middle, 0, middle.length(), buffer);
   1591                 src=firstBoundaryInSrc;
   1592             }
   1593         }
   1594         if(doMakeFCD) {
   1595             makeFCD(s, src, limit, buffer);
   1596         } else {
   1597             buffer.append(s, src, limit);
   1598         }
   1599     }
   1600 
   1601     public boolean hasDecompBoundaryBefore(int c) {
   1602         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
   1603             norm16HasDecompBoundaryBefore(getNorm16(c));
   1604     }
   1605     public boolean norm16HasDecompBoundaryBefore(int norm16) {
   1606         if (norm16 < minNoNoCompNoMaybeCC) {
   1607             return true;
   1608         }
   1609         if (norm16 >= limitNoNo) {
   1610             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
   1611         }
   1612         // c decomposes, get everything from the variable-length extra data
   1613         int mapping=norm16>>OFFSET_SHIFT;
   1614         int firstUnit=extraData.charAt(mapping);
   1615         // true if leadCC==0 (hasFCDBoundaryBefore())
   1616         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
   1617     }
   1618     public boolean hasDecompBoundaryAfter(int c) {
   1619         if (c < minDecompNoCP) {
   1620             return true;
   1621         }
   1622         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
   1623             return true;
   1624         }
   1625         return norm16HasDecompBoundaryAfter(getNorm16(c));
   1626     }
   1627     public boolean norm16HasDecompBoundaryAfter(int norm16) {
   1628         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
   1629             return true;
   1630         }
   1631         if (norm16 >= limitNoNo) {
   1632             if (isMaybeOrNonZeroCC(norm16)) {
   1633                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
   1634             }
   1635             // Maps to an isCompYesAndZeroCC.
   1636             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
   1637         }
   1638         // c decomposes, get everything from the variable-length extra data
   1639         int mapping=norm16>>OFFSET_SHIFT;
   1640         int firstUnit=extraData.charAt(mapping);
   1641         // decomp after-boundary: same as hasFCDBoundaryAfter(),
   1642         // fcd16<=1 || trailCC==0
   1643         if(firstUnit>0x1ff) {
   1644             return false;  // trailCC>1
   1645         }
   1646         if(firstUnit<=0xff) {
   1647             return true;  // trailCC==0
   1648         }
   1649         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
   1650         // true if leadCC==0 (hasFCDBoundaryBefore())
   1651         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
   1652     }
   1653     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
   1654 
   1655     public boolean hasCompBoundaryBefore(int c) {
   1656         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
   1657     }
   1658     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
   1659         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
   1660     }
   1661     public boolean isCompInert(int c, boolean onlyContiguous) {
   1662         int norm16=getNorm16(c);
   1663         return isCompYesAndZeroCC(norm16) &&
   1664             (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
   1665             (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff);
   1666     }
   1667 
   1668     public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); }
   1669     public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); }
   1670     public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
   1671 
   1672     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
   1673     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
   1674     private static boolean isInert(int norm16) { return norm16==INERT; }
   1675     private static boolean isJamoL(int norm16) { return norm16==JAMO_L; }
   1676     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
   1677     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
   1678     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
   1679     private boolean isHangulLVT(int norm16) {
   1680         return norm16==hangulLVT();
   1681     }
   1682     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
   1683     // UBool isCompYes(uint16_t norm16) const {
   1684     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
   1685     // }
   1686     // UBool isCompYesOrMaybe(uint16_t norm16) const {
   1687     //     return norm16<minNoNo || minMaybeYes<=norm16;
   1688     // }
   1689     // private boolean hasZeroCCFromDecompYes(int norm16) {
   1690     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   1691     // }
   1692     private boolean isDecompYesAndZeroCC(int norm16) {
   1693         return norm16<minYesNo ||
   1694                norm16==JAMO_VT ||
   1695                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
   1696     }
   1697     /**
   1698      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
   1699      * the MaybeYes which combine-forward and have ccc=0.
   1700      * (Standard Unicode 10 normalization does not have such characters.)
   1701      */
   1702     private boolean isMostDecompYesAndZeroCC(int norm16) {
   1703         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   1704     }
   1705     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
   1706 
   1707     // For use with isCompYes().
   1708     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
   1709     // static uint8_t getCCFromYes(uint16_t norm16) {
   1710     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
   1711     // }
   1712     private int getCCFromNoNo(int norm16) {
   1713         int mapping=norm16>>OFFSET_SHIFT;
   1714         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
   1715             return extraData.charAt(mapping-1)&0xff;
   1716         } else {
   1717             return 0;
   1718         }
   1719     }
   1720     int getTrailCCFromCompYesAndZeroCC(int norm16) {
   1721         if(norm16<=minYesNo) {
   1722             return 0;  // yesYes and Hangul LV have ccc=tccc=0
   1723         } else {
   1724             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
   1725             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
   1726         }
   1727     }
   1728 
   1729     // Requires algorithmic-NoNo.
   1730     private int mapAlgorithmic(int c, int norm16) {
   1731         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
   1732     }
   1733 
   1734     // Requires minYesNo<norm16<limitNoNo.
   1735     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
   1736 
   1737     /**
   1738      * @return index into maybeYesCompositions, or -1
   1739      */
   1740     private int getCompositionsListForDecompYes(int norm16) {
   1741         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
   1742             return -1;
   1743         } else {
   1744             if((norm16-=minMaybeYes)<0) {
   1745                 // norm16<minMaybeYes: index into extraData which is a substring at
   1746                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
   1747                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
   1748                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
   1749             }
   1750             return norm16>>OFFSET_SHIFT;
   1751         }
   1752     }
   1753     /**
   1754      * @return index into maybeYesCompositions
   1755      */
   1756     private int getCompositionsListForComposite(int norm16) {
   1757         // A composite has both mapping & compositions list.
   1758         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
   1759         int firstUnit=maybeYesCompositions.charAt(list);
   1760         return list+  // mapping in maybeYesCompositions
   1761             1+  // +1 to skip the first unit with the mapping length
   1762             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
   1763     }
   1764     private int getCompositionsListForMaybe(int norm16) {
   1765         // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
   1766         return (norm16-minMaybeYes)>>OFFSET_SHIFT;
   1767     }
   1768     /**
   1769      * @param c code point must have compositions
   1770      * @return index into maybeYesCompositions
   1771      */
   1772     private int getCompositionsList(int norm16) {
   1773         return isDecompYes(norm16) ?
   1774                 getCompositionsListForDecompYes(norm16) :
   1775                 getCompositionsListForComposite(norm16);
   1776     }
   1777 
   1778     // Decompose a short piece of text which is likely to contain characters that
   1779     // fail the quick check loop and/or where the quick check loop's overhead
   1780     // is unlikely to be amortized.
   1781     // Called by the compose() and makeFCD() implementations.
   1782     // Public in Java for collation implementation code.
   1783     private int decomposeShort(
   1784             CharSequence s, int src, int limit,
   1785             boolean stopAtCompBoundary, boolean onlyContiguous,
   1786             ReorderingBuffer buffer) {
   1787         while(src<limit) {
   1788             int c=Character.codePointAt(s, src);
   1789             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
   1790                 return src;
   1791             }
   1792             int norm16 = getNorm16(c);
   1793             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
   1794                 return src;
   1795             }
   1796             src+=Character.charCount(c);
   1797             decompose(c, norm16, buffer);
   1798             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
   1799                 return src;
   1800             }
   1801         }
   1802         return src;
   1803     }
   1804     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
   1805         // get the decomposition and the lead and trail cc's
   1806         if (norm16 >= limitNoNo) {
   1807             if (isMaybeOrNonZeroCC(norm16)) {
   1808                 buffer.append(c, getCCFromYesOrMaybe(norm16));
   1809                 return;
   1810             }
   1811             // Maps to an isCompYesAndZeroCC.
   1812             c=mapAlgorithmic(c, norm16);
   1813             norm16=getNorm16(c);
   1814         }
   1815         if (norm16 < minYesNo) {
   1816             // c does not decompose
   1817             buffer.append(c, 0);
   1818         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
   1819             // Hangul syllable: decompose algorithmically
   1820             Hangul.decompose(c, buffer);
   1821         } else {
   1822             // c decomposes, get everything from the variable-length extra data
   1823             int mapping=norm16>>OFFSET_SHIFT;
   1824             int firstUnit=extraData.charAt(mapping);
   1825             int length=firstUnit&MAPPING_LENGTH_MASK;
   1826             int leadCC, trailCC;
   1827             trailCC=firstUnit>>8;
   1828             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
   1829                 leadCC=extraData.charAt(mapping-1)>>8;
   1830             } else {
   1831                 leadCC=0;
   1832             }
   1833             ++mapping;  // skip over the firstUnit
   1834             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
   1835         }
   1836     }
   1837 
   1838     /**
   1839      * Finds the recomposition result for
   1840      * a forward-combining "lead" character,
   1841      * specified with a pointer to its compositions list,
   1842      * and a backward-combining "trail" character.
   1843      *
   1844      * <p>If the lead and trail characters combine, then this function returns
   1845      * the following "compositeAndFwd" value:
   1846      * <pre>
   1847      * Bits 21..1  composite character
   1848      * Bit      0  set if the composite is a forward-combining starter
   1849      * </pre>
   1850      * otherwise it returns -1.
   1851      *
   1852      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
   1853      * encoded as either pairs or triples of 16-bit units.
   1854      * The last entry has the high bit of its first unit set.
   1855      *
   1856      * <p>The list is sorted by ascending trail characters (there are no duplicates).
   1857      * A linear search is used.
   1858      *
   1859      * <p>See normalizer2impl.h for a more detailed description
   1860      * of the compositions list format.
   1861      */
   1862     private static int combine(String compositions, int list, int trail) {
   1863         int key1, firstUnit;
   1864         if(trail<COMP_1_TRAIL_LIMIT) {
   1865             // trail character is 0..33FF
   1866             // result entry may have 2 or 3 units
   1867             key1=(trail<<1);
   1868             while(key1>(firstUnit=compositions.charAt(list))) {
   1869                 list+=2+(firstUnit&COMP_1_TRIPLE);
   1870             }
   1871             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
   1872                 if((firstUnit&COMP_1_TRIPLE)!=0) {
   1873                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
   1874                 } else {
   1875                     return compositions.charAt(list+1);
   1876                 }
   1877             }
   1878         } else {
   1879             // trail character is 3400..10FFFF
   1880             // result entry has 3 units
   1881             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
   1882             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
   1883             int secondUnit;
   1884             for(;;) {
   1885                 if(key1>(firstUnit=compositions.charAt(list))) {
   1886                     list+=2+(firstUnit&COMP_1_TRIPLE);
   1887                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
   1888                     if(key2>(secondUnit=compositions.charAt(list+1))) {
   1889                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
   1890                             break;
   1891                         } else {
   1892                             list+=3;
   1893                         }
   1894                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
   1895                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
   1896                     } else {
   1897                         break;
   1898                     }
   1899                 } else {
   1900                     break;
   1901                 }
   1902             }
   1903         }
   1904         return -1;
   1905     }
   1906     /**
   1907      * @param list some character's compositions list
   1908      * @param set recursively receives the composites from these compositions
   1909      */
   1910     private void addComposites(int list, UnicodeSet set) {
   1911         int firstUnit, compositeAndFwd;
   1912         do {
   1913             firstUnit=maybeYesCompositions.charAt(list);
   1914             if((firstUnit&COMP_1_TRIPLE)==0) {
   1915                 compositeAndFwd=maybeYesCompositions.charAt(list+1);
   1916                 list+=2;
   1917             } else {
   1918                 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
   1919                                 maybeYesCompositions.charAt(list+2);
   1920                 list+=3;
   1921             }
   1922             int composite=compositeAndFwd>>1;
   1923             if((compositeAndFwd&1)!=0) {
   1924                 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
   1925             }
   1926             set.add(composite);
   1927         } while((firstUnit&COMP_1_LAST_TUPLE)==0);
   1928     }
   1929     /*
   1930      * Recomposes the buffer text starting at recomposeStartIndex
   1931      * (which is in NFD - decomposed and canonically ordered),
   1932      * and truncates the buffer contents.
   1933      *
   1934      * Note that recomposition never lengthens the text:
   1935      * Any character consists of either one or two code units;
   1936      * a composition may contain at most one more code unit than the original starter,
   1937      * while the combining mark that is removed has at least one code unit.
   1938      */
   1939     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
   1940                            boolean onlyContiguous) {
   1941         StringBuilder sb=buffer.getStringBuilder();
   1942         int p=recomposeStartIndex;
   1943         if(p==sb.length()) {
   1944             return;
   1945         }
   1946 
   1947         int starter, pRemove;
   1948         int compositionsList;
   1949         int c, compositeAndFwd;
   1950         int norm16;
   1951         int cc, prevCC;
   1952         boolean starterIsSupplementary;
   1953 
   1954         // Some of the following variables are not used until we have a forward-combining starter
   1955         // and are only initialized now to avoid compiler warnings.
   1956         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
   1957         starter=-1;
   1958         starterIsSupplementary=false;
   1959         prevCC=0;
   1960 
   1961         for(;;) {
   1962             c=sb.codePointAt(p);
   1963             p+=Character.charCount(c);
   1964             norm16=getNorm16(c);
   1965             cc=getCCFromYesOrMaybe(norm16);
   1966             if( // this character combines backward and
   1967                 isMaybe(norm16) &&
   1968                 // we have seen a starter that combines forward and
   1969                 compositionsList>=0 &&
   1970                 // the backward-combining character is not blocked
   1971                 (prevCC<cc || prevCC==0)
   1972             ) {
   1973                 if(isJamoVT(norm16)) {
   1974                     // c is a Jamo V/T, see if we can compose it with the previous character.
   1975                     if(c<Hangul.JAMO_T_BASE) {
   1976                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
   1977                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
   1978                         if(prev<Hangul.JAMO_L_COUNT) {
   1979                             pRemove=p-1;
   1980                             char syllable=(char)
   1981                                 (Hangul.HANGUL_BASE+
   1982                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
   1983                                  Hangul.JAMO_T_COUNT);
   1984                             char t;
   1985                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
   1986                                 ++p;
   1987                                 syllable+=t;  // The next character was a Jamo T.
   1988                             }
   1989                             sb.setCharAt(starter, syllable);
   1990                             // remove the Jamo V/T
   1991                             sb.delete(pRemove, p);
   1992                             p=pRemove;
   1993                         }
   1994                     }
   1995                     /*
   1996                      * No "else" for Jamo T:
   1997                      * Since the input is in NFD, there are no Hangul LV syllables that
   1998                      * a Jamo T could combine with.
   1999                      * All Jamo Ts are combined above when handling Jamo Vs.
   2000                      */
   2001                     if(p==sb.length()) {
   2002                         break;
   2003                     }
   2004                     compositionsList=-1;
   2005                     continue;
   2006                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
   2007                     // The starter and the combining mark (c) do combine.
   2008                     int composite=compositeAndFwd>>1;
   2009 
   2010                     // Remove the combining mark.
   2011                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
   2012                     sb.delete(pRemove, p);
   2013                     p=pRemove;
   2014                     // Replace the starter with the composite.
   2015                     if(starterIsSupplementary) {
   2016                         if(composite>0xffff) {
   2017                             // both are supplementary
   2018                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
   2019                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
   2020                         } else {
   2021                             sb.setCharAt(starter, (char)c);
   2022                             sb.deleteCharAt(starter+1);
   2023                             // The composite is shorter than the starter,
   2024                             // move the intermediate characters forward one.
   2025                             starterIsSupplementary=false;
   2026                             --p;
   2027                         }
   2028                     } else if(composite>0xffff) {
   2029                         // The composite is longer than the starter,
   2030                         // move the intermediate characters back one.
   2031                         starterIsSupplementary=true;
   2032                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
   2033                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
   2034                         ++p;
   2035                     } else {
   2036                         // both are on the BMP
   2037                         sb.setCharAt(starter, (char)composite);
   2038                     }
   2039 
   2040                     // Keep prevCC because we removed the combining mark.
   2041 
   2042                     if(p==sb.length()) {
   2043                         break;
   2044                     }
   2045                     // Is the composite a starter that combines forward?
   2046                     if((compositeAndFwd&1)!=0) {
   2047                         compositionsList=
   2048                             getCompositionsListForComposite(getNorm16(composite));
   2049                     } else {
   2050                         compositionsList=-1;
   2051                     }
   2052 
   2053                     // We combined; continue with looking for compositions.
   2054                     continue;
   2055                 }
   2056             }
   2057 
   2058             // no combination this time
   2059             prevCC=cc;
   2060             if(p==sb.length()) {
   2061                 break;
   2062             }
   2063 
   2064             // If c did not combine, then check if it is a starter.
   2065             if(cc==0) {
   2066                 // Found a new starter.
   2067                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
   2068                     // It may combine with something, prepare for it.
   2069                     if(c<=0xffff) {
   2070                         starterIsSupplementary=false;
   2071                         starter=p-1;
   2072                     } else {
   2073                         starterIsSupplementary=true;
   2074                         starter=p-2;
   2075                     }
   2076                 }
   2077             } else if(onlyContiguous) {
   2078                 // FCC: no discontiguous compositions; any intervening character blocks.
   2079                 compositionsList=-1;
   2080             }
   2081         }
   2082         buffer.flush();
   2083     }
   2084 
   2085     public int composePair(int a, int b) {
   2086         int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
   2087         int list;
   2088         if(isInert(norm16)) {
   2089             return -1;
   2090         } else if(norm16<minYesNoMappingsOnly) {
   2091             // a combines forward.
   2092             if(isJamoL(norm16)) {
   2093                 b-=Hangul.JAMO_V_BASE;
   2094                 if(0<=b && b<Hangul.JAMO_V_COUNT) {
   2095                     return
   2096                         (Hangul.HANGUL_BASE+
   2097                          ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
   2098                          Hangul.JAMO_T_COUNT);
   2099                 } else {
   2100                     return -1;
   2101                 }
   2102             } else if(isHangulLV(norm16)) {
   2103                 b-=Hangul.JAMO_T_BASE;
   2104                 if(0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
   2105                     return a+b;
   2106                 } else {
   2107                     return -1;
   2108                 }
   2109             } else {
   2110                 // 'a' has a compositions list in extraData
   2111                 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
   2112                 if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
   2113                     list+=  // mapping pointer
   2114                         1+  // +1 to skip the first unit with the mapping length
   2115                         (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
   2116                 }
   2117             }
   2118         } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
   2119             return -1;
   2120         } else {
   2121             list=getCompositionsListForMaybe(norm16);  // offset into maybeYesCompositions
   2122         }
   2123         if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
   2124             return -1;
   2125         }
   2126         return combine(maybeYesCompositions, list, b)>>1;
   2127     }
   2128 
   2129     /**
   2130      * Does c have a composition boundary before it?
   2131      * True if its decomposition begins with a character that has
   2132      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
   2133      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
   2134      * (isCompYesAndZeroCC()) so we need not decompose.
   2135      */
   2136     private boolean hasCompBoundaryBefore(int c, int norm16) {
   2137         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
   2138     }
   2139     private boolean norm16HasCompBoundaryBefore(int norm16) {
   2140         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
   2141     }
   2142     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
   2143         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
   2144     }
   2145     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
   2146         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
   2147             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
   2148     }
   2149     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
   2150         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
   2151     }
   2152     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
   2153     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
   2154         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
   2155             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
   2156     }
   2157 
   2158     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
   2159         while(p>0) {
   2160             int c=Character.codePointBefore(s, p);
   2161             int norm16 = getNorm16(c);
   2162             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
   2163                 break;
   2164             }
   2165             p-=Character.charCount(c);
   2166             if(hasCompBoundaryBefore(c, norm16)) {
   2167                 break;
   2168             }
   2169         }
   2170         return p;
   2171     }
   2172     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
   2173         while(p<limit) {
   2174             int c=Character.codePointAt(s, p);
   2175             int norm16=normTrie.get(c);
   2176             if(hasCompBoundaryBefore(c, norm16)) {
   2177                 break;
   2178             }
   2179             p+=Character.charCount(c);
   2180             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
   2181                 break;
   2182             }
   2183         }
   2184         return p;
   2185     }
   2186 
   2187     private int findPreviousFCDBoundary(CharSequence s, int p) {
   2188         while(p>0) {
   2189             int c=Character.codePointBefore(s, p);
   2190             int norm16;
   2191             if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) {
   2192                 break;
   2193             }
   2194             p-=Character.charCount(c);
   2195             if (norm16HasDecompBoundaryBefore(norm16)) {
   2196                 break;
   2197             }
   2198         }
   2199         return p;
   2200     }
   2201     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
   2202         while(p<limit) {
   2203             int c=Character.codePointAt(s, p);
   2204             int norm16;
   2205             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
   2206                 break;
   2207             }
   2208             p+=Character.charCount(c);
   2209             if (norm16HasDecompBoundaryAfter(norm16)) {
   2210                 break;
   2211             }
   2212         }
   2213         return p;
   2214     }
   2215 
   2216     private int getPreviousTrailCC(CharSequence s, int start, int p) {
   2217         if (start == p) {
   2218             return 0;
   2219         }
   2220         return getFCD16(Character.codePointBefore(s, p));
   2221     }
   2222 
   2223     private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
   2224         int canonValue=newData.get(decompLead);
   2225         if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
   2226             // origin is the first character whose decomposition starts with
   2227             // the character for which we are setting the value.
   2228             newData.set(decompLead, canonValue|origin);
   2229         } else {
   2230             // origin is not the first character, or it is U+0000.
   2231             UnicodeSet set;
   2232             if((canonValue&CANON_HAS_SET)==0) {
   2233                 int firstOrigin=canonValue&CANON_VALUE_MASK;
   2234                 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
   2235                 newData.set(decompLead, canonValue);
   2236                 canonStartSets.add(set=new UnicodeSet());
   2237                 if(firstOrigin!=0) {
   2238                     set.add(firstOrigin);
   2239                 }
   2240             } else {
   2241                 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
   2242             }
   2243             set.add(origin);
   2244         }
   2245     }
   2246 
   2247     @SuppressWarnings("unused")
   2248     private VersionInfo dataVersion;
   2249 
   2250     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
   2251     private int minDecompNoCP;
   2252     private int minCompNoMaybeCP;
   2253     private int minLcccCP;
   2254 
   2255     // Norm16 value thresholds for quick check combinations and types of extra data.
   2256     private int minYesNo;
   2257     private int minYesNoMappingsOnly;
   2258     private int minNoNo;
   2259     private int minNoNoCompBoundaryBefore;
   2260     private int minNoNoCompNoMaybeCC;
   2261     private int minNoNoEmpty;
   2262     private int limitNoNo;
   2263     private int centerNoNoDelta;
   2264     private int minMaybeYes;
   2265 
   2266     private Trie2_16 normTrie;
   2267     private String maybeYesCompositions;
   2268     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
   2269     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
   2270 
   2271     private Trie2_32 canonIterData;
   2272     private ArrayList<UnicodeSet> canonStartSets;
   2273 
   2274     // bits in canonIterData
   2275     private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
   2276     private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
   2277     private static final int CANON_HAS_SET = 0x200000;
   2278     private static final int CANON_VALUE_MASK = 0x1fffff;
   2279 }
   2280