Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  *
      6  *   Copyright (C) 2004-2015, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  *******************************************************************************
     10  *   file name:  UCaseProps.java
     11  *   encoding:   US-ASCII
     12  *   tab size:   8 (not used)
     13  *   indentation:4
     14  *
     15  *   created on: 2005jan29
     16  *   created by: Markus W. Scherer
     17  *
     18  *   Low-level Unicode character/string case mapping code.
     19  *   Java port of ucase.h/.c.
     20  */
     21 
     22 package com.ibm.icu.impl;
     23 
     24 import java.io.IOException;
     25 import java.nio.ByteBuffer;
     26 import java.util.Iterator;
     27 import java.util.Locale;
     28 
     29 import com.ibm.icu.lang.UCharacter;
     30 import com.ibm.icu.lang.UProperty;
     31 import com.ibm.icu.text.UTF16;
     32 import com.ibm.icu.text.UnicodeSet;
     33 import com.ibm.icu.util.ICUUncheckedIOException;
     34 import com.ibm.icu.util.ULocale;
     35 
     36 public final class UCaseProps {
     37 
     38     // constructors etc. --------------------------------------------------- ***
     39 
     40     // port of ucase_openProps()
     41     private UCaseProps() throws IOException {
     42         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
     43         readData(bytes);
     44     }
     45 
     46     private final void readData(ByteBuffer bytes) throws IOException {
     47         // read the header
     48         ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
     49 
     50         // read indexes[]
     51         int count=bytes.getInt();
     52         if(count<IX_TOP) {
     53             throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
     54         }
     55         indexes=new int[count];
     56 
     57         indexes[0]=count;
     58         for(int i=1; i<count; ++i) {
     59             indexes[i]=bytes.getInt();
     60         }
     61 
     62         // read the trie
     63         trie=Trie2_16.createFromSerialized(bytes);
     64         int expectedTrieLength=indexes[IX_TRIE_SIZE];
     65         int trieLength=trie.getSerializedLength();
     66         if(trieLength>expectedTrieLength) {
     67             throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
     68         }
     69         // skip padding after trie bytes
     70         ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
     71 
     72         // read exceptions[]
     73         count=indexes[IX_EXC_LENGTH];
     74         if(count>0) {
     75             exceptions=ICUBinary.getString(bytes, count, 0);
     76         }
     77 
     78         // read unfold[]
     79         count=indexes[IX_UNFOLD_LENGTH];
     80         if(count>0) {
     81             unfold=ICUBinary.getChars(bytes, count, 0);
     82         }
     83     }
     84 
     85     // implement ICUBinary.Authenticate
     86     private final static class IsAcceptable implements ICUBinary.Authenticate {
     87         @Override
     88         public boolean isDataVersionAcceptable(byte version[]) {
     89             return version[0]==3;
     90         }
     91     }
     92 
     93     // set of property starts for UnicodeSet ------------------------------- ***
     94 
     95     public final void addPropertyStarts(UnicodeSet set) {
     96         /* add the start code point of each same-value range of the trie */
     97         Iterator<Trie2.Range> trieIterator=trie.iterator();
     98         Trie2.Range range;
     99         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    100             set.add(range.startCodePoint);
    101         }
    102 
    103         /* add code points with hardcoded properties, plus the ones following them */
    104 
    105         /* (none right now, see comment below) */
    106 
    107         /*
    108          * Omit code points with hardcoded specialcasing properties
    109          * because we do not build property UnicodeSets for them right now.
    110          */
    111     }
    112 
    113     // data access primitives ---------------------------------------------- ***
    114     private static final int getExceptionsOffset(int props) {
    115         return props>>EXC_SHIFT;
    116     }
    117 
    118     private static final boolean propsHasException(int props) {
    119         return (props&EXCEPTION)!=0;
    120     }
    121 
    122     /* number of bits in an 8-bit integer value */
    123     private static final byte flagsOffset[/*256*/]={
    124         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    125         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    126         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    127         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    128         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    129         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    130         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    131         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    132         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    133         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    134         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    135         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    136         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    137         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    138         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    139         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    140     };
    141 
    142     private static final boolean hasSlot(int flags, int index) {
    143         return (flags&(1<<index))!=0;
    144     }
    145     private static final byte slotOffset(int flags, int index) {
    146         return flagsOffset[flags&((1<<index)-1)];
    147     }
    148 
    149     /*
    150      * Get the value of an optional-value slot where hasSlot(excWord, index).
    151      *
    152      * @param excWord (in) initial exceptions word
    153      * @param index (in) desired slot index
    154      * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
    155      * @return bits 31..0: slot value
    156      *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
    157      */
    158     private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
    159         long value;
    160         if((excWord&EXC_DOUBLE_SLOTS)==0) {
    161             excOffset+=slotOffset(excWord, index);
    162             value=exceptions.charAt(excOffset);
    163         } else {
    164             excOffset+=2*slotOffset(excWord, index);
    165             value=exceptions.charAt(excOffset++);
    166             value=(value<<16)|exceptions.charAt(excOffset);
    167         }
    168         return value |((long)excOffset<<32);
    169     }
    170 
    171     /* same as getSlotValueAndOffset() but does not return the slot offset */
    172     private final int getSlotValue(int excWord, int index, int excOffset) {
    173         int value;
    174         if((excWord&EXC_DOUBLE_SLOTS)==0) {
    175             excOffset+=slotOffset(excWord, index);
    176             value=exceptions.charAt(excOffset);
    177         } else {
    178             excOffset+=2*slotOffset(excWord, index);
    179             value=exceptions.charAt(excOffset++);
    180             value=(value<<16)|exceptions.charAt(excOffset);
    181         }
    182         return value;
    183     }
    184 
    185     // simple case mappings ------------------------------------------------ ***
    186 
    187     public final int tolower(int c) {
    188         int props=trie.get(c);
    189         if(!propsHasException(props)) {
    190             if(getTypeFromProps(props)>=UPPER) {
    191                 c+=getDelta(props);
    192             }
    193         } else {
    194             int excOffset=getExceptionsOffset(props);
    195             int excWord=exceptions.charAt(excOffset++);
    196             if(hasSlot(excWord, EXC_LOWER)) {
    197                 c=getSlotValue(excWord, EXC_LOWER, excOffset);
    198             }
    199         }
    200         return c;
    201     }
    202 
    203     public final int toupper(int c) {
    204         int props=trie.get(c);
    205         if(!propsHasException(props)) {
    206             if(getTypeFromProps(props)==LOWER) {
    207                 c+=getDelta(props);
    208             }
    209         } else {
    210             int excOffset=getExceptionsOffset(props);
    211             int excWord=exceptions.charAt(excOffset++);
    212             if(hasSlot(excWord, EXC_UPPER)) {
    213                 c=getSlotValue(excWord, EXC_UPPER, excOffset);
    214             }
    215         }
    216         return c;
    217     }
    218 
    219     public final int totitle(int c) {
    220         int props=trie.get(c);
    221         if(!propsHasException(props)) {
    222             if(getTypeFromProps(props)==LOWER) {
    223                 c+=getDelta(props);
    224             }
    225         } else {
    226             int excOffset=getExceptionsOffset(props);
    227             int excWord=exceptions.charAt(excOffset++);
    228             int index;
    229             if(hasSlot(excWord, EXC_TITLE)) {
    230                 index=EXC_TITLE;
    231             } else if(hasSlot(excWord, EXC_UPPER)) {
    232                 index=EXC_UPPER;
    233             } else {
    234                 return c;
    235             }
    236             c=getSlotValue(excWord, index, excOffset);
    237         }
    238         return c;
    239     }
    240 
    241     /**
    242      * Adds all simple case mappings and the full case folding for c to sa,
    243      * and also adds special case closure mappings.
    244      * c itself is not added.
    245      * For example, the mappings
    246      * - for s include long s
    247      * - for sharp s include ss
    248      * - for k include the Kelvin sign
    249      */
    250     public final void addCaseClosure(int c, UnicodeSet set) {
    251         /*
    252          * Hardcode the case closure of i and its relatives and ignore the
    253          * data file data for these characters.
    254          * The Turkic dotless i and dotted I with their case mapping conditions
    255          * and case folding option make the related characters behave specially.
    256          * This code matches their closure behavior to their case folding behavior.
    257          */
    258 
    259         switch(c) {
    260         case 0x49:
    261             /* regular i and I are in one equivalence class */
    262             set.add(0x69);
    263             return;
    264         case 0x69:
    265             set.add(0x49);
    266             return;
    267         case 0x130:
    268             /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    269             set.add(iDot);
    270             return;
    271         case 0x131:
    272             /* dotless i is in a class by itself */
    273             return;
    274         default:
    275             /* otherwise use the data file data */
    276             break;
    277         }
    278 
    279         int props=trie.get(c);
    280         if(!propsHasException(props)) {
    281             if(getTypeFromProps(props)!=NONE) {
    282                 /* add the one simple case mapping, no matter what type it is */
    283                 int delta=getDelta(props);
    284                 if(delta!=0) {
    285                     set.add(c+delta);
    286                 }
    287             }
    288         } else {
    289             /*
    290              * c has exceptions, so there may be multiple simple and/or
    291              * full case mappings. Add them all.
    292              */
    293             int excOffset0, excOffset=getExceptionsOffset(props);
    294             int closureOffset;
    295             int excWord=exceptions.charAt(excOffset++);
    296             int index, closureLength, fullLength, length;
    297 
    298             excOffset0=excOffset;
    299 
    300             /* add all simple case mappings */
    301             for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
    302                 if(hasSlot(excWord, index)) {
    303                     excOffset=excOffset0;
    304                     c=getSlotValue(excWord, index, excOffset);
    305                     set.add(c);
    306                 }
    307             }
    308 
    309             /* get the closure string pointer & length */
    310             if(hasSlot(excWord, EXC_CLOSURE)) {
    311                 excOffset=excOffset0;
    312                 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
    313                 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    314                 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
    315             } else {
    316                 closureLength=0;
    317                 closureOffset=0;
    318             }
    319 
    320             /* add the full case folding */
    321             if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
    322                 excOffset=excOffset0;
    323                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
    324                 fullLength=(int)value;
    325 
    326                 /* start of full case mapping strings */
    327                 excOffset=(int)(value>>32)+1;
    328 
    329                 fullLength&=0xffff; /* bits 16 and higher are reserved */
    330 
    331                 /* skip the lowercase result string */
    332                 excOffset+=fullLength&FULL_LOWER;
    333                 fullLength>>=4;
    334 
    335                 /* add the full case folding string */
    336                 length=fullLength&0xf;
    337                 if(length!=0) {
    338                     set.add(exceptions.substring(excOffset, excOffset+length));
    339                     excOffset+=length;
    340                 }
    341 
    342                 /* skip the uppercase and titlecase strings */
    343                 fullLength>>=4;
    344                 excOffset+=fullLength&0xf;
    345                 fullLength>>=4;
    346                 excOffset+=fullLength;
    347 
    348                 closureOffset=excOffset; /* behind full case mappings */
    349             }
    350 
    351             /* add each code point in the closure string */
    352             int limit=closureOffset+closureLength;
    353             for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
    354                 c=exceptions.codePointAt(index);
    355                 set.add(c);
    356             }
    357         }
    358     }
    359 
    360     /*
    361      * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
    362      * must be s.length()>0 and max>0 and s.length()<=max
    363      */
    364     private final int strcmpMax(String s, int unfoldOffset, int max) {
    365         int i1, length, c1, c2;
    366 
    367         length=s.length();
    368         max-=length; /* we require length<=max, so no need to decrement max in the loop */
    369         i1=0;
    370         do {
    371             c1=s.charAt(i1++);
    372             c2=unfold[unfoldOffset++];
    373             if(c2==0) {
    374                 return 1; /* reached the end of t but not of s */
    375             }
    376             c1-=c2;
    377             if(c1!=0) {
    378                 return c1; /* return difference result */
    379             }
    380         } while(--length>0);
    381         /* ends with length==0 */
    382 
    383         if(max==0 || unfold[unfoldOffset]==0) {
    384             return 0; /* equal to length of both strings */
    385         } else {
    386             return -max; /* return lengh difference */
    387         }
    388     }
    389 
    390     /**
    391      * Maps the string to single code points and adds the associated case closure
    392      * mappings.
    393      * The string is mapped to code points if it is their full case folding string.
    394      * In other words, this performs a reverse full case folding and then
    395      * adds the case closure items of the resulting code points.
    396      * If the string is found and its closure applied, then
    397      * the string itself is added as well as part of its code points' closure.
    398      *
    399      * @return true if the string was found
    400      */
    401     public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    402         int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    403 
    404         if(unfold==null || s==null) {
    405             return false; /* no reverse case folding data, or no string */
    406         }
    407         length=s.length();
    408         if(length<=1) {
    409             /* the string is too short to find any match */
    410             /*
    411              * more precise would be:
    412              * if(!u_strHasMoreChar32Than(s, length, 1))
    413              * but this does not make much practical difference because
    414              * a single supplementary code point would just not be found
    415              */
    416             return false;
    417         }
    418 
    419         unfoldRows=unfold[UNFOLD_ROWS];
    420         unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    421         unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    422         //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
    423 
    424         if(length>unfoldStringWidth) {
    425             /* the string is too long to find any match */
    426             return false;
    427         }
    428 
    429         /* do a binary search for the string */
    430         start=0;
    431         limit=unfoldRows;
    432         while(start<limit) {
    433             i=(start+limit)/2;
    434             unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
    435             result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
    436 
    437             if(result==0) {
    438                 /* found the string: add each code point, and its case closure */
    439                 int c;
    440 
    441                 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
    442                     c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
    443                     set.add(c);
    444                     addCaseClosure(c, set);
    445                 }
    446                 return true;
    447             } else if(result<0) {
    448                 limit=i;
    449             } else /* result>0 */ {
    450                 start=i+1;
    451             }
    452         }
    453 
    454         return false; /* string not found */
    455     }
    456 
    457     /** @return NONE, LOWER, UPPER, TITLE */
    458     public final int getType(int c) {
    459         return getTypeFromProps(trie.get(c));
    460     }
    461 
    462     /** @return like getType() but also sets IGNORABLE if c is case-ignorable */
    463     public final int getTypeOrIgnorable(int c) {
    464         return getTypeAndIgnorableFromProps(trie.get(c));
    465     }
    466 
    467     /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
    468     public final int getDotType(int c) {
    469         int props=trie.get(c);
    470         if(!propsHasException(props)) {
    471             return props&DOT_MASK;
    472         } else {
    473             return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
    474         }
    475     }
    476 
    477     public final boolean isSoftDotted(int c) {
    478         return getDotType(c)==SOFT_DOTTED;
    479     }
    480 
    481     public final boolean isCaseSensitive(int c) {
    482         return (trie.get(c)&SENSITIVE)!=0;
    483     }
    484 
    485     // string casing ------------------------------------------------------- ***
    486 
    487     /*
    488      * These internal functions form the core of string case mappings.
    489      * They map single code points to result code points or strings and take
    490      * all necessary conditions (context, locale ID, options) into account.
    491      *
    492      * They do not iterate over the source or write to the destination
    493      * so that the same functions are useful for non-standard string storage,
    494      * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    495      * For the same reason, the "surrounding text" context is passed in as a
    496      * ContextIterator which does not make any assumptions about
    497      * the underlying storage.
    498      *
    499      * This section contains helper functions that check for conditions
    500      * in the input text surrounding the current code point
    501      * according to SpecialCasing.txt.
    502      *
    503      * Each helper function gets the index
    504      * - after the current code point if it looks at following text
    505      * - before the current code point if it looks at preceding text
    506      *
    507      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    508      *
    509      * Final_Sigma
    510      *   C is preceded by a sequence consisting of
    511      *     a cased letter and a case-ignorable sequence,
    512      *   and C is not followed by a sequence consisting of
    513      *     an ignorable sequence and then a cased letter.
    514      *
    515      * More_Above
    516      *   C is followed by one or more characters of combining class 230 (ABOVE)
    517      *   in the combining character sequence.
    518      *
    519      * After_Soft_Dotted
    520      *   The last preceding character with combining class of zero before C
    521      *   was Soft_Dotted,
    522      *   and there is no intervening combining character class 230 (ABOVE).
    523      *
    524      * Before_Dot
    525      *   C is followed by combining dot above (U+0307).
    526      *   Any sequence of characters with a combining class that is neither 0 nor 230
    527      *   may intervene between the current character and the combining dot above.
    528      *
    529      * The erratum from 2002-10-31 adds the condition
    530      *
    531      * After_I
    532      *   The last preceding base character was an uppercase I, and there is no
    533      *   intervening combining character class 230 (ABOVE).
    534      *
    535      *   (See Jitterbug 2344 and the comments on After_I below.)
    536      *
    537      * Helper definitions in Unicode 3.2 UAX 21:
    538      *
    539      * D1. A character C is defined to be cased
    540      *     if it meets any of the following criteria:
    541      *
    542      *   - The general category of C is Titlecase Letter (Lt)
    543      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    544      *   - Given D = NFD(C), then it is not the case that:
    545      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    546      *     (This third criterium does not add any characters to the list
    547      *      for Unicode 3.2. Ignored.)
    548      *
    549      * D2. A character C is defined to be case-ignorable
    550      *     if it meets either of the following criteria:
    551      *
    552      *   - The general category of C is
    553      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    554      *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    555      *   - C is one of the following characters
    556      *     U+0027 APOSTROPHE
    557      *     U+00AD SOFT HYPHEN (SHY)
    558      *     U+2019 RIGHT SINGLE QUOTATION MARK
    559      *            (the preferred character for apostrophe)
    560      *
    561      * D3. A case-ignorable sequence is a sequence of
    562      *     zero or more case-ignorable characters.
    563      */
    564 
    565     /**
    566      * Iterator for string case mappings, which need to look at the
    567      * context (surrounding text) of a given character for conditional mappings.
    568      *
    569      * The iterator only needs to go backward or forward away from the
    570      * character in question. It does not use any indexes on this interface.
    571      * It does not support random access or an arbitrary change of
    572      * iteration direction.
    573      *
    574      * The code point being case-mapped itself is never returned by
    575      * this iterator.
    576      */
    577     public interface ContextIterator {
    578         /**
    579          * Reset the iterator for forward or backward iteration.
    580          * @param dir >0: Begin iterating forward from the first code point
    581          * after the one that is being case-mapped.
    582          *            <0: Begin iterating backward from the first code point
    583          * before the one that is being case-mapped.
    584          */
    585         public void reset(int dir);
    586         /**
    587          * Iterate and return the next code point, moving in the direction
    588          * determined by the reset() call.
    589          * @return Next code point, or <0 when the iteration is done.
    590          */
    591         public int next();
    592     }
    593 
    594     /**
    595      * For string case mappings, a single character (a code point) is mapped
    596      * either to itself (in which case in-place mapping functions do nothing),
    597      * or to another single code point, or to a string.
    598      * Aside from the string contents, these are indicated with a single int
    599      * value as follows:
    600      *
    601      * Mapping to self: Negative values (~self instead of -self to support U+0000)
    602      *
    603      * Mapping to another code point: Positive values >MAX_STRING_LENGTH
    604      *
    605      * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
    606      * returned. Note that the string result may indeed have zero length.
    607      */
    608     public static final int MAX_STRING_LENGTH=0x1f;
    609 
    610     //ivate static final int LOC_UNKNOWN=0;
    611     public static final int LOC_ROOT=1;
    612     private static final int LOC_TURKISH=2;
    613     private static final int LOC_LITHUANIAN=3;
    614     static final int LOC_GREEK=4;
    615     public static final int LOC_DUTCH=5;
    616 
    617     public static final int getCaseLocale(Locale locale) {
    618         return getCaseLocale(locale.getLanguage());
    619     }
    620     public static final int getCaseLocale(ULocale locale) {
    621         return getCaseLocale(locale.getLanguage());
    622     }
    623     /** Accepts both 2- and 3-letter language subtags. */
    624     private static final int getCaseLocale(String language) {
    625         // Check the subtag length to reduce the number of comparisons
    626         // for locales without special behavior.
    627         // Fastpath for English "en" which is often used for default (=root locale) case mappings,
    628         // and for Chinese "zh": Very common but no special case mapping behavior.
    629         if(language.length()==2) {
    630             if(language.equals("en") || language.charAt(0)>'t') {
    631                 return LOC_ROOT;
    632             } else if(language.equals("tr") || language.equals("az")) {
    633                 return LOC_TURKISH;
    634             } else if(language.equals("el")) {
    635                 return LOC_GREEK;
    636             } else if(language.equals("lt")) {
    637                 return LOC_LITHUANIAN;
    638             } else if(language.equals("nl")) {
    639                 return LOC_DUTCH;
    640             }
    641         } else if(language.length()==3) {
    642             if(language.equals("tur") || language.equals("aze")) {
    643                 return LOC_TURKISH;
    644             } else if(language.equals("ell")) {
    645                 return LOC_GREEK;
    646             } else if(language.equals("lit")) {
    647                 return LOC_LITHUANIAN;
    648             } else if(language.equals("nld")) {
    649                 return LOC_DUTCH;
    650             }
    651         }
    652         return LOC_ROOT;
    653     }
    654 
    655     /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
    656     private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
    657         int c;
    658 
    659         if(iter==null) {
    660             return false;
    661         }
    662 
    663         for(iter.reset(dir); (c=iter.next())>=0;) {
    664             int type=getTypeOrIgnorable(c);
    665             if((type&4)!=0) {
    666                 /* case-ignorable, continue with the loop */
    667             } else if(type!=NONE) {
    668                 return true; /* followed by cased letter */
    669             } else {
    670                 return false; /* uncased and not case-ignorable */
    671             }
    672         }
    673 
    674         return false; /* not followed by cased letter */
    675     }
    676 
    677     /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    678     private final boolean isPrecededBySoftDotted(ContextIterator iter) {
    679         int c;
    680         int dotType;
    681 
    682         if(iter==null) {
    683             return false;
    684         }
    685 
    686         for(iter.reset(-1); (c=iter.next())>=0;) {
    687             dotType=getDotType(c);
    688             if(dotType==SOFT_DOTTED) {
    689                 return true; /* preceded by TYPE_i */
    690             } else if(dotType!=OTHER_ACCENT) {
    691                 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    692             }
    693         }
    694 
    695         return false; /* not preceded by TYPE_i */
    696     }
    697 
    698     /*
    699      * See Jitterbug 2344:
    700      * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    701      * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    702      * we made those releases compatible with Unicode 3.2 which had not fixed
    703      * a related bug in SpecialCasing.txt.
    704      *
    705      * From the Jitterbug 2344 text:
    706      * ... this bug is listed as a Unicode erratum
    707      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    708      * <quote>
    709      * There are two errors in SpecialCasing.txt.
    710      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    711      * 2. An incorrect context definition. Correct as follows:
    712      * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    713      * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    714      * ---
    715      * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    716      * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    717      * where the context After_I is defined as:
    718      * The last preceding base character was an uppercase I, and there is no
    719      * intervening combining character class 230 (ABOVE).
    720      * </quote>
    721      *
    722      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    723      *
    724      * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    725      * # This matches the behavior of the canonically equivalent I-dot_above
    726      *
    727      * See also the description in this place in older versions of uchar.c (revision 1.100).
    728      *
    729      * Markus W. Scherer 2003-feb-15
    730      */
    731 
    732     /* Is preceded by base character 'I' with no intervening cc=230 ? */
    733     private final boolean isPrecededBy_I(ContextIterator iter) {
    734         int c;
    735         int dotType;
    736 
    737         if(iter==null) {
    738             return false;
    739         }
    740 
    741         for(iter.reset(-1); (c=iter.next())>=0;) {
    742             if(c==0x49) {
    743                 return true; /* preceded by I */
    744             }
    745             dotType=getDotType(c);
    746             if(dotType!=OTHER_ACCENT) {
    747                 return false; /* preceded by different base character (not I), or intervening cc==230 */
    748             }
    749         }
    750 
    751         return false; /* not preceded by I */
    752     }
    753 
    754     /* Is followed by one or more cc==230 ? */
    755     private final boolean isFollowedByMoreAbove(ContextIterator iter) {
    756         int c;
    757         int dotType;
    758 
    759         if(iter==null) {
    760             return false;
    761         }
    762 
    763         for(iter.reset(1); (c=iter.next())>=0;) {
    764             dotType=getDotType(c);
    765             if(dotType==ABOVE) {
    766                 return true; /* at least one cc==230 following */
    767             } else if(dotType!=OTHER_ACCENT) {
    768                 return false; /* next base character, no more cc==230 following */
    769             }
    770         }
    771 
    772         return false; /* no more cc==230 following */
    773     }
    774 
    775     /* Is followed by a dot above (without cc==230 in between) ? */
    776     private final boolean isFollowedByDotAbove(ContextIterator iter) {
    777         int c;
    778         int dotType;
    779 
    780         if(iter==null) {
    781             return false;
    782         }
    783 
    784         for(iter.reset(1); (c=iter.next())>=0; ) {
    785             if(c==0x307) {
    786                 return true;
    787             }
    788             dotType=getDotType(c);
    789             if(dotType!=OTHER_ACCENT) {
    790                 return false; /* next base character or cc==230 in between */
    791             }
    792         }
    793 
    794         return false; /* no dot above following */
    795     }
    796 
    797     private static final String
    798         iDot=       "i\u0307",
    799         jDot=       "j\u0307",
    800         iOgonekDot= "\u012f\u0307",
    801         iDotGrave=  "i\u0307\u0300",
    802         iDotAcute=  "i\u0307\u0301",
    803         iDotTilde=  "i\u0307\u0303";
    804 
    805     /**
    806      * Get the full lowercase mapping for c.
    807      *
    808      * @param c Character to be mapped.
    809      * @param iter Character iterator, used for context-sensitive mappings.
    810      *             See ContextIterator for details.
    811      *             If iter==null then a context-independent result is returned.
    812      * @param out If the mapping result is a string, then it is appended to out.
    813      * @param caseLocale Case locale value from ucase_getCaseLocale().
    814      * @return Output code point or string length, see MAX_STRING_LENGTH.
    815      *
    816      * @see ContextIterator
    817      * @see #MAX_STRING_LENGTH
    818      * @internal
    819      */
    820     public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
    821         int result, props;
    822 
    823         result=c;
    824         props=trie.get(c);
    825         if(!propsHasException(props)) {
    826             if(getTypeFromProps(props)>=UPPER) {
    827                 result=c+getDelta(props);
    828             }
    829         } else {
    830             int excOffset=getExceptionsOffset(props), excOffset2;
    831             int excWord=exceptions.charAt(excOffset++);
    832             int full;
    833 
    834             excOffset2=excOffset;
    835 
    836             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
    837                 /* use hardcoded conditions and mappings */
    838                 /*
    839                  * Test for conditional mappings first
    840                  *   (otherwise the unconditional default mappings are always taken),
    841                  * then test for characters that have unconditional mappings in SpecialCasing.txt,
    842                  * then get the UnicodeData.txt mappings.
    843                  */
    844                 if( caseLocale==LOC_LITHUANIAN &&
    845                         /* base characters, find accents above */
    846                         (((c==0x49 || c==0x4a || c==0x12e) &&
    847                             isFollowedByMoreAbove(iter)) ||
    848                         /* precomposed with accent above, no need to find one */
    849                         (c==0xcc || c==0xcd || c==0x128))
    850                 ) {
    851                     /*
    852                         # Lithuanian
    853 
    854                         # Lithuanian retains the dot in a lowercase i when followed by accents.
    855 
    856                         # Introduce an explicit dot above when lowercasing capital I's and J's
    857                         # whenever there are more accents above.
    858                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
    859 
    860                         0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
    861                         004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
    862                         012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
    863                         00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
    864                         00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
    865                         0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
    866                      */
    867                     try {
    868                         switch(c) {
    869                         case 0x49:  /* LATIN CAPITAL LETTER I */
    870                             out.append(iDot);
    871                             return 2;
    872                         case 0x4a:  /* LATIN CAPITAL LETTER J */
    873                             out.append(jDot);
    874                             return 2;
    875                         case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
    876                             out.append(iOgonekDot);
    877                             return 2;
    878                         case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
    879                             out.append(iDotGrave);
    880                             return 3;
    881                         case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
    882                             out.append(iDotAcute);
    883                             return 3;
    884                         case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
    885                             out.append(iDotTilde);
    886                             return 3;
    887                         default:
    888                             return 0; /* will not occur */
    889                         }
    890                     } catch (IOException e) {
    891                         throw new ICUUncheckedIOException(e);
    892                     }
    893                 /* # Turkish and Azeri */
    894                 } else if(caseLocale==LOC_TURKISH && c==0x130) {
    895                     /*
    896                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    897                         # The following rules handle those cases.
    898 
    899                         0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
    900                         0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
    901                      */
    902                     return 0x69;
    903                 } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
    904                     /*
    905                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    906                         # This matches the behavior of the canonically equivalent I-dot_above
    907 
    908                         0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    909                         0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    910                      */
    911                     return 0; /* remove the dot (continue without output) */
    912                 } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
    913                     /*
    914                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
    915 
    916                         0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
    917                         0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
    918                      */
    919                     return 0x131;
    920                 } else if(c==0x130) {
    921                     /*
    922                         # Preserve canonical equivalence for I with dot. Turkic is handled below.
    923 
    924                         0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    925                      */
    926                     try {
    927                         out.append(iDot);
    928                         return 2;
    929                     } catch (IOException e) {
    930                         throw new ICUUncheckedIOException(e);
    931                     }
    932                 } else if(  c==0x3a3 &&
    933                             !isFollowedByCasedLetter(iter, 1) &&
    934                             isFollowedByCasedLetter(iter, -1) /* -1=preceded */
    935                 ) {
    936                     /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
    937                     /*
    938                         # Special case for final form of sigma
    939 
    940                         03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    941                      */
    942                     return 0x3c2; /* greek small final sigma */
    943                 } else {
    944                     /* no known conditional special case mapping, use a normal mapping */
    945                 }
    946             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
    947                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
    948                 full=(int)value&FULL_LOWER;
    949                 if(full!=0) {
    950                     /* start of full case mapping strings */
    951                     excOffset=(int)(value>>32)+1;
    952 
    953                     try {
    954                         // append the lowercase mapping
    955                         out.append(exceptions, excOffset, excOffset+full);
    956 
    957                         /* return the string length */
    958                         return full;
    959                     } catch (IOException e) {
    960                         throw new ICUUncheckedIOException(e);
    961                     }
    962                 }
    963             }
    964 
    965             if(hasSlot(excWord, EXC_LOWER)) {
    966                 result=getSlotValue(excWord, EXC_LOWER, excOffset2);
    967             }
    968         }
    969 
    970         return (result==c) ? ~result : result;
    971     }
    972 
    973     /* internal */
    974     private final int toUpperOrTitle(int c, ContextIterator iter,
    975                                      Appendable out,
    976                                      int loc,
    977                                      boolean upperNotTitle) {
    978         int result;
    979         int props;
    980 
    981         result=c;
    982         props=trie.get(c);
    983         if(!propsHasException(props)) {
    984             if(getTypeFromProps(props)==LOWER) {
    985                 result=c+getDelta(props);
    986             }
    987         } else {
    988             int excOffset=getExceptionsOffset(props), excOffset2;
    989             int excWord=exceptions.charAt(excOffset++);
    990             int full, index;
    991 
    992             excOffset2=excOffset;
    993 
    994             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
    995                 /* use hardcoded conditions and mappings */
    996                 if(loc==LOC_TURKISH && c==0x69) {
    997                     /*
    998                         # Turkish and Azeri
    999 
   1000                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1001                         # The following rules handle those cases.
   1002 
   1003                         # When uppercasing, i turns into a dotted capital I
   1004 
   1005                         0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   1006                         0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   1007                     */
   1008                     return 0x130;
   1009                 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
   1010                     /*
   1011                         # Lithuanian
   1012 
   1013                         # Lithuanian retains the dot in a lowercase i when followed by accents.
   1014 
   1015                         # Remove DOT ABOVE after "i" with upper or titlecase
   1016 
   1017                         0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   1018                      */
   1019                     return 0; /* remove the dot (continue without output) */
   1020                 } else {
   1021                     /* no known conditional special case mapping, use a normal mapping */
   1022                 }
   1023             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
   1024                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
   1025                 full=(int)value&0xffff;
   1026 
   1027                 /* start of full case mapping strings */
   1028                 excOffset=(int)(value>>32)+1;
   1029 
   1030                 /* skip the lowercase and case-folding result strings */
   1031                 excOffset+=full&FULL_LOWER;
   1032                 full>>=4;
   1033                 excOffset+=full&0xf;
   1034                 full>>=4;
   1035 
   1036                 if(upperNotTitle) {
   1037                     full&=0xf;
   1038                 } else {
   1039                     /* skip the uppercase result string */
   1040                     excOffset+=full&0xf;
   1041                     full=(full>>4)&0xf;
   1042                 }
   1043 
   1044                 if(full!=0) {
   1045                     try {
   1046                         // append the result string
   1047                         out.append(exceptions, excOffset, excOffset+full);
   1048 
   1049                         /* return the string length */
   1050                         return full;
   1051                     } catch (IOException e) {
   1052                         throw new ICUUncheckedIOException(e);
   1053                     }
   1054                 }
   1055             }
   1056 
   1057             if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
   1058                 index=EXC_TITLE;
   1059             } else if(hasSlot(excWord, EXC_UPPER)) {
   1060                 /* here, titlecase is same as uppercase */
   1061                 index=EXC_UPPER;
   1062             } else {
   1063                 return ~c;
   1064             }
   1065             result=getSlotValue(excWord, index, excOffset2);
   1066         }
   1067 
   1068         return (result==c) ? ~result : result;
   1069     }
   1070 
   1071     public final int toFullUpper(int c, ContextIterator iter,
   1072                                  Appendable out,
   1073                                  int caseLocale) {
   1074         return toUpperOrTitle(c, iter, out, caseLocale, true);
   1075     }
   1076 
   1077     public final int toFullTitle(int c, ContextIterator iter,
   1078                                  Appendable out,
   1079                                  int caseLocale) {
   1080         return toUpperOrTitle(c, iter, out, caseLocale, false);
   1081     }
   1082 
   1083     /* case folding ------------------------------------------------------------- */
   1084 
   1085     /*
   1086      * Case folding is similar to lowercasing.
   1087      * The result may be a simple mapping, i.e., a single code point, or
   1088      * a full mapping, i.e., a string.
   1089      * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1090      * then only the lowercase mapping is stored.
   1091      *
   1092      * Some special cases are hardcoded because their conditions cannot be
   1093      * parsed and processed from CaseFolding.txt.
   1094      *
   1095      * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1096 
   1097     # C: common case folding, common mappings shared by both simple and full mappings.
   1098     # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1099     # S: simple case folding, mappings to single characters where different from F.
   1100     # T: special case for uppercase I and dotted uppercase I
   1101     #    - For non-Turkic languages, this mapping is normally not used.
   1102     #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1103     #
   1104     # Usage:
   1105     #  A. To do a simple case folding, use the mappings with status C + S.
   1106     #  B. To do a full case folding, use the mappings with status C + F.
   1107     #
   1108     #    The mappings with status T can be used or omitted depending on the desired case-folding
   1109     #    behavior. (The default option is to exclude them.)
   1110 
   1111      * Unicode 3.2 has 'T' mappings as follows:
   1112 
   1113     0049; T; 0131; # LATIN CAPITAL LETTER I
   1114     0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1115 
   1116      * while the default mappings for these code points are:
   1117 
   1118     0049; C; 0069; # LATIN CAPITAL LETTER I
   1119     0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1120 
   1121      * U+0130 has no simple case folding (simple-case-folds to itself).
   1122      */
   1123 
   1124     /**
   1125      * Bit mask for getting just the options from a string compare options word
   1126      * that are relevant for case folding (of a single string or code point).
   1127      * @internal
   1128      */
   1129     private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
   1130 
   1131     /* return the simple case folding mapping for c */
   1132     public final int fold(int c, int options) {
   1133         int props=trie.get(c);
   1134         if(!propsHasException(props)) {
   1135             if(getTypeFromProps(props)>=UPPER) {
   1136                 c+=getDelta(props);
   1137             }
   1138         } else {
   1139             int excOffset=getExceptionsOffset(props);
   1140             int excWord=exceptions.charAt(excOffset++);
   1141             int index;
   1142             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
   1143                 /* special case folding mappings, hardcoded */
   1144                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
   1145                     /* default mappings */
   1146                     if(c==0x49) {
   1147                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1148                         return 0x69;
   1149                     } else if(c==0x130) {
   1150                         /* no simple case folding for U+0130 */
   1151                         return c;
   1152                     }
   1153                 } else {
   1154                     /* Turkic mappings */
   1155                     if(c==0x49) {
   1156                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1157                         return 0x131;
   1158                     } else if(c==0x130) {
   1159                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1160                         return 0x69;
   1161                     }
   1162                 }
   1163             }
   1164             if(hasSlot(excWord, EXC_FOLD)) {
   1165                 index=EXC_FOLD;
   1166             } else if(hasSlot(excWord, EXC_LOWER)) {
   1167                 index=EXC_LOWER;
   1168             } else {
   1169                 return c;
   1170             }
   1171             c=getSlotValue(excWord, index, excOffset);
   1172         }
   1173         return c;
   1174     }
   1175 
   1176     /*
   1177      * Issue for canonical caseless match (UAX #21):
   1178      * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1179      * canonical equivalence, unlike default-option casefolding.
   1180      * For example, I-grave and I + grave fold to strings that are not canonically
   1181      * equivalent.
   1182      * For more details, see the comment in unorm_compare() in unorm.cpp
   1183      * and the intermediate prototype changes for Jitterbug 2021.
   1184      * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1185      *
   1186      * This did not get fixed because it appears that it is not possible to fix
   1187      * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1188      * together in a way that they still fold to common result strings.
   1189      */
   1190 
   1191     public final int toFullFolding(int c, Appendable out, int options) {
   1192         int result;
   1193         int props;
   1194 
   1195         result=c;
   1196         props=trie.get(c);
   1197         if(!propsHasException(props)) {
   1198             if(getTypeFromProps(props)>=UPPER) {
   1199                 result=c+getDelta(props);
   1200             }
   1201         } else {
   1202             int excOffset=getExceptionsOffset(props), excOffset2;
   1203             int excWord=exceptions.charAt(excOffset++);
   1204             int full, index;
   1205 
   1206             excOffset2=excOffset;
   1207 
   1208             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
   1209                 /* use hardcoded conditions and mappings */
   1210                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
   1211                     /* default mappings */
   1212                     if(c==0x49) {
   1213                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1214                         return 0x69;
   1215                     } else if(c==0x130) {
   1216                         /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1217                         try {
   1218                             out.append(iDot);
   1219                             return 2;
   1220                         } catch (IOException e) {
   1221                             throw new ICUUncheckedIOException(e);
   1222                         }
   1223                     }
   1224                 } else {
   1225                     /* Turkic mappings */
   1226                     if(c==0x49) {
   1227                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1228                         return 0x131;
   1229                     } else if(c==0x130) {
   1230                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1231                         return 0x69;
   1232                     }
   1233                 }
   1234             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
   1235                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
   1236                 full=(int)value&0xffff;
   1237 
   1238                 /* start of full case mapping strings */
   1239                 excOffset=(int)(value>>32)+1;
   1240 
   1241                 /* skip the lowercase result string */
   1242                 excOffset+=full&FULL_LOWER;
   1243                 full=(full>>4)&0xf;
   1244 
   1245                 if(full!=0) {
   1246                     try {
   1247                         // append the result string
   1248                         out.append(exceptions, excOffset, excOffset+full);
   1249 
   1250                         /* return the string length */
   1251                         return full;
   1252                     } catch (IOException e) {
   1253                         throw new ICUUncheckedIOException(e);
   1254                     }
   1255                 }
   1256             }
   1257 
   1258             if(hasSlot(excWord, EXC_FOLD)) {
   1259                 index=EXC_FOLD;
   1260             } else if(hasSlot(excWord, EXC_LOWER)) {
   1261                 index=EXC_LOWER;
   1262             } else {
   1263                 return ~c;
   1264             }
   1265             result=getSlotValue(excWord, index, excOffset2);
   1266         }
   1267 
   1268         return (result==c) ? ~result : result;
   1269     }
   1270 
   1271     /* case mapping properties API ---------------------------------------------- */
   1272 
   1273     /*
   1274      * We need a StringBuilder for multi-code point output from the
   1275      * full case mapping functions. However, we do not actually use that output,
   1276      * we just check whether the input character was mapped to anything else.
   1277      * We use a shared StringBuilder to avoid allocating a new one in each call.
   1278      * We remove its contents each time so that it does not grow large over time.
   1279      *
   1280      * @internal
   1281      */
   1282     public static final StringBuilder dummyStringBuilder = new StringBuilder();
   1283 
   1284     public final boolean hasBinaryProperty(int c, int which) {
   1285         switch(which) {
   1286         case UProperty.LOWERCASE:
   1287             return LOWER==getType(c);
   1288         case UProperty.UPPERCASE:
   1289             return UPPER==getType(c);
   1290         case UProperty.SOFT_DOTTED:
   1291             return isSoftDotted(c);
   1292         case UProperty.CASE_SENSITIVE:
   1293             return isCaseSensitive(c);
   1294         case UProperty.CASED:
   1295             return NONE!=getType(c);
   1296         case UProperty.CASE_IGNORABLE:
   1297             return (getTypeOrIgnorable(c)>>2)!=0;
   1298         /*
   1299          * Note: The following Changes_When_Xyz are defined as testing whether
   1300          * the NFD form of the input changes when Xyz-case-mapped.
   1301          * However, this simpler implementation of these properties,
   1302          * ignoring NFD, passes the tests.
   1303          * The implementation needs to be changed if the tests start failing.
   1304          * When that happens, optimizations should be used to work with the
   1305          * per-single-code point ucase_toFullXyz() functions unless
   1306          * the NFD form has more than one code point,
   1307          * and the property starts set needs to be the union of the
   1308          * start sets for normalization and case mappings.
   1309          */
   1310         case UProperty.CHANGES_WHEN_LOWERCASED:
   1311             dummyStringBuilder.setLength(0);
   1312             return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1313         case UProperty.CHANGES_WHEN_UPPERCASED:
   1314             dummyStringBuilder.setLength(0);
   1315             return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1316         case UProperty.CHANGES_WHEN_TITLECASED:
   1317             dummyStringBuilder.setLength(0);
   1318             return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1319         /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
   1320         case UProperty.CHANGES_WHEN_CASEMAPPED:
   1321             dummyStringBuilder.setLength(0);
   1322             return
   1323                 toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
   1324                 toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
   1325                 toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1326         default:
   1327             return false;
   1328         }
   1329     }
   1330 
   1331     // data members -------------------------------------------------------- ***
   1332     private int indexes[];
   1333     private String exceptions;
   1334     private char unfold[];
   1335 
   1336     private Trie2_16 trie;
   1337 
   1338     // data format constants ----------------------------------------------- ***
   1339     private static final String DATA_NAME="ucase";
   1340     private static final String DATA_TYPE="icu";
   1341     private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
   1342 
   1343     /* format "cAsE" */
   1344     private static final int FMT=0x63415345;
   1345 
   1346     /* indexes into indexes[] */
   1347     //private static final int IX_INDEX_TOP=0;
   1348     //private static final int IX_LENGTH=1;
   1349     private static final int IX_TRIE_SIZE=2;
   1350     private static final int IX_EXC_LENGTH=3;
   1351     private static final int IX_UNFOLD_LENGTH=4;
   1352 
   1353     //private static final int IX_MAX_FULL_LENGTH=15;
   1354     private static final int IX_TOP=16;
   1355 
   1356     // definitions for 16-bit case properties word ------------------------- ***
   1357 
   1358     /* 2-bit constants for types of cased characters */
   1359     public static final int TYPE_MASK=3;
   1360     public static final int NONE=0;
   1361     public static final int LOWER=1;
   1362     public static final int UPPER=2;
   1363     public static final int TITLE=3;
   1364 
   1365     /** @return NONE, LOWER, UPPER, TITLE */
   1366     private static final int getTypeFromProps(int props) {
   1367         return props&TYPE_MASK;
   1368     }
   1369 
   1370     /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */
   1371     private static final int getTypeAndIgnorableFromProps(int props) {
   1372         return props&7;
   1373     }
   1374 
   1375     static final int IGNORABLE=4;
   1376     private static final int SENSITIVE=     8;
   1377     private static final int EXCEPTION=     0x10;
   1378 
   1379     private static final int DOT_MASK=      0x60;
   1380     //private static final int NO_DOT=        0;      /* normal characters with cc=0 */
   1381     private static final int SOFT_DOTTED=   0x20;   /* soft-dotted characters with cc=0 */
   1382     private static final int ABOVE=         0x40;   /* "above" accents with cc=230 */
   1383     private static final int OTHER_ACCENT=  0x60;   /* other accent character (0<cc!=230) */
   1384 
   1385     /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
   1386     private static final int DELTA_SHIFT=   7;
   1387     //private static final int DELTA_MASK=    0xff80;
   1388     //private static final int MAX_DELTA=     0xff;
   1389     //private static final int MIN_DELTA=     (-MAX_DELTA-1);
   1390 
   1391     private static final int getDelta(int props) {
   1392         return (short)props>>DELTA_SHIFT;
   1393     }
   1394 
   1395     /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
   1396     private static final int EXC_SHIFT=     5;
   1397     //private static final int EXC_MASK=      0xffe0;
   1398     //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
   1399 
   1400     /* definitions for 16-bit main exceptions word ------------------------------ */
   1401 
   1402     /* first 8 bits indicate values in optional slots */
   1403     private static final int EXC_LOWER=0;
   1404     private static final int EXC_FOLD=1;
   1405     private static final int EXC_UPPER=2;
   1406     private static final int EXC_TITLE=3;
   1407     //private static final int EXC_4=4;           /* reserved */
   1408     //private static final int EXC_5=5;           /* reserved */
   1409     private static final int EXC_CLOSURE=6;
   1410     private static final int EXC_FULL_MAPPINGS=7;
   1411     //private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */
   1412 
   1413     /* each slot is 2 uint16_t instead of 1 */
   1414     private static final int EXC_DOUBLE_SLOTS=          0x100;
   1415 
   1416     /* reserved: exception bits 11..9 */
   1417 
   1418     /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
   1419     private static final int EXC_DOT_SHIFT=7;
   1420 
   1421     /* normally stored in the main word, but pushed out for larger exception indexes */
   1422     //private static final int EXC_DOT_MASK=              0x3000;
   1423     //private static final int EXC_NO_DOT=                0;
   1424     //private static final int EXC_SOFT_DOTTED=           0x1000;
   1425     //private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */
   1426     //private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */
   1427 
   1428     /* complex/conditional mappings */
   1429     private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;
   1430     private static final int EXC_CONDITIONAL_FOLD=      0x8000;
   1431 
   1432     /* definitions for lengths word for full case mappings */
   1433     private static final int FULL_LOWER=    0xf;
   1434     //private static final int FULL_FOLDING=  0xf0;
   1435     //private static final int FULL_UPPER=    0xf00;
   1436     //private static final int FULL_TITLE=    0xf000;
   1437 
   1438     /* maximum lengths */
   1439     //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
   1440     private static final int CLOSURE_MAX_LENGTH=0xf;
   1441 
   1442     /* constants for reverse case folding ("unfold") data */
   1443     private static final int UNFOLD_ROWS=0;
   1444     private static final int UNFOLD_ROW_WIDTH=1;
   1445     private static final int UNFOLD_STRING_WIDTH=2;
   1446 
   1447     /*
   1448      * public singleton instance
   1449      */
   1450     public static final UCaseProps INSTANCE;
   1451 
   1452     // This static initializer block must be placed after
   1453     // other static member initialization
   1454     static {
   1455         try {
   1456             INSTANCE = new UCaseProps();
   1457         } catch (IOException e) {
   1458             throw new ICUUncheckedIOException(e);
   1459         }
   1460     }
   1461 }
   1462