Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  *
      6  *   Copyright (C) 2004-2015, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  *******************************************************************************
     10  *   file name:  UCaseProps.java
     11  *   encoding:   US-ASCII
     12  *   tab size:   8 (not used)
     13  *   indentation:4
     14  *
     15  *   created on: 2005jan29
     16  *   created by: Markus W. Scherer
     17  *
     18  *   Low-level Unicode character/string case mapping code.
     19  *   Java port of ucase.h/.c.
     20  */
     21 
     22 package com.ibm.icu.impl;
     23 
     24 import java.io.IOException;
     25 import java.nio.ByteBuffer;
     26 import java.util.Iterator;
     27 import java.util.Locale;
     28 
     29 import com.ibm.icu.lang.UCharacter;
     30 import com.ibm.icu.lang.UProperty;
     31 import com.ibm.icu.text.UTF16;
     32 import com.ibm.icu.text.UnicodeSet;
     33 import com.ibm.icu.util.ICUUncheckedIOException;
     34 import com.ibm.icu.util.ULocale;
     35 
     36 public final class UCaseProps {
     37 
     38     // constructors etc. --------------------------------------------------- ***
     39 
     40     // port of ucase_openProps()
     41     private UCaseProps() throws IOException {
     42         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
     43         readData(bytes);
     44     }
     45 
     46     private final void readData(ByteBuffer bytes) throws IOException {
     47         // read the header
     48         ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
     49 
     50         // read indexes[]
     51         int count=bytes.getInt();
     52         if(count<IX_TOP) {
     53             throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
     54         }
     55         indexes=new int[count];
     56 
     57         indexes[0]=count;
     58         for(int i=1; i<count; ++i) {
     59             indexes[i]=bytes.getInt();
     60         }
     61 
     62         // read the trie
     63         trie=Trie2_16.createFromSerialized(bytes);
     64         int expectedTrieLength=indexes[IX_TRIE_SIZE];
     65         int trieLength=trie.getSerializedLength();
     66         if(trieLength>expectedTrieLength) {
     67             throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
     68         }
     69         // skip padding after trie bytes
     70         ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
     71 
     72         // read exceptions[]
     73         count=indexes[IX_EXC_LENGTH];
     74         if(count>0) {
     75             exceptions=ICUBinary.getString(bytes, count, 0);
     76         }
     77 
     78         // read unfold[]
     79         count=indexes[IX_UNFOLD_LENGTH];
     80         if(count>0) {
     81             unfold=ICUBinary.getChars(bytes, count, 0);
     82         }
     83     }
     84 
     85     // implement ICUBinary.Authenticate
     86     private final static class IsAcceptable implements ICUBinary.Authenticate {
     87         @Override
     88         public boolean isDataVersionAcceptable(byte version[]) {
     89             return version[0]==3;
     90         }
     91     }
     92 
     93     // set of property starts for UnicodeSet ------------------------------- ***
     94 
     95     public final void addPropertyStarts(UnicodeSet set) {
     96         /* add the start code point of each same-value range of the trie */
     97         Iterator<Trie2.Range> trieIterator=trie.iterator();
     98         Trie2.Range range;
     99         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
    100             set.add(range.startCodePoint);
    101         }
    102 
    103         /* add code points with hardcoded properties, plus the ones following them */
    104 
    105         /* (none right now, see comment below) */
    106 
    107         /*
    108          * Omit code points with hardcoded specialcasing properties
    109          * because we do not build property UnicodeSets for them right now.
    110          */
    111     }
    112 
    113     // data access primitives ---------------------------------------------- ***
    114     private static final int getExceptionsOffset(int props) {
    115         return props>>EXC_SHIFT;
    116     }
    117 
    118     private static final boolean propsHasException(int props) {
    119         return (props&EXCEPTION)!=0;
    120     }
    121 
    122     /* number of bits in an 8-bit integer value */
    123     private static final byte flagsOffset[/*256*/]={
    124         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    125         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    126         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    127         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    128         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    129         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    130         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    131         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    132         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    133         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    134         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    135         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    136         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    137         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    138         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    139         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    140     };
    141 
    142     private static final boolean hasSlot(int flags, int index) {
    143         return (flags&(1<<index))!=0;
    144     }
    145     private static final byte slotOffset(int flags, int index) {
    146         return flagsOffset[flags&((1<<index)-1)];
    147     }
    148 
    149     /*
    150      * Get the value of an optional-value slot where hasSlot(excWord, index).
    151      *
    152      * @param excWord (in) initial exceptions word
    153      * @param index (in) desired slot index
    154      * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
    155      * @return bits 31..0: slot value
    156      *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
    157      */
    158     private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
    159         long value;
    160         if((excWord&EXC_DOUBLE_SLOTS)==0) {
    161             excOffset+=slotOffset(excWord, index);
    162             value=exceptions.charAt(excOffset);
    163         } else {
    164             excOffset+=2*slotOffset(excWord, index);
    165             value=exceptions.charAt(excOffset++);
    166             value=(value<<16)|exceptions.charAt(excOffset);
    167         }
    168         return value |((long)excOffset<<32);
    169     }
    170 
    171     /* same as getSlotValueAndOffset() but does not return the slot offset */
    172     private final int getSlotValue(int excWord, int index, int excOffset) {
    173         int value;
    174         if((excWord&EXC_DOUBLE_SLOTS)==0) {
    175             excOffset+=slotOffset(excWord, index);
    176             value=exceptions.charAt(excOffset);
    177         } else {
    178             excOffset+=2*slotOffset(excWord, index);
    179             value=exceptions.charAt(excOffset++);
    180             value=(value<<16)|exceptions.charAt(excOffset);
    181         }
    182         return value;
    183     }
    184 
    185     // simple case mappings ------------------------------------------------ ***
    186 
    187     public final int tolower(int c) {
    188         int props=trie.get(c);
    189         if(!propsHasException(props)) {
    190             if(getTypeFromProps(props)>=UPPER) {
    191                 c+=getDelta(props);
    192             }
    193         } else {
    194             int excOffset=getExceptionsOffset(props);
    195             int excWord=exceptions.charAt(excOffset++);
    196             if(hasSlot(excWord, EXC_LOWER)) {
    197                 c=getSlotValue(excWord, EXC_LOWER, excOffset);
    198             }
    199         }
    200         return c;
    201     }
    202 
    203     public final int toupper(int c) {
    204         int props=trie.get(c);
    205         if(!propsHasException(props)) {
    206             if(getTypeFromProps(props)==LOWER) {
    207                 c+=getDelta(props);
    208             }
    209         } else {
    210             int excOffset=getExceptionsOffset(props);
    211             int excWord=exceptions.charAt(excOffset++);
    212             if(hasSlot(excWord, EXC_UPPER)) {
    213                 c=getSlotValue(excWord, EXC_UPPER, excOffset);
    214             }
    215         }
    216         return c;
    217     }
    218 
    219     public final int totitle(int c) {
    220         int props=trie.get(c);
    221         if(!propsHasException(props)) {
    222             if(getTypeFromProps(props)==LOWER) {
    223                 c+=getDelta(props);
    224             }
    225         } else {
    226             int excOffset=getExceptionsOffset(props);
    227             int excWord=exceptions.charAt(excOffset++);
    228             int index;
    229             if(hasSlot(excWord, EXC_TITLE)) {
    230                 index=EXC_TITLE;
    231             } else if(hasSlot(excWord, EXC_UPPER)) {
    232                 index=EXC_UPPER;
    233             } else {
    234                 return c;
    235             }
    236             c=getSlotValue(excWord, index, excOffset);
    237         }
    238         return c;
    239     }
    240 
    241     /**
    242      * Adds all simple case mappings and the full case folding for c to sa,
    243      * and also adds special case closure mappings.
    244      * c itself is not added.
    245      * For example, the mappings
    246      * - for s include long s
    247      * - for sharp s include ss
    248      * - for k include the Kelvin sign
    249      */
    250     public final void addCaseClosure(int c, UnicodeSet set) {
    251         /*
    252          * Hardcode the case closure of i and its relatives and ignore the
    253          * data file data for these characters.
    254          * The Turkic dotless i and dotted I with their case mapping conditions
    255          * and case folding option make the related characters behave specially.
    256          * This code matches their closure behavior to their case folding behavior.
    257          */
    258 
    259         switch(c) {
    260         case 0x49:
    261             /* regular i and I are in one equivalence class */
    262             set.add(0x69);
    263             return;
    264         case 0x69:
    265             set.add(0x49);
    266             return;
    267         case 0x130:
    268             /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    269             set.add(iDot);
    270             return;
    271         case 0x131:
    272             /* dotless i is in a class by itself */
    273             return;
    274         default:
    275             /* otherwise use the data file data */
    276             break;
    277         }
    278 
    279         int props=trie.get(c);
    280         if(!propsHasException(props)) {
    281             if(getTypeFromProps(props)!=NONE) {
    282                 /* add the one simple case mapping, no matter what type it is */
    283                 int delta=getDelta(props);
    284                 if(delta!=0) {
    285                     set.add(c+delta);
    286                 }
    287             }
    288         } else {
    289             /*
    290              * c has exceptions, so there may be multiple simple and/or
    291              * full case mappings. Add them all.
    292              */
    293             int excOffset0, excOffset=getExceptionsOffset(props);
    294             int closureOffset;
    295             int excWord=exceptions.charAt(excOffset++);
    296             int index, closureLength, fullLength, length;
    297 
    298             excOffset0=excOffset;
    299 
    300             /* add all simple case mappings */
    301             for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
    302                 if(hasSlot(excWord, index)) {
    303                     excOffset=excOffset0;
    304                     c=getSlotValue(excWord, index, excOffset);
    305                     set.add(c);
    306                 }
    307             }
    308 
    309             /* get the closure string pointer & length */
    310             if(hasSlot(excWord, EXC_CLOSURE)) {
    311                 excOffset=excOffset0;
    312                 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
    313                 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    314                 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
    315             } else {
    316                 closureLength=0;
    317                 closureOffset=0;
    318             }
    319 
    320             /* add the full case folding */
    321             if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
    322                 excOffset=excOffset0;
    323                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
    324                 fullLength=(int)value;
    325 
    326                 /* start of full case mapping strings */
    327                 excOffset=(int)(value>>32)+1;
    328 
    329                 fullLength&=0xffff; /* bits 16 and higher are reserved */
    330 
    331                 /* skip the lowercase result string */
    332                 excOffset+=fullLength&FULL_LOWER;
    333                 fullLength>>=4;
    334 
    335                 /* add the full case folding string */
    336                 length=fullLength&0xf;
    337                 if(length!=0) {
    338                     set.add(exceptions.substring(excOffset, excOffset+length));
    339                     excOffset+=length;
    340                 }
    341 
    342                 /* skip the uppercase and titlecase strings */
    343                 fullLength>>=4;
    344                 excOffset+=fullLength&0xf;
    345                 fullLength>>=4;
    346                 excOffset+=fullLength;
    347 
    348                 closureOffset=excOffset; /* behind full case mappings */
    349             }
    350 
    351             /* add each code point in the closure string */
    352             int limit=closureOffset+closureLength;
    353             for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
    354                 c=exceptions.codePointAt(index);
    355                 set.add(c);
    356             }
    357         }
    358     }
    359 
    360     /*
    361      * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
    362      * must be s.length()>0 and max>0 and s.length()<=max
    363      */
    364     private final int strcmpMax(String s, int unfoldOffset, int max) {
    365         int i1, length, c1, c2;
    366 
    367         length=s.length();
    368         max-=length; /* we require length<=max, so no need to decrement max in the loop */
    369         i1=0;
    370         do {
    371             c1=s.charAt(i1++);
    372             c2=unfold[unfoldOffset++];
    373             if(c2==0) {
    374                 return 1; /* reached the end of t but not of s */
    375             }
    376             c1-=c2;
    377             if(c1!=0) {
    378                 return c1; /* return difference result */
    379             }
    380         } while(--length>0);
    381         /* ends with length==0 */
    382 
    383         if(max==0 || unfold[unfoldOffset]==0) {
    384             return 0; /* equal to length of both strings */
    385         } else {
    386             return -max; /* return lengh difference */
    387         }
    388     }
    389 
    390     /**
    391      * Maps the string to single code points and adds the associated case closure
    392      * mappings.
    393      * The string is mapped to code points if it is their full case folding string.
    394      * In other words, this performs a reverse full case folding and then
    395      * adds the case closure items of the resulting code points.
    396      * If the string is found and its closure applied, then
    397      * the string itself is added as well as part of its code points' closure.
    398      *
    399      * @return true if the string was found
    400      */
    401     public final boolean addStringCaseClosure(String s, UnicodeSet set) {
    402         int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    403 
    404         if(unfold==null || s==null) {
    405             return false; /* no reverse case folding data, or no string */
    406         }
    407         length=s.length();
    408         if(length<=1) {
    409             /* the string is too short to find any match */
    410             /*
    411              * more precise would be:
    412              * if(!u_strHasMoreChar32Than(s, length, 1))
    413              * but this does not make much practical difference because
    414              * a single supplementary code point would just not be found
    415              */
    416             return false;
    417         }
    418 
    419         unfoldRows=unfold[UNFOLD_ROWS];
    420         unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
    421         unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
    422         //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
    423 
    424         if(length>unfoldStringWidth) {
    425             /* the string is too long to find any match */
    426             return false;
    427         }
    428 
    429         /* do a binary search for the string */
    430         start=0;
    431         limit=unfoldRows;
    432         while(start<limit) {
    433             i=(start+limit)/2;
    434             unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
    435             result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
    436 
    437             if(result==0) {
    438                 /* found the string: add each code point, and its case closure */
    439                 int c;
    440 
    441                 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
    442                     c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
    443                     set.add(c);
    444                     addCaseClosure(c, set);
    445                 }
    446                 return true;
    447             } else if(result<0) {
    448                 limit=i;
    449             } else /* result>0 */ {
    450                 start=i+1;
    451             }
    452         }
    453 
    454         return false; /* string not found */
    455     }
    456 
    457     /** @return NONE, LOWER, UPPER, TITLE */
    458     public final int getType(int c) {
    459         return getTypeFromProps(trie.get(c));
    460     }
    461 
    462     /** @return like getType() but also sets IGNORABLE if c is case-ignorable */
    463     public final int getTypeOrIgnorable(int c) {
    464         return getTypeAndIgnorableFromProps(trie.get(c));
    465     }
    466 
    467     /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
    468     public final int getDotType(int c) {
    469         int props=trie.get(c);
    470         if(!propsHasException(props)) {
    471             return props&DOT_MASK;
    472         } else {
    473             return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
    474         }
    475     }
    476 
    477     public final boolean isSoftDotted(int c) {
    478         return getDotType(c)==SOFT_DOTTED;
    479     }
    480 
    481     public final boolean isCaseSensitive(int c) {
    482         return (trie.get(c)&SENSITIVE)!=0;
    483     }
    484 
    485     // string casing ------------------------------------------------------- ***
    486 
    487     /*
    488      * These internal functions form the core of string case mappings.
    489      * They map single code points to result code points or strings and take
    490      * all necessary conditions (context, locale ID, options) into account.
    491      *
    492      * They do not iterate over the source or write to the destination
    493      * so that the same functions are useful for non-standard string storage,
    494      * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    495      * For the same reason, the "surrounding text" context is passed in as a
    496      * ContextIterator which does not make any assumptions about
    497      * the underlying storage.
    498      *
    499      * This section contains helper functions that check for conditions
    500      * in the input text surrounding the current code point
    501      * according to SpecialCasing.txt.
    502      *
    503      * Each helper function gets the index
    504      * - after the current code point if it looks at following text
    505      * - before the current code point if it looks at preceding text
    506      *
    507      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    508      *
    509      * Final_Sigma
    510      *   C is preceded by a sequence consisting of
    511      *     a cased letter and a case-ignorable sequence,
    512      *   and C is not followed by a sequence consisting of
    513      *     an ignorable sequence and then a cased letter.
    514      *
    515      * More_Above
    516      *   C is followed by one or more characters of combining class 230 (ABOVE)
    517      *   in the combining character sequence.
    518      *
    519      * After_Soft_Dotted
    520      *   The last preceding character with combining class of zero before C
    521      *   was Soft_Dotted,
    522      *   and there is no intervening combining character class 230 (ABOVE).
    523      *
    524      * Before_Dot
    525      *   C is followed by combining dot above (U+0307).
    526      *   Any sequence of characters with a combining class that is neither 0 nor 230
    527      *   may intervene between the current character and the combining dot above.
    528      *
    529      * The erratum from 2002-10-31 adds the condition
    530      *
    531      * After_I
    532      *   The last preceding base character was an uppercase I, and there is no
    533      *   intervening combining character class 230 (ABOVE).
    534      *
    535      *   (See Jitterbug 2344 and the comments on After_I below.)
    536      *
    537      * Helper definitions in Unicode 3.2 UAX 21:
    538      *
    539      * D1. A character C is defined to be cased
    540      *     if it meets any of the following criteria:
    541      *
    542      *   - The general category of C is Titlecase Letter (Lt)
    543      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    544      *   - Given D = NFD(C), then it is not the case that:
    545      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    546      *     (This third criterium does not add any characters to the list
    547      *      for Unicode 3.2. Ignored.)
    548      *
    549      * D2. A character C is defined to be case-ignorable
    550      *     if it meets either of the following criteria:
    551      *
    552      *   - The general category of C is
    553      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    554      *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    555      *   - C is one of the following characters
    556      *     U+0027 APOSTROPHE
    557      *     U+00AD SOFT HYPHEN (SHY)
    558      *     U+2019 RIGHT SINGLE QUOTATION MARK
    559      *            (the preferred character for apostrophe)
    560      *
    561      * D3. A case-ignorable sequence is a sequence of
    562      *     zero or more case-ignorable characters.
    563      */
    564 
    565     /**
    566      * Iterator for string case mappings, which need to look at the
    567      * context (surrounding text) of a given character for conditional mappings.
    568      *
    569      * The iterator only needs to go backward or forward away from the
    570      * character in question. It does not use any indexes on this interface.
    571      * It does not support random access or an arbitrary change of
    572      * iteration direction.
    573      *
    574      * The code point being case-mapped itself is never returned by
    575      * this iterator.
    576      */
    577     public interface ContextIterator {
    578         /**
    579          * Reset the iterator for forward or backward iteration.
    580          * @param dir >0: Begin iterating forward from the first code point
    581          * after the one that is being case-mapped.
    582          *            <0: Begin iterating backward from the first code point
    583          * before the one that is being case-mapped.
    584          */
    585         public void reset(int dir);
    586         /**
    587          * Iterate and return the next code point, moving in the direction
    588          * determined by the reset() call.
    589          * @return Next code point, or <0 when the iteration is done.
    590          */
    591         public int next();
    592     }
    593 
    594     /**
    595      * For string case mappings, a single character (a code point) is mapped
    596      * either to itself (in which case in-place mapping functions do nothing),
    597      * or to another single code point, or to a string.
    598      * Aside from the string contents, these are indicated with a single int
    599      * value as follows:
    600      *
    601      * Mapping to self: Negative values (~self instead of -self to support U+0000)
    602      *
    603      * Mapping to another code point: Positive values >MAX_STRING_LENGTH
    604      *
    605      * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
    606      * returned. Note that the string result may indeed have zero length.
    607      */
    608     public static final int MAX_STRING_LENGTH=0x1f;
    609 
    610     //ivate static final int LOC_UNKNOWN=0;
    611     public static final int LOC_ROOT=1;
    612     private static final int LOC_TURKISH=2;
    613     private static final int LOC_LITHUANIAN=3;
    614     static final int LOC_GREEK=4;
    615     public static final int LOC_DUTCH=5;
    616 
    617     public static final int getCaseLocale(Locale locale) {
    618         return getCaseLocale(locale.getLanguage());
    619     }
    620     public static final int getCaseLocale(ULocale locale) {
    621         return getCaseLocale(locale.getLanguage());
    622     }
    623     /** Accepts both 2- and 3-letter language subtags. */
    624     private static final int getCaseLocale(String language) {
    625         // Check the subtag length to reduce the number of comparisons
    626         // for locales without special behavior.
    627         // Fastpath for English "en" which is often used for default (=root locale) case mappings,
    628         // and for Chinese "zh": Very common but no special case mapping behavior.
    629         if(language.length()==2) {
    630             if(language.equals("en") || language.charAt(0)>'t') {
    631                 return LOC_ROOT;
    632             } else if(language.equals("tr") || language.equals("az")) {
    633                 return LOC_TURKISH;
    634             } else if(language.equals("el")) {
    635                 return LOC_GREEK;
    636             } else if(language.equals("lt")) {
    637                 return LOC_LITHUANIAN;
    638             } else if(language.equals("nl")) {
    639                 return LOC_DUTCH;
    640             }
    641         } else if(language.length()==3) {
    642             if(language.equals("tur") || language.equals("aze")) {
    643                 return LOC_TURKISH;
    644             } else if(language.equals("ell")) {
    645                 return LOC_GREEK;
    646             } else if(language.equals("lit")) {
    647                 return LOC_LITHUANIAN;
    648             } else if(language.equals("nld")) {
    649                 return LOC_DUTCH;
    650             }
    651         }
    652         return LOC_ROOT;
    653     }
    654 
    655     /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
    656     private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
    657         int c;
    658 
    659         if(iter==null) {
    660             return false;
    661         }
    662 
    663         for(iter.reset(dir); (c=iter.next())>=0;) {
    664             int type=getTypeOrIgnorable(c);
    665             if((type&4)!=0) {
    666                 /* case-ignorable, continue with the loop */
    667             } else if(type!=NONE) {
    668                 return true; /* followed by cased letter */
    669             } else {
    670                 return false; /* uncased and not case-ignorable */
    671             }
    672         }
    673 
    674         return false; /* not followed by cased letter */
    675     }
    676 
    677     /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    678     private final boolean isPrecededBySoftDotted(ContextIterator iter) {
    679         int c;
    680         int dotType;
    681 
    682         if(iter==null) {
    683             return false;
    684         }
    685 
    686         for(iter.reset(-1); (c=iter.next())>=0;) {
    687             dotType=getDotType(c);
    688             if(dotType==SOFT_DOTTED) {
    689                 return true; /* preceded by TYPE_i */
    690             } else if(dotType!=OTHER_ACCENT) {
    691                 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    692             }
    693         }
    694 
    695         return false; /* not preceded by TYPE_i */
    696     }
    697 
    698     /*
    699      * See Jitterbug 2344:
    700      * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    701      * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    702      * we made those releases compatible with Unicode 3.2 which had not fixed
    703      * a related bug in SpecialCasing.txt.
    704      *
    705      * From the Jitterbug 2344 text:
    706      * ... this bug is listed as a Unicode erratum
    707      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    708      * <quote>
    709      * There are two errors in SpecialCasing.txt.
    710      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    711      * 2. An incorrect context definition. Correct as follows:
    712      * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    713      * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    714      * ---
    715      * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    716      * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    717      * where the context After_I is defined as:
    718      * The last preceding base character was an uppercase I, and there is no
    719      * intervening combining character class 230 (ABOVE).
    720      * </quote>
    721      *
    722      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    723      *
    724      * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    725      * # This matches the behavior of the canonically equivalent I-dot_above
    726      *
    727      * See also the description in this place in older versions of uchar.c (revision 1.100).
    728      *
    729      * Markus W. Scherer 2003-feb-15
    730      */
    731 
    732     /* Is preceded by base character 'I' with no intervening cc=230 ? */
    733     private final boolean isPrecededBy_I(ContextIterator iter) {
    734         int c;
    735         int dotType;
    736 
    737         if(iter==null) {
    738             return false;
    739         }
    740 
    741         for(iter.reset(-1); (c=iter.next())>=0;) {
    742             if(c==0x49) {
    743                 return true; /* preceded by I */
    744             }
    745             dotType=getDotType(c);
    746             if(dotType!=OTHER_ACCENT) {
    747                 return false; /* preceded by different base character (not I), or intervening cc==230 */
    748             }
    749         }
    750 
    751         return false; /* not preceded by I */
    752     }
    753 
    754     /* Is followed by one or more cc==230 ? */
    755     private final boolean isFollowedByMoreAbove(ContextIterator iter) {
    756         int c;
    757         int dotType;
    758 
    759         if(iter==null) {
    760             return false;
    761         }
    762 
    763         for(iter.reset(1); (c=iter.next())>=0;) {
    764             dotType=getDotType(c);
    765             if(dotType==ABOVE) {
    766                 return true; /* at least one cc==230 following */
    767             } else if(dotType!=OTHER_ACCENT) {
    768                 return false; /* next base character, no more cc==230 following */
    769             }
    770         }
    771 
    772         return false; /* no more cc==230 following */
    773     }
    774 
    775     /* Is followed by a dot above (without cc==230 in between) ? */
    776     private final boolean isFollowedByDotAbove(ContextIterator iter) {
    777         int c;
    778         int dotType;
    779 
    780         if(iter==null) {
    781             return false;
    782         }
    783 
    784         for(iter.reset(1); (c=iter.next())>=0; ) {
    785             if(c==0x307) {
    786                 return true;
    787             }
    788             dotType=getDotType(c);
    789             if(dotType!=OTHER_ACCENT) {
    790                 return false; /* next base character or cc==230 in between */
    791             }
    792         }
    793 
    794         return false; /* no dot above following */
    795     }
    796 
    797     private static final String
    798         iDot=       "i\u0307",
    799         jDot=       "j\u0307",
    800         iOgonekDot= "\u012f\u0307",
    801         iDotGrave=  "i\u0307\u0300",
    802         iDotAcute=  "i\u0307\u0301",
    803         iDotTilde=  "i\u0307\u0303";
    804 
    805     /**
    806      * Get the full lowercase mapping for c.
    807      *
    808      * @param c Character to be mapped.
    809      * @param iter Character iterator, used for context-sensitive mappings.
    810      *             See ContextIterator for details.
    811      *             If iter==null then a context-independent result is returned.
    812      * @param out If the mapping result is a string, then it is appended to out.
    813      * @param caseLocale Case locale value from ucase_getCaseLocale().
    814      * @return Output code point or string length, see MAX_STRING_LENGTH.
    815      *
    816      * @see ContextIterator
    817      * @see #MAX_STRING_LENGTH
    818      * @internal
    819      */
    820     public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
    821         int result, props;
    822 
    823         result=c;
    824         props=trie.get(c);
    825         if(!propsHasException(props)) {
    826             if(getTypeFromProps(props)>=UPPER) {
    827                 result=c+getDelta(props);
    828             }
    829         } else {
    830             int excOffset=getExceptionsOffset(props), excOffset2;
    831             int excWord=exceptions.charAt(excOffset++);
    832             int full;
    833 
    834             excOffset2=excOffset;
    835 
    836             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
    837                 /* use hardcoded conditions and mappings */
    838                 /*
    839                  * Test for conditional mappings first
    840                  *   (otherwise the unconditional default mappings are always taken),
    841                  * then test for characters that have unconditional mappings in SpecialCasing.txt,
    842                  * then get the UnicodeData.txt mappings.
    843                  */
    844                 if( caseLocale==LOC_LITHUANIAN &&
    845                         /* base characters, find accents above */
    846                         (((c==0x49 || c==0x4a || c==0x12e) &&
    847                             isFollowedByMoreAbove(iter)) ||
    848                         /* precomposed with accent above, no need to find one */
    849                         (c==0xcc || c==0xcd || c==0x128))
    850                 ) {
    851                     /*
    852                         # Lithuanian
    853 
    854                         # Lithuanian retains the dot in a lowercase i when followed by accents.
    855 
    856                         # Introduce an explicit dot above when lowercasing capital I's and J's
    857                         # whenever there are more accents above.
    858                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
    859 
    860                         0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
    861                         004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
    862                         012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
    863                         00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
    864                         00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
    865                         0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
    866                      */
    867                     try {
    868                         switch(c) {
    869                         case 0x49:  /* LATIN CAPITAL LETTER I */
    870                             out.append(iDot);
    871                             return 2;
    872                         case 0x4a:  /* LATIN CAPITAL LETTER J */
    873                             out.append(jDot);
    874                             return 2;
    875                         case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
    876                             out.append(iOgonekDot);
    877                             return 2;
    878                         case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
    879                             out.append(iDotGrave);
    880                             return 3;
    881                         case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
    882                             out.append(iDotAcute);
    883                             return 3;
    884                         case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
    885                             out.append(iDotTilde);
    886                             return 3;
    887                         default:
    888                             return 0; /* will not occur */
    889                         }
    890                     } catch (IOException e) {
    891                         throw new ICUUncheckedIOException(e);
    892                     }
    893                 /* # Turkish and Azeri */
    894                 } else if(caseLocale==LOC_TURKISH && c==0x130) {
    895                     /*
    896                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    897                         # The following rules handle those cases.
    898 
    899                         0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
    900                         0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
    901                      */
    902                     return 0x69;
    903                 } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
    904                     /*
    905                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    906                         # This matches the behavior of the canonically equivalent I-dot_above
    907 
    908                         0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    909                         0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    910                      */
    911                     return 0; /* remove the dot (continue without output) */
    912                 } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
    913                     /*
    914                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
    915 
    916                         0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
    917                         0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
    918                      */
    919                     return 0x131;
    920                 } else if(c==0x130) {
    921                     /*
    922                         # Preserve canonical equivalence for I with dot. Turkic is handled below.
    923 
    924                         0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    925                      */
    926                     try {
    927                         out.append(iDot);
    928                         return 2;
    929                     } catch (IOException e) {
    930                         throw new ICUUncheckedIOException(e);
    931                     }
    932                 } else if(  c==0x3a3 &&
    933                             !isFollowedByCasedLetter(iter, 1) &&
    934                             isFollowedByCasedLetter(iter, -1) /* -1=preceded */
    935                 ) {
    936                     /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
    937                     /*
    938                         # Special case for final form of sigma
    939 
    940                         03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    941                      */
    942                     return 0x3c2; /* greek small final sigma */
    943                 } else {
    944                     /* no known conditional special case mapping, use a normal mapping */
    945                 }
    946             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
    947                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
    948                 full=(int)value&FULL_LOWER;
    949                 if(full!=0) {
    950                     /* start of full case mapping strings */
    951                     excOffset=(int)(value>>32)+1;
    952 
    953                     try {
    954                         // append the lowercase mapping
    955                         out.append(exceptions, excOffset, excOffset+full);
    956 
    957                         /* return the string length */
    958                         return full;
    959                     } catch (IOException e) {
    960                         throw new ICUUncheckedIOException(e);
    961                     }
    962                 }
    963             }
    964 
    965             if(hasSlot(excWord, EXC_LOWER)) {
    966                 result=getSlotValue(excWord, EXC_LOWER, excOffset2);
    967             }
    968         }
    969 
    970         return (result==c) ? ~result : result;
    971     }
    972 
    973     /* internal */
    974     private final int toUpperOrTitle(int c, ContextIterator iter,
    975                                      Appendable out,
    976                                      int loc,
    977                                      boolean upperNotTitle) {
    978         int result;
    979         int props;
    980 
    981         result=c;
    982         props=trie.get(c);
    983         if(!propsHasException(props)) {
    984             if(getTypeFromProps(props)==LOWER) {
    985                 result=c+getDelta(props);
    986             }
    987         } else {
    988             int excOffset=getExceptionsOffset(props), excOffset2;
    989             int excWord=exceptions.charAt(excOffset++);
    990             int full, index;
    991 
    992             excOffset2=excOffset;
    993 
    994             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
    995                 /* use hardcoded conditions and mappings */
    996                 if(loc==LOC_TURKISH && c==0x69) {
    997                     /*
    998                         # Turkish and Azeri
    999 
   1000                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1001                         # The following rules handle those cases.
   1002 
   1003                         # When uppercasing, i turns into a dotted capital I
   1004 
   1005                         0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   1006                         0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   1007                     */
   1008                     return 0x130;
   1009                 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
   1010                     /*
   1011                         # Lithuanian
   1012 
   1013                         # Lithuanian retains the dot in a lowercase i when followed by accents.
   1014 
   1015                         # Remove DOT ABOVE after "i" with upper or titlecase
   1016 
   1017                         0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   1018                      */
   1019                     return 0; /* remove the dot (continue without output) */
   1020                 } else {
   1021                     /* no known conditional special case mapping, use a normal mapping */
   1022                 }
   1023             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
   1024                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
   1025                 full=(int)value&0xffff;
   1026 
   1027                 /* start of full case mapping strings */
   1028                 excOffset=(int)(value>>32)+1;
   1029 
   1030                 /* skip the lowercase and case-folding result strings */
   1031                 excOffset+=full&FULL_LOWER;
   1032                 full>>=4;
   1033                 excOffset+=full&0xf;
   1034                 full>>=4;
   1035 
   1036                 if(upperNotTitle) {
   1037                     full&=0xf;
   1038                 } else {
   1039                     /* skip the uppercase result string */
   1040                     excOffset+=full&0xf;
   1041                     full=(full>>4)&0xf;
   1042                 }
   1043 
   1044                 if(full!=0) {
   1045                     try {
   1046                         // append the result string
   1047                         out.append(exceptions, excOffset, excOffset+full);
   1048 
   1049                         /* return the string length */
   1050                         return full;
   1051                     } catch (IOException e) {
   1052                         throw new ICUUncheckedIOException(e);
   1053                     }
   1054                 }
   1055             }
   1056 
   1057             if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
   1058                 index=EXC_TITLE;
   1059             } else if(hasSlot(excWord, EXC_UPPER)) {
   1060                 /* here, titlecase is same as uppercase */
   1061                 index=EXC_UPPER;
   1062             } else {
   1063                 return ~c;
   1064             }
   1065             result=getSlotValue(excWord, index, excOffset2);
   1066         }
   1067 
   1068         return (result==c) ? ~result : result;
   1069     }
   1070 
   1071     public final int toFullUpper(int c, ContextIterator iter,
   1072                                  Appendable out,
   1073                                  int caseLocale) {
   1074         return toUpperOrTitle(c, iter, out, caseLocale, true);
   1075     }
   1076 
   1077     public final int toFullTitle(int c, ContextIterator iter,
   1078                                  Appendable out,
   1079                                  int caseLocale) {
   1080         return toUpperOrTitle(c, iter, out, caseLocale, false);
   1081     }
   1082 
   1083     /* case folding ------------------------------------------------------------- */
   1084 
   1085     /*
   1086      * Case folding is similar to lowercasing.
   1087      * The result may be a simple mapping, i.e., a single code point, or
   1088      * a full mapping, i.e., a string.
   1089      * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1090      * then only the lowercase mapping is stored.
   1091      *
   1092      * Some special cases are hardcoded because their conditions cannot be
   1093      * parsed and processed from CaseFolding.txt.
   1094      *
   1095      * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1096 
   1097     # C: common case folding, common mappings shared by both simple and full mappings.
   1098     # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1099     # S: simple case folding, mappings to single characters where different from F.
   1100     # T: special case for uppercase I and dotted uppercase I
   1101     #    - For non-Turkic languages, this mapping is normally not used.
   1102     #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1103     #
   1104     # Usage:
   1105     #  A. To do a simple case folding, use the mappings with status C + S.
   1106     #  B. To do a full case folding, use the mappings with status C + F.
   1107     #
   1108     #    The mappings with status T can be used or omitted depending on the desired case-folding
   1109     #    behavior. (The default option is to exclude them.)
   1110 
   1111      * Unicode 3.2 has 'T' mappings as follows:
   1112 
   1113     0049; T; 0131; # LATIN CAPITAL LETTER I
   1114     0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1115 
   1116      * while the default mappings for these code points are:
   1117 
   1118     0049; C; 0069; # LATIN CAPITAL LETTER I
   1119     0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1120 
   1121      * U+0130 has no simple case folding (simple-case-folds to itself).
   1122      */
   1123 
   1124     /**
   1125      * Bit mask for getting just the options from a string compare options word
   1126      * that are relevant for case folding (of a single string or code point).
   1127      *
   1128      * Currently only bit 0 for FOLD_CASE_EXCLUDE_SPECIAL_I.
   1129      * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
   1130      * It is conceivable that at some point we might want the option to use only simple case foldings
   1131      * when operating on strings.
   1132      *
   1133      * @internal
   1134      */
   1135     private static final int FOLD_CASE_OPTIONS_MASK = 7;
   1136 
   1137     /* return the simple case folding mapping for c */
   1138     public final int fold(int c, int options) {
   1139         int props=trie.get(c);
   1140         if(!propsHasException(props)) {
   1141             if(getTypeFromProps(props)>=UPPER) {
   1142                 c+=getDelta(props);
   1143             }
   1144         } else {
   1145             int excOffset=getExceptionsOffset(props);
   1146             int excWord=exceptions.charAt(excOffset++);
   1147             int index;
   1148             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
   1149                 /* special case folding mappings, hardcoded */
   1150                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
   1151                     /* default mappings */
   1152                     if(c==0x49) {
   1153                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1154                         return 0x69;
   1155                     } else if(c==0x130) {
   1156                         /* no simple case folding for U+0130 */
   1157                         return c;
   1158                     }
   1159                 } else {
   1160                     /* Turkic mappings */
   1161                     if(c==0x49) {
   1162                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1163                         return 0x131;
   1164                     } else if(c==0x130) {
   1165                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1166                         return 0x69;
   1167                     }
   1168                 }
   1169             }
   1170             if(hasSlot(excWord, EXC_FOLD)) {
   1171                 index=EXC_FOLD;
   1172             } else if(hasSlot(excWord, EXC_LOWER)) {
   1173                 index=EXC_LOWER;
   1174             } else {
   1175                 return c;
   1176             }
   1177             c=getSlotValue(excWord, index, excOffset);
   1178         }
   1179         return c;
   1180     }
   1181 
   1182     /*
   1183      * Issue for canonical caseless match (UAX #21):
   1184      * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1185      * canonical equivalence, unlike default-option casefolding.
   1186      * For example, I-grave and I + grave fold to strings that are not canonically
   1187      * equivalent.
   1188      * For more details, see the comment in unorm_compare() in unorm.cpp
   1189      * and the intermediate prototype changes for Jitterbug 2021.
   1190      * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1191      *
   1192      * This did not get fixed because it appears that it is not possible to fix
   1193      * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1194      * together in a way that they still fold to common result strings.
   1195      */
   1196 
   1197     public final int toFullFolding(int c, Appendable out, int options) {
   1198         int result;
   1199         int props;
   1200 
   1201         result=c;
   1202         props=trie.get(c);
   1203         if(!propsHasException(props)) {
   1204             if(getTypeFromProps(props)>=UPPER) {
   1205                 result=c+getDelta(props);
   1206             }
   1207         } else {
   1208             int excOffset=getExceptionsOffset(props), excOffset2;
   1209             int excWord=exceptions.charAt(excOffset++);
   1210             int full, index;
   1211 
   1212             excOffset2=excOffset;
   1213 
   1214             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
   1215                 /* use hardcoded conditions and mappings */
   1216                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
   1217                     /* default mappings */
   1218                     if(c==0x49) {
   1219                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1220                         return 0x69;
   1221                     } else if(c==0x130) {
   1222                         /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1223                         try {
   1224                             out.append(iDot);
   1225                             return 2;
   1226                         } catch (IOException e) {
   1227                             throw new ICUUncheckedIOException(e);
   1228                         }
   1229                     }
   1230                 } else {
   1231                     /* Turkic mappings */
   1232                     if(c==0x49) {
   1233                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1234                         return 0x131;
   1235                     } else if(c==0x130) {
   1236                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1237                         return 0x69;
   1238                     }
   1239                 }
   1240             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
   1241                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
   1242                 full=(int)value&0xffff;
   1243 
   1244                 /* start of full case mapping strings */
   1245                 excOffset=(int)(value>>32)+1;
   1246 
   1247                 /* skip the lowercase result string */
   1248                 excOffset+=full&FULL_LOWER;
   1249                 full=(full>>4)&0xf;
   1250 
   1251                 if(full!=0) {
   1252                     try {
   1253                         // append the result string
   1254                         out.append(exceptions, excOffset, excOffset+full);
   1255 
   1256                         /* return the string length */
   1257                         return full;
   1258                     } catch (IOException e) {
   1259                         throw new ICUUncheckedIOException(e);
   1260                     }
   1261                 }
   1262             }
   1263 
   1264             if(hasSlot(excWord, EXC_FOLD)) {
   1265                 index=EXC_FOLD;
   1266             } else if(hasSlot(excWord, EXC_LOWER)) {
   1267                 index=EXC_LOWER;
   1268             } else {
   1269                 return ~c;
   1270             }
   1271             result=getSlotValue(excWord, index, excOffset2);
   1272         }
   1273 
   1274         return (result==c) ? ~result : result;
   1275     }
   1276 
   1277     /* case mapping properties API ---------------------------------------------- */
   1278 
   1279     /*
   1280      * We need a StringBuilder for multi-code point output from the
   1281      * full case mapping functions. However, we do not actually use that output,
   1282      * we just check whether the input character was mapped to anything else.
   1283      * We use a shared StringBuilder to avoid allocating a new one in each call.
   1284      * We remove its contents each time so that it does not grow large over time.
   1285      *
   1286      * @internal
   1287      */
   1288     public static final StringBuilder dummyStringBuilder = new StringBuilder();
   1289 
   1290     public final boolean hasBinaryProperty(int c, int which) {
   1291         switch(which) {
   1292         case UProperty.LOWERCASE:
   1293             return LOWER==getType(c);
   1294         case UProperty.UPPERCASE:
   1295             return UPPER==getType(c);
   1296         case UProperty.SOFT_DOTTED:
   1297             return isSoftDotted(c);
   1298         case UProperty.CASE_SENSITIVE:
   1299             return isCaseSensitive(c);
   1300         case UProperty.CASED:
   1301             return NONE!=getType(c);
   1302         case UProperty.CASE_IGNORABLE:
   1303             return (getTypeOrIgnorable(c)>>2)!=0;
   1304         /*
   1305          * Note: The following Changes_When_Xyz are defined as testing whether
   1306          * the NFD form of the input changes when Xyz-case-mapped.
   1307          * However, this simpler implementation of these properties,
   1308          * ignoring NFD, passes the tests.
   1309          * The implementation needs to be changed if the tests start failing.
   1310          * When that happens, optimizations should be used to work with the
   1311          * per-single-code point ucase_toFullXyz() functions unless
   1312          * the NFD form has more than one code point,
   1313          * and the property starts set needs to be the union of the
   1314          * start sets for normalization and case mappings.
   1315          */
   1316         case UProperty.CHANGES_WHEN_LOWERCASED:
   1317             dummyStringBuilder.setLength(0);
   1318             return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1319         case UProperty.CHANGES_WHEN_UPPERCASED:
   1320             dummyStringBuilder.setLength(0);
   1321             return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1322         case UProperty.CHANGES_WHEN_TITLECASED:
   1323             dummyStringBuilder.setLength(0);
   1324             return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1325         /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
   1326         case UProperty.CHANGES_WHEN_CASEMAPPED:
   1327             dummyStringBuilder.setLength(0);
   1328             return
   1329                 toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
   1330                 toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
   1331                 toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
   1332         default:
   1333             return false;
   1334         }
   1335     }
   1336 
   1337     // data members -------------------------------------------------------- ***
   1338     private int indexes[];
   1339     private String exceptions;
   1340     private char unfold[];
   1341 
   1342     private Trie2_16 trie;
   1343 
   1344     // data format constants ----------------------------------------------- ***
   1345     private static final String DATA_NAME="ucase";
   1346     private static final String DATA_TYPE="icu";
   1347     private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
   1348 
   1349     /* format "cAsE" */
   1350     private static final int FMT=0x63415345;
   1351 
   1352     /* indexes into indexes[] */
   1353     //private static final int IX_INDEX_TOP=0;
   1354     //private static final int IX_LENGTH=1;
   1355     private static final int IX_TRIE_SIZE=2;
   1356     private static final int IX_EXC_LENGTH=3;
   1357     private static final int IX_UNFOLD_LENGTH=4;
   1358 
   1359     //private static final int IX_MAX_FULL_LENGTH=15;
   1360     private static final int IX_TOP=16;
   1361 
   1362     // definitions for 16-bit case properties word ------------------------- ***
   1363 
   1364     /* 2-bit constants for types of cased characters */
   1365     public static final int TYPE_MASK=3;
   1366     public static final int NONE=0;
   1367     public static final int LOWER=1;
   1368     public static final int UPPER=2;
   1369     public static final int TITLE=3;
   1370 
   1371     /** @return NONE, LOWER, UPPER, TITLE */
   1372     private static final int getTypeFromProps(int props) {
   1373         return props&TYPE_MASK;
   1374     }
   1375 
   1376     /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */
   1377     private static final int getTypeAndIgnorableFromProps(int props) {
   1378         return props&7;
   1379     }
   1380 
   1381     static final int IGNORABLE=4;
   1382     private static final int SENSITIVE=     8;
   1383     private static final int EXCEPTION=     0x10;
   1384 
   1385     private static final int DOT_MASK=      0x60;
   1386     //private static final int NO_DOT=        0;      /* normal characters with cc=0 */
   1387     private static final int SOFT_DOTTED=   0x20;   /* soft-dotted characters with cc=0 */
   1388     private static final int ABOVE=         0x40;   /* "above" accents with cc=230 */
   1389     private static final int OTHER_ACCENT=  0x60;   /* other accent character (0<cc!=230) */
   1390 
   1391     /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
   1392     private static final int DELTA_SHIFT=   7;
   1393     //private static final int DELTA_MASK=    0xff80;
   1394     //private static final int MAX_DELTA=     0xff;
   1395     //private static final int MIN_DELTA=     (-MAX_DELTA-1);
   1396 
   1397     private static final int getDelta(int props) {
   1398         return (short)props>>DELTA_SHIFT;
   1399     }
   1400 
   1401     /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
   1402     private static final int EXC_SHIFT=     5;
   1403     //private static final int EXC_MASK=      0xffe0;
   1404     //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
   1405 
   1406     /* definitions for 16-bit main exceptions word ------------------------------ */
   1407 
   1408     /* first 8 bits indicate values in optional slots */
   1409     private static final int EXC_LOWER=0;
   1410     private static final int EXC_FOLD=1;
   1411     private static final int EXC_UPPER=2;
   1412     private static final int EXC_TITLE=3;
   1413     //private static final int EXC_4=4;           /* reserved */
   1414     //private static final int EXC_5=5;           /* reserved */
   1415     private static final int EXC_CLOSURE=6;
   1416     private static final int EXC_FULL_MAPPINGS=7;
   1417     //private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */
   1418 
   1419     /* each slot is 2 uint16_t instead of 1 */
   1420     private static final int EXC_DOUBLE_SLOTS=          0x100;
   1421 
   1422     /* reserved: exception bits 11..9 */
   1423 
   1424     /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
   1425     private static final int EXC_DOT_SHIFT=7;
   1426 
   1427     /* normally stored in the main word, but pushed out for larger exception indexes */
   1428     //private static final int EXC_DOT_MASK=              0x3000;
   1429     //private static final int EXC_NO_DOT=                0;
   1430     //private static final int EXC_SOFT_DOTTED=           0x1000;
   1431     //private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */
   1432     //private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */
   1433 
   1434     /* complex/conditional mappings */
   1435     private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;
   1436     private static final int EXC_CONDITIONAL_FOLD=      0x8000;
   1437 
   1438     /* definitions for lengths word for full case mappings */
   1439     private static final int FULL_LOWER=    0xf;
   1440     //private static final int FULL_FOLDING=  0xf0;
   1441     //private static final int FULL_UPPER=    0xf00;
   1442     //private static final int FULL_TITLE=    0xf000;
   1443 
   1444     /* maximum lengths */
   1445     //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
   1446     private static final int CLOSURE_MAX_LENGTH=0xf;
   1447 
   1448     /* constants for reverse case folding ("unfold") data */
   1449     private static final int UNFOLD_ROWS=0;
   1450     private static final int UNFOLD_ROW_WIDTH=1;
   1451     private static final int UNFOLD_STRING_WIDTH=2;
   1452 
   1453     /*
   1454      * public singleton instance
   1455      */
   1456     public static final UCaseProps INSTANCE;
   1457 
   1458     // This static initializer block must be placed after
   1459     // other static member initialization
   1460     static {
   1461         try {
   1462             INSTANCE = new UCaseProps();
   1463         } catch (IOException e) {
   1464             throw new ICUUncheckedIOException(e);
   1465         }
   1466     }
   1467 }
   1468