1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * 7 * Copyright (C) 2004-2015, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 * 10 ******************************************************************************* 11 * file name: UCaseProps.java 12 * encoding: US-ASCII 13 * tab size: 8 (not used) 14 * indentation:4 15 * 16 * created on: 2005jan29 17 * created by: Markus W. Scherer 18 * 19 * Low-level Unicode character/string case mapping code. 20 * Java port of ucase.h/.c. 21 */ 22 23 package android.icu.impl; 24 25 import java.io.IOException; 26 import java.nio.ByteBuffer; 27 import java.util.Iterator; 28 import java.util.Locale; 29 30 import android.icu.lang.UCharacter; 31 import android.icu.lang.UProperty; 32 import android.icu.text.UTF16; 33 import android.icu.text.UnicodeSet; 34 import android.icu.util.ICUUncheckedIOException; 35 import android.icu.util.ULocale; 36 37 /** 38 * @hide Only a subset of ICU is exposed in Android 39 */ 40 public final class UCaseProps { 41 42 // constructors etc. --------------------------------------------------- *** 43 44 // port of ucase_openProps() 45 private UCaseProps() throws IOException { 46 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); 47 readData(bytes); 48 } 49 50 private final void readData(ByteBuffer bytes) throws IOException { 51 // read the header 52 ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); 53 54 // read indexes[] 55 int count=bytes.getInt(); 56 if(count<IX_TOP) { 57 throw new IOException("indexes[0] too small in "+DATA_FILE_NAME); 58 } 59 indexes=new int[count]; 60 61 indexes[0]=count; 62 for(int i=1; i<count; ++i) { 63 indexes[i]=bytes.getInt(); 64 } 65 66 // read the trie 67 trie=Trie2_16.createFromSerialized(bytes); 68 int expectedTrieLength=indexes[IX_TRIE_SIZE]; 69 int trieLength=trie.getSerializedLength(); 70 if(trieLength>expectedTrieLength) { 71 throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); 72 } 73 // skip padding after trie bytes 74 ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); 75 76 // read exceptions[] 77 count=indexes[IX_EXC_LENGTH]; 78 if(count>0) { 79 exceptions=ICUBinary.getString(bytes, count, 0); 80 } 81 82 // read unfold[] 83 count=indexes[IX_UNFOLD_LENGTH]; 84 if(count>0) { 85 unfold=ICUBinary.getChars(bytes, count, 0); 86 } 87 } 88 89 // implement ICUBinary.Authenticate 90 private final static class IsAcceptable implements ICUBinary.Authenticate { 91 @Override 92 public boolean isDataVersionAcceptable(byte version[]) { 93 return version[0]==3; 94 } 95 } 96 97 // set of property starts for UnicodeSet ------------------------------- *** 98 99 public final void addPropertyStarts(UnicodeSet set) { 100 /* add the start code point of each same-value range of the trie */ 101 Iterator<Trie2.Range> trieIterator=trie.iterator(); 102 Trie2.Range range; 103 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 104 set.add(range.startCodePoint); 105 } 106 107 /* add code points with hardcoded properties, plus the ones following them */ 108 109 /* (none right now, see comment below) */ 110 111 /* 112 * Omit code points with hardcoded specialcasing properties 113 * because we do not build property UnicodeSets for them right now. 114 */ 115 } 116 117 // data access primitives ---------------------------------------------- *** 118 private static final int getExceptionsOffset(int props) { 119 return props>>EXC_SHIFT; 120 } 121 122 private static final boolean propsHasException(int props) { 123 return (props&EXCEPTION)!=0; 124 } 125 126 /* number of bits in an 8-bit integer value */ 127 private static final byte flagsOffset[/*256*/]={ 128 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 129 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 130 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 131 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 132 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 133 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 134 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 135 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 136 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 137 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 138 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 139 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 140 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 141 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 142 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 143 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 144 }; 145 146 private static final boolean hasSlot(int flags, int index) { 147 return (flags&(1<<index))!=0; 148 } 149 private static final byte slotOffset(int flags, int index) { 150 return flagsOffset[flags&((1<<index)-1)]; 151 } 152 153 /* 154 * Get the value of an optional-value slot where hasSlot(excWord, index). 155 * 156 * @param excWord (in) initial exceptions word 157 * @param index (in) desired slot index 158 * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++); 159 * @return bits 31..0: slot value 160 * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot 161 */ 162 private final long getSlotValueAndOffset(int excWord, int index, int excOffset) { 163 long value; 164 if((excWord&EXC_DOUBLE_SLOTS)==0) { 165 excOffset+=slotOffset(excWord, index); 166 value=exceptions.charAt(excOffset); 167 } else { 168 excOffset+=2*slotOffset(excWord, index); 169 value=exceptions.charAt(excOffset++); 170 value=(value<<16)|exceptions.charAt(excOffset); 171 } 172 return value |((long)excOffset<<32); 173 } 174 175 /* same as getSlotValueAndOffset() but does not return the slot offset */ 176 private final int getSlotValue(int excWord, int index, int excOffset) { 177 int value; 178 if((excWord&EXC_DOUBLE_SLOTS)==0) { 179 excOffset+=slotOffset(excWord, index); 180 value=exceptions.charAt(excOffset); 181 } else { 182 excOffset+=2*slotOffset(excWord, index); 183 value=exceptions.charAt(excOffset++); 184 value=(value<<16)|exceptions.charAt(excOffset); 185 } 186 return value; 187 } 188 189 // simple case mappings ------------------------------------------------ *** 190 191 public final int tolower(int c) { 192 int props=trie.get(c); 193 if(!propsHasException(props)) { 194 if(getTypeFromProps(props)>=UPPER) { 195 c+=getDelta(props); 196 } 197 } else { 198 int excOffset=getExceptionsOffset(props); 199 int excWord=exceptions.charAt(excOffset++); 200 if(hasSlot(excWord, EXC_LOWER)) { 201 c=getSlotValue(excWord, EXC_LOWER, excOffset); 202 } 203 } 204 return c; 205 } 206 207 public final int toupper(int c) { 208 int props=trie.get(c); 209 if(!propsHasException(props)) { 210 if(getTypeFromProps(props)==LOWER) { 211 c+=getDelta(props); 212 } 213 } else { 214 int excOffset=getExceptionsOffset(props); 215 int excWord=exceptions.charAt(excOffset++); 216 if(hasSlot(excWord, EXC_UPPER)) { 217 c=getSlotValue(excWord, EXC_UPPER, excOffset); 218 } 219 } 220 return c; 221 } 222 223 public final int totitle(int c) { 224 int props=trie.get(c); 225 if(!propsHasException(props)) { 226 if(getTypeFromProps(props)==LOWER) { 227 c+=getDelta(props); 228 } 229 } else { 230 int excOffset=getExceptionsOffset(props); 231 int excWord=exceptions.charAt(excOffset++); 232 int index; 233 if(hasSlot(excWord, EXC_TITLE)) { 234 index=EXC_TITLE; 235 } else if(hasSlot(excWord, EXC_UPPER)) { 236 index=EXC_UPPER; 237 } else { 238 return c; 239 } 240 c=getSlotValue(excWord, index, excOffset); 241 } 242 return c; 243 } 244 245 /** 246 * Adds all simple case mappings and the full case folding for c to sa, 247 * and also adds special case closure mappings. 248 * c itself is not added. 249 * For example, the mappings 250 * - for s include long s 251 * - for sharp s include ss 252 * - for k include the Kelvin sign 253 */ 254 public final void addCaseClosure(int c, UnicodeSet set) { 255 /* 256 * Hardcode the case closure of i and its relatives and ignore the 257 * data file data for these characters. 258 * The Turkic dotless i and dotted I with their case mapping conditions 259 * and case folding option make the related characters behave specially. 260 * This code matches their closure behavior to their case folding behavior. 261 */ 262 263 switch(c) { 264 case 0x49: 265 /* regular i and I are in one equivalence class */ 266 set.add(0x69); 267 return; 268 case 0x69: 269 set.add(0x49); 270 return; 271 case 0x130: 272 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 273 set.add(iDot); 274 return; 275 case 0x131: 276 /* dotless i is in a class by itself */ 277 return; 278 default: 279 /* otherwise use the data file data */ 280 break; 281 } 282 283 int props=trie.get(c); 284 if(!propsHasException(props)) { 285 if(getTypeFromProps(props)!=NONE) { 286 /* add the one simple case mapping, no matter what type it is */ 287 int delta=getDelta(props); 288 if(delta!=0) { 289 set.add(c+delta); 290 } 291 } 292 } else { 293 /* 294 * c has exceptions, so there may be multiple simple and/or 295 * full case mappings. Add them all. 296 */ 297 int excOffset0, excOffset=getExceptionsOffset(props); 298 int closureOffset; 299 int excWord=exceptions.charAt(excOffset++); 300 int index, closureLength, fullLength, length; 301 302 excOffset0=excOffset; 303 304 /* add all simple case mappings */ 305 for(index=EXC_LOWER; index<=EXC_TITLE; ++index) { 306 if(hasSlot(excWord, index)) { 307 excOffset=excOffset0; 308 c=getSlotValue(excWord, index, excOffset); 309 set.add(c); 310 } 311 } 312 313 /* get the closure string pointer & length */ 314 if(hasSlot(excWord, EXC_CLOSURE)) { 315 excOffset=excOffset0; 316 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset); 317 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 318 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */ 319 } else { 320 closureLength=0; 321 closureOffset=0; 322 } 323 324 /* add the full case folding */ 325 if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 326 excOffset=excOffset0; 327 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 328 fullLength=(int)value; 329 330 /* start of full case mapping strings */ 331 excOffset=(int)(value>>32)+1; 332 333 fullLength&=0xffff; /* bits 16 and higher are reserved */ 334 335 /* skip the lowercase result string */ 336 excOffset+=fullLength&FULL_LOWER; 337 fullLength>>=4; 338 339 /* add the full case folding string */ 340 length=fullLength&0xf; 341 if(length!=0) { 342 set.add(exceptions.substring(excOffset, excOffset+length)); 343 excOffset+=length; 344 } 345 346 /* skip the uppercase and titlecase strings */ 347 fullLength>>=4; 348 excOffset+=fullLength&0xf; 349 fullLength>>=4; 350 excOffset+=fullLength; 351 352 closureOffset=excOffset; /* behind full case mappings */ 353 } 354 355 /* add each code point in the closure string */ 356 int limit=closureOffset+closureLength; 357 for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) { 358 c=exceptions.codePointAt(index); 359 set.add(c); 360 } 361 } 362 } 363 364 /* 365 * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated 366 * must be s.length()>0 and max>0 and s.length()<=max 367 */ 368 private final int strcmpMax(String s, int unfoldOffset, int max) { 369 int i1, length, c1, c2; 370 371 length=s.length(); 372 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 373 i1=0; 374 do { 375 c1=s.charAt(i1++); 376 c2=unfold[unfoldOffset++]; 377 if(c2==0) { 378 return 1; /* reached the end of t but not of s */ 379 } 380 c1-=c2; 381 if(c1!=0) { 382 return c1; /* return difference result */ 383 } 384 } while(--length>0); 385 /* ends with length==0 */ 386 387 if(max==0 || unfold[unfoldOffset]==0) { 388 return 0; /* equal to length of both strings */ 389 } else { 390 return -max; /* return lengh difference */ 391 } 392 } 393 394 /** 395 * Maps the string to single code points and adds the associated case closure 396 * mappings. 397 * The string is mapped to code points if it is their full case folding string. 398 * In other words, this performs a reverse full case folding and then 399 * adds the case closure items of the resulting code points. 400 * If the string is found and its closure applied, then 401 * the string itself is added as well as part of its code points' closure. 402 * 403 * @return true if the string was found 404 */ 405 public final boolean addStringCaseClosure(String s, UnicodeSet set) { 406 int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth; 407 408 if(unfold==null || s==null) { 409 return false; /* no reverse case folding data, or no string */ 410 } 411 length=s.length(); 412 if(length<=1) { 413 /* the string is too short to find any match */ 414 /* 415 * more precise would be: 416 * if(!u_strHasMoreChar32Than(s, length, 1)) 417 * but this does not make much practical difference because 418 * a single supplementary code point would just not be found 419 */ 420 return false; 421 } 422 423 unfoldRows=unfold[UNFOLD_ROWS]; 424 unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH]; 425 unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH]; 426 //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth; 427 428 if(length>unfoldStringWidth) { 429 /* the string is too long to find any match */ 430 return false; 431 } 432 433 /* do a binary search for the string */ 434 start=0; 435 limit=unfoldRows; 436 while(start<limit) { 437 i=(start+limit)/2; 438 unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above 439 result=strcmpMax(s, unfoldOffset, unfoldStringWidth); 440 441 if(result==0) { 442 /* found the string: add each code point, and its case closure */ 443 int c; 444 445 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) { 446 c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i); 447 set.add(c); 448 addCaseClosure(c, set); 449 } 450 return true; 451 } else if(result<0) { 452 limit=i; 453 } else /* result>0 */ { 454 start=i+1; 455 } 456 } 457 458 return false; /* string not found */ 459 } 460 461 /** @return NONE, LOWER, UPPER, TITLE */ 462 public final int getType(int c) { 463 return getTypeFromProps(trie.get(c)); 464 } 465 466 /** @return like getType() but also sets IGNORABLE if c is case-ignorable */ 467 public final int getTypeOrIgnorable(int c) { 468 return getTypeAndIgnorableFromProps(trie.get(c)); 469 } 470 471 /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */ 472 public final int getDotType(int c) { 473 int props=trie.get(c); 474 if(!propsHasException(props)) { 475 return props&DOT_MASK; 476 } else { 477 return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK; 478 } 479 } 480 481 public final boolean isSoftDotted(int c) { 482 return getDotType(c)==SOFT_DOTTED; 483 } 484 485 public final boolean isCaseSensitive(int c) { 486 return (trie.get(c)&SENSITIVE)!=0; 487 } 488 489 // string casing ------------------------------------------------------- *** 490 491 /* 492 * These internal functions form the core of string case mappings. 493 * They map single code points to result code points or strings and take 494 * all necessary conditions (context, locale ID, options) into account. 495 * 496 * They do not iterate over the source or write to the destination 497 * so that the same functions are useful for non-standard string storage, 498 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 499 * For the same reason, the "surrounding text" context is passed in as a 500 * ContextIterator which does not make any assumptions about 501 * the underlying storage. 502 * 503 * This section contains helper functions that check for conditions 504 * in the input text surrounding the current code point 505 * according to SpecialCasing.txt. 506 * 507 * Each helper function gets the index 508 * - after the current code point if it looks at following text 509 * - before the current code point if it looks at preceding text 510 * 511 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 512 * 513 * Final_Sigma 514 * C is preceded by a sequence consisting of 515 * a cased letter and a case-ignorable sequence, 516 * and C is not followed by a sequence consisting of 517 * an ignorable sequence and then a cased letter. 518 * 519 * More_Above 520 * C is followed by one or more characters of combining class 230 (ABOVE) 521 * in the combining character sequence. 522 * 523 * After_Soft_Dotted 524 * The last preceding character with combining class of zero before C 525 * was Soft_Dotted, 526 * and there is no intervening combining character class 230 (ABOVE). 527 * 528 * Before_Dot 529 * C is followed by combining dot above (U+0307). 530 * Any sequence of characters with a combining class that is neither 0 nor 230 531 * may intervene between the current character and the combining dot above. 532 * 533 * The erratum from 2002-10-31 adds the condition 534 * 535 * After_I 536 * The last preceding base character was an uppercase I, and there is no 537 * intervening combining character class 230 (ABOVE). 538 * 539 * (See Jitterbug 2344 and the comments on After_I below.) 540 * 541 * Helper definitions in Unicode 3.2 UAX 21: 542 * 543 * D1. A character C is defined to be cased 544 * if it meets any of the following criteria: 545 * 546 * - The general category of C is Titlecase Letter (Lt) 547 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 548 * - Given D = NFD(C), then it is not the case that: 549 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 550 * (This third criterium does not add any characters to the list 551 * for Unicode 3.2. Ignored.) 552 * 553 * D2. A character C is defined to be case-ignorable 554 * if it meets either of the following criteria: 555 * 556 * - The general category of C is 557 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 558 * Letter Modifier (Lm), or Symbol Modifier (Sk) 559 * - C is one of the following characters 560 * U+0027 APOSTROPHE 561 * U+00AD SOFT HYPHEN (SHY) 562 * U+2019 RIGHT SINGLE QUOTATION MARK 563 * (the preferred character for apostrophe) 564 * 565 * D3. A case-ignorable sequence is a sequence of 566 * zero or more case-ignorable characters. 567 */ 568 569 /** 570 * Iterator for string case mappings, which need to look at the 571 * context (surrounding text) of a given character for conditional mappings. 572 * 573 * The iterator only needs to go backward or forward away from the 574 * character in question. It does not use any indexes on this interface. 575 * It does not support random access or an arbitrary change of 576 * iteration direction. 577 * 578 * The code point being case-mapped itself is never returned by 579 * this iterator. 580 */ 581 public interface ContextIterator { 582 /** 583 * Reset the iterator for forward or backward iteration. 584 * @param dir >0: Begin iterating forward from the first code point 585 * after the one that is being case-mapped. 586 * <0: Begin iterating backward from the first code point 587 * before the one that is being case-mapped. 588 */ 589 public void reset(int dir); 590 /** 591 * Iterate and return the next code point, moving in the direction 592 * determined by the reset() call. 593 * @return Next code point, or <0 when the iteration is done. 594 */ 595 public int next(); 596 } 597 598 /** 599 * For string case mappings, a single character (a code point) is mapped 600 * either to itself (in which case in-place mapping functions do nothing), 601 * or to another single code point, or to a string. 602 * Aside from the string contents, these are indicated with a single int 603 * value as follows: 604 * 605 * Mapping to self: Negative values (~self instead of -self to support U+0000) 606 * 607 * Mapping to another code point: Positive values >MAX_STRING_LENGTH 608 * 609 * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is 610 * returned. Note that the string result may indeed have zero length. 611 */ 612 public static final int MAX_STRING_LENGTH=0x1f; 613 614 //ivate static final int LOC_UNKNOWN=0; 615 public static final int LOC_ROOT=1; 616 private static final int LOC_TURKISH=2; 617 private static final int LOC_LITHUANIAN=3; 618 static final int LOC_GREEK=4; 619 public static final int LOC_DUTCH=5; 620 621 public static final int getCaseLocale(Locale locale) { 622 return getCaseLocale(locale.getLanguage()); 623 } 624 public static final int getCaseLocale(ULocale locale) { 625 return getCaseLocale(locale.getLanguage()); 626 } 627 /** Accepts both 2- and 3-letter language subtags. */ 628 private static final int getCaseLocale(String language) { 629 // Check the subtag length to reduce the number of comparisons 630 // for locales without special behavior. 631 // Fastpath for English "en" which is often used for default (=root locale) case mappings, 632 // and for Chinese "zh": Very common but no special case mapping behavior. 633 if(language.length()==2) { 634 if(language.equals("en") || language.charAt(0)>'t') { 635 return LOC_ROOT; 636 } else if(language.equals("tr") || language.equals("az")) { 637 return LOC_TURKISH; 638 } else if(language.equals("el")) { 639 return LOC_GREEK; 640 } else if(language.equals("lt")) { 641 return LOC_LITHUANIAN; 642 } else if(language.equals("nl")) { 643 return LOC_DUTCH; 644 } 645 } else if(language.length()==3) { 646 if(language.equals("tur") || language.equals("aze")) { 647 return LOC_TURKISH; 648 } else if(language.equals("ell")) { 649 return LOC_GREEK; 650 } else if(language.equals("lit")) { 651 return LOC_LITHUANIAN; 652 } else if(language.equals("nld")) { 653 return LOC_DUTCH; 654 } 655 } 656 return LOC_ROOT; 657 } 658 659 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */ 660 private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) { 661 int c; 662 663 if(iter==null) { 664 return false; 665 } 666 667 for(iter.reset(dir); (c=iter.next())>=0;) { 668 int type=getTypeOrIgnorable(c); 669 if((type&4)!=0) { 670 /* case-ignorable, continue with the loop */ 671 } else if(type!=NONE) { 672 return true; /* followed by cased letter */ 673 } else { 674 return false; /* uncased and not case-ignorable */ 675 } 676 } 677 678 return false; /* not followed by cased letter */ 679 } 680 681 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 682 private final boolean isPrecededBySoftDotted(ContextIterator iter) { 683 int c; 684 int dotType; 685 686 if(iter==null) { 687 return false; 688 } 689 690 for(iter.reset(-1); (c=iter.next())>=0;) { 691 dotType=getDotType(c); 692 if(dotType==SOFT_DOTTED) { 693 return true; /* preceded by TYPE_i */ 694 } else if(dotType!=OTHER_ACCENT) { 695 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 696 } 697 } 698 699 return false; /* not preceded by TYPE_i */ 700 } 701 702 /* 703 * See Jitterbug 2344: 704 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 705 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 706 * we made those releases compatible with Unicode 3.2 which had not fixed 707 * a related bug in SpecialCasing.txt. 708 * 709 * From the Jitterbug 2344 text: 710 * ... this bug is listed as a Unicode erratum 711 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 712 * <quote> 713 * There are two errors in SpecialCasing.txt. 714 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 715 * 2. An incorrect context definition. Correct as follows: 716 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 717 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 718 * --- 719 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 720 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 721 * where the context After_I is defined as: 722 * The last preceding base character was an uppercase I, and there is no 723 * intervening combining character class 230 (ABOVE). 724 * </quote> 725 * 726 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 727 * 728 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 729 * # This matches the behavior of the canonically equivalent I-dot_above 730 * 731 * See also the description in this place in older versions of uchar.c (revision 1.100). 732 * 733 * Markus W. Scherer 2003-feb-15 734 */ 735 736 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 737 private final boolean isPrecededBy_I(ContextIterator iter) { 738 int c; 739 int dotType; 740 741 if(iter==null) { 742 return false; 743 } 744 745 for(iter.reset(-1); (c=iter.next())>=0;) { 746 if(c==0x49) { 747 return true; /* preceded by I */ 748 } 749 dotType=getDotType(c); 750 if(dotType!=OTHER_ACCENT) { 751 return false; /* preceded by different base character (not I), or intervening cc==230 */ 752 } 753 } 754 755 return false; /* not preceded by I */ 756 } 757 758 /* Is followed by one or more cc==230 ? */ 759 private final boolean isFollowedByMoreAbove(ContextIterator iter) { 760 int c; 761 int dotType; 762 763 if(iter==null) { 764 return false; 765 } 766 767 for(iter.reset(1); (c=iter.next())>=0;) { 768 dotType=getDotType(c); 769 if(dotType==ABOVE) { 770 return true; /* at least one cc==230 following */ 771 } else if(dotType!=OTHER_ACCENT) { 772 return false; /* next base character, no more cc==230 following */ 773 } 774 } 775 776 return false; /* no more cc==230 following */ 777 } 778 779 /* Is followed by a dot above (without cc==230 in between) ? */ 780 private final boolean isFollowedByDotAbove(ContextIterator iter) { 781 int c; 782 int dotType; 783 784 if(iter==null) { 785 return false; 786 } 787 788 for(iter.reset(1); (c=iter.next())>=0; ) { 789 if(c==0x307) { 790 return true; 791 } 792 dotType=getDotType(c); 793 if(dotType!=OTHER_ACCENT) { 794 return false; /* next base character or cc==230 in between */ 795 } 796 } 797 798 return false; /* no dot above following */ 799 } 800 801 private static final String 802 iDot= "i\u0307", 803 jDot= "j\u0307", 804 iOgonekDot= "\u012f\u0307", 805 iDotGrave= "i\u0307\u0300", 806 iDotAcute= "i\u0307\u0301", 807 iDotTilde= "i\u0307\u0303"; 808 809 /** 810 * Get the full lowercase mapping for c. 811 * 812 * @param c Character to be mapped. 813 * @param iter Character iterator, used for context-sensitive mappings. 814 * See ContextIterator for details. 815 * If iter==null then a context-independent result is returned. 816 * @param out If the mapping result is a string, then it is appended to out. 817 * @param caseLocale Case locale value from ucase_getCaseLocale(). 818 * @return Output code point or string length, see MAX_STRING_LENGTH. 819 * 820 * @see ContextIterator 821 * @see #MAX_STRING_LENGTH 822 * @hide draft / provisional / internal are hidden on Android 823 */ 824 public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) { 825 int result, props; 826 827 result=c; 828 props=trie.get(c); 829 if(!propsHasException(props)) { 830 if(getTypeFromProps(props)>=UPPER) { 831 result=c+getDelta(props); 832 } 833 } else { 834 int excOffset=getExceptionsOffset(props), excOffset2; 835 int excWord=exceptions.charAt(excOffset++); 836 int full; 837 838 excOffset2=excOffset; 839 840 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { 841 /* use hardcoded conditions and mappings */ 842 /* 843 * Test for conditional mappings first 844 * (otherwise the unconditional default mappings are always taken), 845 * then test for characters that have unconditional mappings in SpecialCasing.txt, 846 * then get the UnicodeData.txt mappings. 847 */ 848 if( caseLocale==LOC_LITHUANIAN && 849 /* base characters, find accents above */ 850 (((c==0x49 || c==0x4a || c==0x12e) && 851 isFollowedByMoreAbove(iter)) || 852 /* precomposed with accent above, no need to find one */ 853 (c==0xcc || c==0xcd || c==0x128)) 854 ) { 855 /* 856 # Lithuanian 857 858 # Lithuanian retains the dot in a lowercase i when followed by accents. 859 860 # Introduce an explicit dot above when lowercasing capital I's and J's 861 # whenever there are more accents above. 862 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 863 864 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 865 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 866 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 867 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 868 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 869 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 870 */ 871 try { 872 switch(c) { 873 case 0x49: /* LATIN CAPITAL LETTER I */ 874 out.append(iDot); 875 return 2; 876 case 0x4a: /* LATIN CAPITAL LETTER J */ 877 out.append(jDot); 878 return 2; 879 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 880 out.append(iOgonekDot); 881 return 2; 882 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 883 out.append(iDotGrave); 884 return 3; 885 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 886 out.append(iDotAcute); 887 return 3; 888 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 889 out.append(iDotTilde); 890 return 3; 891 default: 892 return 0; /* will not occur */ 893 } 894 } catch (IOException e) { 895 throw new ICUUncheckedIOException(e); 896 } 897 /* # Turkish and Azeri */ 898 } else if(caseLocale==LOC_TURKISH && c==0x130) { 899 /* 900 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 901 # The following rules handle those cases. 902 903 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 904 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 905 */ 906 return 0x69; 907 } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { 908 /* 909 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 910 # This matches the behavior of the canonically equivalent I-dot_above 911 912 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 913 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 914 */ 915 return 0; /* remove the dot (continue without output) */ 916 } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { 917 /* 918 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 919 920 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 921 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 922 */ 923 return 0x131; 924 } else if(c==0x130) { 925 /* 926 # Preserve canonical equivalence for I with dot. Turkic is handled below. 927 928 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 929 */ 930 try { 931 out.append(iDot); 932 return 2; 933 } catch (IOException e) { 934 throw new ICUUncheckedIOException(e); 935 } 936 } else if( c==0x3a3 && 937 !isFollowedByCasedLetter(iter, 1) && 938 isFollowedByCasedLetter(iter, -1) /* -1=preceded */ 939 ) { 940 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 941 /* 942 # Special case for final form of sigma 943 944 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 945 */ 946 return 0x3c2; /* greek small final sigma */ 947 } else { 948 /* no known conditional special case mapping, use a normal mapping */ 949 } 950 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 951 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 952 full=(int)value&FULL_LOWER; 953 if(full!=0) { 954 /* start of full case mapping strings */ 955 excOffset=(int)(value>>32)+1; 956 957 try { 958 // append the lowercase mapping 959 out.append(exceptions, excOffset, excOffset+full); 960 961 /* return the string length */ 962 return full; 963 } catch (IOException e) { 964 throw new ICUUncheckedIOException(e); 965 } 966 } 967 } 968 969 if(hasSlot(excWord, EXC_LOWER)) { 970 result=getSlotValue(excWord, EXC_LOWER, excOffset2); 971 } 972 } 973 974 return (result==c) ? ~result : result; 975 } 976 977 /* internal */ 978 private final int toUpperOrTitle(int c, ContextIterator iter, 979 Appendable out, 980 int loc, 981 boolean upperNotTitle) { 982 int result; 983 int props; 984 985 result=c; 986 props=trie.get(c); 987 if(!propsHasException(props)) { 988 if(getTypeFromProps(props)==LOWER) { 989 result=c+getDelta(props); 990 } 991 } else { 992 int excOffset=getExceptionsOffset(props), excOffset2; 993 int excWord=exceptions.charAt(excOffset++); 994 int full, index; 995 996 excOffset2=excOffset; 997 998 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { 999 /* use hardcoded conditions and mappings */ 1000 if(loc==LOC_TURKISH && c==0x69) { 1001 /* 1002 # Turkish and Azeri 1003 1004 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1005 # The following rules handle those cases. 1006 1007 # When uppercasing, i turns into a dotted capital I 1008 1009 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 1010 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 1011 */ 1012 return 0x130; 1013 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) { 1014 /* 1015 # Lithuanian 1016 1017 # Lithuanian retains the dot in a lowercase i when followed by accents. 1018 1019 # Remove DOT ABOVE after "i" with upper or titlecase 1020 1021 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1022 */ 1023 return 0; /* remove the dot (continue without output) */ 1024 } else { 1025 /* no known conditional special case mapping, use a normal mapping */ 1026 } 1027 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 1028 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 1029 full=(int)value&0xffff; 1030 1031 /* start of full case mapping strings */ 1032 excOffset=(int)(value>>32)+1; 1033 1034 /* skip the lowercase and case-folding result strings */ 1035 excOffset+=full&FULL_LOWER; 1036 full>>=4; 1037 excOffset+=full&0xf; 1038 full>>=4; 1039 1040 if(upperNotTitle) { 1041 full&=0xf; 1042 } else { 1043 /* skip the uppercase result string */ 1044 excOffset+=full&0xf; 1045 full=(full>>4)&0xf; 1046 } 1047 1048 if(full!=0) { 1049 try { 1050 // append the result string 1051 out.append(exceptions, excOffset, excOffset+full); 1052 1053 /* return the string length */ 1054 return full; 1055 } catch (IOException e) { 1056 throw new ICUUncheckedIOException(e); 1057 } 1058 } 1059 } 1060 1061 if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) { 1062 index=EXC_TITLE; 1063 } else if(hasSlot(excWord, EXC_UPPER)) { 1064 /* here, titlecase is same as uppercase */ 1065 index=EXC_UPPER; 1066 } else { 1067 return ~c; 1068 } 1069 result=getSlotValue(excWord, index, excOffset2); 1070 } 1071 1072 return (result==c) ? ~result : result; 1073 } 1074 1075 public final int toFullUpper(int c, ContextIterator iter, 1076 Appendable out, 1077 int caseLocale) { 1078 return toUpperOrTitle(c, iter, out, caseLocale, true); 1079 } 1080 1081 public final int toFullTitle(int c, ContextIterator iter, 1082 Appendable out, 1083 int caseLocale) { 1084 return toUpperOrTitle(c, iter, out, caseLocale, false); 1085 } 1086 1087 /* case folding ------------------------------------------------------------- */ 1088 1089 /* 1090 * Case folding is similar to lowercasing. 1091 * The result may be a simple mapping, i.e., a single code point, or 1092 * a full mapping, i.e., a string. 1093 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1094 * then only the lowercase mapping is stored. 1095 * 1096 * Some special cases are hardcoded because their conditions cannot be 1097 * parsed and processed from CaseFolding.txt. 1098 * 1099 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1100 1101 # C: common case folding, common mappings shared by both simple and full mappings. 1102 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1103 # S: simple case folding, mappings to single characters where different from F. 1104 # T: special case for uppercase I and dotted uppercase I 1105 # - For non-Turkic languages, this mapping is normally not used. 1106 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1107 # 1108 # Usage: 1109 # A. To do a simple case folding, use the mappings with status C + S. 1110 # B. To do a full case folding, use the mappings with status C + F. 1111 # 1112 # The mappings with status T can be used or omitted depending on the desired case-folding 1113 # behavior. (The default option is to exclude them.) 1114 1115 * Unicode 3.2 has 'T' mappings as follows: 1116 1117 0049; T; 0131; # LATIN CAPITAL LETTER I 1118 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1119 1120 * while the default mappings for these code points are: 1121 1122 0049; C; 0069; # LATIN CAPITAL LETTER I 1123 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1124 1125 * U+0130 has no simple case folding (simple-case-folds to itself). 1126 */ 1127 1128 /** 1129 * Bit mask for getting just the options from a string compare options word 1130 * that are relevant for case folding (of a single string or code point). 1131 * @hide draft / provisional / internal are hidden on Android 1132 */ 1133 private static final int FOLD_CASE_OPTIONS_MASK = 0xff; 1134 1135 /* return the simple case folding mapping for c */ 1136 public final int fold(int c, int options) { 1137 int props=trie.get(c); 1138 if(!propsHasException(props)) { 1139 if(getTypeFromProps(props)>=UPPER) { 1140 c+=getDelta(props); 1141 } 1142 } else { 1143 int excOffset=getExceptionsOffset(props); 1144 int excWord=exceptions.charAt(excOffset++); 1145 int index; 1146 if((excWord&EXC_CONDITIONAL_FOLD)!=0) { 1147 /* special case folding mappings, hardcoded */ 1148 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) { 1149 /* default mappings */ 1150 if(c==0x49) { 1151 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1152 return 0x69; 1153 } else if(c==0x130) { 1154 /* no simple case folding for U+0130 */ 1155 return c; 1156 } 1157 } else { 1158 /* Turkic mappings */ 1159 if(c==0x49) { 1160 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1161 return 0x131; 1162 } else if(c==0x130) { 1163 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1164 return 0x69; 1165 } 1166 } 1167 } 1168 if(hasSlot(excWord, EXC_FOLD)) { 1169 index=EXC_FOLD; 1170 } else if(hasSlot(excWord, EXC_LOWER)) { 1171 index=EXC_LOWER; 1172 } else { 1173 return c; 1174 } 1175 c=getSlotValue(excWord, index, excOffset); 1176 } 1177 return c; 1178 } 1179 1180 /* 1181 * Issue for canonical caseless match (UAX #21): 1182 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1183 * canonical equivalence, unlike default-option casefolding. 1184 * For example, I-grave and I + grave fold to strings that are not canonically 1185 * equivalent. 1186 * For more details, see the comment in unorm_compare() in unorm.cpp 1187 * and the intermediate prototype changes for Jitterbug 2021. 1188 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1189 * 1190 * This did not get fixed because it appears that it is not possible to fix 1191 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1192 * together in a way that they still fold to common result strings. 1193 */ 1194 1195 public final int toFullFolding(int c, Appendable out, int options) { 1196 int result; 1197 int props; 1198 1199 result=c; 1200 props=trie.get(c); 1201 if(!propsHasException(props)) { 1202 if(getTypeFromProps(props)>=UPPER) { 1203 result=c+getDelta(props); 1204 } 1205 } else { 1206 int excOffset=getExceptionsOffset(props), excOffset2; 1207 int excWord=exceptions.charAt(excOffset++); 1208 int full, index; 1209 1210 excOffset2=excOffset; 1211 1212 if((excWord&EXC_CONDITIONAL_FOLD)!=0) { 1213 /* use hardcoded conditions and mappings */ 1214 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) { 1215 /* default mappings */ 1216 if(c==0x49) { 1217 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1218 return 0x69; 1219 } else if(c==0x130) { 1220 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1221 try { 1222 out.append(iDot); 1223 return 2; 1224 } catch (IOException e) { 1225 throw new ICUUncheckedIOException(e); 1226 } 1227 } 1228 } else { 1229 /* Turkic mappings */ 1230 if(c==0x49) { 1231 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1232 return 0x131; 1233 } else if(c==0x130) { 1234 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1235 return 0x69; 1236 } 1237 } 1238 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 1239 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 1240 full=(int)value&0xffff; 1241 1242 /* start of full case mapping strings */ 1243 excOffset=(int)(value>>32)+1; 1244 1245 /* skip the lowercase result string */ 1246 excOffset+=full&FULL_LOWER; 1247 full=(full>>4)&0xf; 1248 1249 if(full!=0) { 1250 try { 1251 // append the result string 1252 out.append(exceptions, excOffset, excOffset+full); 1253 1254 /* return the string length */ 1255 return full; 1256 } catch (IOException e) { 1257 throw new ICUUncheckedIOException(e); 1258 } 1259 } 1260 } 1261 1262 if(hasSlot(excWord, EXC_FOLD)) { 1263 index=EXC_FOLD; 1264 } else if(hasSlot(excWord, EXC_LOWER)) { 1265 index=EXC_LOWER; 1266 } else { 1267 return ~c; 1268 } 1269 result=getSlotValue(excWord, index, excOffset2); 1270 } 1271 1272 return (result==c) ? ~result : result; 1273 } 1274 1275 /* case mapping properties API ---------------------------------------------- */ 1276 1277 /* 1278 * We need a StringBuilder for multi-code point output from the 1279 * full case mapping functions. However, we do not actually use that output, 1280 * we just check whether the input character was mapped to anything else. 1281 * We use a shared StringBuilder to avoid allocating a new one in each call. 1282 * We remove its contents each time so that it does not grow large over time. 1283 * 1284 * @internal 1285 */ 1286 public static final StringBuilder dummyStringBuilder = new StringBuilder(); 1287 1288 public final boolean hasBinaryProperty(int c, int which) { 1289 switch(which) { 1290 case UProperty.LOWERCASE: 1291 return LOWER==getType(c); 1292 case UProperty.UPPERCASE: 1293 return UPPER==getType(c); 1294 case UProperty.SOFT_DOTTED: 1295 return isSoftDotted(c); 1296 case UProperty.CASE_SENSITIVE: 1297 return isCaseSensitive(c); 1298 case UProperty.CASED: 1299 return NONE!=getType(c); 1300 case UProperty.CASE_IGNORABLE: 1301 return (getTypeOrIgnorable(c)>>2)!=0; 1302 /* 1303 * Note: The following Changes_When_Xyz are defined as testing whether 1304 * the NFD form of the input changes when Xyz-case-mapped. 1305 * However, this simpler implementation of these properties, 1306 * ignoring NFD, passes the tests. 1307 * The implementation needs to be changed if the tests start failing. 1308 * When that happens, optimizations should be used to work with the 1309 * per-single-code point ucase_toFullXyz() functions unless 1310 * the NFD form has more than one code point, 1311 * and the property starts set needs to be the union of the 1312 * start sets for normalization and case mappings. 1313 */ 1314 case UProperty.CHANGES_WHEN_LOWERCASED: 1315 dummyStringBuilder.setLength(0); 1316 return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1317 case UProperty.CHANGES_WHEN_UPPERCASED: 1318 dummyStringBuilder.setLength(0); 1319 return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1320 case UProperty.CHANGES_WHEN_TITLECASED: 1321 dummyStringBuilder.setLength(0); 1322 return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1323 /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */ 1324 case UProperty.CHANGES_WHEN_CASEMAPPED: 1325 dummyStringBuilder.setLength(0); 1326 return 1327 toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 || 1328 toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 || 1329 toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1330 default: 1331 return false; 1332 } 1333 } 1334 1335 // data members -------------------------------------------------------- *** 1336 private int indexes[]; 1337 private String exceptions; 1338 private char unfold[]; 1339 1340 private Trie2_16 trie; 1341 1342 // data format constants ----------------------------------------------- *** 1343 private static final String DATA_NAME="ucase"; 1344 private static final String DATA_TYPE="icu"; 1345 private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE; 1346 1347 /* format "cAsE" */ 1348 private static final int FMT=0x63415345; 1349 1350 /* indexes into indexes[] */ 1351 //private static final int IX_INDEX_TOP=0; 1352 //private static final int IX_LENGTH=1; 1353 private static final int IX_TRIE_SIZE=2; 1354 private static final int IX_EXC_LENGTH=3; 1355 private static final int IX_UNFOLD_LENGTH=4; 1356 1357 //private static final int IX_MAX_FULL_LENGTH=15; 1358 private static final int IX_TOP=16; 1359 1360 // definitions for 16-bit case properties word ------------------------- *** 1361 1362 /* 2-bit constants for types of cased characters */ 1363 public static final int TYPE_MASK=3; 1364 public static final int NONE=0; 1365 public static final int LOWER=1; 1366 public static final int UPPER=2; 1367 public static final int TITLE=3; 1368 1369 /** @return NONE, LOWER, UPPER, TITLE */ 1370 private static final int getTypeFromProps(int props) { 1371 return props&TYPE_MASK; 1372 } 1373 1374 /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */ 1375 private static final int getTypeAndIgnorableFromProps(int props) { 1376 return props&7; 1377 } 1378 1379 static final int IGNORABLE=4; 1380 private static final int SENSITIVE= 8; 1381 private static final int EXCEPTION= 0x10; 1382 1383 private static final int DOT_MASK= 0x60; 1384 //private static final int NO_DOT= 0; /* normal characters with cc=0 */ 1385 private static final int SOFT_DOTTED= 0x20; /* soft-dotted characters with cc=0 */ 1386 private static final int ABOVE= 0x40; /* "above" accents with cc=230 */ 1387 private static final int OTHER_ACCENT= 0x60; /* other accent character (0<cc!=230) */ 1388 1389 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */ 1390 private static final int DELTA_SHIFT= 7; 1391 //private static final int DELTA_MASK= 0xff80; 1392 //private static final int MAX_DELTA= 0xff; 1393 //private static final int MIN_DELTA= (-MAX_DELTA-1); 1394 1395 private static final int getDelta(int props) { 1396 return (short)props>>DELTA_SHIFT; 1397 } 1398 1399 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ 1400 private static final int EXC_SHIFT= 5; 1401 //private static final int EXC_MASK= 0xffe0; 1402 //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1); 1403 1404 /* definitions for 16-bit main exceptions word ------------------------------ */ 1405 1406 /* first 8 bits indicate values in optional slots */ 1407 private static final int EXC_LOWER=0; 1408 private static final int EXC_FOLD=1; 1409 private static final int EXC_UPPER=2; 1410 private static final int EXC_TITLE=3; 1411 //private static final int EXC_4=4; /* reserved */ 1412 //private static final int EXC_5=5; /* reserved */ 1413 private static final int EXC_CLOSURE=6; 1414 private static final int EXC_FULL_MAPPINGS=7; 1415 //private static final int EXC_ALL_SLOTS=8; /* one past the last slot */ 1416 1417 /* each slot is 2 uint16_t instead of 1 */ 1418 private static final int EXC_DOUBLE_SLOTS= 0x100; 1419 1420 /* reserved: exception bits 11..9 */ 1421 1422 /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */ 1423 private static final int EXC_DOT_SHIFT=7; 1424 1425 /* normally stored in the main word, but pushed out for larger exception indexes */ 1426 //private static final int EXC_DOT_MASK= 0x3000; 1427 //private static final int EXC_NO_DOT= 0; 1428 //private static final int EXC_SOFT_DOTTED= 0x1000; 1429 //private static final int EXC_ABOVE= 0x2000; /* "above" accents with cc=230 */ 1430 //private static final int EXC_OTHER_ACCENT= 0x3000; /* other character (0<cc!=230) */ 1431 1432 /* complex/conditional mappings */ 1433 private static final int EXC_CONDITIONAL_SPECIAL= 0x4000; 1434 private static final int EXC_CONDITIONAL_FOLD= 0x8000; 1435 1436 /* definitions for lengths word for full case mappings */ 1437 private static final int FULL_LOWER= 0xf; 1438 //private static final int FULL_FOLDING= 0xf0; 1439 //private static final int FULL_UPPER= 0xf00; 1440 //private static final int FULL_TITLE= 0xf000; 1441 1442 /* maximum lengths */ 1443 //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf; 1444 private static final int CLOSURE_MAX_LENGTH=0xf; 1445 1446 /* constants for reverse case folding ("unfold") data */ 1447 private static final int UNFOLD_ROWS=0; 1448 private static final int UNFOLD_ROW_WIDTH=1; 1449 private static final int UNFOLD_STRING_WIDTH=2; 1450 1451 /* 1452 * public singleton instance 1453 */ 1454 public static final UCaseProps INSTANCE; 1455 1456 // This static initializer block must be placed after 1457 // other static member initialization 1458 static { 1459 try { 1460 INSTANCE = new UCaseProps(); 1461 } catch (IOException e) { 1462 throw new ICUUncheckedIOException(e); 1463 } 1464 } 1465 } 1466