1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: UCaseProps.java 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2005jan29 16 * created by: Markus W. Scherer 17 * 18 * Low-level Unicode character/string case mapping code. 19 * Java port of ucase.h/.c. 20 */ 21 22 package com.ibm.icu.impl; 23 24 import java.io.IOException; 25 import java.nio.ByteBuffer; 26 import java.util.Iterator; 27 import java.util.Locale; 28 29 import com.ibm.icu.lang.UCharacter; 30 import com.ibm.icu.lang.UProperty; 31 import com.ibm.icu.text.UTF16; 32 import com.ibm.icu.text.UnicodeSet; 33 import com.ibm.icu.util.ICUUncheckedIOException; 34 import com.ibm.icu.util.ULocale; 35 36 public final class UCaseProps { 37 38 // constructors etc. --------------------------------------------------- *** 39 40 // port of ucase_openProps() 41 private UCaseProps() throws IOException { 42 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); 43 readData(bytes); 44 } 45 46 private final void readData(ByteBuffer bytes) throws IOException { 47 // read the header 48 ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); 49 50 // read indexes[] 51 int count=bytes.getInt(); 52 if(count<IX_TOP) { 53 throw new IOException("indexes[0] too small in "+DATA_FILE_NAME); 54 } 55 indexes=new int[count]; 56 57 indexes[0]=count; 58 for(int i=1; i<count; ++i) { 59 indexes[i]=bytes.getInt(); 60 } 61 62 // read the trie 63 trie=Trie2_16.createFromSerialized(bytes); 64 int expectedTrieLength=indexes[IX_TRIE_SIZE]; 65 int trieLength=trie.getSerializedLength(); 66 if(trieLength>expectedTrieLength) { 67 throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); 68 } 69 // skip padding after trie bytes 70 ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); 71 72 // read exceptions[] 73 count=indexes[IX_EXC_LENGTH]; 74 if(count>0) { 75 exceptions=ICUBinary.getString(bytes, count, 0); 76 } 77 78 // read unfold[] 79 count=indexes[IX_UNFOLD_LENGTH]; 80 if(count>0) { 81 unfold=ICUBinary.getChars(bytes, count, 0); 82 } 83 } 84 85 // implement ICUBinary.Authenticate 86 private final static class IsAcceptable implements ICUBinary.Authenticate { 87 @Override 88 public boolean isDataVersionAcceptable(byte version[]) { 89 return version[0]==3; 90 } 91 } 92 93 // set of property starts for UnicodeSet ------------------------------- *** 94 95 public final void addPropertyStarts(UnicodeSet set) { 96 /* add the start code point of each same-value range of the trie */ 97 Iterator<Trie2.Range> trieIterator=trie.iterator(); 98 Trie2.Range range; 99 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 100 set.add(range.startCodePoint); 101 } 102 103 /* add code points with hardcoded properties, plus the ones following them */ 104 105 /* (none right now, see comment below) */ 106 107 /* 108 * Omit code points with hardcoded specialcasing properties 109 * because we do not build property UnicodeSets for them right now. 110 */ 111 } 112 113 // data access primitives ---------------------------------------------- *** 114 private static final int getExceptionsOffset(int props) { 115 return props>>EXC_SHIFT; 116 } 117 118 private static final boolean propsHasException(int props) { 119 return (props&EXCEPTION)!=0; 120 } 121 122 /* number of bits in an 8-bit integer value */ 123 private static final byte flagsOffset[/*256*/]={ 124 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 126 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 127 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 128 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 129 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 130 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 131 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 132 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 133 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 134 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 135 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 136 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 137 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 138 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 139 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 140 }; 141 142 private static final boolean hasSlot(int flags, int index) { 143 return (flags&(1<<index))!=0; 144 } 145 private static final byte slotOffset(int flags, int index) { 146 return flagsOffset[flags&((1<<index)-1)]; 147 } 148 149 /* 150 * Get the value of an optional-value slot where hasSlot(excWord, index). 151 * 152 * @param excWord (in) initial exceptions word 153 * @param index (in) desired slot index 154 * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++); 155 * @return bits 31..0: slot value 156 * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot 157 */ 158 private final long getSlotValueAndOffset(int excWord, int index, int excOffset) { 159 long value; 160 if((excWord&EXC_DOUBLE_SLOTS)==0) { 161 excOffset+=slotOffset(excWord, index); 162 value=exceptions.charAt(excOffset); 163 } else { 164 excOffset+=2*slotOffset(excWord, index); 165 value=exceptions.charAt(excOffset++); 166 value=(value<<16)|exceptions.charAt(excOffset); 167 } 168 return value |((long)excOffset<<32); 169 } 170 171 /* same as getSlotValueAndOffset() but does not return the slot offset */ 172 private final int getSlotValue(int excWord, int index, int excOffset) { 173 int value; 174 if((excWord&EXC_DOUBLE_SLOTS)==0) { 175 excOffset+=slotOffset(excWord, index); 176 value=exceptions.charAt(excOffset); 177 } else { 178 excOffset+=2*slotOffset(excWord, index); 179 value=exceptions.charAt(excOffset++); 180 value=(value<<16)|exceptions.charAt(excOffset); 181 } 182 return value; 183 } 184 185 // simple case mappings ------------------------------------------------ *** 186 187 public final int tolower(int c) { 188 int props=trie.get(c); 189 if(!propsHasException(props)) { 190 if(getTypeFromProps(props)>=UPPER) { 191 c+=getDelta(props); 192 } 193 } else { 194 int excOffset=getExceptionsOffset(props); 195 int excWord=exceptions.charAt(excOffset++); 196 if(hasSlot(excWord, EXC_LOWER)) { 197 c=getSlotValue(excWord, EXC_LOWER, excOffset); 198 } 199 } 200 return c; 201 } 202 203 public final int toupper(int c) { 204 int props=trie.get(c); 205 if(!propsHasException(props)) { 206 if(getTypeFromProps(props)==LOWER) { 207 c+=getDelta(props); 208 } 209 } else { 210 int excOffset=getExceptionsOffset(props); 211 int excWord=exceptions.charAt(excOffset++); 212 if(hasSlot(excWord, EXC_UPPER)) { 213 c=getSlotValue(excWord, EXC_UPPER, excOffset); 214 } 215 } 216 return c; 217 } 218 219 public final int totitle(int c) { 220 int props=trie.get(c); 221 if(!propsHasException(props)) { 222 if(getTypeFromProps(props)==LOWER) { 223 c+=getDelta(props); 224 } 225 } else { 226 int excOffset=getExceptionsOffset(props); 227 int excWord=exceptions.charAt(excOffset++); 228 int index; 229 if(hasSlot(excWord, EXC_TITLE)) { 230 index=EXC_TITLE; 231 } else if(hasSlot(excWord, EXC_UPPER)) { 232 index=EXC_UPPER; 233 } else { 234 return c; 235 } 236 c=getSlotValue(excWord, index, excOffset); 237 } 238 return c; 239 } 240 241 /** 242 * Adds all simple case mappings and the full case folding for c to sa, 243 * and also adds special case closure mappings. 244 * c itself is not added. 245 * For example, the mappings 246 * - for s include long s 247 * - for sharp s include ss 248 * - for k include the Kelvin sign 249 */ 250 public final void addCaseClosure(int c, UnicodeSet set) { 251 /* 252 * Hardcode the case closure of i and its relatives and ignore the 253 * data file data for these characters. 254 * The Turkic dotless i and dotted I with their case mapping conditions 255 * and case folding option make the related characters behave specially. 256 * This code matches their closure behavior to their case folding behavior. 257 */ 258 259 switch(c) { 260 case 0x49: 261 /* regular i and I are in one equivalence class */ 262 set.add(0x69); 263 return; 264 case 0x69: 265 set.add(0x49); 266 return; 267 case 0x130: 268 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 269 set.add(iDot); 270 return; 271 case 0x131: 272 /* dotless i is in a class by itself */ 273 return; 274 default: 275 /* otherwise use the data file data */ 276 break; 277 } 278 279 int props=trie.get(c); 280 if(!propsHasException(props)) { 281 if(getTypeFromProps(props)!=NONE) { 282 /* add the one simple case mapping, no matter what type it is */ 283 int delta=getDelta(props); 284 if(delta!=0) { 285 set.add(c+delta); 286 } 287 } 288 } else { 289 /* 290 * c has exceptions, so there may be multiple simple and/or 291 * full case mappings. Add them all. 292 */ 293 int excOffset0, excOffset=getExceptionsOffset(props); 294 int closureOffset; 295 int excWord=exceptions.charAt(excOffset++); 296 int index, closureLength, fullLength, length; 297 298 excOffset0=excOffset; 299 300 /* add all simple case mappings */ 301 for(index=EXC_LOWER; index<=EXC_TITLE; ++index) { 302 if(hasSlot(excWord, index)) { 303 excOffset=excOffset0; 304 c=getSlotValue(excWord, index, excOffset); 305 set.add(c); 306 } 307 } 308 309 /* get the closure string pointer & length */ 310 if(hasSlot(excWord, EXC_CLOSURE)) { 311 excOffset=excOffset0; 312 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset); 313 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 314 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */ 315 } else { 316 closureLength=0; 317 closureOffset=0; 318 } 319 320 /* add the full case folding */ 321 if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 322 excOffset=excOffset0; 323 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 324 fullLength=(int)value; 325 326 /* start of full case mapping strings */ 327 excOffset=(int)(value>>32)+1; 328 329 fullLength&=0xffff; /* bits 16 and higher are reserved */ 330 331 /* skip the lowercase result string */ 332 excOffset+=fullLength&FULL_LOWER; 333 fullLength>>=4; 334 335 /* add the full case folding string */ 336 length=fullLength&0xf; 337 if(length!=0) { 338 set.add(exceptions.substring(excOffset, excOffset+length)); 339 excOffset+=length; 340 } 341 342 /* skip the uppercase and titlecase strings */ 343 fullLength>>=4; 344 excOffset+=fullLength&0xf; 345 fullLength>>=4; 346 excOffset+=fullLength; 347 348 closureOffset=excOffset; /* behind full case mappings */ 349 } 350 351 /* add each code point in the closure string */ 352 int limit=closureOffset+closureLength; 353 for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) { 354 c=exceptions.codePointAt(index); 355 set.add(c); 356 } 357 } 358 } 359 360 /* 361 * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated 362 * must be s.length()>0 and max>0 and s.length()<=max 363 */ 364 private final int strcmpMax(String s, int unfoldOffset, int max) { 365 int i1, length, c1, c2; 366 367 length=s.length(); 368 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 369 i1=0; 370 do { 371 c1=s.charAt(i1++); 372 c2=unfold[unfoldOffset++]; 373 if(c2==0) { 374 return 1; /* reached the end of t but not of s */ 375 } 376 c1-=c2; 377 if(c1!=0) { 378 return c1; /* return difference result */ 379 } 380 } while(--length>0); 381 /* ends with length==0 */ 382 383 if(max==0 || unfold[unfoldOffset]==0) { 384 return 0; /* equal to length of both strings */ 385 } else { 386 return -max; /* return lengh difference */ 387 } 388 } 389 390 /** 391 * Maps the string to single code points and adds the associated case closure 392 * mappings. 393 * The string is mapped to code points if it is their full case folding string. 394 * In other words, this performs a reverse full case folding and then 395 * adds the case closure items of the resulting code points. 396 * If the string is found and its closure applied, then 397 * the string itself is added as well as part of its code points' closure. 398 * 399 * @return true if the string was found 400 */ 401 public final boolean addStringCaseClosure(String s, UnicodeSet set) { 402 int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth; 403 404 if(unfold==null || s==null) { 405 return false; /* no reverse case folding data, or no string */ 406 } 407 length=s.length(); 408 if(length<=1) { 409 /* the string is too short to find any match */ 410 /* 411 * more precise would be: 412 * if(!u_strHasMoreChar32Than(s, length, 1)) 413 * but this does not make much practical difference because 414 * a single supplementary code point would just not be found 415 */ 416 return false; 417 } 418 419 unfoldRows=unfold[UNFOLD_ROWS]; 420 unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH]; 421 unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH]; 422 //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth; 423 424 if(length>unfoldStringWidth) { 425 /* the string is too long to find any match */ 426 return false; 427 } 428 429 /* do a binary search for the string */ 430 start=0; 431 limit=unfoldRows; 432 while(start<limit) { 433 i=(start+limit)/2; 434 unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above 435 result=strcmpMax(s, unfoldOffset, unfoldStringWidth); 436 437 if(result==0) { 438 /* found the string: add each code point, and its case closure */ 439 int c; 440 441 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) { 442 c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i); 443 set.add(c); 444 addCaseClosure(c, set); 445 } 446 return true; 447 } else if(result<0) { 448 limit=i; 449 } else /* result>0 */ { 450 start=i+1; 451 } 452 } 453 454 return false; /* string not found */ 455 } 456 457 /** @return NONE, LOWER, UPPER, TITLE */ 458 public final int getType(int c) { 459 return getTypeFromProps(trie.get(c)); 460 } 461 462 /** @return like getType() but also sets IGNORABLE if c is case-ignorable */ 463 public final int getTypeOrIgnorable(int c) { 464 return getTypeAndIgnorableFromProps(trie.get(c)); 465 } 466 467 /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */ 468 public final int getDotType(int c) { 469 int props=trie.get(c); 470 if(!propsHasException(props)) { 471 return props&DOT_MASK; 472 } else { 473 return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK; 474 } 475 } 476 477 public final boolean isSoftDotted(int c) { 478 return getDotType(c)==SOFT_DOTTED; 479 } 480 481 public final boolean isCaseSensitive(int c) { 482 return (trie.get(c)&SENSITIVE)!=0; 483 } 484 485 // string casing ------------------------------------------------------- *** 486 487 /* 488 * These internal functions form the core of string case mappings. 489 * They map single code points to result code points or strings and take 490 * all necessary conditions (context, locale ID, options) into account. 491 * 492 * They do not iterate over the source or write to the destination 493 * so that the same functions are useful for non-standard string storage, 494 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 495 * For the same reason, the "surrounding text" context is passed in as a 496 * ContextIterator which does not make any assumptions about 497 * the underlying storage. 498 * 499 * This section contains helper functions that check for conditions 500 * in the input text surrounding the current code point 501 * according to SpecialCasing.txt. 502 * 503 * Each helper function gets the index 504 * - after the current code point if it looks at following text 505 * - before the current code point if it looks at preceding text 506 * 507 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 508 * 509 * Final_Sigma 510 * C is preceded by a sequence consisting of 511 * a cased letter and a case-ignorable sequence, 512 * and C is not followed by a sequence consisting of 513 * an ignorable sequence and then a cased letter. 514 * 515 * More_Above 516 * C is followed by one or more characters of combining class 230 (ABOVE) 517 * in the combining character sequence. 518 * 519 * After_Soft_Dotted 520 * The last preceding character with combining class of zero before C 521 * was Soft_Dotted, 522 * and there is no intervening combining character class 230 (ABOVE). 523 * 524 * Before_Dot 525 * C is followed by combining dot above (U+0307). 526 * Any sequence of characters with a combining class that is neither 0 nor 230 527 * may intervene between the current character and the combining dot above. 528 * 529 * The erratum from 2002-10-31 adds the condition 530 * 531 * After_I 532 * The last preceding base character was an uppercase I, and there is no 533 * intervening combining character class 230 (ABOVE). 534 * 535 * (See Jitterbug 2344 and the comments on After_I below.) 536 * 537 * Helper definitions in Unicode 3.2 UAX 21: 538 * 539 * D1. A character C is defined to be cased 540 * if it meets any of the following criteria: 541 * 542 * - The general category of C is Titlecase Letter (Lt) 543 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 544 * - Given D = NFD(C), then it is not the case that: 545 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 546 * (This third criterium does not add any characters to the list 547 * for Unicode 3.2. Ignored.) 548 * 549 * D2. A character C is defined to be case-ignorable 550 * if it meets either of the following criteria: 551 * 552 * - The general category of C is 553 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 554 * Letter Modifier (Lm), or Symbol Modifier (Sk) 555 * - C is one of the following characters 556 * U+0027 APOSTROPHE 557 * U+00AD SOFT HYPHEN (SHY) 558 * U+2019 RIGHT SINGLE QUOTATION MARK 559 * (the preferred character for apostrophe) 560 * 561 * D3. A case-ignorable sequence is a sequence of 562 * zero or more case-ignorable characters. 563 */ 564 565 /** 566 * Iterator for string case mappings, which need to look at the 567 * context (surrounding text) of a given character for conditional mappings. 568 * 569 * The iterator only needs to go backward or forward away from the 570 * character in question. It does not use any indexes on this interface. 571 * It does not support random access or an arbitrary change of 572 * iteration direction. 573 * 574 * The code point being case-mapped itself is never returned by 575 * this iterator. 576 */ 577 public interface ContextIterator { 578 /** 579 * Reset the iterator for forward or backward iteration. 580 * @param dir >0: Begin iterating forward from the first code point 581 * after the one that is being case-mapped. 582 * <0: Begin iterating backward from the first code point 583 * before the one that is being case-mapped. 584 */ 585 public void reset(int dir); 586 /** 587 * Iterate and return the next code point, moving in the direction 588 * determined by the reset() call. 589 * @return Next code point, or <0 when the iteration is done. 590 */ 591 public int next(); 592 } 593 594 /** 595 * For string case mappings, a single character (a code point) is mapped 596 * either to itself (in which case in-place mapping functions do nothing), 597 * or to another single code point, or to a string. 598 * Aside from the string contents, these are indicated with a single int 599 * value as follows: 600 * 601 * Mapping to self: Negative values (~self instead of -self to support U+0000) 602 * 603 * Mapping to another code point: Positive values >MAX_STRING_LENGTH 604 * 605 * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is 606 * returned. Note that the string result may indeed have zero length. 607 */ 608 public static final int MAX_STRING_LENGTH=0x1f; 609 610 //ivate static final int LOC_UNKNOWN=0; 611 public static final int LOC_ROOT=1; 612 private static final int LOC_TURKISH=2; 613 private static final int LOC_LITHUANIAN=3; 614 static final int LOC_GREEK=4; 615 public static final int LOC_DUTCH=5; 616 617 public static final int getCaseLocale(Locale locale) { 618 return getCaseLocale(locale.getLanguage()); 619 } 620 public static final int getCaseLocale(ULocale locale) { 621 return getCaseLocale(locale.getLanguage()); 622 } 623 /** Accepts both 2- and 3-letter language subtags. */ 624 private static final int getCaseLocale(String language) { 625 // Check the subtag length to reduce the number of comparisons 626 // for locales without special behavior. 627 // Fastpath for English "en" which is often used for default (=root locale) case mappings, 628 // and for Chinese "zh": Very common but no special case mapping behavior. 629 if(language.length()==2) { 630 if(language.equals("en") || language.charAt(0)>'t') { 631 return LOC_ROOT; 632 } else if(language.equals("tr") || language.equals("az")) { 633 return LOC_TURKISH; 634 } else if(language.equals("el")) { 635 return LOC_GREEK; 636 } else if(language.equals("lt")) { 637 return LOC_LITHUANIAN; 638 } else if(language.equals("nl")) { 639 return LOC_DUTCH; 640 } 641 } else if(language.length()==3) { 642 if(language.equals("tur") || language.equals("aze")) { 643 return LOC_TURKISH; 644 } else if(language.equals("ell")) { 645 return LOC_GREEK; 646 } else if(language.equals("lit")) { 647 return LOC_LITHUANIAN; 648 } else if(language.equals("nld")) { 649 return LOC_DUTCH; 650 } 651 } 652 return LOC_ROOT; 653 } 654 655 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */ 656 private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) { 657 int c; 658 659 if(iter==null) { 660 return false; 661 } 662 663 for(iter.reset(dir); (c=iter.next())>=0;) { 664 int type=getTypeOrIgnorable(c); 665 if((type&4)!=0) { 666 /* case-ignorable, continue with the loop */ 667 } else if(type!=NONE) { 668 return true; /* followed by cased letter */ 669 } else { 670 return false; /* uncased and not case-ignorable */ 671 } 672 } 673 674 return false; /* not followed by cased letter */ 675 } 676 677 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 678 private final boolean isPrecededBySoftDotted(ContextIterator iter) { 679 int c; 680 int dotType; 681 682 if(iter==null) { 683 return false; 684 } 685 686 for(iter.reset(-1); (c=iter.next())>=0;) { 687 dotType=getDotType(c); 688 if(dotType==SOFT_DOTTED) { 689 return true; /* preceded by TYPE_i */ 690 } else if(dotType!=OTHER_ACCENT) { 691 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 692 } 693 } 694 695 return false; /* not preceded by TYPE_i */ 696 } 697 698 /* 699 * See Jitterbug 2344: 700 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 701 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 702 * we made those releases compatible with Unicode 3.2 which had not fixed 703 * a related bug in SpecialCasing.txt. 704 * 705 * From the Jitterbug 2344 text: 706 * ... this bug is listed as a Unicode erratum 707 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 708 * <quote> 709 * There are two errors in SpecialCasing.txt. 710 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 711 * 2. An incorrect context definition. Correct as follows: 712 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 713 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 714 * --- 715 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 716 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 717 * where the context After_I is defined as: 718 * The last preceding base character was an uppercase I, and there is no 719 * intervening combining character class 230 (ABOVE). 720 * </quote> 721 * 722 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 723 * 724 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 725 * # This matches the behavior of the canonically equivalent I-dot_above 726 * 727 * See also the description in this place in older versions of uchar.c (revision 1.100). 728 * 729 * Markus W. Scherer 2003-feb-15 730 */ 731 732 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 733 private final boolean isPrecededBy_I(ContextIterator iter) { 734 int c; 735 int dotType; 736 737 if(iter==null) { 738 return false; 739 } 740 741 for(iter.reset(-1); (c=iter.next())>=0;) { 742 if(c==0x49) { 743 return true; /* preceded by I */ 744 } 745 dotType=getDotType(c); 746 if(dotType!=OTHER_ACCENT) { 747 return false; /* preceded by different base character (not I), or intervening cc==230 */ 748 } 749 } 750 751 return false; /* not preceded by I */ 752 } 753 754 /* Is followed by one or more cc==230 ? */ 755 private final boolean isFollowedByMoreAbove(ContextIterator iter) { 756 int c; 757 int dotType; 758 759 if(iter==null) { 760 return false; 761 } 762 763 for(iter.reset(1); (c=iter.next())>=0;) { 764 dotType=getDotType(c); 765 if(dotType==ABOVE) { 766 return true; /* at least one cc==230 following */ 767 } else if(dotType!=OTHER_ACCENT) { 768 return false; /* next base character, no more cc==230 following */ 769 } 770 } 771 772 return false; /* no more cc==230 following */ 773 } 774 775 /* Is followed by a dot above (without cc==230 in between) ? */ 776 private final boolean isFollowedByDotAbove(ContextIterator iter) { 777 int c; 778 int dotType; 779 780 if(iter==null) { 781 return false; 782 } 783 784 for(iter.reset(1); (c=iter.next())>=0; ) { 785 if(c==0x307) { 786 return true; 787 } 788 dotType=getDotType(c); 789 if(dotType!=OTHER_ACCENT) { 790 return false; /* next base character or cc==230 in between */ 791 } 792 } 793 794 return false; /* no dot above following */ 795 } 796 797 private static final String 798 iDot= "i\u0307", 799 jDot= "j\u0307", 800 iOgonekDot= "\u012f\u0307", 801 iDotGrave= "i\u0307\u0300", 802 iDotAcute= "i\u0307\u0301", 803 iDotTilde= "i\u0307\u0303"; 804 805 /** 806 * Get the full lowercase mapping for c. 807 * 808 * @param c Character to be mapped. 809 * @param iter Character iterator, used for context-sensitive mappings. 810 * See ContextIterator for details. 811 * If iter==null then a context-independent result is returned. 812 * @param out If the mapping result is a string, then it is appended to out. 813 * @param caseLocale Case locale value from ucase_getCaseLocale(). 814 * @return Output code point or string length, see MAX_STRING_LENGTH. 815 * 816 * @see ContextIterator 817 * @see #MAX_STRING_LENGTH 818 * @internal 819 */ 820 public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) { 821 int result, props; 822 823 result=c; 824 props=trie.get(c); 825 if(!propsHasException(props)) { 826 if(getTypeFromProps(props)>=UPPER) { 827 result=c+getDelta(props); 828 } 829 } else { 830 int excOffset=getExceptionsOffset(props), excOffset2; 831 int excWord=exceptions.charAt(excOffset++); 832 int full; 833 834 excOffset2=excOffset; 835 836 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { 837 /* use hardcoded conditions and mappings */ 838 /* 839 * Test for conditional mappings first 840 * (otherwise the unconditional default mappings are always taken), 841 * then test for characters that have unconditional mappings in SpecialCasing.txt, 842 * then get the UnicodeData.txt mappings. 843 */ 844 if( caseLocale==LOC_LITHUANIAN && 845 /* base characters, find accents above */ 846 (((c==0x49 || c==0x4a || c==0x12e) && 847 isFollowedByMoreAbove(iter)) || 848 /* precomposed with accent above, no need to find one */ 849 (c==0xcc || c==0xcd || c==0x128)) 850 ) { 851 /* 852 # Lithuanian 853 854 # Lithuanian retains the dot in a lowercase i when followed by accents. 855 856 # Introduce an explicit dot above when lowercasing capital I's and J's 857 # whenever there are more accents above. 858 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 859 860 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 861 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 862 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 863 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 864 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 865 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 866 */ 867 try { 868 switch(c) { 869 case 0x49: /* LATIN CAPITAL LETTER I */ 870 out.append(iDot); 871 return 2; 872 case 0x4a: /* LATIN CAPITAL LETTER J */ 873 out.append(jDot); 874 return 2; 875 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 876 out.append(iOgonekDot); 877 return 2; 878 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 879 out.append(iDotGrave); 880 return 3; 881 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 882 out.append(iDotAcute); 883 return 3; 884 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 885 out.append(iDotTilde); 886 return 3; 887 default: 888 return 0; /* will not occur */ 889 } 890 } catch (IOException e) { 891 throw new ICUUncheckedIOException(e); 892 } 893 /* # Turkish and Azeri */ 894 } else if(caseLocale==LOC_TURKISH && c==0x130) { 895 /* 896 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 897 # The following rules handle those cases. 898 899 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 900 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 901 */ 902 return 0x69; 903 } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { 904 /* 905 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 906 # This matches the behavior of the canonically equivalent I-dot_above 907 908 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 909 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 910 */ 911 return 0; /* remove the dot (continue without output) */ 912 } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { 913 /* 914 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 915 916 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 917 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 918 */ 919 return 0x131; 920 } else if(c==0x130) { 921 /* 922 # Preserve canonical equivalence for I with dot. Turkic is handled below. 923 924 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 925 */ 926 try { 927 out.append(iDot); 928 return 2; 929 } catch (IOException e) { 930 throw new ICUUncheckedIOException(e); 931 } 932 } else if( c==0x3a3 && 933 !isFollowedByCasedLetter(iter, 1) && 934 isFollowedByCasedLetter(iter, -1) /* -1=preceded */ 935 ) { 936 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 937 /* 938 # Special case for final form of sigma 939 940 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 941 */ 942 return 0x3c2; /* greek small final sigma */ 943 } else { 944 /* no known conditional special case mapping, use a normal mapping */ 945 } 946 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 947 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 948 full=(int)value&FULL_LOWER; 949 if(full!=0) { 950 /* start of full case mapping strings */ 951 excOffset=(int)(value>>32)+1; 952 953 try { 954 // append the lowercase mapping 955 out.append(exceptions, excOffset, excOffset+full); 956 957 /* return the string length */ 958 return full; 959 } catch (IOException e) { 960 throw new ICUUncheckedIOException(e); 961 } 962 } 963 } 964 965 if(hasSlot(excWord, EXC_LOWER)) { 966 result=getSlotValue(excWord, EXC_LOWER, excOffset2); 967 } 968 } 969 970 return (result==c) ? ~result : result; 971 } 972 973 /* internal */ 974 private final int toUpperOrTitle(int c, ContextIterator iter, 975 Appendable out, 976 int loc, 977 boolean upperNotTitle) { 978 int result; 979 int props; 980 981 result=c; 982 props=trie.get(c); 983 if(!propsHasException(props)) { 984 if(getTypeFromProps(props)==LOWER) { 985 result=c+getDelta(props); 986 } 987 } else { 988 int excOffset=getExceptionsOffset(props), excOffset2; 989 int excWord=exceptions.charAt(excOffset++); 990 int full, index; 991 992 excOffset2=excOffset; 993 994 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { 995 /* use hardcoded conditions and mappings */ 996 if(loc==LOC_TURKISH && c==0x69) { 997 /* 998 # Turkish and Azeri 999 1000 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1001 # The following rules handle those cases. 1002 1003 # When uppercasing, i turns into a dotted capital I 1004 1005 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 1006 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 1007 */ 1008 return 0x130; 1009 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) { 1010 /* 1011 # Lithuanian 1012 1013 # Lithuanian retains the dot in a lowercase i when followed by accents. 1014 1015 # Remove DOT ABOVE after "i" with upper or titlecase 1016 1017 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1018 */ 1019 return 0; /* remove the dot (continue without output) */ 1020 } else { 1021 /* no known conditional special case mapping, use a normal mapping */ 1022 } 1023 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 1024 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 1025 full=(int)value&0xffff; 1026 1027 /* start of full case mapping strings */ 1028 excOffset=(int)(value>>32)+1; 1029 1030 /* skip the lowercase and case-folding result strings */ 1031 excOffset+=full&FULL_LOWER; 1032 full>>=4; 1033 excOffset+=full&0xf; 1034 full>>=4; 1035 1036 if(upperNotTitle) { 1037 full&=0xf; 1038 } else { 1039 /* skip the uppercase result string */ 1040 excOffset+=full&0xf; 1041 full=(full>>4)&0xf; 1042 } 1043 1044 if(full!=0) { 1045 try { 1046 // append the result string 1047 out.append(exceptions, excOffset, excOffset+full); 1048 1049 /* return the string length */ 1050 return full; 1051 } catch (IOException e) { 1052 throw new ICUUncheckedIOException(e); 1053 } 1054 } 1055 } 1056 1057 if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) { 1058 index=EXC_TITLE; 1059 } else if(hasSlot(excWord, EXC_UPPER)) { 1060 /* here, titlecase is same as uppercase */ 1061 index=EXC_UPPER; 1062 } else { 1063 return ~c; 1064 } 1065 result=getSlotValue(excWord, index, excOffset2); 1066 } 1067 1068 return (result==c) ? ~result : result; 1069 } 1070 1071 public final int toFullUpper(int c, ContextIterator iter, 1072 Appendable out, 1073 int caseLocale) { 1074 return toUpperOrTitle(c, iter, out, caseLocale, true); 1075 } 1076 1077 public final int toFullTitle(int c, ContextIterator iter, 1078 Appendable out, 1079 int caseLocale) { 1080 return toUpperOrTitle(c, iter, out, caseLocale, false); 1081 } 1082 1083 /* case folding ------------------------------------------------------------- */ 1084 1085 /* 1086 * Case folding is similar to lowercasing. 1087 * The result may be a simple mapping, i.e., a single code point, or 1088 * a full mapping, i.e., a string. 1089 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1090 * then only the lowercase mapping is stored. 1091 * 1092 * Some special cases are hardcoded because their conditions cannot be 1093 * parsed and processed from CaseFolding.txt. 1094 * 1095 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1096 1097 # C: common case folding, common mappings shared by both simple and full mappings. 1098 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1099 # S: simple case folding, mappings to single characters where different from F. 1100 # T: special case for uppercase I and dotted uppercase I 1101 # - For non-Turkic languages, this mapping is normally not used. 1102 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1103 # 1104 # Usage: 1105 # A. To do a simple case folding, use the mappings with status C + S. 1106 # B. To do a full case folding, use the mappings with status C + F. 1107 # 1108 # The mappings with status T can be used or omitted depending on the desired case-folding 1109 # behavior. (The default option is to exclude them.) 1110 1111 * Unicode 3.2 has 'T' mappings as follows: 1112 1113 0049; T; 0131; # LATIN CAPITAL LETTER I 1114 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1115 1116 * while the default mappings for these code points are: 1117 1118 0049; C; 0069; # LATIN CAPITAL LETTER I 1119 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1120 1121 * U+0130 has no simple case folding (simple-case-folds to itself). 1122 */ 1123 1124 /** 1125 * Bit mask for getting just the options from a string compare options word 1126 * that are relevant for case folding (of a single string or code point). 1127 * @internal 1128 */ 1129 private static final int FOLD_CASE_OPTIONS_MASK = 0xff; 1130 1131 /* return the simple case folding mapping for c */ 1132 public final int fold(int c, int options) { 1133 int props=trie.get(c); 1134 if(!propsHasException(props)) { 1135 if(getTypeFromProps(props)>=UPPER) { 1136 c+=getDelta(props); 1137 } 1138 } else { 1139 int excOffset=getExceptionsOffset(props); 1140 int excWord=exceptions.charAt(excOffset++); 1141 int index; 1142 if((excWord&EXC_CONDITIONAL_FOLD)!=0) { 1143 /* special case folding mappings, hardcoded */ 1144 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) { 1145 /* default mappings */ 1146 if(c==0x49) { 1147 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1148 return 0x69; 1149 } else if(c==0x130) { 1150 /* no simple case folding for U+0130 */ 1151 return c; 1152 } 1153 } else { 1154 /* Turkic mappings */ 1155 if(c==0x49) { 1156 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1157 return 0x131; 1158 } else if(c==0x130) { 1159 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1160 return 0x69; 1161 } 1162 } 1163 } 1164 if(hasSlot(excWord, EXC_FOLD)) { 1165 index=EXC_FOLD; 1166 } else if(hasSlot(excWord, EXC_LOWER)) { 1167 index=EXC_LOWER; 1168 } else { 1169 return c; 1170 } 1171 c=getSlotValue(excWord, index, excOffset); 1172 } 1173 return c; 1174 } 1175 1176 /* 1177 * Issue for canonical caseless match (UAX #21): 1178 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1179 * canonical equivalence, unlike default-option casefolding. 1180 * For example, I-grave and I + grave fold to strings that are not canonically 1181 * equivalent. 1182 * For more details, see the comment in unorm_compare() in unorm.cpp 1183 * and the intermediate prototype changes for Jitterbug 2021. 1184 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1185 * 1186 * This did not get fixed because it appears that it is not possible to fix 1187 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1188 * together in a way that they still fold to common result strings. 1189 */ 1190 1191 public final int toFullFolding(int c, Appendable out, int options) { 1192 int result; 1193 int props; 1194 1195 result=c; 1196 props=trie.get(c); 1197 if(!propsHasException(props)) { 1198 if(getTypeFromProps(props)>=UPPER) { 1199 result=c+getDelta(props); 1200 } 1201 } else { 1202 int excOffset=getExceptionsOffset(props), excOffset2; 1203 int excWord=exceptions.charAt(excOffset++); 1204 int full, index; 1205 1206 excOffset2=excOffset; 1207 1208 if((excWord&EXC_CONDITIONAL_FOLD)!=0) { 1209 /* use hardcoded conditions and mappings */ 1210 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) { 1211 /* default mappings */ 1212 if(c==0x49) { 1213 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1214 return 0x69; 1215 } else if(c==0x130) { 1216 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1217 try { 1218 out.append(iDot); 1219 return 2; 1220 } catch (IOException e) { 1221 throw new ICUUncheckedIOException(e); 1222 } 1223 } 1224 } else { 1225 /* Turkic mappings */ 1226 if(c==0x49) { 1227 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1228 return 0x131; 1229 } else if(c==0x130) { 1230 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1231 return 0x69; 1232 } 1233 } 1234 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { 1235 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); 1236 full=(int)value&0xffff; 1237 1238 /* start of full case mapping strings */ 1239 excOffset=(int)(value>>32)+1; 1240 1241 /* skip the lowercase result string */ 1242 excOffset+=full&FULL_LOWER; 1243 full=(full>>4)&0xf; 1244 1245 if(full!=0) { 1246 try { 1247 // append the result string 1248 out.append(exceptions, excOffset, excOffset+full); 1249 1250 /* return the string length */ 1251 return full; 1252 } catch (IOException e) { 1253 throw new ICUUncheckedIOException(e); 1254 } 1255 } 1256 } 1257 1258 if(hasSlot(excWord, EXC_FOLD)) { 1259 index=EXC_FOLD; 1260 } else if(hasSlot(excWord, EXC_LOWER)) { 1261 index=EXC_LOWER; 1262 } else { 1263 return ~c; 1264 } 1265 result=getSlotValue(excWord, index, excOffset2); 1266 } 1267 1268 return (result==c) ? ~result : result; 1269 } 1270 1271 /* case mapping properties API ---------------------------------------------- */ 1272 1273 /* 1274 * We need a StringBuilder for multi-code point output from the 1275 * full case mapping functions. However, we do not actually use that output, 1276 * we just check whether the input character was mapped to anything else. 1277 * We use a shared StringBuilder to avoid allocating a new one in each call. 1278 * We remove its contents each time so that it does not grow large over time. 1279 * 1280 * @internal 1281 */ 1282 public static final StringBuilder dummyStringBuilder = new StringBuilder(); 1283 1284 public final boolean hasBinaryProperty(int c, int which) { 1285 switch(which) { 1286 case UProperty.LOWERCASE: 1287 return LOWER==getType(c); 1288 case UProperty.UPPERCASE: 1289 return UPPER==getType(c); 1290 case UProperty.SOFT_DOTTED: 1291 return isSoftDotted(c); 1292 case UProperty.CASE_SENSITIVE: 1293 return isCaseSensitive(c); 1294 case UProperty.CASED: 1295 return NONE!=getType(c); 1296 case UProperty.CASE_IGNORABLE: 1297 return (getTypeOrIgnorable(c)>>2)!=0; 1298 /* 1299 * Note: The following Changes_When_Xyz are defined as testing whether 1300 * the NFD form of the input changes when Xyz-case-mapped. 1301 * However, this simpler implementation of these properties, 1302 * ignoring NFD, passes the tests. 1303 * The implementation needs to be changed if the tests start failing. 1304 * When that happens, optimizations should be used to work with the 1305 * per-single-code point ucase_toFullXyz() functions unless 1306 * the NFD form has more than one code point, 1307 * and the property starts set needs to be the union of the 1308 * start sets for normalization and case mappings. 1309 */ 1310 case UProperty.CHANGES_WHEN_LOWERCASED: 1311 dummyStringBuilder.setLength(0); 1312 return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1313 case UProperty.CHANGES_WHEN_UPPERCASED: 1314 dummyStringBuilder.setLength(0); 1315 return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1316 case UProperty.CHANGES_WHEN_TITLECASED: 1317 dummyStringBuilder.setLength(0); 1318 return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1319 /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */ 1320 case UProperty.CHANGES_WHEN_CASEMAPPED: 1321 dummyStringBuilder.setLength(0); 1322 return 1323 toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 || 1324 toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 || 1325 toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; 1326 default: 1327 return false; 1328 } 1329 } 1330 1331 // data members -------------------------------------------------------- *** 1332 private int indexes[]; 1333 private String exceptions; 1334 private char unfold[]; 1335 1336 private Trie2_16 trie; 1337 1338 // data format constants ----------------------------------------------- *** 1339 private static final String DATA_NAME="ucase"; 1340 private static final String DATA_TYPE="icu"; 1341 private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE; 1342 1343 /* format "cAsE" */ 1344 private static final int FMT=0x63415345; 1345 1346 /* indexes into indexes[] */ 1347 //private static final int IX_INDEX_TOP=0; 1348 //private static final int IX_LENGTH=1; 1349 private static final int IX_TRIE_SIZE=2; 1350 private static final int IX_EXC_LENGTH=3; 1351 private static final int IX_UNFOLD_LENGTH=4; 1352 1353 //private static final int IX_MAX_FULL_LENGTH=15; 1354 private static final int IX_TOP=16; 1355 1356 // definitions for 16-bit case properties word ------------------------- *** 1357 1358 /* 2-bit constants for types of cased characters */ 1359 public static final int TYPE_MASK=3; 1360 public static final int NONE=0; 1361 public static final int LOWER=1; 1362 public static final int UPPER=2; 1363 public static final int TITLE=3; 1364 1365 /** @return NONE, LOWER, UPPER, TITLE */ 1366 private static final int getTypeFromProps(int props) { 1367 return props&TYPE_MASK; 1368 } 1369 1370 /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */ 1371 private static final int getTypeAndIgnorableFromProps(int props) { 1372 return props&7; 1373 } 1374 1375 static final int IGNORABLE=4; 1376 private static final int SENSITIVE= 8; 1377 private static final int EXCEPTION= 0x10; 1378 1379 private static final int DOT_MASK= 0x60; 1380 //private static final int NO_DOT= 0; /* normal characters with cc=0 */ 1381 private static final int SOFT_DOTTED= 0x20; /* soft-dotted characters with cc=0 */ 1382 private static final int ABOVE= 0x40; /* "above" accents with cc=230 */ 1383 private static final int OTHER_ACCENT= 0x60; /* other accent character (0<cc!=230) */ 1384 1385 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */ 1386 private static final int DELTA_SHIFT= 7; 1387 //private static final int DELTA_MASK= 0xff80; 1388 //private static final int MAX_DELTA= 0xff; 1389 //private static final int MIN_DELTA= (-MAX_DELTA-1); 1390 1391 private static final int getDelta(int props) { 1392 return (short)props>>DELTA_SHIFT; 1393 } 1394 1395 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ 1396 private static final int EXC_SHIFT= 5; 1397 //private static final int EXC_MASK= 0xffe0; 1398 //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1); 1399 1400 /* definitions for 16-bit main exceptions word ------------------------------ */ 1401 1402 /* first 8 bits indicate values in optional slots */ 1403 private static final int EXC_LOWER=0; 1404 private static final int EXC_FOLD=1; 1405 private static final int EXC_UPPER=2; 1406 private static final int EXC_TITLE=3; 1407 //private static final int EXC_4=4; /* reserved */ 1408 //private static final int EXC_5=5; /* reserved */ 1409 private static final int EXC_CLOSURE=6; 1410 private static final int EXC_FULL_MAPPINGS=7; 1411 //private static final int EXC_ALL_SLOTS=8; /* one past the last slot */ 1412 1413 /* each slot is 2 uint16_t instead of 1 */ 1414 private static final int EXC_DOUBLE_SLOTS= 0x100; 1415 1416 /* reserved: exception bits 11..9 */ 1417 1418 /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */ 1419 private static final int EXC_DOT_SHIFT=7; 1420 1421 /* normally stored in the main word, but pushed out for larger exception indexes */ 1422 //private static final int EXC_DOT_MASK= 0x3000; 1423 //private static final int EXC_NO_DOT= 0; 1424 //private static final int EXC_SOFT_DOTTED= 0x1000; 1425 //private static final int EXC_ABOVE= 0x2000; /* "above" accents with cc=230 */ 1426 //private static final int EXC_OTHER_ACCENT= 0x3000; /* other character (0<cc!=230) */ 1427 1428 /* complex/conditional mappings */ 1429 private static final int EXC_CONDITIONAL_SPECIAL= 0x4000; 1430 private static final int EXC_CONDITIONAL_FOLD= 0x8000; 1431 1432 /* definitions for lengths word for full case mappings */ 1433 private static final int FULL_LOWER= 0xf; 1434 //private static final int FULL_FOLDING= 0xf0; 1435 //private static final int FULL_UPPER= 0xf00; 1436 //private static final int FULL_TITLE= 0xf000; 1437 1438 /* maximum lengths */ 1439 //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf; 1440 private static final int CLOSURE_MAX_LENGTH=0xf; 1441 1442 /* constants for reverse case folding ("unfold") data */ 1443 private static final int UNFOLD_ROWS=0; 1444 private static final int UNFOLD_ROW_WIDTH=1; 1445 private static final int UNFOLD_STRING_WIDTH=2; 1446 1447 /* 1448 * public singleton instance 1449 */ 1450 public static final UCaseProps INSTANCE; 1451 1452 // This static initializer block must be placed after 1453 // other static member initialization 1454 static { 1455 try { 1456 INSTANCE = new UCaseProps(); 1457 } catch (IOException e) { 1458 throw new ICUUncheckedIOException(e); 1459 } 1460 } 1461 } 1462