1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.ArrayList; 15 import java.util.Iterator; 16 17 import com.ibm.icu.text.UTF16; 18 import com.ibm.icu.text.UnicodeSet; 19 import com.ibm.icu.util.ICUUncheckedIOException; 20 import com.ibm.icu.util.VersionInfo; 21 22 /** 23 * Low-level implementation of the Unicode Normalization Algorithm. 24 * For the data structure and details see the documentation at the end of 25 * C++ normalizer2impl.h and in the design doc at 26 * http://site.icu-project.org/design/normalization/custom 27 */ 28 public final class Normalizer2Impl { 29 public static final class Hangul { 30 /* Korean Hangul and Jamo constants */ 31 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 32 public static final int JAMO_L_END=0x1112; 33 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 34 public static final int JAMO_V_END=0x1175; 35 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 36 public static final int JAMO_T_END=0x11c2; 37 38 public static final int HANGUL_BASE=0xac00; 39 public static final int HANGUL_END=0xd7a3; 40 41 public static final int JAMO_L_COUNT=19; 42 public static final int JAMO_V_COUNT=21; 43 public static final int JAMO_T_COUNT=28; 44 45 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 46 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 47 48 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 49 50 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 51 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 52 53 public static boolean isHangul(int c) { 54 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 55 } 56 public static boolean isHangulLV(int c) { 57 c-=HANGUL_BASE; 58 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 59 } 60 public static boolean isJamoL(int c) { 61 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 62 } 63 public static boolean isJamoV(int c) { 64 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 65 } 66 public static boolean isJamoT(int c) { 67 int t=c-JAMO_T_BASE; 68 return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself 69 } 70 public static boolean isJamo(int c) { 71 return JAMO_L_BASE<=c && c<=JAMO_T_END && 72 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c); 73 } 74 75 /** 76 * Decomposes c, which must be a Hangul syllable, into buffer 77 * and returns the length of the decomposition (2 or 3). 78 */ 79 public static int decompose(int c, Appendable buffer) { 80 try { 81 c-=HANGUL_BASE; 82 int c2=c%JAMO_T_COUNT; 83 c/=JAMO_T_COUNT; 84 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 85 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 86 if(c2==0) { 87 return 2; 88 } else { 89 buffer.append((char)(JAMO_T_BASE+c2)); 90 return 3; 91 } 92 } catch(IOException e) { 93 // Will not occur because we do not write to I/O. 94 throw new ICUUncheckedIOException(e); 95 } 96 } 97 98 /** 99 * Decomposes c, which must be a Hangul syllable, into buffer. 100 * This is the raw, not recursive, decomposition. Its length is always 2. 101 */ 102 public static void getRawDecomposition(int c, Appendable buffer) { 103 try { 104 int orig=c; 105 c-=HANGUL_BASE; 106 int c2=c%JAMO_T_COUNT; 107 if(c2==0) { 108 c/=JAMO_T_COUNT; 109 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 110 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 111 } else { 112 buffer.append((char)(orig-c2)); // LV syllable 113 buffer.append((char)(JAMO_T_BASE+c2)); 114 } 115 } catch(IOException e) { 116 // Will not occur because we do not write to I/O. 117 throw new ICUUncheckedIOException(e); 118 } 119 } 120 } 121 122 /** 123 * Writable buffer that takes care of canonical ordering. 124 * Its Appendable methods behave like the C++ implementation's 125 * appendZeroCC() methods. 126 * <p> 127 * If dest is a StringBuilder, then the buffer writes directly to it. 128 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 129 * until no further changes are necessary and whole segments are appended. 130 * append() methods that take combining-class values always write to the StringBuilder. 131 * Other append() methods flush and append to the Appendable. 132 */ 133 public static final class ReorderingBuffer implements Appendable { 134 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 135 impl=ni; 136 app=dest; 137 if(app instanceof StringBuilder) { 138 appIsStringBuilder=true; 139 str=(StringBuilder)dest; 140 // In Java, the constructor subsumes public void init(int destCapacity) { 141 str.ensureCapacity(destCapacity); 142 reorderStart=0; 143 if(str.length()==0) { 144 lastCC=0; 145 } else { 146 setIterator(); 147 lastCC=previousCC(); 148 // Set reorderStart after the last code point with cc<=1 if there is one. 149 if(lastCC>1) { 150 while(previousCC()>1) {} 151 } 152 reorderStart=codePointLimit; 153 } 154 } else { 155 appIsStringBuilder=false; 156 str=new StringBuilder(); 157 reorderStart=0; 158 lastCC=0; 159 } 160 } 161 162 public boolean isEmpty() { return str.length()==0; } 163 public int length() { return str.length(); } 164 public int getLastCC() { return lastCC; } 165 166 public StringBuilder getStringBuilder() { return str; } 167 168 public boolean equals(CharSequence s, int start, int limit) { 169 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 170 } 171 172 public void append(int c, int cc) { 173 if(lastCC<=cc || cc==0) { 174 str.appendCodePoint(c); 175 lastCC=cc; 176 if(cc<=1) { 177 reorderStart=str.length(); 178 } 179 } else { 180 insert(c, cc); 181 } 182 } 183 // s must be in NFD, otherwise change the implementation. 184 public void append(CharSequence s, int start, int limit, 185 int leadCC, int trailCC) { 186 if(start==limit) { 187 return; 188 } 189 if(lastCC<=leadCC || leadCC==0) { 190 if(trailCC<=1) { 191 reorderStart=str.length()+(limit-start); 192 } else if(leadCC<=1) { 193 reorderStart=str.length()+1; // Ok if not a code point boundary. 194 } 195 str.append(s, start, limit); 196 lastCC=trailCC; 197 } else { 198 int c=Character.codePointAt(s, start); 199 start+=Character.charCount(c); 200 insert(c, leadCC); // insert first code point 201 while(start<limit) { 202 c=Character.codePointAt(s, start); 203 start+=Character.charCount(c); 204 if(start<limit) { 205 // s must be in NFD, otherwise we need to use getCC(). 206 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 207 } else { 208 leadCC=trailCC; 209 } 210 append(c, leadCC); 211 } 212 } 213 } 214 // The following append() methods work like C++ appendZeroCC(). 215 // They assume that the cc or trailCC of their input is 0. 216 // Most of them implement Appendable interface methods. 217 @Override 218 public ReorderingBuffer append(char c) { 219 str.append(c); 220 lastCC=0; 221 reorderStart=str.length(); 222 return this; 223 } 224 public void appendZeroCC(int c) { 225 str.appendCodePoint(c); 226 lastCC=0; 227 reorderStart=str.length(); 228 } 229 @Override 230 public ReorderingBuffer append(CharSequence s) { 231 if(s.length()!=0) { 232 str.append(s); 233 lastCC=0; 234 reorderStart=str.length(); 235 } 236 return this; 237 } 238 @Override 239 public ReorderingBuffer append(CharSequence s, int start, int limit) { 240 if(start!=limit) { 241 str.append(s, start, limit); 242 lastCC=0; 243 reorderStart=str.length(); 244 } 245 return this; 246 } 247 /** 248 * Flushes from the intermediate StringBuilder to the Appendable, 249 * if they are different objects. 250 * Used after recomposition. 251 * Must be called at the end when writing to a non-StringBuilder Appendable. 252 */ 253 public void flush() { 254 if(appIsStringBuilder) { 255 reorderStart=str.length(); 256 } else { 257 try { 258 app.append(str); 259 str.setLength(0); 260 reorderStart=0; 261 } catch(IOException e) { 262 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 263 } 264 } 265 lastCC=0; 266 } 267 /** 268 * Flushes from the intermediate StringBuilder to the Appendable, 269 * if they are different objects. 270 * Then appends the new text to the Appendable or StringBuilder. 271 * Normally used after quick check loops find a non-empty sequence. 272 */ 273 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 274 if(appIsStringBuilder) { 275 str.append(s, start, limit); 276 reorderStart=str.length(); 277 } else { 278 try { 279 app.append(str).append(s, start, limit); 280 str.setLength(0); 281 reorderStart=0; 282 } catch(IOException e) { 283 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 284 } 285 } 286 lastCC=0; 287 return this; 288 } 289 public void remove() { 290 str.setLength(0); 291 lastCC=0; 292 reorderStart=0; 293 } 294 public void removeSuffix(int suffixLength) { 295 int oldLength=str.length(); 296 str.delete(oldLength-suffixLength, oldLength); 297 lastCC=0; 298 reorderStart=str.length(); 299 } 300 301 /* 302 * TODO: Revisit whether it makes sense to track reorderStart. 303 * It is set to after the last known character with cc<=1, 304 * which stops previousCC() before it reads that character and looks up its cc. 305 * previousCC() is normally only called from insert(). 306 * In other words, reorderStart speeds up the insertion of a combining mark 307 * into a multi-combining mark sequence where it does not belong at the end. 308 * This might not be worth the trouble. 309 * On the other hand, it's not a huge amount of trouble. 310 * 311 * We probably need it for UNORM_SIMPLE_APPEND. 312 */ 313 314 // Inserts c somewhere before the last character. 315 // Requires 0<cc<lastCC which implies reorderStart<limit. 316 private void insert(int c, int cc) { 317 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 318 // insert c at codePointLimit, after the character with prevCC<=cc 319 if(c<=0xffff) { 320 str.insert(codePointLimit, (char)c); 321 if(cc<=1) { 322 reorderStart=codePointLimit+1; 323 } 324 } else { 325 str.insert(codePointLimit, Character.toChars(c)); 326 if(cc<=1) { 327 reorderStart=codePointLimit+2; 328 } 329 } 330 } 331 332 private final Normalizer2Impl impl; 333 private final Appendable app; 334 private final StringBuilder str; 335 private final boolean appIsStringBuilder; 336 private int reorderStart; 337 private int lastCC; 338 339 // private backward iterator 340 private void setIterator() { codePointStart=str.length(); } 341 private void skipPrevious() { // Requires 0<codePointStart. 342 codePointLimit=codePointStart; 343 codePointStart=str.offsetByCodePoints(codePointStart, -1); 344 } 345 private int previousCC() { // Returns 0 if there is no previous character. 346 codePointLimit=codePointStart; 347 if(reorderStart>=codePointStart) { 348 return 0; 349 } 350 int c=str.codePointBefore(codePointStart); 351 codePointStart-=Character.charCount(c); 352 return impl.getCCFromYesOrMaybeCP(c); 353 } 354 355 private int codePointStart, codePointLimit; 356 } 357 358 // TODO: Propose as public API on the UTF16 class. 359 // TODO: Propose widening UTF16 methods that take char to take int. 360 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 361 public static final class UTF16Plus { 362 /** 363 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 364 * is it a lead surrogate? 365 * @param c code unit or code point 366 * @return true or false 367 */ 368 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 369 /** 370 * Compares two CharSequence objects for binary equality. 371 * @param s1 first sequence 372 * @param s2 second sequence 373 * @return true if s1 contains the same text as s2 374 */ 375 public static boolean equal(CharSequence s1, CharSequence s2) { 376 if(s1==s2) { 377 return true; 378 } 379 int length=s1.length(); 380 if(length!=s2.length()) { 381 return false; 382 } 383 for(int i=0; i<length; ++i) { 384 if(s1.charAt(i)!=s2.charAt(i)) { 385 return false; 386 } 387 } 388 return true; 389 } 390 /** 391 * Compares two CharSequence subsequences for binary equality. 392 * @param s1 first sequence 393 * @param start1 start offset in first sequence 394 * @param limit1 limit offset in first sequence 395 * @param s2 second sequence 396 * @param start2 start offset in second sequence 397 * @param limit2 limit offset in second sequence 398 * @return true if s1.subSequence(start1, limit1) contains the same text 399 * as s2.subSequence(start2, limit2) 400 */ 401 public static boolean equal(CharSequence s1, int start1, int limit1, 402 CharSequence s2, int start2, int limit2) { 403 if((limit1-start1)!=(limit2-start2)) { 404 return false; 405 } 406 if(s1==s2 && start1==start2) { 407 return true; 408 } 409 while(start1<limit1) { 410 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 411 return false; 412 } 413 } 414 return true; 415 } 416 } 417 418 public Normalizer2Impl() {} 419 420 private static final class IsAcceptable implements ICUBinary.Authenticate { 421 @Override 422 public boolean isDataVersionAcceptable(byte version[]) { 423 return version[0]==3; 424 } 425 } 426 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 427 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 428 429 public Normalizer2Impl load(ByteBuffer bytes) { 430 try { 431 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 432 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 433 if(indexesLength<=IX_MIN_LCCC_CP) { 434 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 435 } 436 int[] inIndexes=new int[indexesLength]; 437 inIndexes[0]=indexesLength*4; 438 for(int i=1; i<indexesLength; ++i) { 439 inIndexes[i]=bytes.getInt(); 440 } 441 442 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 443 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 444 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 445 446 minYesNo=inIndexes[IX_MIN_YES_NO]; 447 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 448 minNoNo=inIndexes[IX_MIN_NO_NO]; 449 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 450 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 451 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 452 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 453 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 454 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 455 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 456 457 // Read the normTrie. 458 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 459 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 460 normTrie=Trie2_16.createFromSerialized(bytes); 461 int trieLength=normTrie.getSerializedLength(); 462 if(trieLength>(nextOffset-offset)) { 463 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 464 } 465 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 466 467 // Read the composition and mapping data. 468 offset=nextOffset; 469 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 470 int numChars=(nextOffset-offset)/2; 471 if(numChars!=0) { 472 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 473 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 474 } 475 476 // smallFCD: new in formatVersion 2 477 offset=nextOffset; 478 smallFCD=new byte[0x100]; 479 bytes.get(smallFCD); 480 481 return this; 482 } catch(IOException e) { 483 throw new ICUUncheckedIOException(e); 484 } 485 } 486 public Normalizer2Impl load(String name) { 487 return load(ICUBinary.getRequiredData(name)); 488 } 489 490 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 491 if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) { 492 set.add(start, end); 493 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 494 int fcd16=getFCD16(start); 495 if(fcd16>0xff) { set.add(start, end); } 496 } 497 } 498 499 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 500 /* add the start code point to the USet */ 501 set.add(start); 502 if(start!=end && isAlgorithmicNoNo(value) && (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) { 503 // Range of code points with same-norm16-value algorithmic decompositions. 504 // They might have different non-zero FCD16 values. 505 int prevFCD16=getFCD16(start); 506 while(++start<=end) { 507 int fcd16=getFCD16(start); 508 if(fcd16!=prevFCD16) { 509 set.add(start); 510 prevFCD16=fcd16; 511 } 512 } 513 } 514 } 515 516 public void addLcccChars(UnicodeSet set) { 517 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 518 Trie2.Range range; 519 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 520 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 521 } 522 } 523 524 public void addPropertyStarts(UnicodeSet set) { 525 /* add the start code point of each same-value range of each trie */ 526 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 527 Trie2.Range range; 528 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 529 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 530 } 531 532 /* add Hangul LV syllables and LV+1 because of skippables */ 533 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 534 set.add(c); 535 set.add(c+1); 536 } 537 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 538 } 539 540 public void addCanonIterPropertyStarts(UnicodeSet set) { 541 /* add the start code point of each same-value range of the canonical iterator data trie */ 542 ensureCanonIterData(); 543 // currently only used for the SEGMENT_STARTER property 544 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 545 Trie2.Range range; 546 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 547 /* add the start code point to the USet */ 548 set.add(range.startCodePoint); 549 } 550 } 551 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 552 @Override 553 public int map(int in) { 554 return in&CANON_NOT_SEGMENT_STARTER; 555 } 556 }; 557 558 // low-level properties ------------------------------------------------ *** 559 560 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 561 // still had getFCDTrie() which built and cached an FCD trie. 562 // That provided faster access to FCD data than getFCD16FromNormData() 563 // but required synchronization and consumed some 10kB of heap memory 564 // in any process that uses FCD (e.g., via collation). 565 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 566 // at least for ASCII & CJK. 567 568 /** 569 * Builds the canonical-iterator data for this instance. 570 * This is required before any of {@link #isCanonSegmentStarter(int)} or 571 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 572 * or else they crash. 573 * @return this 574 */ 575 public synchronized Normalizer2Impl ensureCanonIterData() { 576 if(canonIterData==null) { 577 Trie2Writable newData=new Trie2Writable(0, 0); 578 canonStartSets=new ArrayList<UnicodeSet>(); 579 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 580 Trie2.Range range; 581 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 582 final int norm16=range.value; 583 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { 584 // Inert, or 2-way mapping (including Hangul syllable). 585 // We do not write a canonStartSet for any yesNo character. 586 // Composites from 2-way mappings are added at runtime from the 587 // starter's compositions list, and the other characters in 588 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 589 // "maybe" characters. 590 continue; 591 } 592 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 593 final int oldValue=newData.get(c); 594 int newValue=oldValue; 595 if(isMaybeOrNonZeroCC(norm16)) { 596 // not a segment starter if it occurs in a decomposition or has cc!=0 597 newValue|=CANON_NOT_SEGMENT_STARTER; 598 if(norm16<MIN_NORMAL_MAYBE_YES) { 599 newValue|=CANON_HAS_COMPOSITIONS; 600 } 601 } else if(norm16<minYesNo) { 602 newValue|=CANON_HAS_COMPOSITIONS; 603 } else { 604 // c has a one-way decomposition 605 int c2=c; 606 // Do not modify the whole-range norm16 value. 607 int norm16_2=norm16; 608 if (isDecompNoAlgorithmic(norm16_2)) { 609 // Maps to an isCompYesAndZeroCC. 610 c2 = mapAlgorithmic(c2, norm16_2); 611 norm16_2 = getNorm16(c2); 612 // No compatibility mappings for the CanonicalIterator. 613 assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 614 } 615 if (norm16_2 > minYesNo) { 616 // c decomposes, get everything from the variable-length extra data 617 int mapping=norm16_2>>OFFSET_SHIFT; 618 int firstUnit=extraData.charAt(mapping); 619 int length=firstUnit&MAPPING_LENGTH_MASK; 620 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 621 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) { 622 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 623 } 624 } 625 // Skip empty mappings (no characters in the decomposition). 626 if(length!=0) { 627 ++mapping; // skip over the firstUnit 628 // add c to first code point's start set 629 int limit=mapping+length; 630 c2=extraData.codePointAt(mapping); 631 addToStartSet(newData, c, c2); 632 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 633 // one-way mapping. A 2-way mapping is possible here after 634 // intermediate algorithmic mapping. 635 if(norm16_2>=minNoNo) { 636 while((mapping+=Character.charCount(c2))<limit) { 637 c2=extraData.codePointAt(mapping); 638 int c2Value=newData.get(c2); 639 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 640 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 641 } 642 } 643 } 644 } 645 } else { 646 // c decomposed to c2 algorithmically; c has cc==0 647 addToStartSet(newData, c, c2); 648 } 649 } 650 if(newValue!=oldValue) { 651 newData.set(c, newValue); 652 } 653 } 654 } 655 canonIterData=newData.toTrie2_32(); 656 } 657 return this; 658 } 659 660 public int getNorm16(int c) { return normTrie.get(c); } 661 662 public int getCompQuickCheck(int norm16) { 663 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 664 return 1; // yes 665 } else if(minMaybeYes<=norm16) { 666 return 2; // maybe 667 } else { 668 return 0; // no 669 } 670 } 671 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 672 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 673 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 674 675 public int getCC(int norm16) { 676 if(norm16>=MIN_NORMAL_MAYBE_YES) { 677 return getCCFromNormalYesOrMaybe(norm16); 678 } 679 if(norm16<minNoNo || limitNoNo<=norm16) { 680 return 0; 681 } 682 return getCCFromNoNo(norm16); 683 } 684 public static int getCCFromNormalYesOrMaybe(int norm16) { 685 return (norm16 >> OFFSET_SHIFT) & 0xff; 686 } 687 public static int getCCFromYesOrMaybe(int norm16) { 688 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 689 } 690 public int getCCFromYesOrMaybeCP(int c) { 691 if (c < minCompNoMaybeCP) { return 0; } 692 return getCCFromYesOrMaybe(getNorm16(c)); 693 } 694 695 /** 696 * Returns the FCD data for code point c. 697 * @param c A Unicode code point. 698 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 699 */ 700 public int getFCD16(int c) { 701 if(c<minDecompNoCP) { 702 return 0; 703 } else if(c<=0xffff) { 704 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 705 } 706 return getFCD16FromNormData(c); 707 } 708 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 709 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 710 // 0<=lead<=0xffff 711 byte bits=smallFCD[lead>>8]; 712 if(bits==0) { return false; } 713 return ((bits>>((lead>>5)&7))&1)!=0; 714 } 715 716 /** Gets the FCD value from the regular normalization data. */ 717 public int getFCD16FromNormData(int c) { 718 int norm16=getNorm16(c); 719 if (norm16 >= limitNoNo) { 720 if(norm16>=MIN_NORMAL_MAYBE_YES) { 721 // combining mark 722 norm16=getCCFromNormalYesOrMaybe(norm16); 723 return norm16|(norm16<<8); 724 } else if(norm16>=minMaybeYes) { 725 return 0; 726 } else { // isDecompNoAlgorithmic(norm16) 727 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 728 if (deltaTrailCC <= DELTA_TCCC_1) { 729 return deltaTrailCC >> OFFSET_SHIFT; 730 } 731 // Maps to an isCompYesAndZeroCC. 732 c=mapAlgorithmic(c, norm16); 733 norm16=getNorm16(c); 734 } 735 } 736 if(norm16<=minYesNo || isHangulLVT(norm16)) { 737 // no decomposition or Hangul syllable, all zeros 738 return 0; 739 } 740 // c decomposes, get everything from the variable-length extra data 741 int mapping=norm16>>OFFSET_SHIFT; 742 int firstUnit=extraData.charAt(mapping); 743 int fcd16=firstUnit>>8; // tccc 744 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 745 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 746 } 747 return fcd16; 748 } 749 750 /** 751 * Gets the decomposition for one code point. 752 * @param c code point 753 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 754 */ 755 public String getDecomposition(int c) { 756 int norm16; 757 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 758 // c does not decompose 759 return null; 760 } 761 int decomp = -1; 762 if(isDecompNoAlgorithmic(norm16)) { 763 // Maps to an isCompYesAndZeroCC. 764 decomp=c=mapAlgorithmic(c, norm16); 765 // The mapping might decompose further. 766 norm16 = getNorm16(c); 767 } 768 if (norm16 < minYesNo) { 769 if(decomp<0) { 770 return null; 771 } else { 772 return UTF16.valueOf(decomp); 773 } 774 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 775 // Hangul syllable: decompose algorithmically 776 StringBuilder buffer=new StringBuilder(); 777 Hangul.decompose(c, buffer); 778 return buffer.toString(); 779 } 780 // c decomposes, get everything from the variable-length extra data 781 int mapping=norm16>>OFFSET_SHIFT; 782 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 783 return extraData.substring(mapping, mapping+length); 784 } 785 786 /** 787 * Gets the raw decomposition for one code point. 788 * @param c code point 789 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 790 */ 791 public String getRawDecomposition(int c) { 792 int norm16; 793 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 794 // c does not decompose 795 return null; 796 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 797 // Hangul syllable: decompose algorithmically 798 StringBuilder buffer=new StringBuilder(); 799 Hangul.getRawDecomposition(c, buffer); 800 return buffer.toString(); 801 } else if(isDecompNoAlgorithmic(norm16)) { 802 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 803 } 804 // c decomposes, get everything from the variable-length extra data 805 int mapping=norm16>>OFFSET_SHIFT; 806 int firstUnit=extraData.charAt(mapping); 807 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 808 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 809 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 810 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 811 int rawMapping=mapping-((firstUnit>>7)&1)-1; 812 char rm0=extraData.charAt(rawMapping); 813 if(rm0<=MAPPING_LENGTH_MASK) { 814 return extraData.substring(rawMapping-rm0, rawMapping); 815 } else { 816 // Copy the normal mapping and replace its first two code units with rm0. 817 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 818 mapping+=1+2; // skip over the firstUnit and the first two mapping code units 819 return buffer.append(extraData, mapping, mapping+mLength-2).toString(); 820 } 821 } else { 822 mapping+=1; // skip over the firstUnit 823 return extraData.substring(mapping, mapping+mLength); 824 } 825 } 826 827 /** 828 * Returns true if code point c starts a canonical-iterator string segment. 829 * <b>{@link #ensureCanonIterData()} must have been called before this method, 830 * or else this method will crash.</b> 831 * @param c A Unicode code point. 832 * @return true if c starts a canonical-iterator string segment. 833 */ 834 public boolean isCanonSegmentStarter(int c) { 835 return canonIterData.get(c)>=0; 836 } 837 /** 838 * Returns true if there are characters whose decomposition starts with c. 839 * If so, then the set is cleared and then filled with those characters. 840 * <b>{@link #ensureCanonIterData()} must have been called before this method, 841 * or else this method will crash.</b> 842 * @param c A Unicode code point. 843 * @param set A UnicodeSet to receive the characters whose decompositions 844 * start with c, if there are any. 845 * @return true if there are characters whose decomposition starts with c. 846 */ 847 public boolean getCanonStartSet(int c, UnicodeSet set) { 848 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 849 if(canonValue==0) { 850 return false; 851 } 852 set.clear(); 853 int value=canonValue&CANON_VALUE_MASK; 854 if((canonValue&CANON_HAS_SET)!=0) { 855 set.addAll(canonStartSets.get(value)); 856 } else if(value!=0) { 857 set.add(value); 858 } 859 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 860 int norm16=getNorm16(c); 861 if(norm16==JAMO_L) { 862 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 863 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 864 } else { 865 addComposites(getCompositionsList(norm16), set); 866 } 867 } 868 return true; 869 } 870 871 // Fixed norm16 values. 872 public static final int MIN_YES_YES_WITH_CC=0xfe02; 873 public static final int JAMO_VT=0xfe00; 874 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 875 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 876 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 877 878 // norm16 bit 0 is comp-boundary-after. 879 public static final int HAS_COMP_BOUNDARY_AFTER=1; 880 public static final int OFFSET_SHIFT=1; 881 882 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 883 // tccc (0, 1, >1) for quick FCC boundary-after tests. 884 public static final int DELTA_TCCC_0=0; 885 public static final int DELTA_TCCC_1=2; 886 public static final int DELTA_TCCC_GT_1=4; 887 public static final int DELTA_TCCC_MASK=6; 888 public static final int DELTA_SHIFT=3; 889 890 public static final int MAX_DELTA=0x40; 891 892 // Byte offsets from the start of the data, after the generic header. 893 public static final int IX_NORM_TRIE_OFFSET=0; 894 public static final int IX_EXTRA_DATA_OFFSET=1; 895 public static final int IX_SMALL_FCD_OFFSET=2; 896 public static final int IX_RESERVED3_OFFSET=3; 897 public static final int IX_TOTAL_SIZE=7; 898 899 // Code point thresholds for quick check codes. 900 public static final int IX_MIN_DECOMP_NO_CP=8; 901 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 902 903 // Norm16 value thresholds for quick check combinations and types of extra data. 904 905 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 906 public static final int IX_MIN_YES_NO=10; 907 /** Mappings are comp-normalized. */ 908 public static final int IX_MIN_NO_NO=11; 909 public static final int IX_LIMIT_NO_NO=12; 910 public static final int IX_MIN_MAYBE_YES=13; 911 912 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 913 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 914 /** Mappings are not comp-normalized but have a comp boundary before. */ 915 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 916 /** Mappings do not have a comp boundary before. */ 917 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 918 /** Mappings to the empty string. */ 919 public static final int IX_MIN_NO_NO_EMPTY=17; 920 921 public static final int IX_MIN_LCCC_CP=18; 922 public static final int IX_COUNT=20; 923 924 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 925 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 926 // unused bit 0x20; 927 public static final int MAPPING_LENGTH_MASK=0x1f; 928 929 public static final int COMP_1_LAST_TUPLE=0x8000; 930 public static final int COMP_1_TRIPLE=1; 931 public static final int COMP_1_TRAIL_LIMIT=0x3400; 932 public static final int COMP_1_TRAIL_MASK=0x7ffe; 933 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 934 public static final int COMP_2_TRAIL_SHIFT=6; 935 public static final int COMP_2_TRAIL_MASK=0xffc0; 936 937 // higher-level functionality ------------------------------------------ *** 938 939 // NFD without an NFD Normalizer2 instance. 940 public Appendable decompose(CharSequence s, StringBuilder dest) { 941 decompose(s, 0, s.length(), dest, s.length()); 942 return dest; 943 } 944 /** 945 * Decomposes s[src, limit[ and writes the result to dest. 946 * limit can be NULL if src is NUL-terminated. 947 * destLengthEstimate is the initial dest buffer capacity and can be -1. 948 */ 949 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 950 int destLengthEstimate) { 951 if(destLengthEstimate<0) { 952 destLengthEstimate=limit-src; 953 } 954 dest.setLength(0); 955 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 956 decompose(s, src, limit, buffer); 957 } 958 959 // Dual functionality: 960 // buffer!=NULL: normalize 961 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 962 public int decompose(CharSequence s, int src, int limit, 963 ReorderingBuffer buffer) { 964 int minNoCP=minDecompNoCP; 965 966 int prevSrc; 967 int c=0; 968 int norm16=0; 969 970 // only for quick check 971 int prevBoundary=src; 972 int prevCC=0; 973 974 for(;;) { 975 // count code units below the minimum or with irrelevant data for the quick check 976 for(prevSrc=src; src!=limit;) { 977 if( (c=s.charAt(src))<minNoCP || 978 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 979 ) { 980 ++src; 981 } else if(!UTF16.isSurrogate((char)c)) { 982 break; 983 } else { 984 char c2; 985 if(UTF16Plus.isSurrogateLead(c)) { 986 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 987 c=Character.toCodePoint((char)c, c2); 988 } 989 } else /* trail surrogate */ { 990 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 991 --src; 992 c=Character.toCodePoint(c2, (char)c); 993 } 994 } 995 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 996 src+=Character.charCount(c); 997 } else { 998 break; 999 } 1000 } 1001 } 1002 // copy these code units all at once 1003 if(src!=prevSrc) { 1004 if(buffer!=null) { 1005 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1006 } else { 1007 prevCC=0; 1008 prevBoundary=src; 1009 } 1010 } 1011 if(src==limit) { 1012 break; 1013 } 1014 1015 // Check one above-minimum, relevant code point. 1016 src+=Character.charCount(c); 1017 if(buffer!=null) { 1018 decompose(c, norm16, buffer); 1019 } else { 1020 if(isDecompYes(norm16)) { 1021 int cc=getCCFromYesOrMaybe(norm16); 1022 if(prevCC<=cc || cc==0) { 1023 prevCC=cc; 1024 if(cc<=1) { 1025 prevBoundary=src; 1026 } 1027 continue; 1028 } 1029 } 1030 return prevBoundary; // "no" or cc out of order 1031 } 1032 } 1033 return src; 1034 } 1035 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1036 int limit=s.length(); 1037 if(limit==0) { 1038 return; 1039 } 1040 if(doDecompose) { 1041 decompose(s, 0, limit, buffer); 1042 return; 1043 } 1044 // Just merge the strings at the boundary. 1045 int c=Character.codePointAt(s, 0); 1046 int src=0; 1047 int firstCC, prevCC, cc; 1048 firstCC=prevCC=cc=getCC(getNorm16(c)); 1049 while(cc!=0) { 1050 prevCC=cc; 1051 src+=Character.charCount(c); 1052 if(src>=limit) { 1053 break; 1054 } 1055 c=Character.codePointAt(s, src); 1056 cc=getCC(getNorm16(c)); 1057 }; 1058 buffer.append(s, 0, src, firstCC, prevCC); 1059 buffer.append(s, src, limit); 1060 } 1061 1062 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1063 // doCompose: normalize 1064 // !doCompose: isNormalized (buffer must be empty and initialized) 1065 public boolean compose(CharSequence s, int src, int limit, 1066 boolean onlyContiguous, 1067 boolean doCompose, 1068 ReorderingBuffer buffer) { 1069 int prevBoundary=src; 1070 int minNoMaybeCP=minCompNoMaybeCP; 1071 1072 for (;;) { 1073 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1074 // or with (compYes && ccc==0) properties. 1075 int prevSrc; 1076 int c = 0; 1077 int norm16 = 0; 1078 for (;;) { 1079 if (src == limit) { 1080 if (prevBoundary != limit && doCompose) { 1081 buffer.append(s, prevBoundary, limit); 1082 } 1083 return true; 1084 } 1085 if( (c=s.charAt(src))<minNoMaybeCP || 1086 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1087 ) { 1088 ++src; 1089 } else { 1090 prevSrc = src++; 1091 if(!UTF16.isSurrogate((char)c)) { 1092 break; 1093 } else { 1094 char c2; 1095 if(UTF16Plus.isSurrogateLead(c)) { 1096 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1097 ++src; 1098 c=Character.toCodePoint((char)c, c2); 1099 } 1100 } else /* trail surrogate */ { 1101 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1102 --prevSrc; 1103 c=Character.toCodePoint(c2, (char)c); 1104 } 1105 } 1106 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1107 break; 1108 } 1109 } 1110 } 1111 } 1112 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1113 // The current character is either a "noNo" (has a mapping) 1114 // or a "maybeYes" (combines backward) 1115 // or a "yesYes" with ccc!=0. 1116 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1117 1118 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1119 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 1120 if (!doCompose) { 1121 return false; 1122 } 1123 // Fast path for mapping a character that is immediately surrounded by boundaries. 1124 // In this case, we need not decompose around the current character. 1125 if (isDecompNoAlgorithmic(norm16)) { 1126 // Maps to a single isCompYesAndZeroCC character 1127 // which also implies hasCompBoundaryBefore. 1128 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1129 hasCompBoundaryBefore(s, src, limit)) { 1130 if (prevBoundary != prevSrc) { 1131 buffer.append(s, prevBoundary, prevSrc); 1132 } 1133 buffer.append(mapAlgorithmic(c, norm16), 0); 1134 prevBoundary = src; 1135 continue; 1136 } 1137 } else if (norm16 < minNoNoCompBoundaryBefore) { 1138 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1139 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1140 hasCompBoundaryBefore(s, src, limit)) { 1141 if (prevBoundary != prevSrc) { 1142 buffer.append(s, prevBoundary, prevSrc); 1143 } 1144 int mapping = norm16 >> OFFSET_SHIFT; 1145 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 1146 buffer.append(extraData, mapping, mapping + length); 1147 prevBoundary = src; 1148 continue; 1149 } 1150 } else if (norm16 >= minNoNoEmpty) { 1151 // The current character maps to nothing. 1152 // Simply omit it from the output if there is a boundary before _or_ after it. 1153 // The character itself implies no boundaries. 1154 if (hasCompBoundaryBefore(s, src, limit) || 1155 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 1156 if (prevBoundary != prevSrc) { 1157 buffer.append(s, prevBoundary, prevSrc); 1158 } 1159 prevBoundary = src; 1160 continue; 1161 } 1162 } 1163 // Other "noNo" type, or need to examine more text around this character: 1164 // Fall through to the slow path. 1165 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 1166 char prev=s.charAt(prevSrc-1); 1167 if(c<Hangul.JAMO_T_BASE) { 1168 // The current character is a Jamo Vowel, 1169 // compose with previous Jamo L and following Jamo T. 1170 char l = (char)(prev-Hangul.JAMO_L_BASE); 1171 if(l<Hangul.JAMO_L_COUNT) { 1172 if (!doCompose) { 1173 return false; 1174 } 1175 int t; 1176 if (src != limit && 1177 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 1178 t < Hangul.JAMO_T_COUNT) { 1179 // The next character is a Jamo T. 1180 ++src; 1181 } else if (hasCompBoundaryBefore(s, src, limit)) { 1182 // No Jamo T follows, not even via decomposition. 1183 t = 0; 1184 } else { 1185 t = -1; 1186 } 1187 if (t >= 0) { 1188 int syllable = Hangul.HANGUL_BASE + 1189 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 1190 Hangul.JAMO_T_COUNT + t; 1191 --prevSrc; // Replace the Jamo L as well. 1192 if (prevBoundary != prevSrc) { 1193 buffer.append(s, prevBoundary, prevSrc); 1194 } 1195 buffer.append((char)syllable); 1196 prevBoundary = src; 1197 continue; 1198 } 1199 // If we see L+V+x where x!=T then we drop to the slow path, 1200 // decompose and recompose. 1201 // This is to deal with NFKC finding normal L and V but a 1202 // compatibility variant of a T. 1203 // We need to either fully compose that combination here 1204 // (which would complicate the code and may not work with strange custom data) 1205 // or use the slow path. 1206 } 1207 } else if (Hangul.isHangulLV(prev)) { 1208 // The current character is a Jamo Trailing consonant, 1209 // compose with previous Hangul LV that does not contain a Jamo T. 1210 if (!doCompose) { 1211 return false; 1212 } 1213 int syllable = prev + c - Hangul.JAMO_T_BASE; 1214 --prevSrc; // Replace the Hangul LV as well. 1215 if (prevBoundary != prevSrc) { 1216 buffer.append(s, prevBoundary, prevSrc); 1217 } 1218 buffer.append((char)syllable); 1219 prevBoundary = src; 1220 continue; 1221 } 1222 // No matching context, or may need to decompose surrounding text first: 1223 // Fall through to the slow path. 1224 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 1225 // One or more combining marks that do not combine-back: 1226 // Check for canonical order, copy unchanged if ok and 1227 // if followed by a character with a boundary-before. 1228 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 1229 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 1230 // Fails FCD test, need to decompose and contiguously recompose. 1231 if (!doCompose) { 1232 return false; 1233 } 1234 } else { 1235 // If !onlyContiguous (not FCC), then we ignore the tccc of 1236 // the previous character which passed the quick check "yes && ccc==0" test. 1237 int n16; 1238 for (;;) { 1239 if (src == limit) { 1240 if (doCompose) { 1241 buffer.append(s, prevBoundary, limit); 1242 } 1243 return true; 1244 } 1245 int prevCC = cc; 1246 c = Character.codePointAt(s, src); 1247 n16 = normTrie.get(c); 1248 if (n16 >= MIN_YES_YES_WITH_CC) { 1249 cc = getCCFromNormalYesOrMaybe(n16); 1250 if (prevCC > cc) { 1251 if (!doCompose) { 1252 return false; 1253 } 1254 break; 1255 } 1256 } else { 1257 break; 1258 } 1259 src += Character.charCount(c); 1260 } 1261 // p is after the last in-order combining mark. 1262 // If there is a boundary here, then we continue with no change. 1263 if (norm16HasCompBoundaryBefore(n16)) { 1264 if (isCompYesAndZeroCC(n16)) { 1265 src += Character.charCount(c); 1266 } 1267 continue; 1268 } 1269 // Use the slow path. There is no boundary in [prevSrc, src[. 1270 } 1271 } 1272 1273 // Slow path: Find the nearest boundaries around the current character, 1274 // decompose and recompose. 1275 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 1276 c = Character.codePointBefore(s, prevSrc); 1277 norm16 = normTrie.get(c); 1278 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1279 prevSrc -= Character.charCount(c); 1280 } 1281 } 1282 if (doCompose && prevBoundary != prevSrc) { 1283 buffer.append(s, prevBoundary, prevSrc); 1284 } 1285 int recomposeStartIndex=buffer.length(); 1286 // We know there is not a boundary here. 1287 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 1288 buffer); 1289 // Decompose until the next boundary. 1290 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 1291 buffer); 1292 recompose(buffer, recomposeStartIndex, onlyContiguous); 1293 if(!doCompose) { 1294 if(!buffer.equals(s, prevSrc, src)) { 1295 return false; 1296 } 1297 buffer.remove(); 1298 } 1299 prevBoundary=src; 1300 } 1301 } 1302 1303 /** 1304 * Very similar to compose(): Make the same changes in both places if relevant. 1305 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1306 * !doSpan: quickCheck 1307 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1308 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1309 * then the quick check result is "no" 1310 */ 1311 public int composeQuickCheck(CharSequence s, int src, int limit, 1312 boolean onlyContiguous, boolean doSpan) { 1313 int qcResult=0; 1314 int prevBoundary=src; 1315 int minNoMaybeCP=minCompNoMaybeCP; 1316 1317 for(;;) { 1318 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1319 // or with (compYes && ccc==0) properties. 1320 int prevSrc; 1321 int c = 0; 1322 int norm16 = 0; 1323 for (;;) { 1324 if(src==limit) { 1325 return (src<<1)|qcResult; // "yes" or "maybe" 1326 } 1327 if( (c=s.charAt(src))<minNoMaybeCP || 1328 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1329 ) { 1330 ++src; 1331 } else { 1332 prevSrc = src++; 1333 if(!UTF16.isSurrogate((char)c)) { 1334 break; 1335 } else { 1336 char c2; 1337 if(UTF16Plus.isSurrogateLead(c)) { 1338 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1339 ++src; 1340 c=Character.toCodePoint((char)c, c2); 1341 } 1342 } else /* trail surrogate */ { 1343 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1344 --prevSrc; 1345 c=Character.toCodePoint(c2, (char)c); 1346 } 1347 } 1348 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1349 break; 1350 } 1351 } 1352 } 1353 } 1354 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1355 // The current character is either a "noNo" (has a mapping) 1356 // or a "maybeYes" (combines backward) 1357 // or a "yesYes" with ccc!=0. 1358 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1359 1360 int prevNorm16 = INERT; 1361 if (prevBoundary != prevSrc) { 1362 prevBoundary = prevSrc; 1363 if (!norm16HasCompBoundaryBefore(norm16)) { 1364 c = Character.codePointBefore(s, prevSrc); 1365 int n16 = getNorm16(c); 1366 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1367 prevBoundary -= Character.charCount(c); 1368 prevNorm16 = n16; 1369 } 1370 } 1371 } 1372 1373 if(isMaybeOrNonZeroCC(norm16)) { 1374 int cc=getCCFromYesOrMaybe(norm16); 1375 if (onlyContiguous /* FCC */ && cc != 0 && 1376 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1377 // The [prevBoundary..prevSrc[ character 1378 // passed the quick check "yes && ccc==0" test 1379 // but is out of canonical order with the current combining mark. 1380 } else { 1381 // If !onlyContiguous (not FCC), then we ignore the tccc of 1382 // the previous character which passed the quick check "yes && ccc==0" test. 1383 for (;;) { 1384 if (norm16 < MIN_YES_YES_WITH_CC) { 1385 if (!doSpan) { 1386 qcResult = 1; 1387 } else { 1388 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1389 } 1390 } 1391 if (src == limit) { 1392 return (src<<1) | qcResult; // "yes" or "maybe" 1393 } 1394 int prevCC = cc; 1395 c = Character.codePointAt(s, src); 1396 norm16 = getNorm16(c); 1397 if (isMaybeOrNonZeroCC(norm16)) { 1398 cc = getCCFromYesOrMaybe(norm16); 1399 if (!(prevCC <= cc || cc == 0)) { 1400 break; 1401 } 1402 } else { 1403 break; 1404 } 1405 src += Character.charCount(c); 1406 } 1407 // src is after the last in-order combining mark. 1408 if (isCompYesAndZeroCC(norm16)) { 1409 prevBoundary = src; 1410 src += Character.charCount(c); 1411 continue; 1412 } 1413 } 1414 } 1415 return prevBoundary<<1; // "no" 1416 } 1417 } 1418 public void composeAndAppend(CharSequence s, 1419 boolean doCompose, 1420 boolean onlyContiguous, 1421 ReorderingBuffer buffer) { 1422 int src=0, limit=s.length(); 1423 if(!buffer.isEmpty()) { 1424 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1425 if(0!=firstStarterInSrc) { 1426 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1427 buffer.length(), onlyContiguous); 1428 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1429 firstStarterInSrc+16); 1430 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1431 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1432 middle.append(s, 0, firstStarterInSrc); 1433 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1434 src=firstStarterInSrc; 1435 } 1436 } 1437 if(doCompose) { 1438 compose(s, src, limit, onlyContiguous, true, buffer); 1439 } else { 1440 buffer.append(s, src, limit); 1441 } 1442 } 1443 // Dual functionality: 1444 // buffer!=NULL: normalize 1445 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1446 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1447 // Note: In this function we use buffer->appendZeroCC() because we track 1448 // the lead and trail combining classes here, rather than leaving it to 1449 // the ReorderingBuffer. 1450 // The exception is the call to decomposeShort() which uses the buffer 1451 // in the normal way. 1452 1453 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1454 // Similar to the prevBoundary in the compose() implementation. 1455 int prevBoundary=src; 1456 int prevSrc; 1457 int c=0; 1458 int prevFCD16=0; 1459 int fcd16=0; 1460 1461 for(;;) { 1462 // count code units with lccc==0 1463 for(prevSrc=src; src!=limit;) { 1464 if((c=s.charAt(src))<minLcccCP) { 1465 prevFCD16=~c; 1466 ++src; 1467 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1468 prevFCD16=0; 1469 ++src; 1470 } else { 1471 if(UTF16.isSurrogate((char)c)) { 1472 char c2; 1473 if(UTF16Plus.isSurrogateLead(c)) { 1474 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1475 c=Character.toCodePoint((char)c, c2); 1476 } 1477 } else /* trail surrogate */ { 1478 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1479 --src; 1480 c=Character.toCodePoint(c2, (char)c); 1481 } 1482 } 1483 } 1484 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1485 prevFCD16=fcd16; 1486 src+=Character.charCount(c); 1487 } else { 1488 break; 1489 } 1490 } 1491 } 1492 // copy these code units all at once 1493 if(src!=prevSrc) { 1494 if(src==limit) { 1495 if(buffer!=null) { 1496 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1497 } 1498 break; 1499 } 1500 prevBoundary=src; 1501 // We know that the previous character's lccc==0. 1502 if(prevFCD16<0) { 1503 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1504 int prev=~prevFCD16; 1505 if(prev<minDecompNoCP) { 1506 prevFCD16=0; 1507 } else { 1508 prevFCD16=getFCD16FromNormData(prev); 1509 if(prevFCD16>1) { 1510 --prevBoundary; 1511 } 1512 } 1513 } else { 1514 int p=src-1; 1515 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1516 Character.isHighSurrogate(s.charAt(p-1)) 1517 ) { 1518 --p; 1519 // Need to fetch the previous character's FCD value because 1520 // prevFCD16 was just for the trail surrogate code point. 1521 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1522 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1523 } 1524 if(prevFCD16>1) { 1525 prevBoundary=p; 1526 } 1527 } 1528 if(buffer!=null) { 1529 // The last lccc==0 character is excluded from the 1530 // flush-and-append call in case it needs to be modified. 1531 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1532 buffer.append(s, prevBoundary, src); 1533 } 1534 // The start of the current character (c). 1535 prevSrc=src; 1536 } else if(src==limit) { 1537 break; 1538 } 1539 1540 src+=Character.charCount(c); 1541 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1542 // Check for proper order, and decompose locally if necessary. 1543 if((prevFCD16&0xff)<=(fcd16>>8)) { 1544 // proper order: prev tccc <= current lccc 1545 if((fcd16&0xff)<=1) { 1546 prevBoundary=src; 1547 } 1548 if(buffer!=null) { 1549 buffer.appendZeroCC(c); 1550 } 1551 prevFCD16=fcd16; 1552 continue; 1553 } else if(buffer==null) { 1554 return prevBoundary; // quick check "no" 1555 } else { 1556 /* 1557 * Back out the part of the source that we copied or appended 1558 * already but is now going to be decomposed. 1559 * prevSrc is set to after what was copied/appended. 1560 */ 1561 buffer.removeSuffix(prevSrc-prevBoundary); 1562 /* 1563 * Find the part of the source that needs to be decomposed, 1564 * up to the next safe boundary. 1565 */ 1566 src=findNextFCDBoundary(s, src, limit); 1567 /* 1568 * The source text does not fulfill the conditions for FCD. 1569 * Decompose and reorder a limited piece of the text. 1570 */ 1571 decomposeShort(s, prevBoundary, src, false, false, buffer); 1572 prevBoundary=src; 1573 prevFCD16=0; 1574 } 1575 } 1576 return src; 1577 } 1578 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1579 int src=0, limit=s.length(); 1580 if(!buffer.isEmpty()) { 1581 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1582 if(0!=firstBoundaryInSrc) { 1583 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1584 buffer.length()); 1585 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1586 firstBoundaryInSrc+16); 1587 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1588 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1589 middle.append(s, 0, firstBoundaryInSrc); 1590 makeFCD(middle, 0, middle.length(), buffer); 1591 src=firstBoundaryInSrc; 1592 } 1593 } 1594 if(doMakeFCD) { 1595 makeFCD(s, src, limit, buffer); 1596 } else { 1597 buffer.append(s, src, limit); 1598 } 1599 } 1600 1601 public boolean hasDecompBoundaryBefore(int c) { 1602 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1603 norm16HasDecompBoundaryBefore(getNorm16(c)); 1604 } 1605 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1606 if (norm16 < minNoNoCompNoMaybeCC) { 1607 return true; 1608 } 1609 if (norm16 >= limitNoNo) { 1610 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1611 } 1612 // c decomposes, get everything from the variable-length extra data 1613 int mapping=norm16>>OFFSET_SHIFT; 1614 int firstUnit=extraData.charAt(mapping); 1615 // true if leadCC==0 (hasFCDBoundaryBefore()) 1616 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1617 } 1618 public boolean hasDecompBoundaryAfter(int c) { 1619 if (c < minDecompNoCP) { 1620 return true; 1621 } 1622 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1623 return true; 1624 } 1625 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1626 } 1627 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1628 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1629 return true; 1630 } 1631 if (norm16 >= limitNoNo) { 1632 if (isMaybeOrNonZeroCC(norm16)) { 1633 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1634 } 1635 // Maps to an isCompYesAndZeroCC. 1636 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1637 } 1638 // c decomposes, get everything from the variable-length extra data 1639 int mapping=norm16>>OFFSET_SHIFT; 1640 int firstUnit=extraData.charAt(mapping); 1641 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1642 // fcd16<=1 || trailCC==0 1643 if(firstUnit>0x1ff) { 1644 return false; // trailCC>1 1645 } 1646 if(firstUnit<=0xff) { 1647 return true; // trailCC==0 1648 } 1649 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1650 // true if leadCC==0 (hasFCDBoundaryBefore()) 1651 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1652 } 1653 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1654 1655 public boolean hasCompBoundaryBefore(int c) { 1656 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1657 } 1658 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1659 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1660 } 1661 public boolean isCompInert(int c, boolean onlyContiguous) { 1662 int norm16=getNorm16(c); 1663 return isCompYesAndZeroCC(norm16) && 1664 (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1665 (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff); 1666 } 1667 1668 public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } 1669 public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } 1670 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1671 1672 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1673 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1674 private static boolean isInert(int norm16) { return norm16==INERT; } 1675 private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } 1676 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1677 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 1678 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } 1679 private boolean isHangulLVT(int norm16) { 1680 return norm16==hangulLVT(); 1681 } 1682 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1683 // UBool isCompYes(uint16_t norm16) const { 1684 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1685 // } 1686 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1687 // return norm16<minNoNo || minMaybeYes<=norm16; 1688 // } 1689 // private boolean hasZeroCCFromDecompYes(int norm16) { 1690 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1691 // } 1692 private boolean isDecompYesAndZeroCC(int norm16) { 1693 return norm16<minYesNo || 1694 norm16==JAMO_VT || 1695 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1696 } 1697 /** 1698 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1699 * the MaybeYes which combine-forward and have ccc=0. 1700 * (Standard Unicode 10 normalization does not have such characters.) 1701 */ 1702 private boolean isMostDecompYesAndZeroCC(int norm16) { 1703 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1704 } 1705 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1706 1707 // For use with isCompYes(). 1708 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1709 // static uint8_t getCCFromYes(uint16_t norm16) { 1710 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1711 // } 1712 private int getCCFromNoNo(int norm16) { 1713 int mapping=norm16>>OFFSET_SHIFT; 1714 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1715 return extraData.charAt(mapping-1)&0xff; 1716 } else { 1717 return 0; 1718 } 1719 } 1720 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1721 if(norm16<=minYesNo) { 1722 return 0; // yesYes and Hangul LV have ccc=tccc=0 1723 } else { 1724 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1725 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1726 } 1727 } 1728 1729 // Requires algorithmic-NoNo. 1730 private int mapAlgorithmic(int c, int norm16) { 1731 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1732 } 1733 1734 // Requires minYesNo<norm16<limitNoNo. 1735 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1736 1737 /** 1738 * @return index into maybeYesCompositions, or -1 1739 */ 1740 private int getCompositionsListForDecompYes(int norm16) { 1741 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1742 return -1; 1743 } else { 1744 if((norm16-=minMaybeYes)<0) { 1745 // norm16<minMaybeYes: index into extraData which is a substring at 1746 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1747 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1748 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1749 } 1750 return norm16>>OFFSET_SHIFT; 1751 } 1752 } 1753 /** 1754 * @return index into maybeYesCompositions 1755 */ 1756 private int getCompositionsListForComposite(int norm16) { 1757 // A composite has both mapping & compositions list. 1758 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1759 int firstUnit=maybeYesCompositions.charAt(list); 1760 return list+ // mapping in maybeYesCompositions 1761 1+ // +1 to skip the first unit with the mapping length 1762 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1763 } 1764 private int getCompositionsListForMaybe(int norm16) { 1765 // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES 1766 return (norm16-minMaybeYes)>>OFFSET_SHIFT; 1767 } 1768 /** 1769 * @param c code point must have compositions 1770 * @return index into maybeYesCompositions 1771 */ 1772 private int getCompositionsList(int norm16) { 1773 return isDecompYes(norm16) ? 1774 getCompositionsListForDecompYes(norm16) : 1775 getCompositionsListForComposite(norm16); 1776 } 1777 1778 // Decompose a short piece of text which is likely to contain characters that 1779 // fail the quick check loop and/or where the quick check loop's overhead 1780 // is unlikely to be amortized. 1781 // Called by the compose() and makeFCD() implementations. 1782 // Public in Java for collation implementation code. 1783 private int decomposeShort( 1784 CharSequence s, int src, int limit, 1785 boolean stopAtCompBoundary, boolean onlyContiguous, 1786 ReorderingBuffer buffer) { 1787 while(src<limit) { 1788 int c=Character.codePointAt(s, src); 1789 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1790 return src; 1791 } 1792 int norm16 = getNorm16(c); 1793 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1794 return src; 1795 } 1796 src+=Character.charCount(c); 1797 decompose(c, norm16, buffer); 1798 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1799 return src; 1800 } 1801 } 1802 return src; 1803 } 1804 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1805 // get the decomposition and the lead and trail cc's 1806 if (norm16 >= limitNoNo) { 1807 if (isMaybeOrNonZeroCC(norm16)) { 1808 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1809 return; 1810 } 1811 // Maps to an isCompYesAndZeroCC. 1812 c=mapAlgorithmic(c, norm16); 1813 norm16=getNorm16(c); 1814 } 1815 if (norm16 < minYesNo) { 1816 // c does not decompose 1817 buffer.append(c, 0); 1818 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1819 // Hangul syllable: decompose algorithmically 1820 Hangul.decompose(c, buffer); 1821 } else { 1822 // c decomposes, get everything from the variable-length extra data 1823 int mapping=norm16>>OFFSET_SHIFT; 1824 int firstUnit=extraData.charAt(mapping); 1825 int length=firstUnit&MAPPING_LENGTH_MASK; 1826 int leadCC, trailCC; 1827 trailCC=firstUnit>>8; 1828 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1829 leadCC=extraData.charAt(mapping-1)>>8; 1830 } else { 1831 leadCC=0; 1832 } 1833 ++mapping; // skip over the firstUnit 1834 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); 1835 } 1836 } 1837 1838 /** 1839 * Finds the recomposition result for 1840 * a forward-combining "lead" character, 1841 * specified with a pointer to its compositions list, 1842 * and a backward-combining "trail" character. 1843 * 1844 * <p>If the lead and trail characters combine, then this function returns 1845 * the following "compositeAndFwd" value: 1846 * <pre> 1847 * Bits 21..1 composite character 1848 * Bit 0 set if the composite is a forward-combining starter 1849 * </pre> 1850 * otherwise it returns -1. 1851 * 1852 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1853 * encoded as either pairs or triples of 16-bit units. 1854 * The last entry has the high bit of its first unit set. 1855 * 1856 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1857 * A linear search is used. 1858 * 1859 * <p>See normalizer2impl.h for a more detailed description 1860 * of the compositions list format. 1861 */ 1862 private static int combine(String compositions, int list, int trail) { 1863 int key1, firstUnit; 1864 if(trail<COMP_1_TRAIL_LIMIT) { 1865 // trail character is 0..33FF 1866 // result entry may have 2 or 3 units 1867 key1=(trail<<1); 1868 while(key1>(firstUnit=compositions.charAt(list))) { 1869 list+=2+(firstUnit&COMP_1_TRIPLE); 1870 } 1871 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1872 if((firstUnit&COMP_1_TRIPLE)!=0) { 1873 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1874 } else { 1875 return compositions.charAt(list+1); 1876 } 1877 } 1878 } else { 1879 // trail character is 3400..10FFFF 1880 // result entry has 3 units 1881 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1882 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1883 int secondUnit; 1884 for(;;) { 1885 if(key1>(firstUnit=compositions.charAt(list))) { 1886 list+=2+(firstUnit&COMP_1_TRIPLE); 1887 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1888 if(key2>(secondUnit=compositions.charAt(list+1))) { 1889 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1890 break; 1891 } else { 1892 list+=3; 1893 } 1894 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1895 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1896 } else { 1897 break; 1898 } 1899 } else { 1900 break; 1901 } 1902 } 1903 } 1904 return -1; 1905 } 1906 /** 1907 * @param list some character's compositions list 1908 * @param set recursively receives the composites from these compositions 1909 */ 1910 private void addComposites(int list, UnicodeSet set) { 1911 int firstUnit, compositeAndFwd; 1912 do { 1913 firstUnit=maybeYesCompositions.charAt(list); 1914 if((firstUnit&COMP_1_TRIPLE)==0) { 1915 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1916 list+=2; 1917 } else { 1918 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1919 maybeYesCompositions.charAt(list+2); 1920 list+=3; 1921 } 1922 int composite=compositeAndFwd>>1; 1923 if((compositeAndFwd&1)!=0) { 1924 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1925 } 1926 set.add(composite); 1927 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1928 } 1929 /* 1930 * Recomposes the buffer text starting at recomposeStartIndex 1931 * (which is in NFD - decomposed and canonically ordered), 1932 * and truncates the buffer contents. 1933 * 1934 * Note that recomposition never lengthens the text: 1935 * Any character consists of either one or two code units; 1936 * a composition may contain at most one more code unit than the original starter, 1937 * while the combining mark that is removed has at least one code unit. 1938 */ 1939 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1940 boolean onlyContiguous) { 1941 StringBuilder sb=buffer.getStringBuilder(); 1942 int p=recomposeStartIndex; 1943 if(p==sb.length()) { 1944 return; 1945 } 1946 1947 int starter, pRemove; 1948 int compositionsList; 1949 int c, compositeAndFwd; 1950 int norm16; 1951 int cc, prevCC; 1952 boolean starterIsSupplementary; 1953 1954 // Some of the following variables are not used until we have a forward-combining starter 1955 // and are only initialized now to avoid compiler warnings. 1956 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1957 starter=-1; 1958 starterIsSupplementary=false; 1959 prevCC=0; 1960 1961 for(;;) { 1962 c=sb.codePointAt(p); 1963 p+=Character.charCount(c); 1964 norm16=getNorm16(c); 1965 cc=getCCFromYesOrMaybe(norm16); 1966 if( // this character combines backward and 1967 isMaybe(norm16) && 1968 // we have seen a starter that combines forward and 1969 compositionsList>=0 && 1970 // the backward-combining character is not blocked 1971 (prevCC<cc || prevCC==0) 1972 ) { 1973 if(isJamoVT(norm16)) { 1974 // c is a Jamo V/T, see if we can compose it with the previous character. 1975 if(c<Hangul.JAMO_T_BASE) { 1976 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1977 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1978 if(prev<Hangul.JAMO_L_COUNT) { 1979 pRemove=p-1; 1980 char syllable=(char) 1981 (Hangul.HANGUL_BASE+ 1982 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1983 Hangul.JAMO_T_COUNT); 1984 char t; 1985 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1986 ++p; 1987 syllable+=t; // The next character was a Jamo T. 1988 } 1989 sb.setCharAt(starter, syllable); 1990 // remove the Jamo V/T 1991 sb.delete(pRemove, p); 1992 p=pRemove; 1993 } 1994 } 1995 /* 1996 * No "else" for Jamo T: 1997 * Since the input is in NFD, there are no Hangul LV syllables that 1998 * a Jamo T could combine with. 1999 * All Jamo Ts are combined above when handling Jamo Vs. 2000 */ 2001 if(p==sb.length()) { 2002 break; 2003 } 2004 compositionsList=-1; 2005 continue; 2006 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 2007 // The starter and the combining mark (c) do combine. 2008 int composite=compositeAndFwd>>1; 2009 2010 // Remove the combining mark. 2011 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 2012 sb.delete(pRemove, p); 2013 p=pRemove; 2014 // Replace the starter with the composite. 2015 if(starterIsSupplementary) { 2016 if(composite>0xffff) { 2017 // both are supplementary 2018 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2019 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 2020 } else { 2021 sb.setCharAt(starter, (char)c); 2022 sb.deleteCharAt(starter+1); 2023 // The composite is shorter than the starter, 2024 // move the intermediate characters forward one. 2025 starterIsSupplementary=false; 2026 --p; 2027 } 2028 } else if(composite>0xffff) { 2029 // The composite is longer than the starter, 2030 // move the intermediate characters back one. 2031 starterIsSupplementary=true; 2032 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2033 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 2034 ++p; 2035 } else { 2036 // both are on the BMP 2037 sb.setCharAt(starter, (char)composite); 2038 } 2039 2040 // Keep prevCC because we removed the combining mark. 2041 2042 if(p==sb.length()) { 2043 break; 2044 } 2045 // Is the composite a starter that combines forward? 2046 if((compositeAndFwd&1)!=0) { 2047 compositionsList= 2048 getCompositionsListForComposite(getNorm16(composite)); 2049 } else { 2050 compositionsList=-1; 2051 } 2052 2053 // We combined; continue with looking for compositions. 2054 continue; 2055 } 2056 } 2057 2058 // no combination this time 2059 prevCC=cc; 2060 if(p==sb.length()) { 2061 break; 2062 } 2063 2064 // If c did not combine, then check if it is a starter. 2065 if(cc==0) { 2066 // Found a new starter. 2067 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2068 // It may combine with something, prepare for it. 2069 if(c<=0xffff) { 2070 starterIsSupplementary=false; 2071 starter=p-1; 2072 } else { 2073 starterIsSupplementary=true; 2074 starter=p-2; 2075 } 2076 } 2077 } else if(onlyContiguous) { 2078 // FCC: no discontiguous compositions; any intervening character blocks. 2079 compositionsList=-1; 2080 } 2081 } 2082 buffer.flush(); 2083 } 2084 2085 public int composePair(int a, int b) { 2086 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2087 int list; 2088 if(isInert(norm16)) { 2089 return -1; 2090 } else if(norm16<minYesNoMappingsOnly) { 2091 // a combines forward. 2092 if(isJamoL(norm16)) { 2093 b-=Hangul.JAMO_V_BASE; 2094 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2095 return 2096 (Hangul.HANGUL_BASE+ 2097 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2098 Hangul.JAMO_T_COUNT); 2099 } else { 2100 return -1; 2101 } 2102 } else if(isHangulLV(norm16)) { 2103 b-=Hangul.JAMO_T_BASE; 2104 if(0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2105 return a+b; 2106 } else { 2107 return -1; 2108 } 2109 } else { 2110 // 'a' has a compositions list in extraData 2111 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 2112 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2113 list+= // mapping pointer 2114 1+ // +1 to skip the first unit with the mapping length 2115 (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2116 } 2117 } 2118 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2119 return -1; 2120 } else { 2121 list=getCompositionsListForMaybe(norm16); // offset into maybeYesCompositions 2122 } 2123 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2124 return -1; 2125 } 2126 return combine(maybeYesCompositions, list, b)>>1; 2127 } 2128 2129 /** 2130 * Does c have a composition boundary before it? 2131 * True if its decomposition begins with a character that has 2132 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2133 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2134 * (isCompYesAndZeroCC()) so we need not decompose. 2135 */ 2136 private boolean hasCompBoundaryBefore(int c, int norm16) { 2137 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 2138 } 2139 private boolean norm16HasCompBoundaryBefore(int norm16) { 2140 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 2141 } 2142 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 2143 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 2144 } 2145 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 2146 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 2147 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 2148 } 2149 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 2150 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 2151 } 2152 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 2153 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 2154 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 2155 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 2156 } 2157 2158 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 2159 while(p>0) { 2160 int c=Character.codePointBefore(s, p); 2161 int norm16 = getNorm16(c); 2162 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2163 break; 2164 } 2165 p-=Character.charCount(c); 2166 if(hasCompBoundaryBefore(c, norm16)) { 2167 break; 2168 } 2169 } 2170 return p; 2171 } 2172 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 2173 while(p<limit) { 2174 int c=Character.codePointAt(s, p); 2175 int norm16=normTrie.get(c); 2176 if(hasCompBoundaryBefore(c, norm16)) { 2177 break; 2178 } 2179 p+=Character.charCount(c); 2180 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2181 break; 2182 } 2183 } 2184 return p; 2185 } 2186 2187 private int findPreviousFCDBoundary(CharSequence s, int p) { 2188 while(p>0) { 2189 int c=Character.codePointBefore(s, p); 2190 int norm16; 2191 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) { 2192 break; 2193 } 2194 p-=Character.charCount(c); 2195 if (norm16HasDecompBoundaryBefore(norm16)) { 2196 break; 2197 } 2198 } 2199 return p; 2200 } 2201 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2202 while(p<limit) { 2203 int c=Character.codePointAt(s, p); 2204 int norm16; 2205 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 2206 break; 2207 } 2208 p+=Character.charCount(c); 2209 if (norm16HasDecompBoundaryAfter(norm16)) { 2210 break; 2211 } 2212 } 2213 return p; 2214 } 2215 2216 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2217 if (start == p) { 2218 return 0; 2219 } 2220 return getFCD16(Character.codePointBefore(s, p)); 2221 } 2222 2223 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2224 int canonValue=newData.get(decompLead); 2225 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2226 // origin is the first character whose decomposition starts with 2227 // the character for which we are setting the value. 2228 newData.set(decompLead, canonValue|origin); 2229 } else { 2230 // origin is not the first character, or it is U+0000. 2231 UnicodeSet set; 2232 if((canonValue&CANON_HAS_SET)==0) { 2233 int firstOrigin=canonValue&CANON_VALUE_MASK; 2234 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2235 newData.set(decompLead, canonValue); 2236 canonStartSets.add(set=new UnicodeSet()); 2237 if(firstOrigin!=0) { 2238 set.add(firstOrigin); 2239 } 2240 } else { 2241 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2242 } 2243 set.add(origin); 2244 } 2245 } 2246 2247 @SuppressWarnings("unused") 2248 private VersionInfo dataVersion; 2249 2250 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2251 private int minDecompNoCP; 2252 private int minCompNoMaybeCP; 2253 private int minLcccCP; 2254 2255 // Norm16 value thresholds for quick check combinations and types of extra data. 2256 private int minYesNo; 2257 private int minYesNoMappingsOnly; 2258 private int minNoNo; 2259 private int minNoNoCompBoundaryBefore; 2260 private int minNoNoCompNoMaybeCC; 2261 private int minNoNoEmpty; 2262 private int limitNoNo; 2263 private int centerNoNoDelta; 2264 private int minMaybeYes; 2265 2266 private Trie2_16 normTrie; 2267 private String maybeYesCompositions; 2268 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2269 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2270 2271 private Trie2_32 canonIterData; 2272 private ArrayList<UnicodeSet> canonStartSets; 2273 2274 // bits in canonIterData 2275 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2276 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2277 private static final int CANON_HAS_SET = 0x200000; 2278 private static final int CANON_VALUE_MASK = 0x1fffff; 2279 } 2280