1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /** 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import android.icu.impl.Utility; 14 15 /** 16 * <p> 17 * Standalone utility class providing UTF16 character conversions and indexing conversions. 18 * </p> 19 * <p> 20 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, 21 * so searching for strings is a safe operation. Similarly, concatenation is always safe. 22 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the 23 * values for start and end are on those boundaries, since they arose from operations like 24 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>. 25 * </p> 26 * <strong>Examples:</strong> 27 * <p> 28 * The following examples illustrate use of some of these methods. 29 * 30 * <pre> 31 * // iteration forwards: Original 32 * for (int i = 0; i < s.length(); ++i) { 33 * char ch = s.charAt(i); 34 * doSomethingWith(ch); 35 * } 36 * 37 * // iteration forwards: Changes for UTF-32 38 * int ch; 39 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 40 * ch = UTF16.charAt(s, i); 41 * doSomethingWith(ch); 42 * } 43 * 44 * // iteration backwards: Original 45 * for (int i = s.length() - 1; i >= 0; --i) { 46 * char ch = s.charAt(i); 47 * doSomethingWith(ch); 48 * } 49 * 50 * // iteration backwards: Changes for UTF-32 51 * int ch; 52 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 53 * ch = UTF16.charAt(s, i); 54 * doSomethingWith(ch); 55 * } 56 * </pre> 57 * 58 * <strong>Notes:</strong> 59 * <ul> 60 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code> 61 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string. 62 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16 63 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32 64 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li> 65 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a 66 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 67 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>. 68 * </li> 69 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out 70 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates 71 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to 72 * check for validity if desired. </li> 73 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then 74 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It 75 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 76 * 5.5). </li> 77 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the 78 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small 79 * percentage of all the text in the world, the singleton case should always be optimized for. </li> 80 * </ul> 81 * 82 * @author Mark Davis, with help from Markus Scherer 83 * @hide Only a subset of ICU is exposed in Android 84 */ 85 86 public final class UTF16 { 87 // public variables --------------------------------------------------- 88 89 /** 90 * Value returned in {@link #bounds(String, int) bounds()}. 91 * These values are chosen specifically so that it actually represents the position of the 92 * character [offset16 - (value >> 2), offset16 + (value & 3)] 93 */ 94 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2, 95 TRAIL_SURROGATE_BOUNDARY = 5; 96 97 /** 98 * The lowest Unicode code point value. 99 */ 100 public static final int CODEPOINT_MIN_VALUE = 0; 101 102 /** 103 * The highest Unicode code point value (scalar value) according to the Unicode Standard. 104 */ 105 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 106 107 /** 108 * The minimum value for Supplementary code points 109 */ 110 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 111 112 /** 113 * Lead surrogate minimum value 114 */ 115 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 116 117 /** 118 * Trail surrogate minimum value 119 */ 120 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 121 122 /** 123 * Lead surrogate maximum value 124 */ 125 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 126 127 /** 128 * Trail surrogate maximum value 129 */ 130 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 131 132 /** 133 * Surrogate minimum value 134 */ 135 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 136 137 /** 138 * Maximum surrogate value 139 */ 140 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE; 141 142 /** 143 * Lead surrogate bitmask 144 */ 145 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 146 147 /** 148 * Trail surrogate bitmask 149 */ 150 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 151 152 /** 153 * Surrogate bitmask 154 */ 155 private static final int SURROGATE_BITMASK = 0xFFFFF800; 156 157 /** 158 * Lead surrogate bits 159 */ 160 private static final int LEAD_SURROGATE_BITS = 0xD800; 161 162 /** 163 * Trail surrogate bits 164 */ 165 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 166 167 /** 168 * Surrogate bits 169 */ 170 private static final int SURROGATE_BITS = 0xD800; 171 172 // constructor -------------------------------------------------------- 173 174 // /CLOVER:OFF 175 /** 176 * Prevent instance from being created. 177 */ 178 private UTF16() { 179 } 180 181 // /CLOVER:ON 182 // public method ------------------------------------------------------ 183 184 /** 185 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 186 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 187 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 188 * UCharacter.isLegal()</a></code> 189 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 190 * character will be returned. If a complete supplementary character is not found the incomplete 191 * character will be returned 192 * 193 * @param source Array of UTF-16 chars 194 * @param offset16 UTF-16 offset to the start of the character. 195 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 196 * of that codepoint are the same as in <code>bounds32()</code>. 197 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 198 */ 199 public static int charAt(String source, int offset16) { 200 char single = source.charAt(offset16); 201 if (single < LEAD_SURROGATE_MIN_VALUE) { 202 return single; 203 } 204 return _charAt(source, offset16, single); 205 } 206 207 private static int _charAt(String source, int offset16, char single) { 208 if (single > TRAIL_SURROGATE_MAX_VALUE) { 209 return single; 210 } 211 212 // Convert the UTF-16 surrogate pair if necessary. 213 // For simplicity in usage, and because the frequency of pairs is 214 // low, look both directions. 215 216 if (single <= LEAD_SURROGATE_MAX_VALUE) { 217 ++offset16; 218 if (source.length() != offset16) { 219 char trail = source.charAt(offset16); 220 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 221 return Character.toCodePoint(single, trail); 222 } 223 } 224 } else { 225 --offset16; 226 if (offset16 >= 0) { 227 // single is a trail surrogate so 228 char lead = source.charAt(offset16); 229 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 230 return Character.toCodePoint(lead, single); 231 } 232 } 233 } 234 return single; // return unmatched surrogate 235 } 236 237 /** 238 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 239 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 240 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)"> 241 * UCharacter.isLegal()</a></code> 242 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 243 * character will be returned. If a complete supplementary character is not found the incomplete 244 * character will be returned 245 * 246 * @param source Array of UTF-16 chars 247 * @param offset16 UTF-16 offset to the start of the character. 248 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 249 * of that codepoint are the same as in <code>bounds32()</code>. 250 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 251 */ 252 public static int charAt(CharSequence source, int offset16) { 253 char single = source.charAt(offset16); 254 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 255 return single; 256 } 257 return _charAt(source, offset16, single); 258 } 259 260 private static int _charAt(CharSequence source, int offset16, char single) { 261 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 262 return single; 263 } 264 265 // Convert the UTF-16 surrogate pair if necessary. 266 // For simplicity in usage, and because the frequency of pairs is 267 // low, look both directions. 268 269 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 270 ++offset16; 271 if (source.length() != offset16) { 272 char trail = source.charAt(offset16); 273 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 274 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 275 return Character.toCodePoint(single, trail); 276 } 277 } 278 } else { 279 --offset16; 280 if (offset16 >= 0) { 281 // single is a trail surrogate so 282 char lead = source.charAt(offset16); 283 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 284 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 285 return Character.toCodePoint(lead, single); 286 } 287 } 288 } 289 return single; // return unmatched surrogate 290 } 291 292 /** 293 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 294 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 295 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 296 * </a></code> 297 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 298 * character will be returned. If a complete supplementary character is not found the incomplete 299 * character will be returned 300 * 301 * @param source UTF-16 chars string buffer 302 * @param offset16 UTF-16 offset to the start of the character. 303 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 304 * of that codepoint are the same as in <code>bounds32()</code>. 305 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 306 */ 307 public static int charAt(StringBuffer source, int offset16) { 308 if (offset16 < 0 || offset16 >= source.length()) { 309 throw new StringIndexOutOfBoundsException(offset16); 310 } 311 312 char single = source.charAt(offset16); 313 if (!isSurrogate(single)) { 314 return single; 315 } 316 317 // Convert the UTF-16 surrogate pair if necessary. 318 // For simplicity in usage, and because the frequency of pairs is 319 // low, look both directions. 320 321 if (single <= LEAD_SURROGATE_MAX_VALUE) { 322 ++offset16; 323 if (source.length() != offset16) { 324 char trail = source.charAt(offset16); 325 if (isTrailSurrogate(trail)) 326 return Character.toCodePoint(single, trail); 327 } 328 } else { 329 --offset16; 330 if (offset16 >= 0) { 331 // single is a trail surrogate so 332 char lead = source.charAt(offset16); 333 if (isLeadSurrogate(lead)) { 334 return Character.toCodePoint(lead, single); 335 } 336 } 337 } 338 return single; // return unmatched surrogate 339 } 340 341 /** 342 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 343 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 344 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 345 * </a></code> 346 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 347 * character will be returned. If a complete supplementary character is not found the incomplete 348 * character will be returned 349 * 350 * @param source Array of UTF-16 chars 351 * @param start Offset to substring in the source array for analyzing 352 * @param limit Offset to substring in the source array for analyzing 353 * @param offset16 UTF-16 offset relative to start 354 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 355 * of that codepoint are the same as in <code>bounds32()</code>. 356 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 357 */ 358 public static int charAt(char source[], int start, int limit, int offset16) { 359 offset16 += start; 360 if (offset16 < start || offset16 >= limit) { 361 throw new ArrayIndexOutOfBoundsException(offset16); 362 } 363 364 char single = source[offset16]; 365 if (!isSurrogate(single)) { 366 return single; 367 } 368 369 // Convert the UTF-16 surrogate pair if necessary. 370 // For simplicity in usage, and because the frequency of pairs is 371 // low, look both directions. 372 if (single <= LEAD_SURROGATE_MAX_VALUE) { 373 offset16++; 374 if (offset16 >= limit) { 375 return single; 376 } 377 char trail = source[offset16]; 378 if (isTrailSurrogate(trail)) { 379 return Character.toCodePoint(single, trail); 380 } 381 } else { // isTrailSurrogate(single), so 382 if (offset16 == start) { 383 return single; 384 } 385 offset16--; 386 char lead = source[offset16]; 387 if (isLeadSurrogate(lead)) 388 return Character.toCodePoint(lead, single); 389 } 390 return single; // return unmatched surrogate 391 } 392 393 /** 394 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with 395 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 396 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 397 * </a></code> 398 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 399 * character will be returned. If a complete supplementary character is not found the incomplete 400 * character will be returned 401 * 402 * @param source UTF-16 chars string buffer 403 * @param offset16 UTF-16 offset to the start of the character. 404 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 405 * of that codepoint are the same as in <code>bounds32()</code>. 406 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds. 407 */ 408 public static int charAt(Replaceable source, int offset16) { 409 if (offset16 < 0 || offset16 >= source.length()) { 410 throw new StringIndexOutOfBoundsException(offset16); 411 } 412 413 char single = source.charAt(offset16); 414 if (!isSurrogate(single)) { 415 return single; 416 } 417 418 // Convert the UTF-16 surrogate pair if necessary. 419 // For simplicity in usage, and because the frequency of pairs is 420 // low, look both directions. 421 422 if (single <= LEAD_SURROGATE_MAX_VALUE) { 423 ++offset16; 424 if (source.length() != offset16) { 425 char trail = source.charAt(offset16); 426 if (isTrailSurrogate(trail)) 427 return Character.toCodePoint(single, trail); 428 } 429 } else { 430 --offset16; 431 if (offset16 >= 0) { 432 // single is a trail surrogate so 433 char lead = source.charAt(offset16); 434 if (isLeadSurrogate(lead)) { 435 return Character.toCodePoint(lead, single); 436 } 437 } 438 } 439 return single; // return unmatched surrogate 440 } 441 442 /** 443 * Determines how many chars this char32 requires. If a validity check is required, use <code> 444 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 445 * on char32 before calling. 446 * 447 * @param char32 The input codepoint. 448 * @return 2 if is in supplementary space, otherwise 1. 449 */ 450 public static int getCharCount(int char32) { 451 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 452 return 1; 453 } 454 return 2; 455 } 456 457 /** 458 * Returns the type of the boundaries around the char at offset16. Used for random access. 459 * 460 * @param source Text to analyse 461 * @param offset16 UTF-16 offset 462 * @return 463 * <ul> 464 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1] 465 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 466 * are [offset16, offset16 + 2] 467 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 468 * bounds are [offset16 - 1, offset16 + 1] 469 * </ul> 470 * For bit-twiddlers, the return values for these are chosen so that the boundaries 471 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 472 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 473 */ 474 public static int bounds(String source, int offset16) { 475 char ch = source.charAt(offset16); 476 if (isSurrogate(ch)) { 477 if (isLeadSurrogate(ch)) { 478 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 479 return LEAD_SURROGATE_BOUNDARY; 480 } 481 } else { 482 // isTrailSurrogate(ch), so 483 --offset16; 484 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 485 return TRAIL_SURROGATE_BOUNDARY; 486 } 487 } 488 } 489 return SINGLE_CHAR_BOUNDARY; 490 } 491 492 /** 493 * Returns the type of the boundaries around the char at offset16. Used for random access. 494 * 495 * @param source String buffer to analyse 496 * @param offset16 UTF16 offset 497 * @return 498 * <ul> 499 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1] 500 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 501 * are [offset16, offset16 + 2] 502 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 503 * bounds are [offset16 - 1, offset16 + 1] 504 * </ul> 505 * For bit-twiddlers, the return values for these are chosen so that the boundaries 506 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. 507 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 508 */ 509 public static int bounds(StringBuffer source, int offset16) { 510 char ch = source.charAt(offset16); 511 if (isSurrogate(ch)) { 512 if (isLeadSurrogate(ch)) { 513 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) { 514 return LEAD_SURROGATE_BOUNDARY; 515 } 516 } else { 517 // isTrailSurrogate(ch), so 518 --offset16; 519 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) { 520 return TRAIL_SURROGATE_BOUNDARY; 521 } 522 } 523 } 524 return SINGLE_CHAR_BOUNDARY; 525 } 526 527 /** 528 * Returns the type of the boundaries around the char at offset16. Used for random access. Note 529 * that the boundaries are determined with respect to the subarray, hence the char array 530 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1. 531 * 532 * @param source Char array to analyse 533 * @param start Offset to substring in the source array for analyzing 534 * @param limit Offset to substring in the source array for analyzing 535 * @param offset16 UTF16 offset relative to start 536 * @return 537 * <ul> 538 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are 539 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds 540 * are [offset16, offset16 + 2] 541 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the 542 * bounds are [offset16 - 1, offset16 + 1] 543 * </ul> 544 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries 545 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)]. 546 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 547 */ 548 public static int bounds(char source[], int start, int limit, int offset16) { 549 offset16 += start; 550 if (offset16 < start || offset16 >= limit) { 551 throw new ArrayIndexOutOfBoundsException(offset16); 552 } 553 char ch = source[offset16]; 554 if (isSurrogate(ch)) { 555 if (isLeadSurrogate(ch)) { 556 ++offset16; 557 if (offset16 < limit && isTrailSurrogate(source[offset16])) { 558 return LEAD_SURROGATE_BOUNDARY; 559 } 560 } else { // isTrailSurrogate(ch), so 561 --offset16; 562 if (offset16 >= start && isLeadSurrogate(source[offset16])) { 563 return TRAIL_SURROGATE_BOUNDARY; 564 } 565 } 566 } 567 return SINGLE_CHAR_BOUNDARY; 568 } 569 570 /** 571 * Determines whether the code value is a surrogate. 572 * 573 * @param char16 The input character. 574 * @return true If the input character is a surrogate. 575 */ 576 public static boolean isSurrogate(char char16) { 577 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 578 } 579 580 /** 581 * Determines whether the character is a trail surrogate. 582 * 583 * @param char16 The input character. 584 * @return true If the input character is a trail surrogate. 585 */ 586 public static boolean isTrailSurrogate(char char16) { 587 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 588 } 589 590 /** 591 * Determines whether the character is a lead surrogate. 592 * 593 * @param char16 The input character. 594 * @return true If the input character is a lead surrogate 595 */ 596 public static boolean isLeadSurrogate(char char16) { 597 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 598 } 599 600 /** 601 * Returns the lead surrogate. If a validity check is required, use 602 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 603 * before calling. 604 * 605 * @param char32 The input character. 606 * @return lead surrogate if the getCharCount(ch) is 2; <br> 607 * and 0 otherwise (note: 0 is not a valid lead surrogate). 608 */ 609 public static char getLeadSurrogate(int char32) { 610 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 611 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_)); 612 } 613 return 0; 614 } 615 616 /** 617 * Returns the trail surrogate. If a validity check is required, use 618 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 619 * before calling. 620 * 621 * @param char32 The input character. 622 * @return the trail surrogate if the getCharCount(ch) is 2; <br> 623 * otherwise the character itself 624 */ 625 public static char getTrailSurrogate(int char32) { 626 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 627 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_)); 628 } 629 return (char) char32; 630 } 631 632 /** 633 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string 634 * containing the UTF-32 value in UTF16 format. If a validity check is required, use 635 * {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before calling. 636 * 637 * @param char32 The input character. 638 * @return string value of char32 in UTF16 format 639 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint. 640 */ 641 public static String valueOf(int char32) { 642 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 643 throw new IllegalArgumentException("Illegal codepoint"); 644 } 645 return toString(char32); 646 } 647 648 /** 649 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or 650 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate 651 * character, the whole supplementary codepoint will be returned. If a validity check is 652 * required, use {@link android.icu.lang.UCharacter#isLegal(int)} on the 653 * codepoint at offset16 before calling. The result returned will be a newly created String 654 * obtained by calling source.substring(..) with the appropriate indexes. 655 * 656 * @param source The input string. 657 * @param offset16 The UTF16 index to the codepoint in source 658 * @return string value of char32 in UTF16 format 659 */ 660 public static String valueOf(String source, int offset16) { 661 switch (bounds(source, offset16)) { 662 case LEAD_SURROGATE_BOUNDARY: 663 return source.substring(offset16, offset16 + 2); 664 case TRAIL_SURROGATE_BOUNDARY: 665 return source.substring(offset16 - 1, offset16 + 1); 666 default: 667 return source.substring(offset16, offset16 + 1); 668 } 669 } 670 671 /** 672 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a 673 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a 674 * surrogate character, the whole supplementary codepoint will be returned. If a validity check 675 * is required, use {@link android.icu.lang.UCharacter#isLegal(int)} on 676 * the codepoint at offset16 before calling. The result returned will be a newly created String 677 * obtained by calling source.substring(..) with the appropriate indexes. 678 * 679 * @param source The input string buffer. 680 * @param offset16 The UTF16 index to the codepoint in source 681 * @return string value of char32 in UTF16 format 682 */ 683 public static String valueOf(StringBuffer source, int offset16) { 684 switch (bounds(source, offset16)) { 685 case LEAD_SURROGATE_BOUNDARY: 686 return source.substring(offset16, offset16 + 2); 687 case TRAIL_SURROGATE_BOUNDARY: 688 return source.substring(offset16 - 1, offset16 + 1); 689 default: 690 return source.substring(offset16, offset16 + 1); 691 } 692 } 693 694 /** 695 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16 696 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be 697 * returned, except when either the leading or trailing surrogate character lies out of the 698 * specified subarray. In the latter case, only the surrogate character within bounds will be 699 * returned. If a validity check is required, use 700 * {@link android.icu.lang.UCharacter#isLegal(int)} on the codepoint at 701 * offset16 before calling. The result returned will be a newly created String containing the 702 * relevant characters. 703 * 704 * @param source The input char array. 705 * @param start Start index of the subarray 706 * @param limit End index of the subarray 707 * @param offset16 The UTF16 index to the codepoint in source relative to start 708 * @return string value of char32 in UTF16 format 709 */ 710 public static String valueOf(char source[], int start, int limit, int offset16) { 711 switch (bounds(source, start, limit, offset16)) { 712 case LEAD_SURROGATE_BOUNDARY: 713 return new String(source, start + offset16, 2); 714 case TRAIL_SURROGATE_BOUNDARY: 715 return new String(source, start + offset16 - 1, 2); 716 } 717 return new String(source, start + offset16, 1); 718 } 719 720 /** 721 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 722 * the {@link UTF16 class description} for notes on roundtripping. 723 * 724 * @param source The UTF-16 string 725 * @param offset32 UTF-32 offset 726 * @return UTF-16 offset 727 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 728 */ 729 public static int findOffsetFromCodePoint(String source, int offset32) { 730 char ch; 731 int size = source.length(), result = 0, count = offset32; 732 if (offset32 < 0 || offset32 > size) { 733 throw new StringIndexOutOfBoundsException(offset32); 734 } 735 while (result < size && count > 0) { 736 ch = source.charAt(result); 737 if (isLeadSurrogate(ch) && ((result + 1) < size) 738 && isTrailSurrogate(source.charAt(result + 1))) { 739 result++; 740 } 741 742 count--; 743 result++; 744 } 745 if (count != 0) { 746 throw new StringIndexOutOfBoundsException(offset32); 747 } 748 return result; 749 } 750 751 /** 752 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 753 * the {@link UTF16 class description} for notes on roundtripping. 754 * 755 * @param source The UTF-16 string buffer 756 * @param offset32 UTF-32 offset 757 * @return UTF-16 offset 758 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 759 */ 760 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) { 761 char ch; 762 int size = source.length(), result = 0, count = offset32; 763 if (offset32 < 0 || offset32 > size) { 764 throw new StringIndexOutOfBoundsException(offset32); 765 } 766 while (result < size && count > 0) { 767 ch = source.charAt(result); 768 if (isLeadSurrogate(ch) && ((result + 1) < size) 769 && isTrailSurrogate(source.charAt(result + 1))) { 770 result++; 771 } 772 773 count--; 774 result++; 775 } 776 if (count != 0) { 777 throw new StringIndexOutOfBoundsException(offset32); 778 } 779 return result; 780 } 781 782 /** 783 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See 784 * the {@link UTF16 class description} for notes on roundtripping. 785 * 786 * @param source The UTF-16 char array whose substring is to be analysed 787 * @param start Offset of the substring to be analysed 788 * @param limit Offset of the substring to be analysed 789 * @param offset32 UTF-32 offset relative to start 790 * @return UTF-16 offset relative to start 791 * @exception IndexOutOfBoundsException If offset32 is out of bounds. 792 */ 793 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) { 794 char ch; 795 int result = start, count = offset32; 796 if (offset32 > limit - start) { 797 throw new ArrayIndexOutOfBoundsException(offset32); 798 } 799 while (result < limit && count > 0) { 800 ch = source[result]; 801 if (isLeadSurrogate(ch) && ((result + 1) < limit) 802 && isTrailSurrogate(source[result + 1])) { 803 result++; 804 } 805 806 count--; 807 result++; 808 } 809 if (count != 0) { 810 throw new ArrayIndexOutOfBoundsException(offset32); 811 } 812 return result - start; 813 } 814 815 /** 816 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given 817 * UTF-16 offset. Used for random access. See the {@link UTF16 class description} for 818 * notes on roundtripping.<br> 819 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 820 * of the <strong>lead</strong> of the pair is returned. </i> 821 * <p> 822 * To find the UTF-32 length of a string, use: 823 * 824 * <pre> 825 * len32 = countCodePoint(source, source.length()); 826 * </pre> 827 * 828 * @param source Text to analyse 829 * @param offset16 UTF-16 offset < source text length. 830 * @return UTF-32 offset 831 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 832 */ 833 public static int findCodePointOffset(String source, int offset16) { 834 if (offset16 < 0 || offset16 > source.length()) { 835 throw new StringIndexOutOfBoundsException(offset16); 836 } 837 838 int result = 0; 839 char ch; 840 boolean hadLeadSurrogate = false; 841 842 for (int i = 0; i < offset16; ++i) { 843 ch = source.charAt(i); 844 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 845 hadLeadSurrogate = false; // count valid trail as zero 846 } else { 847 hadLeadSurrogate = isLeadSurrogate(ch); 848 ++result; // count others as 1 849 } 850 } 851 852 if (offset16 == source.length()) { 853 return result; 854 } 855 856 // end of source being the less significant surrogate character 857 // shift result back to the start of the supplementary character 858 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 859 result--; 860 } 861 862 return result; 863 } 864 865 /** 866 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 867 * offset. Used for random access. See the {@link UTF16 class description} for notes on 868 * roundtripping.<br> 869 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 870 * of the <strong>lead</strong> of the pair is returned. </i> 871 * <p> 872 * To find the UTF-32 length of a string, use: 873 * 874 * <pre> 875 * len32 = countCodePoint(source); 876 * </pre> 877 * 878 * @param source Text to analyse 879 * @param offset16 UTF-16 offset < source text length. 880 * @return UTF-32 offset 881 * @exception IndexOutOfBoundsException If offset16 is out of bounds. 882 */ 883 public static int findCodePointOffset(StringBuffer source, int offset16) { 884 if (offset16 < 0 || offset16 > source.length()) { 885 throw new StringIndexOutOfBoundsException(offset16); 886 } 887 888 int result = 0; 889 char ch; 890 boolean hadLeadSurrogate = false; 891 892 for (int i = 0; i < offset16; ++i) { 893 ch = source.charAt(i); 894 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 895 hadLeadSurrogate = false; // count valid trail as zero 896 } else { 897 hadLeadSurrogate = isLeadSurrogate(ch); 898 ++result; // count others as 1 899 } 900 } 901 902 if (offset16 == source.length()) { 903 return result; 904 } 905 906 // end of source being the less significant surrogate character 907 // shift result back to the start of the supplementary character 908 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) { 909 result--; 910 } 911 912 return result; 913 } 914 915 /** 916 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16 917 * offset. Used for random access. See the {@link UTF16 class description} for notes on 918 * roundtripping.<br> 919 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset 920 * of the <strong>lead</strong> of the pair is returned. </i> 921 * <p> 922 * To find the UTF-32 length of a substring, use: 923 * 924 * <pre> 925 * len32 = countCodePoint(source, start, limit); 926 * </pre> 927 * 928 * @param source Text to analyse 929 * @param start Offset of the substring 930 * @param limit Offset of the substring 931 * @param offset16 UTF-16 relative to start 932 * @return UTF-32 offset relative to start 933 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit. 934 */ 935 public static int findCodePointOffset(char source[], int start, int limit, int offset16) { 936 offset16 += start; 937 if (offset16 > limit) { 938 throw new StringIndexOutOfBoundsException(offset16); 939 } 940 941 int result = 0; 942 char ch; 943 boolean hadLeadSurrogate = false; 944 945 for (int i = start; i < offset16; ++i) { 946 ch = source[i]; 947 if (hadLeadSurrogate && isTrailSurrogate(ch)) { 948 hadLeadSurrogate = false; // count valid trail as zero 949 } else { 950 hadLeadSurrogate = isLeadSurrogate(ch); 951 ++result; // count others as 1 952 } 953 } 954 955 if (offset16 == limit) { 956 return result; 957 } 958 959 // end of source being the less significant surrogate character 960 // shift result back to the start of the supplementary character 961 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) { 962 result--; 963 } 964 965 return result; 966 } 967 968 /** 969 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required, 970 * use {@link android.icu.lang.UCharacter#isLegal(int)} on char32 before 971 * calling. 972 * 973 * @param target The buffer to append to 974 * @param char32 Value to append. 975 * @return the updated StringBuffer 976 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints 977 */ 978 public static StringBuffer append(StringBuffer target, int char32) { 979 // Check for irregular values 980 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 981 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 982 } 983 984 // Write the UTF-16 values 985 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 986 target.append(getLeadSurrogate(char32)); 987 target.append(getTrailSurrogate(char32)); 988 } else { 989 target.append((char) char32); 990 } 991 return target; 992 } 993 994 /** 995 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a 996 * convenience. 997 * 998 * @param target The buffer to append to 999 * @param cp The code point to append 1000 * @return the updated StringBuffer 1001 * @throws IllegalArgumentException If cp is not a valid code point 1002 */ 1003 public static StringBuffer appendCodePoint(StringBuffer target, int cp) { 1004 return append(target, cp); 1005 } 1006 1007 /** 1008 * Adds a codepoint to offset16 position of the argument char array. 1009 * 1010 * @param target Char array to be append with the new code point 1011 * @param limit UTF16 offset which the codepoint will be appended. 1012 * @param char32 Code point to be appended 1013 * @return offset after char32 in the array. 1014 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not 1015 * lie within the range of the Unicode codepoints. 1016 */ 1017 public static int append(char[] target, int limit, int char32) { 1018 // Check for irregular values 1019 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1020 throw new IllegalArgumentException("Illegal codepoint"); 1021 } 1022 // Write the UTF-16 values 1023 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 1024 target[limit++] = getLeadSurrogate(char32); 1025 target[limit++] = getTrailSurrogate(char32); 1026 } else { 1027 target[limit++] = (char) char32; 1028 } 1029 return limit; 1030 } 1031 1032 /** 1033 * Number of codepoints in a UTF16 String 1034 * 1035 * @param source UTF16 string 1036 * @return number of codepoint in string 1037 */ 1038 public static int countCodePoint(String source) { 1039 if (source == null || source.length() == 0) { 1040 return 0; 1041 } 1042 return findCodePointOffset(source, source.length()); 1043 } 1044 1045 /** 1046 * Number of codepoints in a UTF16 String buffer 1047 * 1048 * @param source UTF16 string buffer 1049 * @return number of codepoint in string 1050 */ 1051 public static int countCodePoint(StringBuffer source) { 1052 if (source == null || source.length() == 0) { 1053 return 0; 1054 } 1055 return findCodePointOffset(source, source.length()); 1056 } 1057 1058 /** 1059 * Number of codepoints in a UTF16 char array substring 1060 * 1061 * @param source UTF16 char array 1062 * @param start Offset of the substring 1063 * @param limit Offset of the substring 1064 * @return number of codepoint in the substring 1065 * @exception IndexOutOfBoundsException If start and limit are not valid. 1066 */ 1067 public static int countCodePoint(char source[], int start, int limit) { 1068 if (source == null || source.length == 0) { 1069 return 0; 1070 } 1071 return findCodePointOffset(source, start, limit, limit - start); 1072 } 1073 1074 /** 1075 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a 1076 * non-supplementary codepoint with a supplementary and vice versa. 1077 * 1078 * @param target Stringbuffer 1079 * @param offset16 UTF16 position to insert into 1080 * @param char32 Code point 1081 */ 1082 public static void setCharAt(StringBuffer target, int offset16, int char32) { 1083 int count = 1; 1084 char single = target.charAt(offset16); 1085 1086 if (isSurrogate(single)) { 1087 // pairs of the surrogate with offset16 at the lead char found 1088 if (isLeadSurrogate(single) && (target.length() > offset16 + 1) 1089 && isTrailSurrogate(target.charAt(offset16 + 1))) { 1090 count++; 1091 } else { 1092 // pairs of the surrogate with offset16 at the trail char 1093 // found 1094 if (isTrailSurrogate(single) && (offset16 > 0) 1095 && isLeadSurrogate(target.charAt(offset16 - 1))) { 1096 offset16--; 1097 count++; 1098 } 1099 } 1100 } 1101 target.replace(offset16, offset16 + count, valueOf(char32)); 1102 } 1103 1104 /** 1105 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are 1106 * replacing a non-supplementary codepoint with a supplementary and vice versa. 1107 * 1108 * @param target char array 1109 * @param limit numbers of valid chars in target, different from target.length. limit counts the 1110 * number of chars in target that represents a string, not the size of array target. 1111 * @param offset16 UTF16 position to insert into 1112 * @param char32 code point 1113 * @return new number of chars in target that represents a string 1114 * @exception IndexOutOfBoundsException if offset16 is out of range 1115 */ 1116 public static int setCharAt(char target[], int limit, int offset16, int char32) { 1117 if (offset16 >= limit) { 1118 throw new ArrayIndexOutOfBoundsException(offset16); 1119 } 1120 int count = 1; 1121 char single = target[offset16]; 1122 1123 if (isSurrogate(single)) { 1124 // pairs of the surrogate with offset16 at the lead char found 1125 if (isLeadSurrogate(single) && (target.length > offset16 + 1) 1126 && isTrailSurrogate(target[offset16 + 1])) { 1127 count++; 1128 } else { 1129 // pairs of the surrogate with offset16 at the trail char 1130 // found 1131 if (isTrailSurrogate(single) && (offset16 > 0) 1132 && isLeadSurrogate(target[offset16 - 1])) { 1133 offset16--; 1134 count++; 1135 } 1136 } 1137 } 1138 1139 String str = valueOf(char32); 1140 int result = limit; 1141 int strlength = str.length(); 1142 target[offset16] = str.charAt(0); 1143 if (count == strlength) { 1144 if (count == 2) { 1145 target[offset16 + 1] = str.charAt(1); 1146 } 1147 } else { 1148 // this is not exact match in space, we'll have to do some 1149 // shifting 1150 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit 1151 - (offset16 + count)); 1152 if (count < strlength) { 1153 // char32 is a supplementary character trying to squeeze into 1154 // a non-supplementary space 1155 target[offset16 + 1] = str.charAt(1); 1156 result++; 1157 if (result < target.length) { 1158 target[result] = 0; 1159 } 1160 } else { 1161 // char32 is a non-supplementary character trying to fill 1162 // into a supplementary space 1163 result--; 1164 target[result] = 0; 1165 } 1166 } 1167 return result; 1168 } 1169 1170 /** 1171 * Shifts offset16 by the argument number of codepoints 1172 * 1173 * @param source string 1174 * @param offset16 UTF16 position to shift 1175 * @param shift32 number of codepoints to shift 1176 * @return new shifted offset16 1177 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds. 1178 */ 1179 public static int moveCodePointOffset(String source, int offset16, int shift32) { 1180 int result = offset16; 1181 int size = source.length(); 1182 int count; 1183 char ch; 1184 if (offset16 < 0 || offset16 > size) { 1185 throw new StringIndexOutOfBoundsException(offset16); 1186 } 1187 if (shift32 > 0) { 1188 if (shift32 + offset16 > size) { 1189 throw new StringIndexOutOfBoundsException(offset16); 1190 } 1191 count = shift32; 1192 while (result < size && count > 0) { 1193 ch = source.charAt(result); 1194 if (isLeadSurrogate(ch) && ((result + 1) < size) 1195 && isTrailSurrogate(source.charAt(result + 1))) { 1196 result++; 1197 } 1198 count--; 1199 result++; 1200 } 1201 } else { 1202 if (offset16 + shift32 < 0) { 1203 throw new StringIndexOutOfBoundsException(offset16); 1204 } 1205 for (count = -shift32; count > 0; count--) { 1206 result--; 1207 if (result < 0) { 1208 break; 1209 } 1210 ch = source.charAt(result); 1211 if (isTrailSurrogate(ch) && result > 0 1212 && isLeadSurrogate(source.charAt(result - 1))) { 1213 result--; 1214 } 1215 } 1216 } 1217 if (count != 0) { 1218 throw new StringIndexOutOfBoundsException(shift32); 1219 } 1220 return result; 1221 } 1222 1223 /** 1224 * Shifts offset16 by the argument number of codepoints 1225 * 1226 * @param source String buffer 1227 * @param offset16 UTF16 position to shift 1228 * @param shift32 Number of codepoints to shift 1229 * @return new shifted offset16 1230 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds. 1231 */ 1232 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) { 1233 int result = offset16; 1234 int size = source.length(); 1235 int count; 1236 char ch; 1237 if (offset16 < 0 || offset16 > size) { 1238 throw new StringIndexOutOfBoundsException(offset16); 1239 } 1240 if (shift32 > 0) { 1241 if (shift32 + offset16 > size) { 1242 throw new StringIndexOutOfBoundsException(offset16); 1243 } 1244 count = shift32; 1245 while (result < size && count > 0) { 1246 ch = source.charAt(result); 1247 if (isLeadSurrogate(ch) && ((result + 1) < size) 1248 && isTrailSurrogate(source.charAt(result + 1))) { 1249 result++; 1250 } 1251 count--; 1252 result++; 1253 } 1254 } else { 1255 if (offset16 + shift32 < 0) { 1256 throw new StringIndexOutOfBoundsException(offset16); 1257 } 1258 for (count = -shift32; count > 0; count--) { 1259 result--; 1260 if (result < 0) { 1261 break; 1262 } 1263 ch = source.charAt(result); 1264 if (isTrailSurrogate(ch) && result > 0 1265 && isLeadSurrogate(source.charAt(result - 1))) { 1266 result--; 1267 } 1268 } 1269 } 1270 if (count != 0) { 1271 throw new StringIndexOutOfBoundsException(shift32); 1272 } 1273 return result; 1274 } 1275 1276 /** 1277 * Shifts offset16 by the argument number of codepoints within a subarray. 1278 * 1279 * @param source Char array 1280 * @param start Position of the subarray to be performed on 1281 * @param limit Position of the subarray to be performed on 1282 * @param offset16 UTF16 position to shift relative to start 1283 * @param shift32 Number of codepoints to shift 1284 * @return new shifted offset16 relative to start 1285 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the 1286 * subarray bounds are out of range. 1287 */ 1288 public static int moveCodePointOffset(char source[], int start, int limit, int offset16, 1289 int shift32) { 1290 int size = source.length; 1291 int count; 1292 char ch; 1293 int result = offset16 + start; 1294 if (start < 0 || limit < start) { 1295 throw new StringIndexOutOfBoundsException(start); 1296 } 1297 if (limit > size) { 1298 throw new StringIndexOutOfBoundsException(limit); 1299 } 1300 if (offset16 < 0 || result > limit) { 1301 throw new StringIndexOutOfBoundsException(offset16); 1302 } 1303 if (shift32 > 0) { 1304 if (shift32 + result > size) { 1305 throw new StringIndexOutOfBoundsException(result); 1306 } 1307 count = shift32; 1308 while (result < limit && count > 0) { 1309 ch = source[result]; 1310 if (isLeadSurrogate(ch) && (result + 1 < limit) 1311 && isTrailSurrogate(source[result + 1])) { 1312 result++; 1313 } 1314 count--; 1315 result++; 1316 } 1317 } else { 1318 if (result + shift32 < start) { 1319 throw new StringIndexOutOfBoundsException(result); 1320 } 1321 for (count = -shift32; count > 0; count--) { 1322 result--; 1323 if (result < start) { 1324 break; 1325 } 1326 ch = source[result]; 1327 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 1328 result--; 1329 } 1330 } 1331 } 1332 if (count != 0) { 1333 throw new StringIndexOutOfBoundsException(shift32); 1334 } 1335 result -= start; 1336 return result; 1337 } 1338 1339 /** 1340 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1341 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1342 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2 1343 * otherwise. 1344 * <p> 1345 * The overall effect is exactly as if the argument were converted to a string by the method 1346 * valueOf(char) and the characters in that string were then inserted into target at the 1347 * position indicated by offset16. 1348 * </p> 1349 * <p> 1350 * The offset argument must be greater than or equal to 0, and less than or equal to the length 1351 * of source. 1352 * 1353 * @param target String buffer to insert to 1354 * @param offset16 Offset which char32 will be inserted in 1355 * @param char32 Codepoint to be inserted 1356 * @return a reference to target 1357 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1358 */ 1359 public static StringBuffer insert(StringBuffer target, int offset16, int char32) { 1360 String str = valueOf(char32); 1361 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1362 offset16++; 1363 } 1364 target.insert(offset16, str); 1365 return target; 1366 } 1367 1368 /** 1369 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the 1370 * middle of a supplementary codepoint, char32 will be inserted after the supplementary 1371 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise. 1372 * <p> 1373 * The overall effect is exactly as if the argument were converted to a string by the method 1374 * valueOf(char) and the characters in that string were then inserted into target at the 1375 * position indicated by offset16. 1376 * </p> 1377 * <p> 1378 * The offset argument must be greater than or equal to 0, and less than or equal to the limit. 1379 * 1380 * @param target Char array to insert to 1381 * @param limit End index of the char array, limit <= target.length 1382 * @param offset16 Offset which char32 will be inserted in 1383 * @param char32 Codepoint to be inserted 1384 * @return new limit size 1385 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1386 */ 1387 public static int insert(char target[], int limit, int offset16, int char32) { 1388 String str = valueOf(char32); 1389 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) { 1390 offset16++; 1391 } 1392 int size = str.length(); 1393 if (limit + size > target.length) { 1394 throw new ArrayIndexOutOfBoundsException(offset16 + size); 1395 } 1396 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16); 1397 target[offset16] = str.charAt(0); 1398 if (size == 2) { 1399 target[offset16 + 1] = str.charAt(1); 1400 } 1401 return limit + size; 1402 } 1403 1404 /** 1405 * Removes the codepoint at the specified position in this target (shortening target by 1 1406 * character if the codepoint is a non-supplementary, 2 otherwise). 1407 * 1408 * @param target String buffer to remove codepoint from 1409 * @param offset16 Offset which the codepoint will be removed 1410 * @return a reference to target 1411 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1412 */ 1413 public static StringBuffer delete(StringBuffer target, int offset16) { 1414 int count = 1; 1415 switch (bounds(target, offset16)) { 1416 case LEAD_SURROGATE_BOUNDARY: 1417 count++; 1418 break; 1419 case TRAIL_SURROGATE_BOUNDARY: 1420 count++; 1421 offset16--; 1422 break; 1423 } 1424 target.delete(offset16, offset16 + count); 1425 return target; 1426 } 1427 1428 /** 1429 * Removes the codepoint at the specified position in this target (shortening target by 1 1430 * character if the codepoint is a non-supplementary, 2 otherwise). 1431 * 1432 * @param target String buffer to remove codepoint from 1433 * @param limit End index of the char array, limit <= target.length 1434 * @param offset16 Offset which the codepoint will be removed 1435 * @return a new limit size 1436 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid. 1437 */ 1438 public static int delete(char target[], int limit, int offset16) { 1439 int count = 1; 1440 switch (bounds(target, 0, limit, offset16)) { 1441 case LEAD_SURROGATE_BOUNDARY: 1442 count++; 1443 break; 1444 case TRAIL_SURROGATE_BOUNDARY: 1445 count++; 1446 offset16--; 1447 break; 1448 } 1449 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count)); 1450 target[limit - count] = 0; 1451 return limit - count; 1452 } 1453 1454 /** 1455 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1456 * the argument codepoint. I.e., the smallest index <code>i</code> such that 1457 * <code>UTF16.charAt(source, i) == 1458 * char32</code> is true. 1459 * <p> 1460 * If no such character occurs in this string, then -1 is returned. 1461 * </p> 1462 * <p> 1463 * Examples:<br> 1464 * UTF16.indexOf("abc", 'a') returns 0<br> 1465 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1466 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1467 * </p> 1468 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1469 * characters to its fullest. 1470 * 1471 * @param source UTF16 format Unicode string that will be searched 1472 * @param char32 Codepoint to search for 1473 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1474 * -1 if the codepoint does not occur. 1475 */ 1476 public static int indexOf(String source, int char32) { 1477 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1478 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1479 } 1480 // non-surrogate bmp 1481 if (char32 < LEAD_SURROGATE_MIN_VALUE 1482 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1483 return source.indexOf((char) char32); 1484 } 1485 // surrogate 1486 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1487 int result = source.indexOf((char) char32); 1488 if (result >= 0) { 1489 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1490 && isTrailSurrogate(source.charAt(result + 1))) { 1491 return indexOf(source, char32, result + 1); 1492 } 1493 // trail surrogate 1494 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1495 return indexOf(source, char32, result + 1); 1496 } 1497 } 1498 return result; 1499 } 1500 // supplementary 1501 String char32str = toString(char32); 1502 return source.indexOf(char32str); 1503 } 1504 1505 /** 1506 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1507 * the argument string str. This method is implemented based on codepoints, hence a "lead 1508 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1509 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1510 * character before str found at in source will not have a valid match. Vice versa for lead 1511 * surrogates that ends str. See example below. 1512 * <p> 1513 * If no such string str occurs in this source, then -1 is returned. 1514 * </p> 1515 * <p> 1516 * Examples:<br> 1517 * UTF16.indexOf("abc", "ab") returns 0<br> 1518 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1519 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1520 * </p> 1521 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1522 * characters to its fullest. 1523 * 1524 * @param source UTF16 format Unicode string that will be searched 1525 * @param str UTF16 format Unicode string to search for 1526 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1527 * -1 if the codepoint does not occur. 1528 */ 1529 public static int indexOf(String source, String str) { 1530 int strLength = str.length(); 1531 // non-surrogate ends 1532 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1533 return source.indexOf(str); 1534 } 1535 1536 int result = source.indexOf(str); 1537 int resultEnd = result + strLength; 1538 if (result >= 0) { 1539 // check last character 1540 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1541 && isTrailSurrogate(source.charAt(resultEnd + 1))) { 1542 return indexOf(source, str, resultEnd + 1); 1543 } 1544 // check first character which is a trail surrogate 1545 if (isTrailSurrogate(str.charAt(0)) && result > 0 1546 && isLeadSurrogate(source.charAt(result - 1))) { 1547 return indexOf(source, str, resultEnd + 1); 1548 } 1549 } 1550 return result; 1551 } 1552 1553 /** 1554 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1555 * the argument codepoint. I.e., the smallest index i such that: <br> 1556 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true. 1557 * <p> 1558 * If no such character occurs in this string, then -1 is returned. 1559 * </p> 1560 * <p> 1561 * Examples:<br> 1562 * UTF16.indexOf("abc", 'a', 1) returns -1<br> 1563 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br> 1564 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br> 1565 * </p> 1566 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1567 * characters to its fullest. 1568 * 1569 * @param source UTF16 format Unicode string that will be searched 1570 * @param char32 Codepoint to search for 1571 * @param fromIndex The index to start the search from. 1572 * @return the index of the first occurrence of the codepoint in the argument Unicode string at 1573 * or after fromIndex, or -1 if the codepoint does not occur. 1574 */ 1575 public static int indexOf(String source, int char32, int fromIndex) { 1576 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1577 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1578 } 1579 // non-surrogate bmp 1580 if (char32 < LEAD_SURROGATE_MIN_VALUE 1581 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1582 return source.indexOf((char) char32, fromIndex); 1583 } 1584 // surrogate 1585 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1586 int result = source.indexOf((char) char32, fromIndex); 1587 if (result >= 0) { 1588 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1589 && isTrailSurrogate(source.charAt(result + 1))) { 1590 return indexOf(source, char32, result + 1); 1591 } 1592 // trail surrogate 1593 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1594 return indexOf(source, char32, result + 1); 1595 } 1596 } 1597 return result; 1598 } 1599 // supplementary 1600 String char32str = toString(char32); 1601 return source.indexOf(char32str, fromIndex); 1602 } 1603 1604 /** 1605 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of 1606 * the argument string str. This method is implemented based on codepoints, hence a "lead 1607 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1608 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1609 * character before str found at in source will not have a valid match. Vice versa for lead 1610 * surrogates that ends str. See example below. 1611 * <p> 1612 * If no such string str occurs in this source, then -1 is returned. 1613 * </p> 1614 * <p> 1615 * Examples:<br> 1616 * UTF16.indexOf("abc", "ab", 0) returns 0<br> 1617 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br> 1618 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br> 1619 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br> 1620 * </p> 1621 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1622 * characters to its fullest. 1623 * 1624 * @param source UTF16 format Unicode string that will be searched 1625 * @param str UTF16 format Unicode string to search for 1626 * @param fromIndex The index to start the search from. 1627 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or 1628 * -1 if the codepoint does not occur. 1629 */ 1630 public static int indexOf(String source, String str, int fromIndex) { 1631 int strLength = str.length(); 1632 // non-surrogate ends 1633 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1634 return source.indexOf(str, fromIndex); 1635 } 1636 1637 int result = source.indexOf(str, fromIndex); 1638 int resultEnd = result + strLength; 1639 if (result >= 0) { 1640 // check last character 1641 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1642 && isTrailSurrogate(source.charAt(resultEnd))) { 1643 return indexOf(source, str, resultEnd + 1); 1644 } 1645 // check first character which is a trail surrogate 1646 if (isTrailSurrogate(str.charAt(0)) && result > 0 1647 && isLeadSurrogate(source.charAt(result - 1))) { 1648 return indexOf(source, str, resultEnd + 1); 1649 } 1650 } 1651 return result; 1652 } 1653 1654 /** 1655 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1656 * the argument codepoint. I.e., the index returned is the largest value i such that: 1657 * UTF16.charAt(source, i) == char32 is true. 1658 * <p> 1659 * Examples:<br> 1660 * UTF16.lastIndexOf("abc", 'a') returns 0<br> 1661 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br> 1662 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1663 * </p> 1664 * <p> 1665 * source is searched backwards starting at the last character. 1666 * </p> 1667 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1668 * characters to its fullest. 1669 * 1670 * @param source UTF16 format Unicode string that will be searched 1671 * @param char32 Codepoint to search for 1672 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1673 * does not occur. 1674 */ 1675 public static int lastIndexOf(String source, int char32) { 1676 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1677 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1678 } 1679 // non-surrogate bmp 1680 if (char32 < LEAD_SURROGATE_MIN_VALUE 1681 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1682 return source.lastIndexOf((char) char32); 1683 } 1684 // surrogate 1685 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1686 int result = source.lastIndexOf((char) char32); 1687 if (result >= 0) { 1688 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1689 && isTrailSurrogate(source.charAt(result + 1))) { 1690 return lastIndexOf(source, char32, result - 1); 1691 } 1692 // trail surrogate 1693 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1694 return lastIndexOf(source, char32, result - 1); 1695 } 1696 } 1697 return result; 1698 } 1699 // supplementary 1700 String char32str = toString(char32); 1701 return source.lastIndexOf(char32str); 1702 } 1703 1704 /** 1705 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1706 * the argument string str. This method is implemented based on codepoints, hence a "lead 1707 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str 1708 * starts with trail surrogate character at index 0, a source with a leading a surrogate 1709 * character before str found at in source will not have a valid match. Vice versa for lead 1710 * surrogates that ends str. See example below. 1711 * <p> 1712 * Examples:<br> 1713 * UTF16.lastIndexOf("abc", "a") returns 0<br> 1714 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br> 1715 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br> 1716 * </p> 1717 * <p> 1718 * source is searched backwards starting at the last character. 1719 * </p> 1720 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1721 * characters to its fullest. 1722 * 1723 * @param source UTF16 format Unicode string that will be searched 1724 * @param str UTF16 format Unicode string to search for 1725 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1726 * does not occur. 1727 */ 1728 public static int lastIndexOf(String source, String str) { 1729 int strLength = str.length(); 1730 // non-surrogate ends 1731 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1732 return source.lastIndexOf(str); 1733 } 1734 1735 int result = source.lastIndexOf(str); 1736 if (result >= 0) { 1737 // check last character 1738 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1739 && isTrailSurrogate(source.charAt(result + strLength + 1))) { 1740 return lastIndexOf(source, str, result - 1); 1741 } 1742 // check first character which is a trail surrogate 1743 if (isTrailSurrogate(str.charAt(0)) && result > 0 1744 && isLeadSurrogate(source.charAt(result - 1))) { 1745 return lastIndexOf(source, str, result - 1); 1746 } 1747 } 1748 return result; 1749 } 1750 1751 /** 1752 * <p> 1753 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1754 * the argument codepoint, where the result is less than or equals to fromIndex. 1755 * </p> 1756 * <p> 1757 * This method is implemented based on codepoints, hence a single surrogate character will not 1758 * match a supplementary character. 1759 * </p> 1760 * <p> 1761 * source is searched backwards starting at the last character starting at the specified index. 1762 * </p> 1763 * <p> 1764 * Examples:<br> 1765 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br> 1766 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br> 1767 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br> 1768 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br> 1769 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br> 1770 * </p> 1771 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1772 * characters to its fullest. 1773 * 1774 * @param source UTF16 format Unicode string that will be searched 1775 * @param char32 Codepoint to search for 1776 * @param fromIndex the index to start the search from. There is no restriction on the value of 1777 * fromIndex. If it is greater than or equal to the length of this string, it has the 1778 * same effect as if it were equal to one less than the length of this string: this 1779 * entire string may be searched. If it is negative, it has the same effect as if it 1780 * were -1: -1 is returned. 1781 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1782 * does not occur. 1783 */ 1784 public static int lastIndexOf(String source, int char32, int fromIndex) { 1785 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 1786 throw new IllegalArgumentException("Argument char32 is not a valid codepoint"); 1787 } 1788 // non-surrogate bmp 1789 if (char32 < LEAD_SURROGATE_MIN_VALUE 1790 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) { 1791 return source.lastIndexOf((char) char32, fromIndex); 1792 } 1793 // surrogate 1794 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 1795 int result = source.lastIndexOf((char) char32, fromIndex); 1796 if (result >= 0) { 1797 if (isLeadSurrogate((char) char32) && (result < source.length() - 1) 1798 && isTrailSurrogate(source.charAt(result + 1))) { 1799 return lastIndexOf(source, char32, result - 1); 1800 } 1801 // trail surrogate 1802 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) { 1803 return lastIndexOf(source, char32, result - 1); 1804 } 1805 } 1806 return result; 1807 } 1808 // supplementary 1809 String char32str = toString(char32); 1810 return source.lastIndexOf(char32str, fromIndex); 1811 } 1812 1813 /** 1814 * <p> 1815 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of 1816 * the argument string str, where the result is less than or equals to fromIndex. 1817 * </p> 1818 * <p> 1819 * This method is implemented based on codepoints, hence a "lead surrogate character + trail 1820 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate 1821 * character at index 0, a source with a leading a surrogate character before str found at in 1822 * source will not have a valid match. Vice versa for lead surrogates that ends str. 1823 * </p> 1824 * See example below. 1825 * <p> 1826 * Examples:<br> 1827 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br> 1828 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br> 1829 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br> 1830 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br> 1831 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br> 1832 * </p> 1833 * <p> 1834 * source is searched backwards starting at the last character. 1835 * </p> 1836 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1837 * characters to its fullest. 1838 * 1839 * @param source UTF16 format Unicode string that will be searched 1840 * @param str UTF16 format Unicode string to search for 1841 * @param fromIndex the index to start the search from. There is no restriction on the value of 1842 * fromIndex. If it is greater than or equal to the length of this string, it has the 1843 * same effect as if it were equal to one less than the length of this string: this 1844 * entire string may be searched. If it is negative, it has the same effect as if it 1845 * were -1: -1 is returned. 1846 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint 1847 * does not occur. 1848 */ 1849 public static int lastIndexOf(String source, String str, int fromIndex) { 1850 int strLength = str.length(); 1851 // non-surrogate ends 1852 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) { 1853 return source.lastIndexOf(str, fromIndex); 1854 } 1855 1856 int result = source.lastIndexOf(str, fromIndex); 1857 if (result >= 0) { 1858 // check last character 1859 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1) 1860 && isTrailSurrogate(source.charAt(result + strLength))) { 1861 return lastIndexOf(source, str, result - 1); 1862 } 1863 // check first character which is a trail surrogate 1864 if (isTrailSurrogate(str.charAt(0)) && result > 0 1865 && isLeadSurrogate(source.charAt(result - 1))) { 1866 return lastIndexOf(source, str, result - 1); 1867 } 1868 } 1869 return result; 1870 } 1871 1872 /** 1873 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of 1874 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16 1875 * format Unicode string source, then source will be returned. Otherwise, a new String object is 1876 * created that represents a codepoint sequence identical to the codepoint sequence represented 1877 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of 1878 * newChar32. 1879 * <p> 1880 * Examples: <br> 1881 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br> 1882 * returns "mosquito in your collar"<br> 1883 * UTF16.replace("JonL", 'q', 'x');<br> 1884 * returns "JonL" (no change)<br> 1885 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br> 1886 * returns "Supplementary character !"<br> 1887 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br> 1888 * returns "Supplementary character \ud800\udc00"<br> 1889 * </p> 1890 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1891 * characters to its fullest. 1892 * 1893 * @param source UTF16 format Unicode string which the codepoint replacements will be based on. 1894 * @param oldChar32 Non-zero old codepoint to be replaced. 1895 * @param newChar32 The new codepoint to replace oldChar32 1896 * @return new String derived from source by replacing every occurrence of oldChar32 with 1897 * newChar32, unless when no oldChar32 is found in source then source will be returned. 1898 */ 1899 public static String replace(String source, int oldChar32, int newChar32) { 1900 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) { 1901 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint"); 1902 } 1903 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) { 1904 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint"); 1905 } 1906 1907 int index = indexOf(source, oldChar32); 1908 if (index == -1) { 1909 return source; 1910 } 1911 String newChar32Str = toString(newChar32); 1912 int oldChar32Size = 1; 1913 int newChar32Size = newChar32Str.length(); 1914 StringBuffer result = new StringBuffer(source); 1915 int resultIndex = index; 1916 1917 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) { 1918 oldChar32Size = 2; 1919 } 1920 1921 while (index != -1) { 1922 int endResultIndex = resultIndex + oldChar32Size; 1923 result.replace(resultIndex, endResultIndex, newChar32Str); 1924 int lastEndIndex = index + oldChar32Size; 1925 index = indexOf(source, oldChar32, lastEndIndex); 1926 resultIndex += newChar32Size + index - lastEndIndex; 1927 } 1928 return result.toString(); 1929 } 1930 1931 /** 1932 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr 1933 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string 1934 * source, then source will be returned. Otherwise, a new String object is created that 1935 * represents a codepoint sequence identical to the codepoint sequence represented by source, 1936 * except that every occurrence of oldStr is replaced by an occurrence of newStr. 1937 * <p> 1938 * Examples: <br> 1939 * UTF16.replace("mesquite in your cellar", "e", "o");<br> 1940 * returns "mosquito in your collar"<br> 1941 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br> 1942 * returns "cat in your cellar"<br> 1943 * UTF16.replace("JonL", "q", "x");<br> 1944 * returns "JonL" (no change)<br> 1945 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br> 1946 * returns "Supplementary character !"<br> 1947 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br> 1948 * returns "Supplementary character \ud800\udc00"<br> 1949 * </p> 1950 * Note this method is provided as support to jdk 1.3, which does not support supplementary 1951 * characters to its fullest. 1952 * 1953 * @param source UTF16 format Unicode string which the replacements will be based on. 1954 * @param oldStr Non-zero-length string to be replaced. 1955 * @param newStr The new string to replace oldStr 1956 * @return new String derived from source by replacing every occurrence of oldStr with newStr. 1957 * When no oldStr is found in source, then source will be returned. 1958 */ 1959 public static String replace(String source, String oldStr, String newStr) { 1960 int index = indexOf(source, oldStr); 1961 if (index == -1) { 1962 return source; 1963 } 1964 int oldStrSize = oldStr.length(); 1965 int newStrSize = newStr.length(); 1966 StringBuffer result = new StringBuffer(source); 1967 int resultIndex = index; 1968 1969 while (index != -1) { 1970 int endResultIndex = resultIndex + oldStrSize; 1971 result.replace(resultIndex, endResultIndex, newStr); 1972 int lastEndIndex = index + oldStrSize; 1973 index = indexOf(source, oldStr, lastEndIndex); 1974 resultIndex += newStrSize + index - lastEndIndex; 1975 } 1976 return result.toString(); 1977 } 1978 1979 /** 1980 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method 1981 * will reverse surrogate characters correctly, instead of blindly reversing every character. 1982 * <p> 1983 * Examples:<br> 1984 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br> 1985 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS". 1986 * 1987 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed 1988 * @return a modified source with reversed UTF16 format Unicode string. 1989 */ 1990 public static StringBuffer reverse(StringBuffer source) { 1991 int length = source.length(); 1992 StringBuffer result = new StringBuffer(length); 1993 for (int i = length; i-- > 0;) { 1994 char ch = source.charAt(i); 1995 if (isTrailSurrogate(ch) && i > 0) { 1996 char ch2 = source.charAt(i - 1); 1997 if (isLeadSurrogate(ch2)) { 1998 result.append(ch2); 1999 result.append(ch); 2000 --i; 2001 continue; 2002 } 2003 } 2004 result.append(ch); 2005 } 2006 return result; 2007 } 2008 2009 /** 2010 * Check if the string contains more Unicode code points than a certain number. This is more 2011 * efficient than counting all code points in the entire string and comparing that number with a 2012 * threshold. This function may not need to scan the string at all if the length is within a 2013 * certain range, and never needs to count more than 'number + 1' code points. Logically 2014 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two 2015 * code units. 2016 * 2017 * @param source The input string. 2018 * @param number The number of code points in the string is compared against the 'number' 2019 * parameter. 2020 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2021 */ 2022 public static boolean hasMoreCodePointsThan(String source, int number) { 2023 if (number < 0) { 2024 return true; 2025 } 2026 if (source == null) { 2027 return false; 2028 } 2029 int length = source.length(); 2030 2031 // length >= 0 known 2032 // source contains at least (length + 1) / 2 code points: <= 2 2033 // chars per cp 2034 if (((length + 1) >> 1) > number) { 2035 return true; 2036 } 2037 2038 // check if source does not even contain enough chars 2039 int maxsupplementary = length - number; 2040 if (maxsupplementary <= 0) { 2041 return false; 2042 } 2043 2044 // there are maxsupplementary = length - number more chars than 2045 // asked-for code points 2046 2047 // count code points until they exceed and also check that there are 2048 // no more than maxsupplementary supplementary code points (char pairs) 2049 int start = 0; 2050 while (true) { 2051 if (length == 0) { 2052 return false; 2053 } 2054 if (number == 0) { 2055 return true; 2056 } 2057 if (isLeadSurrogate(source.charAt(start++)) && start != length 2058 && isTrailSurrogate(source.charAt(start))) { 2059 start++; 2060 if (--maxsupplementary <= 0) { 2061 // too many pairs - too few code points 2062 return false; 2063 } 2064 } 2065 --number; 2066 } 2067 } 2068 2069 /** 2070 * Check if the sub-range of char array, from argument start to limit, contains more Unicode 2071 * code points than a certain number. This is more efficient than counting all code points in 2072 * the entire char array range and comparing that number with a threshold. This function may not 2073 * need to scan the char array at all if start and limit is within a certain range, and never 2074 * needs to count more than 'number + 1' code points. Logically equivalent to 2075 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one 2076 * or two code units. 2077 * 2078 * @param source Array of UTF-16 chars 2079 * @param start Offset to substring in the source array for analyzing 2080 * @param limit Offset to substring in the source array for analyzing 2081 * @param number The number of code points in the string is compared against the 'number' 2082 * parameter. 2083 * @return boolean value for whether the string contains more Unicode code points than 'number'. 2084 * @exception IndexOutOfBoundsException Thrown when limit < start 2085 */ 2086 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) { 2087 int length = limit - start; 2088 if (length < 0 || start < 0 || limit < 0) { 2089 throw new IndexOutOfBoundsException( 2090 "Start and limit indexes should be non-negative and start <= limit"); 2091 } 2092 if (number < 0) { 2093 return true; 2094 } 2095 if (source == null) { 2096 return false; 2097 } 2098 2099 // length >= 0 known 2100 // source contains at least (length + 1) / 2 code points: <= 2 2101 // chars per cp 2102 if (((length + 1) >> 1) > number) { 2103 return true; 2104 } 2105 2106 // check if source does not even contain enough chars 2107 int maxsupplementary = length - number; 2108 if (maxsupplementary <= 0) { 2109 return false; 2110 } 2111 2112 // there are maxsupplementary = length - number more chars than 2113 // asked-for code points 2114 2115 // count code points until they exceed and also check that there are 2116 // no more than maxsupplementary supplementary code points (char pairs) 2117 while (true) { 2118 if (length == 0) { 2119 return false; 2120 } 2121 if (number == 0) { 2122 return true; 2123 } 2124 if (isLeadSurrogate(source[start++]) && start != limit 2125 && isTrailSurrogate(source[start])) { 2126 start++; 2127 if (--maxsupplementary <= 0) { 2128 // too many pairs - too few code points 2129 return false; 2130 } 2131 } 2132 --number; 2133 } 2134 } 2135 2136 /** 2137 * Check if the string buffer contains more Unicode code points than a certain number. This is 2138 * more efficient than counting all code points in the entire string buffer and comparing that 2139 * number with a threshold. This function may not need to scan the string buffer at all if the 2140 * length is within a certain range, and never needs to count more than 'number + 1' code 2141 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may 2142 * occupy either one or two code units. 2143 * 2144 * @param source The input string buffer. 2145 * @param number The number of code points in the string buffer is compared against the 'number' 2146 * parameter. 2147 * @return boolean value for whether the string buffer contains more Unicode code points than 2148 * 'number'. 2149 */ 2150 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) { 2151 if (number < 0) { 2152 return true; 2153 } 2154 if (source == null) { 2155 return false; 2156 } 2157 int length = source.length(); 2158 2159 // length >= 0 known 2160 // source contains at least (length + 1) / 2 code points: <= 2 2161 // chars per cp 2162 if (((length + 1) >> 1) > number) { 2163 return true; 2164 } 2165 2166 // check if source does not even contain enough chars 2167 int maxsupplementary = length - number; 2168 if (maxsupplementary <= 0) { 2169 return false; 2170 } 2171 2172 // there are maxsupplementary = length - number more chars than 2173 // asked-for code points 2174 2175 // count code points until they exceed and also check that there are 2176 // no more than maxsupplementary supplementary code points (char pairs) 2177 int start = 0; 2178 while (true) { 2179 if (length == 0) { 2180 return false; 2181 } 2182 if (number == 0) { 2183 return true; 2184 } 2185 if (isLeadSurrogate(source.charAt(start++)) && start != length 2186 && isTrailSurrogate(source.charAt(start))) { 2187 start++; 2188 if (--maxsupplementary <= 0) { 2189 // too many pairs - too few code points 2190 return false; 2191 } 2192 } 2193 --number; 2194 } 2195 } 2196 2197 /** 2198 * Cover JDK 1.5 API. Create a String from an array of codePoints. 2199 * 2200 * @param codePoints The code array 2201 * @param offset The start of the text in the code point array 2202 * @param count The number of code points 2203 * @return a String representing the code points between offset and count 2204 * @throws IllegalArgumentException If an invalid code point is encountered 2205 * @throws IndexOutOfBoundsException If the offset or count are out of bounds. 2206 */ 2207 public static String newString(int[] codePoints, int offset, int count) { 2208 if (count < 0) { 2209 throw new IllegalArgumentException(); 2210 } 2211 char[] chars = new char[count]; 2212 int w = 0; 2213 for (int r = offset, e = offset + count; r < e; ++r) { 2214 int cp = codePoints[r]; 2215 if (cp < 0 || cp > 0x10ffff) { 2216 throw new IllegalArgumentException(); 2217 } 2218 while (true) { 2219 try { 2220 if (cp < 0x010000) { 2221 chars[w] = (char) cp; 2222 w++; 2223 } else { 2224 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); 2225 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); 2226 w += 2; 2227 } 2228 break; 2229 } catch (IndexOutOfBoundsException ex) { 2230 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2) 2231 / (r - offset + 1))); 2232 char[] temp = new char[newlen]; 2233 System.arraycopy(chars, 0, temp, 0, w); 2234 chars = temp; 2235 } 2236 } 2237 } 2238 return new String(chars, 0, w); 2239 } 2240 2241 /** 2242 * <p> 2243 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various 2244 * modes 2245 * </p> 2246 * <ul> 2247 * <li> Code point comparison or code unit comparison 2248 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison 2249 * with special handling for character 'i'. 2250 * </ul> 2251 * <p> 2252 * The code unit or code point comparison differ only when comparing supplementary code points 2253 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e., 2254 * \ue000..\uffff). In code unit comparison, high BMP code points sort after 2255 * supplementary code points because they are stored as pairs of surrogates which are at 2256 * \ud800..\udfff. 2257 * </p> 2258 * 2259 * @see #FOLD_CASE_DEFAULT 2260 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2261 */ 2262 public static final class StringComparator implements java.util.Comparator<String> { 2263 // public constructor ------------------------------------------------ 2264 2265 /** 2266 * Default constructor that does code unit comparison and case sensitive comparison. 2267 */ 2268 public StringComparator() { 2269 this(false, false, FOLD_CASE_DEFAULT); 2270 } 2271 2272 /** 2273 * Constructor that does comparison based on the argument options. 2274 * 2275 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit 2276 * comparison. 2277 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison 2278 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2279 * when ignorecase is set to true. If ignorecase is false, this option is 2280 * ignored. 2281 * @see #FOLD_CASE_DEFAULT 2282 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2283 * @throws IllegalArgumentException If foldcaseoption is out of range 2284 */ 2285 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) { 2286 setCodePointCompare(codepointcompare); 2287 m_ignoreCase_ = ignorecase; 2288 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2289 throw new IllegalArgumentException("Invalid fold case option"); 2290 } 2291 m_foldCase_ = foldcaseoption; 2292 } 2293 2294 // public data member ------------------------------------------------ 2295 2296 /** 2297 * Option value for case folding comparison: 2298 * 2299 * <p>Comparison is case insensitive, strings are folded using default mappings defined in 2300 * Unicode data file CaseFolding.txt, before comparison. 2301 */ 2302 public static final int FOLD_CASE_DEFAULT = 0; 2303 2304 /** 2305 * Option value for case folding: 2306 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 2307 * and dotless i appropriately for Turkic languages (tr, az). 2308 * 2309 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in 2310 * Unicode data file CaseFolding.txt, before comparison. 2311 * 2312 * @see android.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 2313 */ 2314 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1; 2315 2316 // public methods ---------------------------------------------------- 2317 2318 // public setters ---------------------------------------------------- 2319 2320 /** 2321 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode 2322 * is set to code unit compare 2323 * 2324 * @param flag True for code point compare, false for code unit compare 2325 */ 2326 public void setCodePointCompare(boolean flag) { 2327 if (flag) { 2328 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER; 2329 } else { 2330 m_codePointCompare_ = 0; 2331 } 2332 } 2333 2334 /** 2335 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise 2336 * case sensitive comparison mode if set to false. 2337 * 2338 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison 2339 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only 2340 * when ignorecase is set to true. If ignorecase is false, this option is 2341 * ignored. 2342 * @see #FOLD_CASE_DEFAULT 2343 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2344 */ 2345 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) { 2346 m_ignoreCase_ = ignorecase; 2347 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) { 2348 throw new IllegalArgumentException("Invalid fold case option"); 2349 } 2350 m_foldCase_ = foldcaseoption; 2351 } 2352 2353 // public getters ---------------------------------------------------- 2354 2355 /** 2356 * Checks if the comparison mode is code point compare. 2357 * 2358 * @return true for code point compare, false for code unit compare 2359 */ 2360 public boolean getCodePointCompare() { 2361 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2362 } 2363 2364 /** 2365 * Checks if Comparator is in the case insensitive mode. 2366 * 2367 * @return true if Comparator performs case insensitive comparison, false otherwise 2368 */ 2369 public boolean getIgnoreCase() { 2370 return m_ignoreCase_; 2371 } 2372 2373 /** 2374 * Gets the fold case options set in Comparator to be used with case insensitive comparison. 2375 * 2376 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I 2377 * @see #FOLD_CASE_DEFAULT 2378 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I 2379 */ 2380 public int getIgnoreCaseOption() { 2381 return m_foldCase_; 2382 } 2383 2384 // public other methods ---------------------------------------------- 2385 2386 /** 2387 * Compare two strings depending on the options selected during construction. 2388 * 2389 * @param a first source string. 2390 * @param b second source string. 2391 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b, 2392 * a positive value is returned. 2393 * @exception ClassCastException thrown when either a or b is not a String object 2394 */ 2395 @Override 2396 public int compare(String a, String b) { 2397 if (Utility.sameObjects(a, b)) { 2398 return 0; 2399 } 2400 if (a == null) { 2401 return -1; 2402 } 2403 if (b == null) { 2404 return 1; 2405 } 2406 2407 if (m_ignoreCase_) { 2408 return compareCaseInsensitive(a, b); 2409 } 2410 return compareCaseSensitive(a, b); 2411 } 2412 2413 // private data member ---------------------------------------------- 2414 2415 /** 2416 * Code unit comparison flag. True if code unit comparison is required. False if code point 2417 * comparison is required. 2418 */ 2419 private int m_codePointCompare_; 2420 2421 /** 2422 * Fold case comparison option. 2423 */ 2424 private int m_foldCase_; 2425 2426 /** 2427 * Flag indicator if ignore case is to be used during comparison 2428 */ 2429 private boolean m_ignoreCase_; 2430 2431 /** 2432 * Code point order offset for surrogate characters 2433 */ 2434 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800; 2435 2436 // private method --------------------------------------------------- 2437 2438 /** 2439 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life 2440 * easier. 2441 * 2442 * @param s1 2443 * first string to compare 2444 * @param s2 2445 * second string to compare 2446 * @return -1 is s1 < s2, 0 if equals, 2447 */ 2448 private int compareCaseInsensitive(String s1, String s2) { 2449 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_ 2450 | Normalizer.COMPARE_IGNORE_CASE); 2451 } 2452 2453 /** 2454 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life 2455 * easier. 2456 * 2457 * @param s1 2458 * first string to compare 2459 * @param s2 2460 * second string to compare 2461 * @return -1 is s1 < s2, 0 if equals, 2462 */ 2463 private int compareCaseSensitive(String s1, String s2) { 2464 // compare identical prefixes - they do not need to be fixed up 2465 // limit1 = start1 + min(lenght1, length2) 2466 int length1 = s1.length(); 2467 int length2 = s2.length(); 2468 int minlength = length1; 2469 int result = 0; 2470 if (length1 < length2) { 2471 result = -1; 2472 } else if (length1 > length2) { 2473 result = 1; 2474 minlength = length2; 2475 } 2476 2477 char c1 = 0; 2478 char c2 = 0; 2479 int index = 0; 2480 for (; index < minlength; index++) { 2481 c1 = s1.charAt(index); 2482 c2 = s2.charAt(index); 2483 // check pseudo-limit 2484 if (c1 != c2) { 2485 break; 2486 } 2487 } 2488 2489 if (index == minlength) { 2490 return result; 2491 } 2492 2493 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER; 2494 // if both values are in or above the surrogate range, fix them up 2495 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE 2496 && codepointcompare) { 2497 // subtract 0x2800 from BMP code points to make them smaller 2498 // than supplementary ones 2499 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1))) 2500 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) { 2501 // part of a surrogate pair, leave >=d800 2502 } else { 2503 // BMP code point - may be surrogate code point - make 2504 // < d800 2505 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2506 } 2507 2508 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1))) 2509 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) { 2510 // part of a surrogate pair, leave >=d800 2511 } else { 2512 // BMP code point - may be surrogate code point - make <d800 2513 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_; 2514 } 2515 } 2516 2517 // now c1 and c2 are in UTF-32-compatible order 2518 return c1 - c2; 2519 } 2520 } 2521 2522 /** 2523 * Utility for getting a code point from a CharSequence that contains exactly one code point. 2524 * @return the code point IF the string is non-null and consists of a single code point. 2525 * otherwise returns -1. 2526 * @param s to test 2527 */ 2528 public static int getSingleCodePoint(CharSequence s) { 2529 if (s == null || s.length() == 0) { 2530 return -1; 2531 } else if (s.length() == 1) { 2532 return s.charAt(0); 2533 } else if (s.length() > 2) { 2534 return -1; 2535 } 2536 2537 // at this point, len = 2 2538 int cp = Character.codePointAt(s, 0); 2539 if (cp > 0xFFFF) { // is surrogate pair 2540 return cp; 2541 } 2542 return -1; 2543 } 2544 2545 /** 2546 * Utility for comparing a code point to a string without having to create a new string. Returns the same results 2547 * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if 2548 * <pre> 2549 * sc = new StringComparator(true,false,0); 2550 * fast = UTF16.compareCodePoint(codePoint, charSequence) 2551 * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString()) 2552 * </pre> 2553 * then 2554 * <pre> 2555 * Integer.signum(fast) == Integer.signum(slower) 2556 * </pre> 2557 * @param codePoint to test 2558 * @param s to test 2559 * @return equivalent of code point comparator comparing two strings. 2560 */ 2561 public static int compareCodePoint(int codePoint, CharSequence s) { 2562 if (s == null) { 2563 return 1; 2564 } 2565 final int strLen = s.length(); 2566 if (strLen == 0) { 2567 return 1; 2568 } 2569 int second = Character.codePointAt(s, 0); 2570 int diff = codePoint - second; 2571 if (diff != 0) { 2572 return diff; 2573 } 2574 return strLen == Character.charCount(codePoint) ? 0 : -1; 2575 } 2576 2577 // private data members ------------------------------------------------- 2578 2579 /** 2580 * Shift value for lead surrogate to form a supplementary character. 2581 */ 2582 private static final int LEAD_SURROGATE_SHIFT_ = 10; 2583 2584 /** 2585 * Mask to retrieve the significant value from a trail surrogate. 2586 */ 2587 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 2588 2589 /** 2590 * Value that all lead surrogate starts with 2591 */ 2592 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE 2593 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); 2594 2595 // private methods ------------------------------------------------------ 2596 2597 /** 2598 * <p> 2599 * Converts argument code point and returns a String object representing the code point's value 2600 * in UTF16 format. 2601 * </p> 2602 * <p> 2603 * This method does not check for the validity of the codepoint, the results are not guaranteed 2604 * if a invalid codepoint is passed as argument. 2605 * </p> 2606 * <p> 2607 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise. 2608 * </p> 2609 * 2610 * @param ch 2611 * code point 2612 * @return string representation of the code point 2613 */ 2614 private static String toString(int ch) { 2615 if (ch < SUPPLEMENTARY_MIN_VALUE) { 2616 return String.valueOf((char) ch); 2617 } 2618 2619 StringBuilder result = new StringBuilder(); 2620 result.append(getLeadSurrogate(ch)); 2621 result.append(getTrailSurrogate(ch)); 2622 return result.toString(); 2623 } 2624 } 2625 // eof 2626