1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2015, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.impl; 11 12 import java.io.IOException; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Locale; 16 import java.util.regex.Pattern; 17 18 import android.icu.lang.UCharacter; 19 import android.icu.text.Replaceable; 20 import android.icu.text.UTF16; 21 import android.icu.text.UnicodeMatcher; 22 23 /** 24 * @hide Only a subset of ICU is exposed in Android 25 */ 26 public final class Utility { 27 28 private static final char APOSTROPHE = '\''; 29 private static final char BACKSLASH = '\\'; 30 private static final int MAGIC_UNSIGNED = 0x80000000; 31 32 /** 33 * Convenience utility to compare two Object[]s. 34 * Ought to be in System 35 */ 36 public final static boolean arrayEquals(Object[] source, Object target) { 37 if (source == null) return (target == null); 38 if (!(target instanceof Object[])) return false; 39 Object[] targ = (Object[]) target; 40 return (source.length == targ.length 41 && arrayRegionMatches(source, 0, targ, 0, source.length)); 42 } 43 44 /** 45 * Convenience utility to compare two int[]s 46 * Ought to be in System 47 */ 48 public final static boolean arrayEquals(int[] source, Object target) { 49 if (source == null) return (target == null); 50 if (!(target instanceof int[])) return false; 51 int[] targ = (int[]) target; 52 return (source.length == targ.length 53 && arrayRegionMatches(source, 0, targ, 0, source.length)); 54 } 55 56 /** 57 * Convenience utility to compare two double[]s 58 * Ought to be in System 59 */ 60 public final static boolean arrayEquals(double[] source, Object target) { 61 if (source == null) return (target == null); 62 if (!(target instanceof double[])) return false; 63 double[] targ = (double[]) target; 64 return (source.length == targ.length 65 && arrayRegionMatches(source, 0, targ, 0, source.length)); 66 } 67 public final static boolean arrayEquals(byte[] source, Object target) { 68 if (source == null) return (target == null); 69 if (!(target instanceof byte[])) return false; 70 byte[] targ = (byte[]) target; 71 return (source.length == targ.length 72 && arrayRegionMatches(source, 0, targ, 0, source.length)); 73 } 74 75 /** 76 * Convenience utility to compare two Object[]s 77 * Ought to be in System 78 */ 79 public final static boolean arrayEquals(Object source, Object target) { 80 if (source == null) return (target == null); 81 // for some reason, the correct arrayEquals is not being called 82 // so do it by hand for now. 83 if (source instanceof Object[]) 84 return(arrayEquals((Object[]) source,target)); 85 if (source instanceof int[]) 86 return(arrayEquals((int[]) source,target)); 87 if (source instanceof double[]) 88 return(arrayEquals((double[]) source, target)); 89 if (source instanceof byte[]) 90 return(arrayEquals((byte[]) source,target)); 91 return source.equals(target); 92 } 93 94 /** 95 * Convenience utility to compare two Object[]s 96 * Ought to be in System. 97 * @param len the length to compare. 98 * The start indices and start+len must be valid. 99 */ 100 public final static boolean arrayRegionMatches(Object[] source, int sourceStart, 101 Object[] target, int targetStart, 102 int len) 103 { 104 int sourceEnd = sourceStart + len; 105 int delta = targetStart - sourceStart; 106 for (int i = sourceStart; i < sourceEnd; i++) { 107 if (!arrayEquals(source[i],target[i + delta])) 108 return false; 109 } 110 return true; 111 } 112 113 /** 114 * Convenience utility to compare two Object[]s 115 * Ought to be in System. 116 * @param len the length to compare. 117 * The start indices and start+len must be valid. 118 */ 119 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 120 char[] target, int targetStart, 121 int len) 122 { 123 int sourceEnd = sourceStart + len; 124 int delta = targetStart - sourceStart; 125 for (int i = sourceStart; i < sourceEnd; i++) { 126 if (source[i]!=target[i + delta]) 127 return false; 128 } 129 return true; 130 } 131 132 /** 133 * Convenience utility to compare two int[]s. 134 * @param len the length to compare. 135 * The start indices and start+len must be valid. 136 * Ought to be in System 137 */ 138 public final static boolean arrayRegionMatches(int[] source, int sourceStart, 139 int[] target, int targetStart, 140 int len) 141 { 142 int sourceEnd = sourceStart + len; 143 int delta = targetStart - sourceStart; 144 for (int i = sourceStart; i < sourceEnd; i++) { 145 if (source[i] != target[i + delta]) 146 return false; 147 } 148 return true; 149 } 150 151 /** 152 * Convenience utility to compare two arrays of doubles. 153 * @param len the length to compare. 154 * The start indices and start+len must be valid. 155 * Ought to be in System 156 */ 157 public final static boolean arrayRegionMatches(double[] source, int sourceStart, 158 double[] target, int targetStart, 159 int len) 160 { 161 int sourceEnd = sourceStart + len; 162 int delta = targetStart - sourceStart; 163 for (int i = sourceStart; i < sourceEnd; i++) { 164 if (source[i] != target[i + delta]) 165 return false; 166 } 167 return true; 168 } 169 public final static boolean arrayRegionMatches(byte[] source, int sourceStart, 170 byte[] target, int targetStart, int len){ 171 int sourceEnd = sourceStart + len; 172 int delta = targetStart - sourceStart; 173 for (int i = sourceStart; i < sourceEnd; i++) { 174 if (source[i] != target[i + delta]) 175 return false; 176 } 177 return true; 178 } 179 180 /** 181 * Trivial reference equality. 182 * This method should help document that we really want == not equals(), 183 * and to have a single place to suppress warnings from static analysis tools. 184 */ 185 public static final boolean sameObjects(Object a, Object b) { 186 return a == b; 187 } 188 189 /** 190 * Convenience utility. Does null checks on objects, then calls equals. 191 */ 192 public final static boolean objectEquals(Object a, Object b) { 193 return a == null ? 194 b == null ? true : false : 195 b == null ? false : a.equals(b); 196 } 197 198 /** 199 * Convenience utility. Does null checks on objects, then calls compare. 200 */ 201 public static <T extends Comparable<T>> int checkCompare(T a, T b) { 202 return a == null ? 203 b == null ? 0 : -1 : 204 b == null ? 1 : a.compareTo(b); 205 } 206 207 /** 208 * Convenience utility. Does null checks on object, then calls hashCode. 209 */ 210 public static int checkHash(Object a) { 211 return a == null ? 0 : a.hashCode(); 212 } 213 214 /** 215 * The ESCAPE character is used during run-length encoding. It signals 216 * a run of identical chars. 217 */ 218 private static final char ESCAPE = '\uA5A5'; 219 220 /** 221 * The ESCAPE_BYTE character is used during run-length encoding. It signals 222 * a run of identical bytes. 223 */ 224 static final byte ESCAPE_BYTE = (byte)0xA5; 225 226 /** 227 * Construct a string representing an int array. Use run-length encoding. 228 * A character represents itself, unless it is the ESCAPE character. Then 229 * the following notations are possible: 230 * ESCAPE ESCAPE ESCAPE literal 231 * ESCAPE n c n instances of character c 232 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 233 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 234 * If we encounter a run where n == ESCAPE, we represent this as: 235 * c ESCAPE n-1 c 236 * The ESCAPE value is chosen so as not to collide with commonly 237 * seen values. 238 */ 239 static public final String arrayToRLEString(int[] a) { 240 StringBuilder buffer = new StringBuilder(); 241 242 appendInt(buffer, a.length); 243 int runValue = a[0]; 244 int runLength = 1; 245 for (int i=1; i<a.length; ++i) { 246 int s = a[i]; 247 if (s == runValue && runLength < 0xFFFF) { 248 ++runLength; 249 } else { 250 encodeRun(buffer, runValue, runLength); 251 runValue = s; 252 runLength = 1; 253 } 254 } 255 encodeRun(buffer, runValue, runLength); 256 return buffer.toString(); 257 } 258 259 /** 260 * Construct a string representing a short array. Use run-length encoding. 261 * A character represents itself, unless it is the ESCAPE character. Then 262 * the following notations are possible: 263 * ESCAPE ESCAPE ESCAPE literal 264 * ESCAPE n c n instances of character c 265 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 266 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 267 * If we encounter a run where n == ESCAPE, we represent this as: 268 * c ESCAPE n-1 c 269 * The ESCAPE value is chosen so as not to collide with commonly 270 * seen values. 271 */ 272 static public final String arrayToRLEString(short[] a) { 273 StringBuilder buffer = new StringBuilder(); 274 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]); 275 buffer.append((char) (a.length >> 16)); 276 buffer.append((char) a.length); 277 short runValue = a[0]; 278 int runLength = 1; 279 for (int i=1; i<a.length; ++i) { 280 short s = a[i]; 281 if (s == runValue && runLength < 0xFFFF) ++runLength; 282 else { 283 encodeRun(buffer, runValue, runLength); 284 runValue = s; 285 runLength = 1; 286 } 287 } 288 encodeRun(buffer, runValue, runLength); 289 return buffer.toString(); 290 } 291 292 /** 293 * Construct a string representing a char array. Use run-length encoding. 294 * A character represents itself, unless it is the ESCAPE character. Then 295 * the following notations are possible: 296 * ESCAPE ESCAPE ESCAPE literal 297 * ESCAPE n c n instances of character c 298 * Since an encoded run occupies 3 characters, we only encode runs of 4 or 299 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. 300 * If we encounter a run where n == ESCAPE, we represent this as: 301 * c ESCAPE n-1 c 302 * The ESCAPE value is chosen so as not to collide with commonly 303 * seen values. 304 */ 305 static public final String arrayToRLEString(char[] a) { 306 StringBuilder buffer = new StringBuilder(); 307 buffer.append((char) (a.length >> 16)); 308 buffer.append((char) a.length); 309 char runValue = a[0]; 310 int runLength = 1; 311 for (int i=1; i<a.length; ++i) { 312 char s = a[i]; 313 if (s == runValue && runLength < 0xFFFF) ++runLength; 314 else { 315 encodeRun(buffer, (short)runValue, runLength); 316 runValue = s; 317 runLength = 1; 318 } 319 } 320 encodeRun(buffer, (short)runValue, runLength); 321 return buffer.toString(); 322 } 323 324 /** 325 * Construct a string representing a byte array. Use run-length encoding. 326 * Two bytes are packed into a single char, with a single extra zero byte at 327 * the end if needed. A byte represents itself, unless it is the 328 * ESCAPE_BYTE. Then the following notations are possible: 329 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal 330 * ESCAPE_BYTE n b n instances of byte b 331 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or 332 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. 333 * If we encounter a run where n == ESCAPE_BYTE, we represent this as: 334 * b ESCAPE_BYTE n-1 b 335 * The ESCAPE_BYTE value is chosen so as not to collide with commonly 336 * seen values. 337 */ 338 static public final String arrayToRLEString(byte[] a) { 339 StringBuilder buffer = new StringBuilder(); 340 buffer.append((char) (a.length >> 16)); 341 buffer.append((char) a.length); 342 byte runValue = a[0]; 343 int runLength = 1; 344 byte[] state = new byte[2]; 345 for (int i=1; i<a.length; ++i) { 346 byte b = a[i]; 347 if (b == runValue && runLength < 0xFF) ++runLength; 348 else { 349 encodeRun(buffer, runValue, runLength, state); 350 runValue = b; 351 runLength = 1; 352 } 353 } 354 encodeRun(buffer, runValue, runLength, state); 355 356 // We must save the final byte, if there is one, by padding 357 // an extra zero. 358 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state); 359 360 return buffer.toString(); 361 } 362 363 /** 364 * Encode a run, possibly a degenerate run (of < 4 values). 365 * @param length The length of the run; must be > 0 && <= 0xFFFF. 366 */ 367 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) { 368 if (length < 4) { 369 for (int j=0; j<length; ++j) { 370 if (value == ESCAPE) { 371 appendInt(buffer, value); 372 } 373 appendInt(buffer, value); 374 } 375 } 376 else { 377 if (length == ESCAPE) { 378 if (value == ESCAPE) { 379 appendInt(buffer, ESCAPE); 380 } 381 appendInt(buffer, value); 382 --length; 383 } 384 appendInt(buffer, ESCAPE); 385 appendInt(buffer, length); 386 appendInt(buffer, value); // Don't need to escape this value 387 } 388 } 389 390 private static final <T extends Appendable> void appendInt(T buffer, int value) { 391 try { 392 buffer.append((char)(value >>> 16)); 393 buffer.append((char)(value & 0xFFFF)); 394 } catch (IOException e) { 395 throw new IllegalIcuArgumentException(e); 396 } 397 } 398 399 /** 400 * Encode a run, possibly a degenerate run (of < 4 values). 401 * @param length The length of the run; must be > 0 && <= 0xFFFF. 402 */ 403 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) { 404 try { 405 char valueChar = (char) value; 406 if (length < 4) { 407 for (int j=0; j<length; ++j) { 408 if (valueChar == ESCAPE) { 409 buffer.append(ESCAPE); 410 } 411 buffer.append(valueChar); 412 } 413 } 414 else { 415 if (length == ESCAPE) { 416 if (valueChar == ESCAPE) { 417 buffer.append(ESCAPE); 418 } 419 buffer.append(valueChar); 420 --length; 421 } 422 buffer.append(ESCAPE); 423 buffer.append((char) length); 424 buffer.append(valueChar); // Don't need to escape this value 425 } 426 } catch (IOException e) { 427 throw new IllegalIcuArgumentException(e); 428 } 429 } 430 431 /** 432 * Encode a run, possibly a degenerate run (of < 4 values). 433 * @param length The length of the run; must be > 0 && <= 0xFF. 434 */ 435 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length, 436 byte[] state) { 437 if (length < 4) { 438 for (int j=0; j<length; ++j) { 439 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 440 appendEncodedByte(buffer, value, state); 441 } 442 } 443 else { 444 if ((byte)length == ESCAPE_BYTE) { 445 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state); 446 appendEncodedByte(buffer, value, state); 447 --length; 448 } 449 appendEncodedByte(buffer, ESCAPE_BYTE, state); 450 appendEncodedByte(buffer, (byte)length, state); 451 appendEncodedByte(buffer, value, state); // Don't need to escape this value 452 } 453 } 454 455 /** 456 * Append a byte to the given Appendable, packing two bytes into each 457 * character. The state parameter maintains intermediary data between 458 * calls. 459 * @param state A two-element array, with state[0] == 0 if this is the 460 * first byte of a pair, or state[0] != 0 if this is the second byte 461 * of a pair, in which case state[1] is the first byte. 462 */ 463 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value, 464 byte[] state) { 465 try { 466 if (state[0] != 0) { 467 char c = (char) ((state[1] << 8) | ((value) & 0xFF)); 468 buffer.append(c); 469 state[0] = 0; 470 } 471 else { 472 state[0] = 1; 473 state[1] = value; 474 } 475 } catch (IOException e) { 476 throw new IllegalIcuArgumentException(e); 477 } 478 } 479 480 /** 481 * Construct an array of ints from a run-length encoded string. 482 */ 483 static public final int[] RLEStringToIntArray(String s) { 484 int length = getInt(s, 0); 485 int[] array = new int[length]; 486 int ai = 0, i = 1; 487 488 int maxI = s.length() / 2; 489 while (ai < length && i < maxI) { 490 int c = getInt(s, i++); 491 492 if (c == ESCAPE) { 493 c = getInt(s, i++); 494 if (c == ESCAPE) { 495 array[ai++] = c; 496 } else { 497 int runLength = c; 498 int runValue = getInt(s, i++); 499 for (int j=0; j<runLength; ++j) { 500 array[ai++] = runValue; 501 } 502 } 503 } 504 else { 505 array[ai++] = c; 506 } 507 } 508 509 if (ai != length || i != maxI) { 510 throw new IllegalStateException("Bad run-length encoded int array"); 511 } 512 513 return array; 514 } 515 static final int getInt(String s, int i) { 516 return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1); 517 } 518 519 /** 520 * Construct an array of shorts from a run-length encoded string. 521 */ 522 static public final short[] RLEStringToShortArray(String s) { 523 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 524 short[] array = new short[length]; 525 int ai = 0; 526 for (int i=2; i<s.length(); ++i) { 527 char c = s.charAt(i); 528 if (c == ESCAPE) { 529 c = s.charAt(++i); 530 if (c == ESCAPE) { 531 array[ai++] = (short) c; 532 } else { 533 int runLength = c; 534 short runValue = (short) s.charAt(++i); 535 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 536 } 537 } 538 else { 539 array[ai++] = (short) c; 540 } 541 } 542 543 if (ai != length) 544 throw new IllegalStateException("Bad run-length encoded short array"); 545 546 return array; 547 } 548 549 /** 550 * Construct an array of shorts from a run-length encoded string. 551 */ 552 static public final char[] RLEStringToCharArray(String s) { 553 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 554 char[] array = new char[length]; 555 int ai = 0; 556 for (int i=2; i<s.length(); ++i) { 557 char c = s.charAt(i); 558 if (c == ESCAPE) { 559 c = s.charAt(++i); 560 if (c == ESCAPE) { 561 array[ai++] = c; 562 } else { 563 int runLength = c; 564 char runValue = s.charAt(++i); 565 for (int j=0; j<runLength; ++j) array[ai++] = runValue; 566 } 567 } 568 else { 569 array[ai++] = c; 570 } 571 } 572 573 if (ai != length) 574 throw new IllegalStateException("Bad run-length encoded short array"); 575 576 return array; 577 } 578 579 /** 580 * Construct an array of bytes from a run-length encoded string. 581 */ 582 static public final byte[] RLEStringToByteArray(String s) { 583 int length = ((s.charAt(0)) << 16) | (s.charAt(1)); 584 byte[] array = new byte[length]; 585 boolean nextChar = true; 586 char c = 0; 587 int node = 0; 588 int runLength = 0; 589 int i = 2; 590 for (int ai=0; ai<length; ) { 591 // This part of the loop places the next byte into the local 592 // variable 'b' each time through the loop. It keeps the 593 // current character in 'c' and uses the boolean 'nextChar' 594 // to see if we've taken both bytes out of 'c' yet. 595 byte b; 596 if (nextChar) { 597 c = s.charAt(i++); 598 b = (byte) (c >> 8); 599 nextChar = false; 600 } 601 else { 602 b = (byte) (c & 0xFF); 603 nextChar = true; 604 } 605 606 // This part of the loop is a tiny state machine which handles 607 // the parsing of the run-length encoding. This would be simpler 608 // if we could look ahead, but we can't, so we use 'node' to 609 // move between three nodes in the state machine. 610 switch (node) { 611 case 0: 612 // Normal idle node 613 if (b == ESCAPE_BYTE) { 614 node = 1; 615 } 616 else { 617 array[ai++] = b; 618 } 619 break; 620 case 1: 621 // We have seen one ESCAPE_BYTE; we expect either a second 622 // one, or a run length and value. 623 if (b == ESCAPE_BYTE) { 624 array[ai++] = ESCAPE_BYTE; 625 node = 0; 626 } 627 else { 628 runLength = b; 629 // Interpret signed byte as unsigned 630 if (runLength < 0) runLength += 0x100; 631 node = 2; 632 } 633 break; 634 case 2: 635 // We have seen an ESCAPE_BYTE and length byte. We interpret 636 // the next byte as the value to be repeated. 637 for (int j=0; j<runLength; ++j) array[ai++] = b; 638 node = 0; 639 break; 640 } 641 } 642 643 if (node != 0) 644 throw new IllegalStateException("Bad run-length encoded byte array"); 645 646 if (i != s.length()) 647 throw new IllegalStateException("Excess data in RLE byte array string"); 648 649 return array; 650 } 651 652 static public String LINE_SEPARATOR = System.getProperty("line.separator"); 653 654 /** 655 * Format a String for representation in a source file. This includes 656 * breaking it into lines and escaping characters using octal notation 657 * when necessary (control characters and double quotes). 658 */ 659 static public final String formatForSource(String s) { 660 StringBuilder buffer = new StringBuilder(); 661 for (int i=0; i<s.length();) { 662 if (i > 0) buffer.append('+').append(LINE_SEPARATOR); 663 buffer.append(" \""); 664 int count = 11; 665 while (i<s.length() && count<80) { 666 char c = s.charAt(i++); 667 if (c < '\u0020' || c == '"' || c == '\\') { 668 if (c == '\n') { 669 buffer.append("\\n"); 670 count += 2; 671 } else if (c == '\t') { 672 buffer.append("\\t"); 673 count += 2; 674 } else if (c == '\r') { 675 buffer.append("\\r"); 676 count += 2; 677 } else { 678 // Represent control characters, backslash and double quote 679 // using octal notation; otherwise the string we form 680 // won't compile, since Unicode escape sequences are 681 // processed before tokenization. 682 buffer.append('\\'); 683 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 684 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 685 buffer.append(HEX_DIGIT[(c & 0007)]); 686 count += 4; 687 } 688 } 689 else if (c <= '\u007E') { 690 buffer.append(c); 691 count += 1; 692 } 693 else { 694 buffer.append("\\u"); 695 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 696 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 697 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 698 buffer.append(HEX_DIGIT[(c & 0x000F)]); 699 count += 6; 700 } 701 } 702 buffer.append('"'); 703 } 704 return buffer.toString(); 705 } 706 707 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7', 708 '8','9','A','B','C','D','E','F'}; 709 710 /** 711 * Format a String for representation in a source file. Like 712 * formatForSource but does not do line breaking. 713 */ 714 static public final String format1ForSource(String s) { 715 StringBuilder buffer = new StringBuilder(); 716 buffer.append("\""); 717 for (int i=0; i<s.length();) { 718 char c = s.charAt(i++); 719 if (c < '\u0020' || c == '"' || c == '\\') { 720 if (c == '\n') { 721 buffer.append("\\n"); 722 } else if (c == '\t') { 723 buffer.append("\\t"); 724 } else if (c == '\r') { 725 buffer.append("\\r"); 726 } else { 727 // Represent control characters, backslash and double quote 728 // using octal notation; otherwise the string we form 729 // won't compile, since Unicode escape sequences are 730 // processed before tokenization. 731 buffer.append('\\'); 732 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal 733 buffer.append(HEX_DIGIT[(c & 0070) >> 3]); 734 buffer.append(HEX_DIGIT[(c & 0007)]); 735 } 736 } 737 else if (c <= '\u007E') { 738 buffer.append(c); 739 } 740 else { 741 buffer.append("\\u"); 742 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]); 743 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]); 744 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]); 745 buffer.append(HEX_DIGIT[(c & 0x000F)]); 746 } 747 } 748 buffer.append('"'); 749 return buffer.toString(); 750 } 751 752 /** 753 * Convert characters outside the range U+0020 to U+007F to 754 * Unicode escapes, and convert backslash to a double backslash. 755 */ 756 public static final String escape(String s) { 757 StringBuilder buf = new StringBuilder(); 758 for (int i=0; i<s.length(); ) { 759 int c = Character.codePointAt(s, i); 760 i += UTF16.getCharCount(c); 761 if (c >= ' ' && c <= 0x007F) { 762 if (c == '\\') { 763 buf.append("\\\\"); // That is, "\\" 764 } else { 765 buf.append((char)c); 766 } 767 } else { 768 boolean four = c <= 0xFFFF; 769 buf.append(four ? "\\u" : "\\U"); 770 buf.append(hex(c, four ? 4 : 8)); 771 } 772 } 773 return buf.toString(); 774 } 775 776 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 777 static private final char[] UNESCAPE_MAP = { 778 /*" 0x22, 0x22 */ 779 /*' 0x27, 0x27 */ 780 /*? 0x3F, 0x3F */ 781 /*\ 0x5C, 0x5C */ 782 /*a*/ 0x61, 0x07, 783 /*b*/ 0x62, 0x08, 784 /*e*/ 0x65, 0x1b, 785 /*f*/ 0x66, 0x0c, 786 /*n*/ 0x6E, 0x0a, 787 /*r*/ 0x72, 0x0d, 788 /*t*/ 0x74, 0x09, 789 /*v*/ 0x76, 0x0b 790 }; 791 792 /** 793 * Convert an escape to a 32-bit code point value. We attempt 794 * to parallel the icu4c unescapeAt() function. 795 * @param offset16 an array containing offset to the character 796 * <em>after</em> the backslash. Upon return offset16[0] will 797 * be updated to point after the escape sequence. 798 * @return character value from 0 to 10FFFF, or -1 on error. 799 */ 800 public static int unescapeAt(String s, int[] offset16) { 801 int c; 802 int result = 0; 803 int n = 0; 804 int minDig = 0; 805 int maxDig = 0; 806 int bitsPerDigit = 4; 807 int dig; 808 int i; 809 boolean braces = false; 810 811 /* Check that offset is in range */ 812 int offset = offset16[0]; 813 int length = s.length(); 814 if (offset < 0 || offset >= length) { 815 return -1; 816 } 817 818 /* Fetch first UChar after '\\' */ 819 c = Character.codePointAt(s, offset); 820 offset += UTF16.getCharCount(c); 821 822 /* Convert hexadecimal and octal escapes */ 823 switch (c) { 824 case 'u': 825 minDig = maxDig = 4; 826 break; 827 case 'U': 828 minDig = maxDig = 8; 829 break; 830 case 'x': 831 minDig = 1; 832 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 833 ++offset; 834 braces = true; 835 maxDig = 8; 836 } else { 837 maxDig = 2; 838 } 839 break; 840 default: 841 dig = UCharacter.digit(c, 8); 842 if (dig >= 0) { 843 minDig = 1; 844 maxDig = 3; 845 n = 1; /* Already have first octal digit */ 846 bitsPerDigit = 3; 847 result = dig; 848 } 849 break; 850 } 851 if (minDig != 0) { 852 while (offset < length && n < maxDig) { 853 c = UTF16.charAt(s, offset); 854 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 855 if (dig < 0) { 856 break; 857 } 858 result = (result << bitsPerDigit) | dig; 859 offset += UTF16.getCharCount(c); 860 ++n; 861 } 862 if (n < minDig) { 863 return -1; 864 } 865 if (braces) { 866 if (c != 0x7D /*}*/) { 867 return -1; 868 } 869 ++offset; 870 } 871 if (result < 0 || result >= 0x110000) { 872 return -1; 873 } 874 // If an escape sequence specifies a lead surrogate, see 875 // if there is a trail surrogate after it, either as an 876 // escape or as a literal. If so, join them up into a 877 // supplementary. 878 if (offset < length && 879 UTF16.isLeadSurrogate((char) result)) { 880 int ahead = offset+1; 881 c = s.charAt(offset); // [sic] get 16-bit code unit 882 if (c == '\\' && ahead < length) { 883 int o[] = new int[] { ahead }; 884 c = unescapeAt(s, o); 885 ahead = o[0]; 886 } 887 if (UTF16.isTrailSurrogate((char) c)) { 888 offset = ahead; 889 result = Character.toCodePoint((char) result, (char) c); 890 } 891 } 892 offset16[0] = offset; 893 return result; 894 } 895 896 /* Convert C-style escapes in table */ 897 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 898 if (c == UNESCAPE_MAP[i]) { 899 offset16[0] = offset; 900 return UNESCAPE_MAP[i+1]; 901 } else if (c < UNESCAPE_MAP[i]) { 902 break; 903 } 904 } 905 906 /* Map \cX to control-X: X & 0x1F */ 907 if (c == 'c' && offset < length) { 908 c = UTF16.charAt(s, offset); 909 offset16[0] = offset + UTF16.getCharCount(c); 910 return 0x1F & c; 911 } 912 913 /* If no special forms are recognized, then consider 914 * the backslash to generically escape the next character. */ 915 offset16[0] = offset; 916 return c; 917 } 918 919 /** 920 * Convert all escapes in a given string using unescapeAt(). 921 * @exception IllegalArgumentException if an invalid escape is 922 * seen. 923 */ 924 public static String unescape(String s) { 925 StringBuilder buf = new StringBuilder(); 926 int[] pos = new int[1]; 927 for (int i=0; i<s.length(); ) { 928 char c = s.charAt(i++); 929 if (c == '\\') { 930 pos[0] = i; 931 int e = unescapeAt(s, pos); 932 if (e < 0) { 933 throw new IllegalArgumentException("Invalid escape sequence " + 934 s.substring(i-1, Math.min(i+8, s.length()))); 935 } 936 buf.appendCodePoint(e); 937 i = pos[0]; 938 } else { 939 buf.append(c); 940 } 941 } 942 return buf.toString(); 943 } 944 945 /** 946 * Convert all escapes in a given string using unescapeAt(). 947 * Leave invalid escape sequences unchanged. 948 */ 949 public static String unescapeLeniently(String s) { 950 StringBuilder buf = new StringBuilder(); 951 int[] pos = new int[1]; 952 for (int i=0; i<s.length(); ) { 953 char c = s.charAt(i++); 954 if (c == '\\') { 955 pos[0] = i; 956 int e = unescapeAt(s, pos); 957 if (e < 0) { 958 buf.append(c); 959 } else { 960 buf.appendCodePoint(e); 961 i = pos[0]; 962 } 963 } else { 964 buf.append(c); 965 } 966 } 967 return buf.toString(); 968 } 969 970 /** 971 * Convert a char to 4 hex uppercase digits. E.g., hex('a') => 972 * "0041". 973 */ 974 public static String hex(long ch) { 975 return hex(ch, 4); 976 } 977 978 /** 979 * Supplies a zero-padded hex representation of an integer (without 0x) 980 */ 981 static public String hex(long i, int places) { 982 if (i == Long.MIN_VALUE) return "-8000000000000000"; 983 boolean negative = i < 0; 984 if (negative) { 985 i = -i; 986 } 987 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 988 if (result.length() < places) { 989 result = "0000000000000000".substring(result.length(),places) + result; 990 } 991 if (negative) { 992 return '-' + result; 993 } 994 return result; 995 } 996 997 /** 998 * Convert a string to comma-separated groups of 4 hex uppercase 999 * digits. E.g., hex('ab') => "0041,0042". 1000 */ 1001 public static String hex(CharSequence s) { 1002 return hex(s, 4, ",", true, new StringBuilder()).toString(); 1003 } 1004 1005 /** 1006 * Convert a string to separated groups of hex uppercase 1007 * digits. E.g., hex('ab'...) => "0041,0042". Append the output 1008 * to the given Appendable. 1009 */ 1010 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) { 1011 try { 1012 if (useCodePoints) { 1013 int cp; 1014 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1015 cp = Character.codePointAt(s, i); 1016 if (i != 0) { 1017 result.append(separator); 1018 } 1019 result.append(hex(cp,width)); 1020 } 1021 } else { 1022 for (int i = 0; i < s.length(); ++i) { 1023 if (i != 0) { 1024 result.append(separator); 1025 } 1026 result.append(hex(s.charAt(i),width)); 1027 } 1028 } 1029 return result; 1030 } catch (IOException e) { 1031 throw new IllegalIcuArgumentException(e); 1032 } 1033 } 1034 1035 public static String hex(byte[] o, int start, int end, String separator) { 1036 StringBuilder result = new StringBuilder(); 1037 //int ch; 1038 for (int i = start; i < end; ++i) { 1039 if (i != 0) result.append(separator); 1040 result.append(hex(o[i])); 1041 } 1042 return result.toString(); 1043 } 1044 1045 /** 1046 * Convert a string to comma-separated groups of 4 hex uppercase 1047 * digits. E.g., hex('ab') => "0041,0042". 1048 */ 1049 public static <S extends CharSequence> String hex(S s, int width, S separator) { 1050 return hex(s, width, separator, true, new StringBuilder()).toString(); 1051 } 1052 1053 /** 1054 * Split a string into pieces based on the given divider character 1055 * @param s the string to split 1056 * @param divider the character on which to split. Occurrences of 1057 * this character are not included in the output 1058 * @param output an array to receive the substrings between 1059 * instances of divider. It must be large enough on entry to 1060 * accomodate all output. Adjacent instances of the divider 1061 * character will place empty strings into output. Before 1062 * returning, output is padded out with empty strings. 1063 */ 1064 public static void split(String s, char divider, String[] output) { 1065 int last = 0; 1066 int current = 0; 1067 int i; 1068 for (i = 0; i < s.length(); ++i) { 1069 if (s.charAt(i) == divider) { 1070 output[current++] = s.substring(last,i); 1071 last = i+1; 1072 } 1073 } 1074 output[current++] = s.substring(last,i); 1075 while (current < output.length) { 1076 output[current++] = ""; 1077 } 1078 } 1079 1080 /** 1081 * Split a string into pieces based on the given divider character 1082 * @param s the string to split 1083 * @param divider the character on which to split. Occurrences of 1084 * this character are not included in the output 1085 * @return output an array to receive the substrings between 1086 * instances of divider. Adjacent instances of the divider 1087 * character will place empty strings into output. 1088 */ 1089 public static String[] split(String s, char divider) { 1090 int last = 0; 1091 int i; 1092 ArrayList<String> output = new ArrayList<String>(); 1093 for (i = 0; i < s.length(); ++i) { 1094 if (s.charAt(i) == divider) { 1095 output.add(s.substring(last,i)); 1096 last = i+1; 1097 } 1098 } 1099 output.add( s.substring(last,i)); 1100 return output.toArray(new String[output.size()]); 1101 } 1102 1103 /** 1104 * Look up a given string in a string array. Returns the index at 1105 * which the first occurrence of the string was found in the 1106 * array, or -1 if it was not found. 1107 * @param source the string to search for 1108 * @param target the array of zero or more strings in which to 1109 * look for source 1110 * @return the index of target at which source first occurs, or -1 1111 * if not found 1112 */ 1113 public static int lookup(String source, String[] target) { 1114 for (int i = 0; i < target.length; ++i) { 1115 if (source.equals(target[i])) return i; 1116 } 1117 return -1; 1118 } 1119 1120 /** 1121 * Parse a single non-whitespace character 'ch', optionally 1122 * preceded by whitespace. 1123 * @param id the string to be parsed 1124 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1125 * offset of the first character to be parsed. On output, pos[0] 1126 * is the index after the last parsed character. If the parse 1127 * fails, pos[0] will be unchanged. 1128 * @param ch the non-whitespace character to be parsed. 1129 * @return true if 'ch' is seen preceded by zero or more 1130 * whitespace characters. 1131 */ 1132 public static boolean parseChar(String id, int[] pos, char ch) { 1133 int start = pos[0]; 1134 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]); 1135 if (pos[0] == id.length() || 1136 id.charAt(pos[0]) != ch) { 1137 pos[0] = start; 1138 return false; 1139 } 1140 ++pos[0]; 1141 return true; 1142 } 1143 1144 /** 1145 * Parse a pattern string starting at offset pos. Keywords are 1146 * matched case-insensitively. Spaces may be skipped and may be 1147 * optional or required. Integer values may be parsed, and if 1148 * they are, they will be returned in the given array. If 1149 * successful, the offset of the next non-space character is 1150 * returned. On failure, -1 is returned. 1151 * @param pattern must only contain lowercase characters, which 1152 * will match their uppercase equivalents as well. A space 1153 * character matches one or more required spaces. A '~' character 1154 * matches zero or more optional spaces. A '#' character matches 1155 * an integer and stores it in parsedInts, which the caller must 1156 * ensure has enough capacity. 1157 * @param parsedInts array to receive parsed integers. Caller 1158 * must ensure that parsedInts.length is >= the number of '#' 1159 * signs in 'pattern'. 1160 * @return the position after the last character parsed, or -1 if 1161 * the parse failed 1162 */ 1163 @SuppressWarnings("fallthrough") 1164 public static int parsePattern(String rule, int pos, int limit, 1165 String pattern, int[] parsedInts) { 1166 // TODO Update this to handle surrogates 1167 int[] p = new int[1]; 1168 int intCount = 0; // number of integers parsed 1169 for (int i=0; i<pattern.length(); ++i) { 1170 char cpat = pattern.charAt(i); 1171 char c; 1172 switch (cpat) { 1173 case ' ': 1174 if (pos >= limit) { 1175 return -1; 1176 } 1177 c = rule.charAt(pos++); 1178 if (!PatternProps.isWhiteSpace(c)) { 1179 return -1; 1180 } 1181 // FALL THROUGH to skipWhitespace 1182 case '~': 1183 pos = PatternProps.skipWhiteSpace(rule, pos); 1184 break; 1185 case '#': 1186 p[0] = pos; 1187 parsedInts[intCount++] = parseInteger(rule, p, limit); 1188 if (p[0] == pos) { 1189 // Syntax error; failed to parse integer 1190 return -1; 1191 } 1192 pos = p[0]; 1193 break; 1194 default: 1195 if (pos >= limit) { 1196 return -1; 1197 } 1198 c = (char) UCharacter.toLowerCase(rule.charAt(pos++)); 1199 if (c != cpat) { 1200 return -1; 1201 } 1202 break; 1203 } 1204 } 1205 return pos; 1206 } 1207 1208 /** 1209 * Parse a pattern string within the given Replaceable and a parsing 1210 * pattern. Characters are matched literally and case-sensitively 1211 * except for the following special characters: 1212 * 1213 * ~ zero or more Pattern_White_Space chars 1214 * 1215 * If end of pattern is reached with all matches along the way, 1216 * pos is advanced to the first unparsed index and returned. 1217 * Otherwise -1 is returned. 1218 * @param pat pattern that controls parsing 1219 * @param text text to be parsed, starting at index 1220 * @param index offset to first character to parse 1221 * @param limit offset after last character to parse 1222 * @return index after last parsed character, or -1 on parse failure. 1223 */ 1224 public static int parsePattern(String pat, 1225 Replaceable text, 1226 int index, 1227 int limit) { 1228 int ipat = 0; 1229 1230 // empty pattern matches immediately 1231 if (ipat == pat.length()) { 1232 return index; 1233 } 1234 1235 int cpat = Character.codePointAt(pat, ipat); 1236 1237 while (index < limit) { 1238 int c = text.char32At(index); 1239 1240 // parse \s* 1241 if (cpat == '~') { 1242 if (PatternProps.isWhiteSpace(c)) { 1243 index += UTF16.getCharCount(c); 1244 continue; 1245 } else { 1246 if (++ipat == pat.length()) { 1247 return index; // success; c unparsed 1248 } 1249 // fall thru; process c again with next cpat 1250 } 1251 } 1252 1253 // parse literal 1254 else if (c == cpat) { 1255 int n = UTF16.getCharCount(c); 1256 index += n; 1257 ipat += n; 1258 if (ipat == pat.length()) { 1259 return index; // success; c parsed 1260 } 1261 // fall thru; get next cpat 1262 } 1263 1264 // match failure of literal 1265 else { 1266 return -1; 1267 } 1268 1269 cpat = UTF16.charAt(pat, ipat); 1270 } 1271 1272 return -1; // text ended before end of pat 1273 } 1274 1275 /** 1276 * Parse an integer at pos, either of the form \d+ or of the form 1277 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1278 * or octal format. 1279 * @param pos INPUT-OUTPUT parameter. On input, the first 1280 * character to parse. On output, the character after the last 1281 * parsed character. 1282 */ 1283 public static int parseInteger(String rule, int[] pos, int limit) { 1284 int count = 0; 1285 int value = 0; 1286 int p = pos[0]; 1287 int radix = 10; 1288 1289 if (rule.regionMatches(true, p, "0x", 0, 2)) { 1290 p += 2; 1291 radix = 16; 1292 } else if (p < limit && rule.charAt(p) == '0') { 1293 p++; 1294 count = 1; 1295 radix = 8; 1296 } 1297 1298 while (p < limit) { 1299 int d = UCharacter.digit(rule.charAt(p++), radix); 1300 if (d < 0) { 1301 --p; 1302 break; 1303 } 1304 ++count; 1305 int v = (value * radix) + d; 1306 if (v <= value) { 1307 // If there are too many input digits, at some point 1308 // the value will go negative, e.g., if we have seen 1309 // "0x8000000" already and there is another '0', when 1310 // we parse the next 0 the value will go negative. 1311 return 0; 1312 } 1313 value = v; 1314 } 1315 if (count > 0) { 1316 pos[0] = p; 1317 } 1318 return value; 1319 } 1320 1321 /** 1322 * Parse a Unicode identifier from the given string at the given 1323 * position. Return the identifier, or null if there is no 1324 * identifier. 1325 * @param str the string to parse 1326 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the 1327 * first character to examine. It must be less than str.length(), 1328 * and it must not point to a whitespace character. That is, must 1329 * have pos[0] < str.length(). On 1330 * OUTPUT, the position after the last parsed character. 1331 * @return the Unicode identifier, or null if there is no valid 1332 * identifier at pos[0]. 1333 */ 1334 public static String parseUnicodeIdentifier(String str, int[] pos) { 1335 // assert(pos[0] < str.length()); 1336 StringBuilder buf = new StringBuilder(); 1337 int p = pos[0]; 1338 while (p < str.length()) { 1339 int ch = Character.codePointAt(str, p); 1340 if (buf.length() == 0) { 1341 if (UCharacter.isUnicodeIdentifierStart(ch)) { 1342 buf.appendCodePoint(ch); 1343 } else { 1344 return null; 1345 } 1346 } else { 1347 if (UCharacter.isUnicodeIdentifierPart(ch)) { 1348 buf.appendCodePoint(ch); 1349 } else { 1350 break; 1351 } 1352 } 1353 p += UTF16.getCharCount(ch); 1354 } 1355 pos[0] = p; 1356 return buf.toString(); 1357 } 1358 1359 static final char DIGITS[] = { 1360 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 1361 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 1362 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 1363 'U', 'V', 'W', 'X', 'Y', 'Z' 1364 }; 1365 1366 /** 1367 * Append the digits of a positive integer to the given 1368 * <code>Appendable</code> in the given radix. This is 1369 * done recursively since it is easiest to generate the low- 1370 * order digit first, but it must be appended last. 1371 * 1372 * @param result is the <code>Appendable</code> to append to 1373 * @param n is the positive integer 1374 * @param radix is the radix, from 2 to 36 inclusive 1375 * @param minDigits is the minimum number of digits to append. 1376 */ 1377 private static <T extends Appendable> void recursiveAppendNumber(T result, int n, 1378 int radix, int minDigits) 1379 { 1380 try { 1381 int digit = n % radix; 1382 1383 if (n >= radix || minDigits > 1) { 1384 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 1385 } 1386 result.append(DIGITS[digit]); 1387 } catch (IOException e) { 1388 throw new IllegalIcuArgumentException(e); 1389 } 1390 } 1391 1392 /** 1393 * Append a number to the given Appendable in the given radix. 1394 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 1395 * radices 11 through 36. 1396 * @param result the digits of the number are appended here 1397 * @param n the number to be converted to digits; may be negative. 1398 * If negative, a '-' is prepended to the digits. 1399 * @param radix a radix from 2 to 36 inclusive. 1400 * @param minDigits the minimum number of digits, not including 1401 * any '-', to produce. Values less than 2 have no effect. One 1402 * digit is always emitted regardless of this parameter. 1403 * @return a reference to result 1404 */ 1405 public static <T extends Appendable> T appendNumber(T result, int n, 1406 int radix, int minDigits) 1407 { 1408 try { 1409 if (radix < 2 || radix > 36) { 1410 throw new IllegalArgumentException("Illegal radix " + radix); 1411 } 1412 1413 1414 int abs = n; 1415 1416 if (n < 0) { 1417 abs = -n; 1418 result.append("-"); 1419 } 1420 1421 recursiveAppendNumber(result, abs, radix, minDigits); 1422 1423 return result; 1424 } catch (IOException e) { 1425 throw new IllegalIcuArgumentException(e); 1426 } 1427 1428 } 1429 1430 /** 1431 * Parse an unsigned 31-bit integer at the given offset. Use 1432 * UCharacter.digit() to parse individual characters into digits. 1433 * @param text the text to be parsed 1434 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1435 * offset within text at which to start parsing; it should point 1436 * to a valid digit. On exit, pos[0] is the offset after the last 1437 * parsed character. If the parse failed, it will be unchanged on 1438 * exit. Must be >= 0 on entry. 1439 * @param radix the radix in which to parse; must be >= 2 and <= 1440 * 36. 1441 * @return a non-negative parsed number, or -1 upon parse failure. 1442 * Parse fails if there are no digits, that is, if pos[0] does not 1443 * point to a valid digit on entry, or if the number to be parsed 1444 * does not fit into a 31-bit unsigned integer. 1445 */ 1446 public static int parseNumber(String text, int[] pos, int radix) { 1447 // assert(pos[0] >= 0); 1448 // assert(radix >= 2); 1449 // assert(radix <= 36); 1450 int n = 0; 1451 int p = pos[0]; 1452 while (p < text.length()) { 1453 int ch = Character.codePointAt(text, p); 1454 int d = UCharacter.digit(ch, radix); 1455 if (d < 0) { 1456 break; 1457 } 1458 n = radix*n + d; 1459 // ASSUME that when a 32-bit integer overflows it becomes 1460 // negative. E.g., 214748364 * 10 + 8 => negative value. 1461 if (n < 0) { 1462 return -1; 1463 } 1464 ++p; 1465 } 1466 if (p == pos[0]) { 1467 return -1; 1468 } 1469 pos[0] = p; 1470 return n; 1471 } 1472 1473 /** 1474 * Return true if the character is NOT printable ASCII. The tab, 1475 * newline and linefeed characters are considered unprintable. 1476 */ 1477 public static boolean isUnprintable(int c) { 1478 //0x20 = 32 and 0x7E = 126 1479 return !(c >= 0x20 && c <= 0x7E); 1480 } 1481 1482 /** 1483 * Escape unprintable characters using <backslash>uxxxx notation 1484 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 1485 * above. If the character is printable ASCII, then do nothing 1486 * and return FALSE. Otherwise, append the escaped notation and 1487 * return TRUE. 1488 */ 1489 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 1490 try { 1491 if (isUnprintable(c)) { 1492 result.append('\\'); 1493 if ((c & ~0xFFFF) != 0) { 1494 result.append('U'); 1495 result.append(DIGITS[0xF&(c>>28)]); 1496 result.append(DIGITS[0xF&(c>>24)]); 1497 result.append(DIGITS[0xF&(c>>20)]); 1498 result.append(DIGITS[0xF&(c>>16)]); 1499 } else { 1500 result.append('u'); 1501 } 1502 result.append(DIGITS[0xF&(c>>12)]); 1503 result.append(DIGITS[0xF&(c>>8)]); 1504 result.append(DIGITS[0xF&(c>>4)]); 1505 result.append(DIGITS[0xF&c]); 1506 return true; 1507 } 1508 return false; 1509 } catch (IOException e) { 1510 throw new IllegalIcuArgumentException(e); 1511 } 1512 } 1513 1514 /** 1515 * Returns the index of the first character in a set, ignoring quoted text. 1516 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1517 * found by a search for "h". Unlike String.indexOf(), this method searches 1518 * not for a single character, but for any character of the string 1519 * <code>setOfChars</code>. 1520 * @param text text to be searched 1521 * @param start the beginning index, inclusive; <code>0 <= start 1522 * <= limit</code>. 1523 * @param limit the ending index, exclusive; <code>start <= limit 1524 * <= text.length()</code>. 1525 * @param setOfChars string with one or more distinct characters 1526 * @return Offset of the first character in <code>setOfChars</code> 1527 * found, or -1 if not found. 1528 * @see String#indexOf 1529 */ 1530 public static int quotedIndexOf(String text, int start, int limit, 1531 String setOfChars) { 1532 for (int i=start; i<limit; ++i) { 1533 char c = text.charAt(i); 1534 if (c == BACKSLASH) { 1535 ++i; 1536 } else if (c == APOSTROPHE) { 1537 while (++i < limit 1538 && text.charAt(i) != APOSTROPHE) {} 1539 } else if (setOfChars.indexOf(c) >= 0) { 1540 return i; 1541 } 1542 } 1543 return -1; 1544 } 1545 1546 /** 1547 * Append a character to a rule that is being built up. To flush 1548 * the quoteBuf to rule, make one final call with isLiteral == true. 1549 * If there is no final character, pass in (int)-1 as c. 1550 * @param rule the string to append the character to 1551 * @param c the character to append, or (int)-1 if none. 1552 * @param isLiteral if true, then the given character should not be 1553 * quoted or escaped. Usually this means it is a syntactic element 1554 * such as > or $ 1555 * @param escapeUnprintable if true, then unprintable characters 1556 * should be escaped using escapeUnprintable(). These escapes will 1557 * appear outside of quotes. 1558 * @param quoteBuf a buffer which is used to build up quoted 1559 * substrings. The caller should initially supply an empty buffer, 1560 * and thereafter should not modify the buffer. The buffer should be 1561 * cleared out by, at the end, calling this method with a literal 1562 * character (which may be -1). 1563 */ 1564 public static void appendToRule(StringBuffer rule, 1565 int c, 1566 boolean isLiteral, 1567 boolean escapeUnprintable, 1568 StringBuffer quoteBuf) { 1569 // If we are escaping unprintables, then escape them outside 1570 // quotes. \\u and \\U are not recognized within quotes. The same 1571 // logic applies to literals, but literals are never escaped. 1572 if (isLiteral || 1573 (escapeUnprintable && Utility.isUnprintable(c))) { 1574 if (quoteBuf.length() > 0) { 1575 // We prefer backslash APOSTROPHE to double APOSTROPHE 1576 // (more readable, less similar to ") so if there are 1577 // double APOSTROPHEs at the ends, we pull them outside 1578 // of the quote. 1579 1580 // If the first thing in the quoteBuf is APOSTROPHE 1581 // (doubled) then pull it out. 1582 while (quoteBuf.length() >= 2 && 1583 quoteBuf.charAt(0) == APOSTROPHE && 1584 quoteBuf.charAt(1) == APOSTROPHE) { 1585 rule.append(BACKSLASH).append(APOSTROPHE); 1586 quoteBuf.delete(0, 2); 1587 } 1588 // If the last thing in the quoteBuf is APOSTROPHE 1589 // (doubled) then remove and count it and add it after. 1590 int trailingCount = 0; 1591 while (quoteBuf.length() >= 2 && 1592 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1593 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1594 quoteBuf.setLength(quoteBuf.length()-2); 1595 ++trailingCount; 1596 } 1597 if (quoteBuf.length() > 0) { 1598 rule.append(APOSTROPHE); 1599 rule.append(quoteBuf); 1600 rule.append(APOSTROPHE); 1601 quoteBuf.setLength(0); 1602 } 1603 while (trailingCount-- > 0) { 1604 rule.append(BACKSLASH).append(APOSTROPHE); 1605 } 1606 } 1607 if (c != -1) { 1608 /* Since spaces are ignored during parsing, they are 1609 * emitted only for readability. We emit one here 1610 * only if there isn't already one at the end of the 1611 * rule. 1612 */ 1613 if (c == ' ') { 1614 int len = rule.length(); 1615 if (len > 0 && rule.charAt(len-1) != ' ') { 1616 rule.append(' '); 1617 } 1618 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) { 1619 rule.appendCodePoint(c); 1620 } 1621 } 1622 } 1623 1624 // Escape ' and '\' and don't begin a quote just for them 1625 else if (quoteBuf.length() == 0 && 1626 (c == APOSTROPHE || c == BACKSLASH)) { 1627 rule.append(BACKSLASH).append((char)c); 1628 } 1629 1630 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1631 // whitespace need quoting. Also append stuff to quotes if we are 1632 // building up a quoted substring already. 1633 else if (quoteBuf.length() > 0 || 1634 (c >= 0x0021 && c <= 0x007E && 1635 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1636 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1637 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1638 PatternProps.isWhiteSpace(c)) { 1639 quoteBuf.appendCodePoint(c); 1640 // Double ' within a quote 1641 if (c == APOSTROPHE) { 1642 quoteBuf.append((char)c); 1643 } 1644 } 1645 1646 // Otherwise just append 1647 else { 1648 rule.appendCodePoint(c); 1649 } 1650 } 1651 1652 /** 1653 * Append the given string to the rule. Calls the single-character 1654 * version of appendToRule for each character. 1655 */ 1656 public static void appendToRule(StringBuffer rule, 1657 String text, 1658 boolean isLiteral, 1659 boolean escapeUnprintable, 1660 StringBuffer quoteBuf) { 1661 for (int i=0; i<text.length(); ++i) { 1662 // Okay to process in 16-bit code units here 1663 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf); 1664 } 1665 } 1666 1667 /** 1668 * Given a matcher reference, which may be null, append its 1669 * pattern as a literal to the given rule. 1670 */ 1671 public static void appendToRule(StringBuffer rule, 1672 UnicodeMatcher matcher, 1673 boolean escapeUnprintable, 1674 StringBuffer quoteBuf) { 1675 if (matcher != null) { 1676 appendToRule(rule, matcher.toPattern(escapeUnprintable), 1677 true, escapeUnprintable, quoteBuf); 1678 } 1679 } 1680 1681 /** 1682 * Compares 2 unsigned integers 1683 * @param source 32 bit unsigned integer 1684 * @param target 32 bit unsigned integer 1685 * @return 0 if equals, 1 if source is greater than target and -1 1686 * otherwise 1687 */ 1688 public static final int compareUnsigned(int source, int target) 1689 { 1690 source += MAGIC_UNSIGNED; 1691 target += MAGIC_UNSIGNED; 1692 if (source < target) { 1693 return -1; 1694 } 1695 else if (source > target) { 1696 return 1; 1697 } 1698 return 0; 1699 } 1700 1701 /** 1702 * Find the highest bit in a positive integer. This is done 1703 * by doing a binary search through the bits. 1704 * 1705 * @param n is the integer 1706 * 1707 * @return the bit number of the highest bit, with 0 being 1708 * the low order bit, or -1 if <code>n</code> is not positive 1709 */ 1710 public static final byte highBit(int n) 1711 { 1712 if (n <= 0) { 1713 return -1; 1714 } 1715 1716 byte bit = 0; 1717 1718 if (n >= 1 << 16) { 1719 n >>= 16; 1720 bit += 16; 1721 } 1722 1723 if (n >= 1 << 8) { 1724 n >>= 8; 1725 bit += 8; 1726 } 1727 1728 if (n >= 1 << 4) { 1729 n >>= 4; 1730 bit += 4; 1731 } 1732 1733 if (n >= 1 << 2) { 1734 n >>= 2; 1735 bit += 2; 1736 } 1737 1738 if (n >= 1 << 1) { 1739 n >>= 1; 1740 bit += 1; 1741 } 1742 1743 return bit; 1744 } 1745 /** 1746 * Utility method to take a int[] containing codepoints and return 1747 * a string representation with code units. 1748 */ 1749 public static String valueOf(int[]source){ 1750 // TODO: Investigate why this method is not on UTF16 class 1751 StringBuilder result = new StringBuilder(source.length); 1752 for(int i=0; i<source.length; i++){ 1753 result.appendCodePoint(source[i]); 1754 } 1755 return result.toString(); 1756 } 1757 1758 1759 /** 1760 * Utility to duplicate a string count times 1761 * @param s String to be duplicated. 1762 * @param count Number of times to duplicate a string. 1763 */ 1764 public static String repeat(String s, int count) { 1765 if (count <= 0) return ""; 1766 if (count == 1) return s; 1767 StringBuilder result = new StringBuilder(); 1768 for (int i = 0; i < count; ++i) { 1769 result.append(s); 1770 } 1771 return result.toString(); 1772 } 1773 1774 public static String[] splitString(String src, String target) { 1775 return src.split("\\Q" + target + "\\E"); 1776 } 1777 1778 /** 1779 * Split the string at runs of ascii whitespace characters. 1780 */ 1781 public static String[] splitWhitespace(String src) { 1782 return src.split("\\s+"); 1783 } 1784 1785 /** 1786 * Parse a list of hex numbers and return a string 1787 * @param string String of hex numbers. 1788 * @param minLength Minimal length. 1789 * @param separator Separator. 1790 * @return A string from hex numbers. 1791 */ 1792 public static String fromHex(String string, int minLength, String separator) { 1793 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+")); 1794 } 1795 1796 /** 1797 * Parse a list of hex numbers and return a string 1798 * @param string String of hex numbers. 1799 * @param minLength Minimal length. 1800 * @param separator Separator. 1801 * @return A string from hex numbers. 1802 */ 1803 public static String fromHex(String string, int minLength, Pattern separator) { 1804 StringBuilder buffer = new StringBuilder(); 1805 String[] parts = separator.split(string); 1806 for (String part : parts) { 1807 if (part.length() < minLength) { 1808 throw new IllegalArgumentException("code point too short: " + part); 1809 } 1810 int cp = Integer.parseInt(part, 16); 1811 buffer.appendCodePoint(cp); 1812 } 1813 return buffer.toString(); 1814 } 1815 1816 /** 1817 * This implementation is equivalent to Java 7+ Objects#equals(Object a, Object b) 1818 * 1819 * @param a an object 1820 * @param b an object to be compared with a for equality 1821 * @return true if the arguments are equal to each other and false otherwise 1822 */ 1823 public static boolean equals(Object a, Object b) { 1824 return (a == b) 1825 || (a != null && b != null && a.equals(b)); 1826 } 1827 1828 /** 1829 * This implementation is equivalent to Java 7+ Objects#hash(Object... values) 1830 * @param values the values to be hashed 1831 * @return a hash value of the sequence of input values 1832 */ 1833 public static int hash(Object... values) { 1834 return Arrays.hashCode(values); 1835 } 1836 1837 /** 1838 * This implementation is equivalent to Java 7+ Objects#hashCode(Object o) 1839 * @param o an object 1840 * @return a hash value of a non-null argument and 0 for null argument 1841 */ 1842 public static int hashCode(Object o) { 1843 return o == null ? 0 : o.hashCode(); 1844 } 1845 1846 /** 1847 * This implementation is equivalent to Java 7+ Objects#toString(Object o) 1848 * @param o an object 1849 * @return the result of calling toStirng for a non-null argument and "null" for a 1850 * null argument 1851 */ 1852 public static String toString(Object o) { 1853 return o == null ? "null" : o.toString(); 1854 } 1855 } 1856