1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 */ 11 12 package com.ibm.icu.lang; 13 14 import com.ibm.icu.text.UTF16; 15 16 /** 17 * <code>UScriptRun</code> is used to find runs of characters in 18 * the same script, as defined in the <code>UScript</code> class. 19 * It implements a simple iterator over an array of characters. 20 * The iterator will assign <code>COMMON</code> and <code>INHERITED</code> 21 * characters to the same script as the preceeding characters. If the 22 * COMMON and INHERITED characters are first, they will be assigned to 23 * the same script as the following characters. 24 * 25 * The iterator will try to match paired punctuation. If it sees an 26 * opening punctuation character, it will remember the script that 27 * was assigned to that character, and assign the same script to the 28 * matching closing punctuation. 29 * 30 * No attempt is made to combine related scripts into a single run. In 31 * particular, Hiragana, Katakana, and Han characters will appear in separate 32 * runs. 33 34 * Here is an example of how to iterate over script runs: 35 * <pre> 36 * void printScriptRuns(char[] text) 37 * { 38 * UScriptRun scriptRun = new UScriptRun(text); 39 * 40 * while (scriptRun.next()) { 41 * int start = scriptRun.getScriptStart(); 42 * int limit = scriptRun.getScriptLimit(); 43 * int script = scriptRun.getScriptCode(); 44 * 45 * System.out.println("Script \"" + UScript.getName(script) + "\" from " + 46 * start + " to " + limit + "."); 47 * } 48 * } 49 * </pre> 50 * 51 * @internal 52 * @deprecated This API is ICU internal only. 53 */ 54 @Deprecated 55 public final class UScriptRun 56 { 57 /** 58 * Construct an empty <code>UScriptRun</code> object. The <code>next()</code> 59 * method will return <code>false</code> the first time it is called. 60 * 61 * @internal 62 * @deprecated This API is ICU internal only. 63 */ 64 @Deprecated 65 public UScriptRun() 66 { 67 char[] nullChars = null; 68 69 reset(nullChars, 0, 0); 70 } 71 72 /** 73 * Construct a <code>UScriptRun</code> object which iterates over the 74 * characters in the given string. 75 * 76 * @param text the string of characters over which to iterate. 77 * 78 * @internal 79 * @deprecated This API is ICU internal only. 80 */ 81 @Deprecated 82 public UScriptRun(String text) 83 { 84 reset (text); 85 } 86 87 /** 88 * Construct a <code>UScriptRun</code> object which iterates over a subrange 89 * of the characetrs in the given string. 90 * 91 * @param text the string of characters over which to iterate. 92 * @param start the index of the first character over which to iterate 93 * @param count the number of characters over which to iterate 94 * 95 * @internal 96 * @deprecated This API is ICU internal only. 97 */ 98 @Deprecated 99 public UScriptRun(String text, int start, int count) 100 { 101 reset(text, start, count); 102 } 103 104 /** 105 * Construct a <code>UScriptRun</code> object which iterates over the given 106 * characetrs. 107 * 108 * @param chars the array of characters over which to iterate. 109 * 110 * @internal 111 * @deprecated This API is ICU internal only. 112 */ 113 @Deprecated 114 public UScriptRun(char[] chars) 115 { 116 reset(chars); 117 } 118 119 /** 120 * Construct a <code>UScriptRun</code> object which iterates over a subrange 121 * of the given characetrs. 122 * 123 * @param chars the array of characters over which to iterate. 124 * @param start the index of the first character over which to iterate 125 * @param count the number of characters over which to iterate 126 * 127 * @internal 128 * @deprecated This API is ICU internal only. 129 */ 130 @Deprecated 131 public UScriptRun(char[] chars, int start, int count) 132 { 133 reset(chars, start, count); 134 } 135 136 137 /** 138 * Reset the iterator to the start of the text. 139 * 140 * @internal 141 * @deprecated This API is ICU internal only. 142 */ 143 @Deprecated 144 public final void reset() 145 { 146 // empty any old parenStack contents. 147 // NOTE: this is not the most efficient way 148 // to do this, but it's the easiest to write... 149 while (stackIsNotEmpty()) { 150 pop(); 151 } 152 153 scriptStart = textStart; 154 scriptLimit = textStart; 155 scriptCode = UScript.INVALID_CODE; 156 parenSP = -1; 157 pushCount = 0; 158 fixupCount = 0; 159 160 textIndex = textStart; 161 } 162 163 /** 164 * Reset the iterator to iterate over the given range of the text. Throws 165 * IllegalArgumentException if the range is outside of the bounds of the 166 * character array. 167 * 168 * @param start the index of the new first character over which to iterate 169 * @param count the new number of characters over which to iterate. 170 * @exception IllegalArgumentException If invalid arguments are passed. 171 * 172 * @internal 173 * @deprecated This API is ICU internal only. 174 */ 175 @Deprecated 176 public final void reset(int start, int count) 177 throws IllegalArgumentException 178 { 179 int len = 0; 180 181 if (text != null) { 182 len = text.length; 183 } 184 185 if (start < 0 || count < 0 || start > len - count) { 186 throw new IllegalArgumentException(); 187 } 188 189 textStart = start; 190 textLimit = start + count; 191 192 reset(); 193 } 194 195 /** 196 * Reset the iterator to iterate over <code>count</code> characters 197 * in <code>chars</code> starting at <code>start</code>. This allows 198 * clients to reuse an iterator. 199 * 200 * @param chars the new array of characters over which to iterate. 201 * @param start the index of the first character over which to iterate. 202 * @param count the number of characters over which to iterate. 203 * 204 * @internal 205 * @deprecated This API is ICU internal only. 206 */ 207 @Deprecated 208 public final void reset(char[] chars, int start, int count) 209 { 210 if (chars == null) { 211 chars = emptyCharArray; 212 } 213 214 text = chars; 215 216 reset(start, count); 217 } 218 219 /** 220 * Reset the iterator to iterate over the characters 221 * in <code>chars</code>. This allows clients to reuse an iterator. 222 * 223 * @param chars the new array of characters over which to iterate. 224 * 225 * @internal 226 * @deprecated This API is ICU internal only. 227 */ 228 @Deprecated 229 public final void reset(char[] chars) 230 { 231 int length = 0; 232 233 if (chars != null) { 234 length = chars.length; 235 } 236 237 reset(chars, 0, length); 238 } 239 240 /** 241 * Reset the iterator to iterate over <code>count</code> characters 242 * in <code>text</code> starting at <code>start</code>. This allows 243 * clients to reuse an iterator. 244 * 245 * @param str the new string of characters over which to iterate. 246 * @param start the index of the first character over which to iterate. 247 * @param count the nuber of characters over which to iterate. 248 * 249 * @internal 250 * @deprecated This API is ICU internal only. 251 */ 252 @Deprecated 253 public final void reset(String str, int start, int count) 254 { 255 char[] chars = null; 256 257 if (str != null) { 258 chars = str.toCharArray(); 259 } 260 261 reset(chars, start, count); 262 } 263 264 /** 265 * Reset the iterator to iterate over the characters 266 * in <code>text</code>. This allows clients to reuse an iterator. 267 * 268 * @param str the new string of characters over which to iterate. 269 * 270 * @internal 271 * @deprecated This API is ICU internal only. 272 */ 273 @Deprecated 274 public final void reset(String str) 275 { 276 int length = 0; 277 278 if (str != null) { 279 length = str.length(); 280 } 281 282 reset(str, 0, length); 283 } 284 285 286 287 /** 288 * Get the starting index of the current script run. 289 * 290 * @return the index of the first character in the current script run. 291 * 292 * @internal 293 * @deprecated This API is ICU internal only. 294 */ 295 @Deprecated 296 public final int getScriptStart() 297 { 298 return scriptStart; 299 } 300 301 /** 302 * Get the index of the first character after the current script run. 303 * 304 * @return the index of the first character after the current script run. 305 * 306 * @internal 307 * @deprecated This API is ICU internal only. 308 */ 309 @Deprecated 310 public final int getScriptLimit() 311 { 312 return scriptLimit; 313 } 314 315 /** 316 * Get the script code for the script of the current script run. 317 * 318 * @return the script code for the script of the current script run. 319 * @see com.ibm.icu.lang.UScript 320 * 321 * @internal 322 * @deprecated This API is ICU internal only. 323 */ 324 @Deprecated 325 public final int getScriptCode() 326 { 327 return scriptCode; 328 } 329 330 /** 331 * Find the next script run. Returns <code>false</code> if there 332 * isn't another run, returns <code>true</code> if there is. 333 * 334 * @return <code>false</code> if there isn't another run, <code>true</code> if there is. 335 * 336 * @internal 337 * @deprecated This API is ICU internal only. 338 */ 339 @Deprecated 340 public final boolean next() 341 { 342 // if we've fallen off the end of the text, we're done 343 if (scriptLimit >= textLimit) { 344 return false; 345 } 346 347 scriptCode = UScript.COMMON; 348 scriptStart = scriptLimit; 349 350 syncFixup(); 351 352 while (textIndex < textLimit) { 353 int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart); 354 int codePointCount = UTF16.getCharCount(ch); 355 int sc = UScript.getScript(ch); 356 int pairIndex = getPairIndex(ch); 357 358 textIndex += codePointCount; 359 360 // Paired character handling: 361 // 362 // if it's an open character, push it onto the stack. 363 // if it's a close character, find the matching open on the 364 // stack, and use that script code. Any non-matching open 365 // characters above it on the stack will be poped. 366 if (pairIndex >= 0) { 367 if ((pairIndex & 1) == 0) { 368 push(pairIndex, scriptCode); 369 } else { 370 int pi = pairIndex & ~1; 371 372 while (stackIsNotEmpty() && top().pairIndex != pi) { 373 pop(); 374 } 375 376 if (stackIsNotEmpty()) { 377 sc = top().scriptCode; 378 } 379 } 380 } 381 382 if (sameScript(scriptCode, sc)) { 383 if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { 384 scriptCode = sc; 385 386 fixup(scriptCode); 387 } 388 389 // if this character is a close paired character, 390 // pop the matching open character from the stack 391 if (pairIndex >= 0 && (pairIndex & 1) != 0) { 392 pop(); 393 } 394 } else { 395 // We've just seen the first character of 396 // the next run. Back over it so we'll see 397 // it again the next time. 398 textIndex -= codePointCount; 399 break; 400 } 401 } 402 403 scriptLimit = textIndex; 404 return true; 405 } 406 407 /** 408 * Compare two script codes to see if they are in the same script. If one script is 409 * a strong script, and the other is INHERITED or COMMON, it will compare equal. 410 * 411 * @param scriptOne one of the script codes. 412 * @param scriptTwo the other script code. 413 * @return <code>true</code> if the two scripts are the same. 414 * @see com.ibm.icu.lang.UScript 415 */ 416 private static boolean sameScript(int scriptOne, int scriptTwo) 417 { 418 return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; 419 } 420 421 /* 422 * An internal class which holds entries on the paren stack. 423 */ 424 private static final class ParenStackEntry 425 { 426 int pairIndex; 427 int scriptCode; 428 429 public ParenStackEntry(int thePairIndex, int theScriptCode) 430 { 431 pairIndex = thePairIndex; 432 scriptCode = theScriptCode; 433 } 434 } 435 436 private static final int mod(int sp) 437 { 438 return sp % PAREN_STACK_DEPTH; 439 } 440 441 private static final int inc(int sp, int count) 442 { 443 return mod(sp + count); 444 } 445 446 private static final int inc(int sp) 447 { 448 return inc(sp, 1); 449 } 450 451 private static final int dec(int sp, int count) 452 { 453 return mod(sp + PAREN_STACK_DEPTH - count); 454 } 455 456 private static final int dec(int sp) 457 { 458 return dec(sp, 1); 459 } 460 461 private static final int limitInc(int count) 462 { 463 if (count < PAREN_STACK_DEPTH) { 464 count += 1; 465 } 466 467 return count; 468 } 469 470 private final boolean stackIsEmpty() 471 { 472 return pushCount <= 0; 473 } 474 475 private final boolean stackIsNotEmpty() 476 { 477 return ! stackIsEmpty(); 478 } 479 480 private final void push(int pairIndex, int scrptCode) 481 { 482 pushCount = limitInc(pushCount); 483 fixupCount = limitInc(fixupCount); 484 485 parenSP = inc(parenSP); 486 parenStack[parenSP] = new ParenStackEntry(pairIndex, scrptCode); 487 } 488 489 private final void pop() 490 { 491 492 if (stackIsEmpty()) { 493 return; 494 } 495 496 parenStack[parenSP] = null; 497 498 if (fixupCount > 0) { 499 fixupCount -= 1; 500 } 501 502 pushCount -= 1; 503 parenSP = dec(parenSP); 504 505 // If the stack is now empty, reset the stack 506 // pointers to their initial values. 507 if (stackIsEmpty()) { 508 parenSP = -1; 509 } 510 } 511 512 private final ParenStackEntry top() 513 { 514 return parenStack[parenSP]; 515 } 516 517 private final void syncFixup() 518 { 519 fixupCount = 0; 520 } 521 522 private final void fixup(int scrptCode) 523 { 524 int fixupSP = dec(parenSP, fixupCount); 525 526 while (fixupCount-- > 0) { 527 fixupSP = inc(fixupSP); 528 parenStack[fixupSP].scriptCode = scrptCode; 529 } 530 } 531 532 private char[] emptyCharArray = {}; 533 534 private char[] text; 535 536 private int textIndex; 537 private int textStart; 538 private int textLimit; 539 540 private int scriptStart; 541 private int scriptLimit; 542 private int scriptCode; 543 544 private static int PAREN_STACK_DEPTH = 32; 545 private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH]; 546 private int parenSP = -1; 547 private int pushCount = 0; 548 private int fixupCount = 0; 549 550 /** 551 * Find the highest bit that's set in a word. Uses a binary search through 552 * the bits. 553 * 554 * @param n the word in which to find the highest bit that's set. 555 * @return the bit number (counting from the low order bit) of the highest bit. 556 */ 557 private static final byte highBit(int n) 558 { 559 if (n <= 0) { 560 return -32; 561 } 562 563 byte bit = 0; 564 565 if (n >= 1 << 16) { 566 n >>= 16; 567 bit += 16; 568 } 569 570 if (n >= 1 << 8) { 571 n >>= 8; 572 bit += 8; 573 } 574 575 if (n >= 1 << 4) { 576 n >>= 4; 577 bit += 4; 578 } 579 580 if (n >= 1 << 2) { 581 n >>= 2; 582 bit += 2; 583 } 584 585 if (n >= 1 << 1) { 586 n >>= 1; 587 bit += 1; 588 } 589 590 return bit; 591 } 592 593 /** 594 * Search the pairedChars array for the given character. 595 * 596 * @param ch the character for which to search. 597 * @return the index of the character in the table, or -1 if it's not there. 598 */ 599 private static int getPairIndex(int ch) 600 { 601 int probe = pairedCharPower; 602 int index = 0; 603 604 if (ch >= pairedChars[pairedCharExtra]) { 605 index = pairedCharExtra; 606 } 607 608 while (probe > (1 << 0)) { 609 probe >>= 1; 610 611 if (ch >= pairedChars[index + probe]) { 612 index += probe; 613 } 614 } 615 616 if (pairedChars[index] != ch) { 617 index = -1; 618 } 619 620 return index; 621 } 622 623 private static int pairedChars[] = { 624 0x0028, 0x0029, // ascii paired punctuation 625 0x003c, 0x003e, 626 0x005b, 0x005d, 627 0x007b, 0x007d, 628 0x00ab, 0x00bb, // guillemets 629 0x2018, 0x2019, // general punctuation 630 0x201c, 0x201d, 631 0x2039, 0x203a, 632 0x3008, 0x3009, // chinese paired punctuation 633 0x300a, 0x300b, 634 0x300c, 0x300d, 635 0x300e, 0x300f, 636 0x3010, 0x3011, 637 0x3014, 0x3015, 638 0x3016, 0x3017, 639 0x3018, 0x3019, 640 0x301a, 0x301b 641 }; 642 643 private static int pairedCharPower = 1 << highBit(pairedChars.length); 644 private static int pairedCharExtra = pairedChars.length - pairedCharPower; 645 } 646 647