1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.MessageFormat; 12 import java.util.ArrayList; 13 import java.util.Collections; 14 import java.util.Enumeration; 15 import java.util.HashMap; 16 import java.util.List; 17 import java.util.Locale; 18 import java.util.Map; 19 import java.util.MissingResourceException; 20 21 import com.ibm.icu.impl.ICUData; 22 import com.ibm.icu.impl.ICUResourceBundle; 23 import com.ibm.icu.impl.Utility; 24 import com.ibm.icu.impl.UtilityExtensions; 25 import com.ibm.icu.text.RuleBasedTransliterator.Data; 26 import com.ibm.icu.text.TransliteratorIDParser.SingleID; 27 import com.ibm.icu.util.CaseInsensitiveString; 28 import com.ibm.icu.util.ULocale; 29 import com.ibm.icu.util.ULocale.Category; 30 import com.ibm.icu.util.UResourceBundle; 31 32 /** 33 * <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common 34 * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator 35 * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not 36 * <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference 37 * to the meanings of words and sentences. 38 * 39 * <p> 40 * Although script conversion is its most common use, a transliterator can actually perform a more general class of 41 * tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the 42 * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of 43 * <code>Transliterator</code>. 44 * 45 * <p> 46 * <b>Transliterators are stateless</b> 47 * 48 * <p> 49 * <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to 50 * <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might 51 * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex 52 * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In 53 * other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the 54 * needed information, and delayed operation allows arbitrary complexity. 55 * 56 * <p> 57 * <b>Batch transliteration</b> 58 * 59 * <p> 60 * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as 61 * <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>, 62 * the call 63 * 64 * <blockquote><code>String result = t.transliterate(input); 65 * </code></blockquote> 66 * 67 * will transliterate it and return the result. Other methods allow the client to specify a substring to be 68 * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band 69 * information (such as text styles). 70 * 71 * <p> 72 * <b>Keyboard transliteration</b> 73 * 74 * <p> 75 * Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that 76 * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal 77 * fashion. 78 * 79 * <p> 80 * In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as 81 * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being 82 * modified as each new character arrives. 83 * 84 * <p> 85 * Consider the simple <code>RuleBasedTransliterator</code>: 86 * 87 * <blockquote><code> 88 * th>{theta}<br> 89 * t>{tau} 90 * </code></blockquote> 91 * 92 * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is 93 * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string: 94 * 95 * <blockquote><code> 96 * t>|{tau}<br> 97 * {tau}h>{theta} 98 * </code></blockquote> 99 * 100 * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is 101 * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across 102 * calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a 103 * case like the one above, it will precede the insertion point. 104 * 105 * <p> 106 * Keyboard transliteration methods maintain a set of three indices that are updated with each call to 107 * <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and 108 * they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring 109 * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index; 110 * that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the 111 * transliterator last stopped, either because it reached the end, or because it required more characters to 112 * disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules in a 113 * <code>RuleBasedTransliterator</code>. Any characters before the <code>cursor</code> index are frozen; future keyboard 114 * transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code> 115 * index, which marks the end of the substring that the transliterator looks at. 116 * 117 * <p> 118 * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It 119 * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the 120 * client code knows that no more characters are forthcoming, perhaps because the user has performed some input 121 * termination operation, then it should call <code>finishTransliteration()</code> to complete any pending 122 * transliterations. 123 * 124 * <p> 125 * <b>Inverses</b> 126 * 127 * <p> 128 * Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates 129 * characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character 130 * values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound 131 * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input 132 * text. 133 * 134 * The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists, 135 * or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true 136 * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider 137 * two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates 138 * 'B' to 'A'. It might seem that these are exact inverses, since 139 * 140 * <blockquote>"A" x <b>AB</b> -> "B"<br> 141 * "B" x <b>BA</b> -> "A"</blockquote> 142 * 143 * where 'x' represents transliteration. However, 144 * 145 * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br> 146 * "BBCD" x <b>BA</b> -> "AACD"</blockquote> 147 * 148 * so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be 149 * <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return 150 * <b>BA</b>. 151 * 152 * <p> 153 * <b>Filtering</b> 154 * <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The 155 * filter affects just the characters that are changed -- the characters outside of the filter are still part of the 156 * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'. 157 * 158 * <pre> 159 * String rules = "x > y; x{a} > b; "; 160 * Transliterator tempTrans = Transliterator.createFromRules("temp", rules, Transliterator.FORWARD); 161 * tempTrans.setFilter(new UnicodeSet("[a]")); 162 * String tempResult = tempTrans.transform("xa"); 163 * // results in "xb" 164 *</pre> 165 * <p> 166 * <b>IDs and display names</b> 167 * 168 * <p> 169 * A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format 170 * <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em> 171 * describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of 172 * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from 173 * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1 174 * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the 175 * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes. 176 * 177 * <p> 178 * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces, 179 * returned by {@link #getDisplayName}. 180 * 181 * <p> 182 * <b>Factory methods and registration</b> 183 * 184 * <p> 185 * In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a 186 * transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators 187 * are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object. 188 * 189 * <p> 190 * In addition to the system transliterators registered at startup, user transliterators may be registered by calling 191 * <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it 192 * is needed), users may call <code>registerClass()</code>. 193 * 194 * <p> 195 * <b>Composed transliterators</b> 196 * 197 * <p> 198 * In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em> 199 * transliterators. These are implemented by composing two or more component transliterators. For example, if we have 200 * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12 201 * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an 202 * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M", 203 * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em> 204 * <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72 205 * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that 206 * the given transliterator is intended to be composed with others, rather than be used as is. 207 * 208 * <p> 209 * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati" 210 * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this 211 * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns 212 * "Devanagari-Gujarati"). 213 * 214 * <p> 215 * <b>Subclassing</b> 216 * 217 * <p> 218 * Subclasses must implement the abstract method <code>handleTransliterate()</code>. 219 * <p> 220 * Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the 221 * <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of 222 * these methods can be improved over the performance obtained by the default implementations in this class. 223 * 224 * @author Alan Liu 225 * @stable ICU 2.0 226 */ 227 public abstract class Transliterator implements StringTransform { 228 /** 229 * Direction constant indicating the forward direction in a transliterator, 230 * e.g., the forward rules of a RuleBasedTransliterator. An "A-B" 231 * transliterator transliterates A to B when operating in the forward 232 * direction, and B to A when operating in the reverse direction. 233 * @stable ICU 2.0 234 */ 235 public static final int FORWARD = 0; 236 237 /** 238 * Direction constant indicating the reverse direction in a transliterator, 239 * e.g., the reverse rules of a RuleBasedTransliterator. An "A-B" 240 * transliterator transliterates A to B when operating in the forward 241 * direction, and B to A when operating in the reverse direction. 242 * @stable ICU 2.0 243 */ 244 public static final int REVERSE = 1; 245 246 /** 247 * Position structure for incremental transliteration. This data 248 * structure defines two substrings of the text being 249 * transliterated. The first region, [contextStart, 250 * contextLimit), defines what characters the transliterator will 251 * read as context. The second region, [start, limit), defines 252 * what characters will actually be transliterated. The second 253 * region should be a subset of the first. 254 * 255 * <p>After a transliteration operation, some of the indices in this 256 * structure will be modified. See the field descriptions for 257 * details. 258 * 259 * <p>contextStart <= start <= limit <= contextLimit 260 * 261 * <p>Note: All index values in this structure must be at code point 262 * boundaries. That is, none of them may occur between two code units 263 * of a surrogate pair. If any index does split a surrogate pair, 264 * results are unspecified. 265 * @stable ICU 2.0 266 */ 267 public static class Position { 268 269 /** 270 * Beginning index, inclusive, of the context to be considered for 271 * a transliteration operation. The transliterator will ignore 272 * anything before this index. INPUT/OUTPUT parameter: This parameter 273 * is updated by a transliteration operation to reflect the maximum 274 * amount of antecontext needed by a transliterator. 275 * @stable ICU 2.0 276 */ 277 public int contextStart; 278 279 /** 280 * Ending index, exclusive, of the context to be considered for a 281 * transliteration operation. The transliterator will ignore 282 * anything at or after this index. INPUT/OUTPUT parameter: This 283 * parameter is updated to reflect changes in the length of the 284 * text, but points to the same logical position in the text. 285 * @stable ICU 2.0 286 */ 287 public int contextLimit; 288 289 /** 290 * Beginning index, inclusive, of the text to be transliteratd. 291 * INPUT/OUTPUT parameter: This parameter is advanced past 292 * characters that have already been transliterated by a 293 * transliteration operation. 294 * @stable ICU 2.0 295 */ 296 public int start; 297 298 /** 299 * Ending index, exclusive, of the text to be transliteratd. 300 * INPUT/OUTPUT parameter: This parameter is updated to reflect 301 * changes in the length of the text, but points to the same 302 * logical position in the text. 303 * @stable ICU 2.0 304 */ 305 public int limit; 306 307 /** 308 * Constructs a Position object with start, limit, 309 * contextStart, and contextLimit all equal to zero. 310 * @stable ICU 2.0 311 */ 312 public Position() { 313 this(0, 0, 0, 0); 314 } 315 316 /** 317 * Constructs a Position object with the given start, 318 * contextStart, and contextLimit. The limit is set to the 319 * contextLimit. 320 * @stable ICU 2.0 321 */ 322 public Position(int contextStart, int contextLimit, int start) { 323 this(contextStart, contextLimit, start, contextLimit); 324 } 325 326 /** 327 * Constructs a Position object with the given start, limit, 328 * contextStart, and contextLimit. 329 * @stable ICU 2.0 330 */ 331 public Position(int contextStart, int contextLimit, 332 int start, int limit) { 333 this.contextStart = contextStart; 334 this.contextLimit = contextLimit; 335 this.start = start; 336 this.limit = limit; 337 } 338 339 /** 340 * Constructs a Position object that is a copy of another. 341 * @stable ICU 2.6 342 */ 343 public Position(Position pos) { 344 set(pos); 345 } 346 347 /** 348 * Copies the indices of this position from another. 349 * @stable ICU 2.6 350 */ 351 public void set(Position pos) { 352 contextStart = pos.contextStart; 353 contextLimit = pos.contextLimit; 354 start = pos.start; 355 limit = pos.limit; 356 } 357 358 /** 359 * Returns true if this Position is equal to the given object. 360 * @stable ICU 2.6 361 */ 362 @Override 363 public boolean equals(Object obj) { 364 if (obj instanceof Position) { 365 Position pos = (Position) obj; 366 return contextStart == pos.contextStart && 367 contextLimit == pos.contextLimit && 368 start == pos.start && 369 limit == pos.limit; 370 } 371 return false; 372 } 373 374 /** 375 * Mock implementation of hashCode(). This implementation always returns a constant 376 * value. When Java assertion is enabled, this method triggers an assertion failure. 377 * @internal 378 * @deprecated This API is ICU internal only. 379 */ 380 @Override 381 @Deprecated 382 public int hashCode() { 383 assert false : "hashCode not designed"; 384 return 42; 385 } 386 387 /** 388 * Returns a string representation of this Position. 389 * @stable ICU 2.6 390 */ 391 @Override 392 public String toString() { 393 return "[cs=" + contextStart 394 + ", s=" + start 395 + ", l=" + limit 396 + ", cl=" + contextLimit 397 + "]"; 398 } 399 400 /** 401 * Check all bounds. If they are invalid, throw an exception. 402 * @param length the length of the string this object applies to 403 * @exception IllegalArgumentException if any indices are out 404 * of bounds 405 * @stable ICU 2.0 406 */ 407 public final void validate(int length) { 408 if (contextStart < 0 || 409 start < contextStart || 410 limit < start || 411 contextLimit < limit || 412 length < contextLimit) { 413 throw new IllegalArgumentException("Invalid Position {cs=" + 414 contextStart + ", s=" + 415 start + ", l=" + 416 limit + ", cl=" + 417 contextLimit + "}, len=" + 418 length); 419 } 420 } 421 } 422 423 /** 424 * Programmatic name, e.g., "Latin-Arabic". 425 */ 426 private String ID; 427 428 /** 429 * This transliterator's filter. Any character for which 430 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 431 * altered by this transliterator. If <tt>filter</tt> is 432 * <tt>null</tt> then no filtering is applied. 433 */ 434 private UnicodeSet filter; 435 436 private int maximumContextLength = 0; 437 438 /** 439 * System transliterator registry. 440 */ 441 private static TransliteratorRegistry registry; 442 443 private static Map<CaseInsensitiveString, String> displayNameCache; 444 445 /** 446 * Prefix for resource bundle key for the display name for a 447 * transliterator. The ID is appended to this to form the key. 448 * The resource bundle value should be a String. 449 */ 450 private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%"; 451 452 /** 453 * Prefix for resource bundle key for the display name for a 454 * transliterator SCRIPT. The ID is appended to this to form the key. 455 * The resource bundle value should be a String. 456 */ 457 private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%"; 458 459 /** 460 * Resource bundle key for display name pattern. 461 * The resource bundle value should be a String forming a 462 * MessageFormat pattern, e.g.: 463 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". 464 */ 465 private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern"; 466 467 /** 468 * Delimiter between elements in a compound ID. 469 */ 470 static final char ID_DELIM = ';'; 471 472 /** 473 * Delimiter before target in an ID. 474 */ 475 static final char ID_SEP = '-'; 476 477 /** 478 * Delimiter before variant in an ID. 479 */ 480 static final char VARIANT_SEP = '/'; 481 482 /** 483 * To enable debugging output in the Transliterator component, set 484 * DEBUG to true. 485 * 486 * N.B. Make sure to recompile all of the com.ibm.icu.text package 487 * after changing this. Easiest way to do this is 'ant clean 488 * core' ('ant' will NOT pick up the dependency automatically). 489 * 490 * <<This generates a lot of output.>> 491 */ 492 static final boolean DEBUG = false; 493 494 /** 495 * Default constructor. 496 * @param ID the string identifier for this transliterator 497 * @param filter the filter. Any character for which 498 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 499 * altered by this transliterator. If <tt>filter</tt> is 500 * <tt>null</tt> then no filtering is applied. 501 * @stable ICU 2.0 502 */ 503 protected Transliterator(String ID, UnicodeFilter filter) { 504 if (ID == null) { 505 throw new NullPointerException(); 506 } 507 this.ID = ID; 508 setFilter(filter); 509 } 510 511 /** 512 * Transliterates a segment of a string, with optional filtering. 513 * 514 * @param text the string to be transliterated 515 * @param start the beginning index, inclusive; <code>0 <= start 516 * <= limit</code>. 517 * @param limit the ending index, exclusive; <code>start <= limit 518 * <= text.length()</code>. 519 * @return The new limit index. The text previously occupying <code>[start, 520 * limit)</code> has been transliterated, possibly to a string of a different 521 * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where 522 * <em>new-limit</em> is the return value. If the input offsets are out of bounds, 523 * the returned value is -1 and the input string remains unchanged. 524 * @stable ICU 2.0 525 */ 526 public final int transliterate(Replaceable text, int start, int limit) { 527 if (start < 0 || 528 limit < start || 529 text.length() < limit) { 530 return -1; 531 } 532 533 Position pos = new Position(start, limit, start); 534 filteredTransliterate(text, pos, false, true); 535 return pos.limit; 536 } 537 538 /** 539 * Transliterates an entire string in place. Convenience method. 540 * @param text the string to be transliterated 541 * @stable ICU 2.0 542 */ 543 public final void transliterate(Replaceable text) { 544 transliterate(text, 0, text.length()); 545 } 546 547 /** 548 * Transliterate an entire string and returns the result. Convenience method. 549 * 550 * @param text the string to be transliterated 551 * @return The transliterated text 552 * @stable ICU 2.0 553 */ 554 public final String transliterate(String text) { 555 ReplaceableString result = new ReplaceableString(text); 556 transliterate(result); 557 return result.toString(); 558 } 559 560 /** 561 * Transliterates the portion of the text buffer that can be 562 * transliterated unambiguosly after new text has been inserted, 563 * typically as a result of a keyboard event. The new text in 564 * <code>insertion</code> will be inserted into <code>text</code> 565 * at <code>index.contextLimit</code>, advancing 566 * <code>index.contextLimit</code> by <code>insertion.length()</code>. 567 * Then the transliterator will try to transliterate characters of 568 * <code>text</code> between <code>index.start</code> and 569 * <code>index.contextLimit</code>. Characters before 570 * <code>index.start</code> will not be changed. 571 * 572 * <p>Upon return, values in <code>index</code> will be updated. 573 * <code>index.contextStart</code> will be advanced to the first 574 * character that future calls to this method will read. 575 * <code>index.start</code> and <code>index.contextLimit</code> will 576 * be adjusted to delimit the range of text that future calls to 577 * this method may change. 578 * 579 * <p>Typical usage of this method begins with an initial call 580 * with <code>index.contextStart</code> and <code>index.contextLimit</code> 581 * set to indicate the portion of <code>text</code> to be 582 * transliterated, and <code>index.start == index.contextStart</code>. 583 * Thereafter, <code>index</code> can be used without 584 * modification in future calls, provided that all changes to 585 * <code>text</code> are made via this method. 586 * 587 * <p>This method assumes that future calls may be made that will 588 * insert new text into the buffer. As a result, it only performs 589 * unambiguous transliterations. After the last call to this 590 * method, there may be untransliterated text that is waiting for 591 * more input to resolve an ambiguity. In order to perform these 592 * pending transliterations, clients should call {@link 593 * #finishTransliteration} after the last call to this 594 * method has been made. 595 * 596 * @param text the buffer holding transliterated and untransliterated text 597 * @param index the start and limit of the text, the position 598 * of the cursor, and the start and limit of transliteration. 599 * @param insertion text to be inserted and possibly 600 * transliterated into the translation buffer at 601 * <code>index.contextLimit</code>. If <code>null</code> then no text 602 * is inserted. 603 * @see #handleTransliterate 604 * @exception IllegalArgumentException if <code>index</code> 605 * is invalid 606 * @stable ICU 2.0 607 */ 608 public final void transliterate(Replaceable text, Position index, 609 String insertion) { 610 index.validate(text.length()); 611 612 // int originalStart = index.contextStart; 613 if (insertion != null) { 614 text.replace(index.limit, index.limit, insertion); 615 index.limit += insertion.length(); 616 index.contextLimit += insertion.length(); 617 } 618 619 if (index.limit > 0 && 620 UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) { 621 // Oops, there is a dangling lead surrogate in the buffer. 622 // This will break most transliterators, since they will 623 // assume it is part of a pair. Don't transliterate until 624 // more text comes in. 625 return; 626 } 627 628 filteredTransliterate(text, index, true, true); 629 630 // TODO 631 // This doesn't work once we add quantifier support. Need to rewrite 632 // this code to support quantifiers and 'use maximum backup <n>;'. 633 // 634 // index.contextStart = Math.max(index.start - getMaximumContextLength(), 635 // originalStart); 636 } 637 638 /** 639 * Transliterates the portion of the text buffer that can be 640 * transliterated unambiguosly after a new character has been 641 * inserted, typically as a result of a keyboard event. This is a 642 * convenience method; see {@link #transliterate(Replaceable, 643 * Transliterator.Position, String)} for details. 644 * @param text the buffer holding transliterated and 645 * untransliterated text 646 * @param index the start and limit of the text, the position 647 * of the cursor, and the start and limit of transliteration. 648 * @param insertion text to be inserted and possibly 649 * transliterated into the translation buffer at 650 * <code>index.contextLimit</code>. 651 * @see #transliterate(Replaceable, Transliterator.Position, String) 652 * @stable ICU 2.0 653 */ 654 public final void transliterate(Replaceable text, Position index, 655 int insertion) { 656 transliterate(text, index, UTF16.valueOf(insertion)); 657 } 658 659 /** 660 * Transliterates the portion of the text buffer that can be 661 * transliterated unambiguosly. This is a convenience method; see 662 * {@link #transliterate(Replaceable, Transliterator.Position, 663 * String)} for details. 664 * @param text the buffer holding transliterated and 665 * untransliterated text 666 * @param index the start and limit of the text, the position 667 * of the cursor, and the start and limit of transliteration. 668 * @see #transliterate(Replaceable, Transliterator.Position, String) 669 * @stable ICU 2.0 670 */ 671 public final void transliterate(Replaceable text, Position index) { 672 transliterate(text, index, null); 673 } 674 675 /** 676 * Finishes any pending transliterations that were waiting for 677 * more characters. Clients should call this method as the last 678 * call after a sequence of one or more calls to 679 * <code>transliterate()</code>. 680 * @param text the buffer holding transliterated and 681 * untransliterated text. 682 * @param index the array of indices previously passed to {@link 683 * #transliterate} 684 * @stable ICU 2.0 685 */ 686 public final void finishTransliteration(Replaceable text, 687 Position index) { 688 index.validate(text.length()); 689 filteredTransliterate(text, index, false, true); 690 } 691 692 /** 693 * Abstract method that concrete subclasses define to implement 694 * their transliteration algorithm. This method handles both 695 * incremental and non-incremental transliteration. Let 696 * <code>originalStart</code> refer to the value of 697 * <code>pos.start</code> upon entry. 698 * 699 * <ul> 700 * <li>If <code>incremental</code> is false, then this method 701 * should transliterate all characters between 702 * <code>pos.start</code> and <code>pos.limit</code>. Upon return 703 * <code>pos.start</code> must == <code> pos.limit</code>.</li> 704 * 705 * <li>If <code>incremental</code> is true, then this method 706 * should transliterate all characters between 707 * <code>pos.start</code> and <code>pos.limit</code> that can be 708 * unambiguously transliterated, regardless of future insertions 709 * of text at <code>pos.limit</code>. Upon return, 710 * <code>pos.start</code> should be in the range 711 * [<code>originalStart</code>, <code>pos.limit</code>). 712 * <code>pos.start</code> should be positioned such that 713 * characters [<code>originalStart</code>, <code> 714 * pos.start</code>) will not be changed in the future by this 715 * transliterator and characters [<code>pos.start</code>, 716 * <code>pos.limit</code>) are unchanged.</li> 717 * </ul> 718 * 719 * <p>Implementations of this method should also obey the 720 * following invariants:</p> 721 * 722 * <ul> 723 * <li> <code>pos.limit</code> and <code>pos.contextLimit</code> 724 * should be updated to reflect changes in length of the text 725 * between <code>pos.start</code> and <code>pos.limit</code>. The 726 * difference <code> pos.contextLimit - pos.limit</code> should 727 * not change.</li> 728 * 729 * <li><code>pos.contextStart</code> should not change.</li> 730 * 731 * <li>Upon return, neither <code>pos.start</code> nor 732 * <code>pos.limit</code> should be less than 733 * <code>originalStart</code>.</li> 734 * 735 * <li>Text before <code>originalStart</code> and text after 736 * <code>pos.limit</code> should not change.</li> 737 * 738 * <li>Text before <code>pos.contextStart</code> and text after 739 * <code> pos.contextLimit</code> should be ignored.</li> 740 * </ul> 741 * 742 * <p>Subclasses may safely assume that all characters in 743 * [<code>pos.start</code>, <code>pos.limit</code>) are filtered. 744 * In other words, the filter has already been applied by the time 745 * this method is called. See 746 * <code>filteredTransliterate()</code>. 747 * 748 * <p>This method is <b>not</b> for public consumption. Calling 749 * this method directly will transliterate 750 * [<code>pos.start</code>, <code>pos.limit</code>) without 751 * applying the filter. End user code should call <code> 752 * transliterate()</code> instead of this method. Subclass code 753 * should call <code>filteredTransliterate()</code> instead of 754 * this method.<p> 755 * 756 * @param text the buffer holding transliterated and 757 * untransliterated text 758 * 759 * @param pos the indices indicating the start, limit, context 760 * start, and context limit of the text. 761 * 762 * @param incremental if true, assume more text may be inserted at 763 * <code>pos.limit</code> and act accordingly. Otherwise, 764 * transliterate all text between <code>pos.start</code> and 765 * <code>pos.limit</code> and move <code>pos.start</code> up to 766 * <code>pos.limit</code>. 767 * 768 * @see #transliterate 769 * @stable ICU 2.0 770 */ 771 protected abstract void handleTransliterate(Replaceable text, 772 Position pos, boolean incremental); 773 774 /** 775 * Top-level transliteration method, handling filtering, incremental and 776 * non-incremental transliteration, and rollback. All transliteration 777 * public API methods eventually call this method with a rollback argument 778 * of TRUE. Other entities may call this method but rollback should be 779 * FALSE. 780 * 781 * <p>If this transliterator has a filter, break up the input text into runs 782 * of unfiltered characters. Pass each run to 783 * <subclass>.handleTransliterate(). 784 * 785 * <p>In incremental mode, if rollback is TRUE, perform a special 786 * incremental procedure in which several passes are made over the input 787 * text, adding one character at a time, and committing successful 788 * transliterations as they occur. Unsuccessful transliterations are rolled 789 * back and retried with additional characters to give correct results. 790 * 791 * @param text the text to be transliterated 792 * @param index the position indices 793 * @param incremental if TRUE, then assume more characters may be inserted 794 * at index.limit, and postpone processing to accomodate future incoming 795 * characters 796 * @param rollback if TRUE and if incremental is TRUE, then perform special 797 * incremental processing, as described above, and undo partial 798 * transliterations where necessary. If incremental is FALSE then this 799 * parameter is ignored. 800 */ 801 private void filteredTransliterate(Replaceable text, 802 Position index, 803 boolean incremental, 804 boolean rollback) { 805 // Short circuit path for transliterators with no filter in 806 // non-incremental mode. 807 if (filter == null && !rollback) { 808 handleTransliterate(text, index, incremental); 809 return; 810 } 811 812 //---------------------------------------------------------------------- 813 // This method processes text in two groupings: 814 // 815 // RUNS -- A run is a contiguous group of characters which are contained 816 // in the filter for this transliterator (filter.contains(ch) == true). 817 // Text outside of runs may appear as context but it is not modified. 818 // The start and limit Position values are narrowed to each run. 819 // 820 // PASSES (incremental only) -- To make incremental mode work correctly, 821 // each run is broken up into n passes, where n is the length (in code 822 // points) of the run. Each pass contains the first n characters. If a 823 // pass is completely transliterated, it is committed, and further passes 824 // include characters after the committed text. If a pass is blocked, 825 // and does not transliterate completely, then this method rolls back 826 // the changes made during the pass, extends the pass by one code point, 827 // and tries again. 828 //---------------------------------------------------------------------- 829 830 // globalLimit is the limit value for the entire operation. We 831 // set index.limit to the end of each unfiltered run before 832 // calling handleTransliterate(), so we need to maintain the real 833 // value of index.limit here. After each transliteration, we 834 // update globalLimit for insertions or deletions that have 835 // happened. 836 int globalLimit = index.limit; 837 838 // If there is a non-null filter, then break the input text up. Say the 839 // input text has the form: 840 // xxxabcxxdefxx 841 // where 'x' represents a filtered character (filter.contains('x') == 842 // false). Then we break this up into: 843 // xxxabc xxdef xx 844 // Each pass through the loop consumes a run of filtered 845 // characters (which are ignored) and a subsequent run of 846 // unfiltered characters (which are transliterated). 847 848 StringBuffer log = null; 849 if (DEBUG) { 850 log = new StringBuffer(); 851 } 852 853 for (;;) { 854 855 if (filter != null) { 856 // Narrow the range to be transliterated to the first run 857 // of unfiltered characters at or after index.start. 858 859 // Advance past filtered chars 860 int c; 861 while (index.start < globalLimit && 862 !filter.contains(c=text.char32At(index.start))) { 863 index.start += UTF16.getCharCount(c); 864 } 865 866 // Find the end of this run of unfiltered chars 867 index.limit = index.start; 868 while (index.limit < globalLimit && 869 filter.contains(c=text.char32At(index.limit))) { 870 index.limit += UTF16.getCharCount(c); 871 } 872 } 873 874 // Check to see if the unfiltered run is empty. This only 875 // happens at the end of the string when all the remaining 876 // characters are filtered. 877 if (index.start == index.limit) { 878 break; 879 } 880 881 // Is this run incremental? If there is additional 882 // filtered text (if limit < globalLimit) then we pass in 883 // an incremental value of FALSE to force the subclass to 884 // complete the transliteration for this run. 885 boolean isIncrementalRun = 886 (index.limit < globalLimit ? false : incremental); 887 888 int delta; 889 890 // Implement rollback. To understand the need for rollback, 891 // consider the following transliterator: 892 // 893 // "t" is "a > A;" 894 // "u" is "A > b;" 895 // "v" is a compound of "t; NFD; u" with a filter [:Ll:] 896 // 897 // Now apply "v" to the input text "a". The result is "b". But if 898 // the transliteration is done incrementally, then the NFD holds 899 // things up after "t" has already transformed "a" to "A". When 900 // finishTransliterate() is called, "A" is _not_ processed because 901 // it gets excluded by the [:Ll:] filter, and the end result is "A" 902 // -- incorrect. The problem is that the filter is applied to a 903 // partially-transliterated result, when we only want it to apply to 904 // input text. Although this example describes a compound 905 // transliterator containing NFD and a specific filter, it can 906 // happen with any transliterator which does a partial 907 // transformation in incremental mode into characters outside its 908 // filter. 909 // 910 // To handle this, when in incremental mode we supply characters to 911 // handleTransliterate() in several passes. Each pass adds one more 912 // input character to the input text. That is, for input "ABCD", we 913 // first try "A", then "AB", then "ABC", and finally "ABCD". If at 914 // any point we block (upon return, start < limit) then we roll 915 // back. If at any point we complete the run (upon return start == 916 // limit) then we commit that run. 917 918 if (rollback && isIncrementalRun) { 919 920 if (DEBUG) { 921 log.setLength(0); 922 System.out.println("filteredTransliterate{"+getID()+"}i: IN=" + 923 UtilityExtensions.formatInput(text, index)); 924 } 925 926 int runStart = index.start; 927 int runLimit = index.limit; 928 int runLength = runLimit - runStart; 929 930 // Make a rollback copy at the end of the string 931 int rollbackOrigin = text.length(); 932 text.copy(runStart, runLimit, rollbackOrigin); 933 934 // Variables reflecting the commitment of completely 935 // transliterated text. passStart is the runStart, advanced 936 // past committed text. rollbackStart is the rollbackOrigin, 937 // advanced past rollback text that corresponds to committed 938 // text. 939 int passStart = runStart; 940 int rollbackStart = rollbackOrigin; 941 942 // The limit for each pass; we advance by one code point with 943 // each iteration. 944 int passLimit = index.start; 945 946 // Total length, in 16-bit code units, of uncommitted text. 947 // This is the length to be rolled back. 948 int uncommittedLength = 0; 949 950 // Total delta (change in length) for all passes 951 int totalDelta = 0; 952 953 // PASS MAIN LOOP -- Start with a single character, and extend 954 // the text by one character at a time. Roll back partial 955 // transliterations and commit complete transliterations. 956 for (;;) { 957 // Length of additional code point, either one or two 958 int charLength = 959 UTF16.getCharCount(text.char32At(passLimit)); 960 passLimit += charLength; 961 if (passLimit > runLimit) { 962 break; 963 } 964 uncommittedLength += charLength; 965 966 index.limit = passLimit; 967 968 if (DEBUG) { 969 log.setLength(0); 970 log.append("filteredTransliterate{"+getID()+"}i: "); 971 UtilityExtensions.formatInput(log, text, index); 972 } 973 974 // Delegate to subclass for actual transliteration. Upon 975 // return, start will be updated to point after the 976 // transliterated text, and limit and contextLimit will be 977 // adjusted for length changes. 978 handleTransliterate(text, index, true); 979 980 if (DEBUG) { 981 log.append(" => "); 982 UtilityExtensions.formatInput(log, text, index); 983 } 984 985 delta = index.limit - passLimit; // change in length 986 987 // We failed to completely transliterate this pass. 988 // Roll back the text. Indices remain unchanged; reset 989 // them where necessary. 990 if (index.start != index.limit) { 991 // Find the rollbackStart, adjusted for length changes 992 // and the deletion of partially transliterated text. 993 int rs = rollbackStart + delta - (index.limit - passStart); 994 995 // Delete the partially transliterated text 996 text.replace(passStart, index.limit, ""); 997 998 // Copy the rollback text back 999 text.copy(rs, rs + uncommittedLength, passStart); 1000 1001 // Restore indices to their original values 1002 index.start = passStart; 1003 index.limit = passLimit; 1004 index.contextLimit -= delta; 1005 1006 if (DEBUG) { 1007 log.append(" (ROLLBACK)"); 1008 } 1009 } 1010 1011 // We did completely transliterate this pass. Update the 1012 // commit indices to record how far we got. Adjust indices 1013 // for length change. 1014 else { 1015 // Move the pass indices past the committed text. 1016 passStart = passLimit = index.start; 1017 1018 // Adjust the rollbackStart for length changes and move 1019 // it past the committed text. All characters we've 1020 // processed to this point are committed now, so zero 1021 // out the uncommittedLength. 1022 rollbackStart += delta + uncommittedLength; 1023 uncommittedLength = 0; 1024 1025 // Adjust indices for length changes. 1026 runLimit += delta; 1027 totalDelta += delta; 1028 } 1029 1030 if (DEBUG) { 1031 System.out.println(Utility.escape(log.toString())); 1032 } 1033 } 1034 1035 // Adjust overall limit and rollbackOrigin for insertions and 1036 // deletions. Don't need to worry about contextLimit because 1037 // handleTransliterate() maintains that. 1038 rollbackOrigin += totalDelta; 1039 globalLimit += totalDelta; 1040 1041 // Delete the rollback copy 1042 text.replace(rollbackOrigin, rollbackOrigin + runLength, ""); 1043 1044 // Move start past committed text 1045 index.start = passStart; 1046 } 1047 1048 else { 1049 // Delegate to subclass for actual transliteration. 1050 if (DEBUG) { 1051 log.setLength(0); 1052 log.append("filteredTransliterate{"+getID()+"}: "); 1053 UtilityExtensions.formatInput(log, text, index); 1054 } 1055 1056 int limit = index.limit; 1057 handleTransliterate(text, index, isIncrementalRun); 1058 delta = index.limit - limit; // change in length 1059 1060 if (DEBUG) { 1061 log.append(" => "); 1062 UtilityExtensions.formatInput(log, text, index); 1063 } 1064 1065 // In a properly written transliterator, start == limit after 1066 // handleTransliterate() returns when incremental is false. 1067 // Catch cases where the subclass doesn't do this, and throw 1068 // an exception. (Just pinning start to limit is a bad idea, 1069 // because what's probably happening is that the subclass 1070 // isn't transliterating all the way to the end, and it should 1071 // in non-incremental mode.) 1072 if (!isIncrementalRun && index.start != index.limit) { 1073 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID()); 1074 } 1075 1076 // Adjust overall limit for insertions/deletions. Don't need 1077 // to worry about contextLimit because handleTransliterate() 1078 // maintains that. 1079 globalLimit += delta; 1080 1081 if (DEBUG) { 1082 System.out.println(Utility.escape(log.toString())); 1083 } 1084 } 1085 1086 if (filter == null || isIncrementalRun) { 1087 break; 1088 } 1089 1090 // If we did completely transliterate this 1091 // run, then repeat with the next unfiltered run. 1092 } 1093 1094 // Start is valid where it is. Limit needs to be put back where 1095 // it was, modulo adjustments for deletions/insertions. 1096 index.limit = globalLimit; 1097 1098 if (DEBUG) { 1099 System.out.println("filteredTransliterate{"+getID()+"}: OUT=" + 1100 UtilityExtensions.formatInput(text, index)); 1101 } 1102 } 1103 1104 /** 1105 * Transliterate a substring of text, as specified by index, taking filters 1106 * into account. This method is for subclasses that need to delegate to 1107 * another transliterator, such as CompoundTransliterator. 1108 * @param text the text to be transliterated 1109 * @param index the position indices 1110 * @param incremental if TRUE, then assume more characters may be inserted 1111 * at index.limit, and postpone processing to accomodate future incoming 1112 * characters 1113 * @stable ICU 2.0 1114 */ 1115 public void filteredTransliterate(Replaceable text, 1116 Position index, 1117 boolean incremental) { 1118 filteredTransliterate(text, index, incremental, false); 1119 } 1120 1121 /** 1122 * Returns the length of the longest context required by this transliterator. 1123 * This is <em>preceding</em> context. The default value is zero, but 1124 * subclasses can change this by calling <code>setMaximumContextLength()</code>. 1125 * For example, if a transliterator translates "ddd" (where 1126 * d is any digit) to "555" when preceded by "(ddd)", then the preceding 1127 * context length is 5, the length of "(ddd)". 1128 * 1129 * @return The maximum number of preceding context characters this 1130 * transliterator needs to examine 1131 * @stable ICU 2.0 1132 */ 1133 public final int getMaximumContextLength() { 1134 return maximumContextLength; 1135 } 1136 1137 /** 1138 * Method for subclasses to use to set the maximum context length. 1139 * @see #getMaximumContextLength 1140 * @stable ICU 2.0 1141 */ 1142 protected void setMaximumContextLength(int a) { 1143 if (a < 0) { 1144 throw new IllegalArgumentException("Invalid context length " + a); 1145 } 1146 maximumContextLength = a; 1147 } 1148 1149 /** 1150 * Returns a programmatic identifier for this transliterator. 1151 * If this identifier is passed to <code>getInstance()</code>, it 1152 * will return this object, if it has been registered. 1153 * @see #registerClass 1154 * @see #getAvailableIDs 1155 * @stable ICU 2.0 1156 */ 1157 public final String getID() { 1158 return ID; 1159 } 1160 1161 /** 1162 * Set the programmatic identifier for this transliterator. Only 1163 * for use by subclasses. 1164 * @stable ICU 2.0 1165 */ 1166 protected final void setID(String id) { 1167 ID = id; 1168 } 1169 1170 /** 1171 * Returns a name for this transliterator that is appropriate for 1172 * display to the user in the default <code>DISPLAY</code> locale. See {@link 1173 * #getDisplayName(String,Locale)} for details. 1174 * @see com.ibm.icu.util.ULocale.Category#DISPLAY 1175 * @stable ICU 2.0 1176 */ 1177 public final static String getDisplayName(String ID) { 1178 return getDisplayName(ID, ULocale.getDefault(Category.DISPLAY)); 1179 } 1180 1181 /** 1182 * Returns a name for this transliterator that is appropriate for 1183 * display to the user in the given locale. This name is taken 1184 * from the locale resource data in the standard manner of the 1185 * <code>java.text</code> package. 1186 * 1187 * <p>If no localized names exist in the system resource bundles, 1188 * a name is synthesized using a localized 1189 * <code>MessageFormat</code> pattern from the resource data. The 1190 * arguments to this pattern are an integer followed by one or two 1191 * strings. The integer is the number of strings, either 1 or 2. 1192 * The strings are formed by splitting the ID for this 1193 * transliterator at the first '-'. If there is no '-', then the 1194 * entire ID forms the only string. 1195 * @param inLocale the Locale in which the display name should be 1196 * localized. 1197 * @see java.text.MessageFormat 1198 * @stable ICU 2.0 1199 */ 1200 public static String getDisplayName(String id, Locale inLocale) { 1201 return getDisplayName(id, ULocale.forLocale(inLocale)); 1202 } 1203 1204 /** 1205 * Returns a name for this transliterator that is appropriate for 1206 * display to the user in the given locale. This name is taken 1207 * from the locale resource data in the standard manner of the 1208 * <code>java.text</code> package. 1209 * 1210 * <p>If no localized names exist in the system resource bundles, 1211 * a name is synthesized using a localized 1212 * <code>MessageFormat</code> pattern from the resource data. The 1213 * arguments to this pattern are an integer followed by one or two 1214 * strings. The integer is the number of strings, either 1 or 2. 1215 * The strings are formed by splitting the ID for this 1216 * transliterator at the first '-'. If there is no '-', then the 1217 * entire ID forms the only string. 1218 * @param inLocale the ULocale in which the display name should be 1219 * localized. 1220 * @see java.text.MessageFormat 1221 * @stable ICU 3.2 1222 */ 1223 public static String getDisplayName(String id, ULocale inLocale) { 1224 1225 // Resource bundle containing display name keys and the 1226 // RB_RULE_BASED_IDS array. 1227 // 1228 //If we ever integrate this with the Sun JDK, the resource bundle 1229 // root will change to sun.text.resources.LocaleElements 1230 1231 ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle. 1232 getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, inLocale); 1233 1234 // Normalize the ID 1235 String stv[] = TransliteratorIDParser.IDtoSTV(id); 1236 if (stv == null) { 1237 // No target; malformed id 1238 return ""; 1239 } 1240 String ID = stv[0] + '-' + stv[1]; 1241 if (stv[2] != null && stv[2].length() > 0) { 1242 ID = ID + '/' + stv[2]; 1243 } 1244 1245 // Use the registered display name, if any 1246 String n = displayNameCache.get(new CaseInsensitiveString(ID)); 1247 if (n != null) { 1248 return n; 1249 } 1250 1251 // Use display name for the entire transliterator, if it 1252 // exists. 1253 try { 1254 return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID); 1255 } catch (MissingResourceException e) {} 1256 1257 try { 1258 // Construct the formatter first; if getString() fails 1259 // we'll exit the try block 1260 MessageFormat format = new MessageFormat( 1261 bundle.getString(RB_DISPLAY_NAME_PATTERN)); 1262 // Construct the argument array 1263 Object[] args = new Object[] { Integer.valueOf(2), stv[0], stv[1] }; 1264 1265 // Use display names for the scripts, if they exist 1266 for (int j=1; j<=2; ++j) { 1267 try { 1268 args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX + 1269 (String) args[j]); 1270 } catch (MissingResourceException e) {} 1271 } 1272 1273 // Format it using the pattern in the resource 1274 return (stv[2].length() > 0) ? 1275 (format.format(args) + '/' + stv[2]) : 1276 format.format(args); 1277 } catch (MissingResourceException e2) {} 1278 1279 // We should not reach this point unless there is something 1280 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has 1281 // been deleted from the root RB_LOCALE_ELEMENTS resource. 1282 throw new RuntimeException(); 1283 } 1284 1285 /** 1286 * Returns the filter used by this transliterator, or <tt>null</tt> 1287 * if this transliterator uses no filter. 1288 * @stable ICU 2.0 1289 */ 1290 public final UnicodeFilter getFilter() { 1291 return filter; 1292 } 1293 1294 /** 1295 * Changes the filter used by this transliterator. If the filter 1296 * is set to <tt>null</tt> then no filtering will occur. 1297 * 1298 * <p>Callers must take care if a transliterator is in use by 1299 * multiple threads. The filter should not be changed by one 1300 * thread while another thread may be transliterating. 1301 * @stable ICU 2.0 1302 */ 1303 public void setFilter(UnicodeFilter filter) { 1304 if (filter == null) { 1305 this.filter = null; 1306 } else { 1307 try { 1308 // fast high-runner case 1309 this.filter = new UnicodeSet((UnicodeSet)filter).freeze(); 1310 } catch (Exception e) { 1311 this.filter = new UnicodeSet(); 1312 filter.addMatchSetTo(this.filter); 1313 this.filter.freeze(); 1314 } 1315 } 1316 } 1317 1318 /** 1319 * Returns a <code>Transliterator</code> object given its ID. 1320 * The ID must be either a system transliterator ID or a ID registered 1321 * using <code>registerClass()</code>. 1322 * 1323 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1324 * @return A <code>Transliterator</code> object with the given ID 1325 * @exception IllegalArgumentException if the given ID is invalid. 1326 * @stable ICU 2.0 1327 */ 1328 public static final Transliterator getInstance(String ID) { 1329 return getInstance(ID, FORWARD); 1330 } 1331 1332 /** 1333 * Returns a <code>Transliterator</code> object given its ID. 1334 * The ID must be either a system transliterator ID or a ID registered 1335 * using <code>registerClass()</code>. 1336 * 1337 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> 1338 * @param dir either FORWARD or REVERSE. If REVERSE then the 1339 * inverse of the given ID is instantiated. 1340 * @return A <code>Transliterator</code> object with the given ID 1341 * @exception IllegalArgumentException if the given ID is invalid. 1342 * @see #registerClass 1343 * @see #getAvailableIDs 1344 * @see #getID 1345 * @stable ICU 2.0 1346 */ 1347 public static Transliterator getInstance(String ID, 1348 int dir) { 1349 StringBuffer canonID = new StringBuffer(); 1350 List<SingleID> list = new ArrayList<SingleID>(); 1351 UnicodeSet[] globalFilter = new UnicodeSet[1]; 1352 if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) { 1353 throw new IllegalArgumentException("Invalid ID " + ID); 1354 } 1355 1356 List<Transliterator> translits = TransliteratorIDParser.instantiateList(list); 1357 1358 // assert(list.size() > 0); 1359 Transliterator t = null; 1360 if (list.size() > 1 || canonID.indexOf(";") >= 0) { 1361 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only 1362 // has one child transliterator. This is so that toRules() will return the right thing 1363 // (without any inactive ID), but our main ID still comes out correct. That is, if we 1364 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;" 1365 // even though the ID is "(Lower);Latin-Greek;". 1366 t = new CompoundTransliterator(translits); 1367 } 1368 else { 1369 t = translits.get(0); 1370 } 1371 1372 t.setID(canonID.toString()); 1373 if (globalFilter[0] != null) { 1374 t.setFilter(globalFilter[0]); 1375 } 1376 return t; 1377 } 1378 1379 /** 1380 * Create a transliterator from a basic ID. This is an ID 1381 * containing only the forward direction source, target, and 1382 * variant. 1383 * @param id a basic ID of the form S-T or S-T/V. 1384 * @param canonID canonical ID to apply to the result, or 1385 * null to leave the ID unchanged 1386 * @return a newly created Transliterator or null if the ID is 1387 * invalid. 1388 */ 1389 static Transliterator getBasicInstance(String id, String canonID) { 1390 StringBuffer s = new StringBuffer(); 1391 Transliterator t = registry.get(id, s); 1392 if (s.length() != 0) { 1393 // assert(t==0); 1394 // Instantiate an alias 1395 t = getInstance(s.toString(), FORWARD); 1396 } 1397 if (t != null && canonID != null) { 1398 t.setID(canonID); 1399 } 1400 return t; 1401 } 1402 1403 /** 1404 * Returns a <code>Transliterator</code> object constructed from 1405 * the given rule string. This will be a RuleBasedTransliterator, 1406 * if the rule string contains only rules, or a 1407 * CompoundTransliterator, if it contains ID blocks, or a 1408 * NullTransliterator, if it contains ID blocks which parse as 1409 * empty for the given direction. 1410 * @stable ICU 2.0 1411 */ 1412 public static final Transliterator createFromRules(String ID, String rules, int dir) { 1413 Transliterator t = null; 1414 1415 TransliteratorParser parser = new TransliteratorParser(); 1416 parser.parse(rules, dir); 1417 1418 // NOTE: The logic here matches that in TransliteratorRegistry. 1419 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) { 1420 t = new NullTransliterator(); 1421 } 1422 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { 1423 t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter); 1424 } 1425 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { 1426 // idBlock, no data -- this is an alias. The ID has 1427 // been munged from reverse into forward mode, if 1428 // necessary, so instantiate the ID in the forward 1429 // direction. 1430 if (parser.compoundFilter != null) { 1431 t = getInstance(parser.compoundFilter.toPattern(false) + ";" 1432 + parser.idBlockVector.get(0)); 1433 } else { 1434 t = getInstance(parser.idBlockVector.get(0)); 1435 } 1436 1437 if (t != null) { 1438 t.setID(ID); 1439 } 1440 } 1441 else { 1442 List<Transliterator> transliterators = new ArrayList<Transliterator>(); 1443 int passNumber = 1; 1444 1445 int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size()); 1446 for (int i = 0; i < limit; i++) { 1447 if (i < parser.idBlockVector.size()) { 1448 String idBlock = parser.idBlockVector.get(i); 1449 if (idBlock.length() > 0) { 1450 Transliterator temp = getInstance(idBlock); 1451 if (!(temp instanceof NullTransliterator)) 1452 transliterators.add(getInstance(idBlock)); 1453 } 1454 } 1455 if (i < parser.dataVector.size()) { 1456 Data data = parser.dataVector.get(i); 1457 transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null)); 1458 } 1459 } 1460 1461 t = new CompoundTransliterator(transliterators, passNumber - 1); 1462 t.setID(ID); 1463 if (parser.compoundFilter != null) { 1464 t.setFilter(parser.compoundFilter); 1465 } 1466 } 1467 1468 return t; 1469 } 1470 1471 /** 1472 * Returns a rule string for this transliterator. 1473 * @param escapeUnprintable if true, then unprintable characters 1474 * will be converted to escape form backslash-'u' or 1475 * backslash-'U'. 1476 * @stable ICU 2.0 1477 */ 1478 public String toRules(boolean escapeUnprintable) { 1479 return baseToRules(escapeUnprintable); 1480 } 1481 1482 /** 1483 * Returns a rule string for this transliterator. This is 1484 * a non-overrideable base class implementation that subclasses 1485 * may call. It simply munges the ID into the correct format, 1486 * that is, "foo" => "::foo". 1487 * @param escapeUnprintable if true, then unprintable characters 1488 * will be converted to escape form backslash-'u' or 1489 * backslash-'U'. 1490 * @stable ICU 2.0 1491 */ 1492 protected final String baseToRules(boolean escapeUnprintable) { 1493 // The base class implementation of toRules munges the ID into 1494 // the correct format. That is: foo => ::foo 1495 // KEEP in sync with rbt_pars 1496 if (escapeUnprintable) { 1497 StringBuffer rulesSource = new StringBuffer(); 1498 String id = getID(); 1499 for (int i=0; i<id.length();) { 1500 int c = UTF16.charAt(id, i); 1501 if (!Utility.escapeUnprintable(rulesSource, c)) { 1502 UTF16.append(rulesSource, c); 1503 } 1504 i += UTF16.getCharCount(c); 1505 } 1506 rulesSource.insert(0, "::"); 1507 rulesSource.append(ID_DELIM); 1508 return rulesSource.toString(); 1509 } 1510 return "::" + getID() + ID_DELIM; 1511 } 1512 1513 /** 1514 * Return the elements that make up this transliterator. For 1515 * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek" 1516 * were created, the return value of this method would be an array 1517 * of the three transliterator objects that make up that 1518 * transliterator: [NFD, Jamo-Latin, Latin-Greek]. 1519 * 1520 * <p>If this transliterator is not composed of other 1521 * transliterators, then this method will return an array of 1522 * length one containing a reference to this transliterator. 1523 * @return an array of one or more transliterators that make up 1524 * this transliterator 1525 * @stable ICU 3.0 1526 */ 1527 public Transliterator[] getElements() { 1528 Transliterator result[]; 1529 if (this instanceof CompoundTransliterator) { 1530 CompoundTransliterator cpd = (CompoundTransliterator) this; 1531 result = new Transliterator[cpd.getCount()]; 1532 for (int i=0; i<result.length; ++i) { 1533 result[i] = cpd.getTransliterator(i); 1534 } 1535 } else { 1536 result = new Transliterator[] { this }; 1537 } 1538 return result; 1539 } 1540 1541 /** 1542 * Returns the set of all characters that may be modified in the 1543 * input text by this Transliterator. This incorporates this 1544 * object's current filter; if the filter is changed, the return 1545 * value of this function will change. The default implementation 1546 * returns an empty set. Some subclasses may override {@link 1547 * #handleGetSourceSet} to return a more precise result. The 1548 * return result is approximate in any case and is intended for 1549 * use by tests, tools, or utilities. 1550 * @see #getTargetSet 1551 * @see #handleGetSourceSet 1552 * @stable ICU 2.2 1553 */ 1554 public final UnicodeSet getSourceSet() { 1555 UnicodeSet result = new UnicodeSet(); 1556 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), result, new UnicodeSet()); 1557 return result; 1558 } 1559 1560 /** 1561 * Framework method that returns the set of all characters that 1562 * may be modified in the input text by this Transliterator, 1563 * ignoring the effect of this object's filter. The base class 1564 * implementation returns the empty set. Subclasses that wish to 1565 * implement this should override this method. 1566 * @return the set of characters that this transliterator may 1567 * modify. The set may be modified, so subclasses should return a 1568 * newly-created object. 1569 * @see #getSourceSet 1570 * @see #getTargetSet 1571 * @stable ICU 2.2 1572 */ 1573 protected UnicodeSet handleGetSourceSet() { 1574 return new UnicodeSet(); 1575 } 1576 1577 /** 1578 * Returns the set of all characters that may be generated as 1579 * replacement text by this transliterator. The default 1580 * implementation returns the empty set. Some subclasses may 1581 * override this method to return a more precise result. The 1582 * return result is approximate in any case and is intended for 1583 * use by tests, tools, or utilities requiring such 1584 * meta-information. 1585 * <p>Warning. You might expect an empty filter to always produce an empty target. 1586 * However, consider the following: 1587 * <pre> 1588 * [Pp]{}[\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB] > \'; 1589 * </pre> 1590 * With a filter of [], you still get some elements in the target set, because this rule will still match. It could 1591 * be recast to the following if it were important. 1592 * <pre> 1593 * [Pp]{([\u03A3\u03C2\u03C3\u03F7\u03F8\u03FA\u03FB])} > \' | $1; 1594 * </pre> 1595 * @see #getTargetSet 1596 * @stable ICU 2.2 1597 */ 1598 public UnicodeSet getTargetSet() { 1599 UnicodeSet result = new UnicodeSet(); 1600 addSourceTargetSet(getFilterAsUnicodeSet(UnicodeSet.ALL_CODE_POINTS), new UnicodeSet(), result); 1601 return result; 1602 } 1603 1604 /** 1605 * Returns the set of all characters that may be generated as 1606 * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter(). 1607 * <p>SHOULD BE OVERRIDEN BY SUBCLASSES. 1608 * It is probably an error for any transliterator to NOT override this, but we can't force them to 1609 * for backwards compatibility. 1610 * <p>Other methods vector through this. 1611 * <p>When gathering the information on source and target, the compound transliterator makes things complicated. 1612 * For example, suppose we have: 1613 * <pre> 1614 * Global FILTER = [ax] 1615 * a > b; 1616 * :: NULL; 1617 * b > c; 1618 * x > d; 1619 * </pre> 1620 * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets 1621 * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to 1622 * the global filter, intersect that transliterator's filter. Based on that we get the target. 1623 * The next transliterator gets as a global filter (global + last target). And so on. 1624 * <p>There is another complication: 1625 * <pre> 1626 * Global FILTER = [ax] 1627 * a >|b; 1628 * b >c; 1629 * </pre> 1630 * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will 1631 * change the global filter as we go. 1632 * @param targetSet TODO 1633 * @see #getTargetSet 1634 * @internal 1635 * @deprecated This API is ICU internal only. 1636 */ 1637 @Deprecated 1638 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 1639 UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); 1640 UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter); 1641 // use old method, if we don't have anything better 1642 sourceSet.addAll(temp); 1643 // clumsy guess with target 1644 for (String s : temp) { 1645 String t = transliterate(s); 1646 if (!s.equals(t)) { 1647 targetSet.addAll(t); 1648 } 1649 } 1650 } 1651 1652 /** 1653 * Returns the intersectionof this instance's filter intersected with an external filter. 1654 * The externalFilter must be frozen (it is frozen if not). 1655 * The result may be frozen, so don't attempt to modify. 1656 * @internal 1657 * @deprecated This API is ICU internal only. 1658 */ 1659 @Deprecated 1660 // TODO change to getMergedFilter 1661 public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) { 1662 if (filter == null) { 1663 return externalFilter; 1664 } 1665 UnicodeSet filterSet = new UnicodeSet(externalFilter); 1666 // Most, but not all filters will be UnicodeSets. Optimize for 1667 // the high-runner case. 1668 UnicodeSet temp; 1669 try { 1670 temp = filter; 1671 } catch (ClassCastException e) { 1672 filter.addMatchSetTo(temp = new UnicodeSet()); 1673 } 1674 return filterSet.retainAll(temp).freeze(); 1675 } 1676 1677 /** 1678 * Returns this transliterator's inverse. See the class 1679 * documentation for details. This implementation simply inverts 1680 * the two entities in the ID and attempts to retrieve the 1681 * resulting transliterator. That is, if <code>getID()</code> 1682 * returns "A-B", then this method will return the result of 1683 * <code>getInstance("B-A")</code>, or <code>null</code> if that 1684 * call fails. 1685 * 1686 * <p>Subclasses with knowledge of their inverse may wish to 1687 * override this method. 1688 * 1689 * @return a transliterator that is an inverse, not necessarily 1690 * exact, of this transliterator, or <code>null</code> if no such 1691 * transliterator is registered. 1692 * @see #registerClass 1693 * @stable ICU 2.0 1694 */ 1695 public final Transliterator getInverse() { 1696 return getInstance(ID, REVERSE); 1697 } 1698 1699 /** 1700 * Registers a subclass of <code>Transliterator</code> with the 1701 * system. This subclass must have a public constructor taking no 1702 * arguments. When that constructor is called, the resulting 1703 * object must return the <code>ID</code> passed to this method if 1704 * its <code>getID()</code> method is called. 1705 * 1706 * @param ID the result of <code>getID()</code> for this 1707 * transliterator 1708 * @param transClass a subclass of <code>Transliterator</code> 1709 * @see #unregister 1710 * @stable ICU 2.0 1711 */ 1712 public static void registerClass(String ID, Class<? extends Transliterator> transClass, String displayName) { 1713 registry.put(ID, transClass, true); 1714 if (displayName != null) { 1715 displayNameCache.put(new CaseInsensitiveString(ID), displayName); 1716 } 1717 } 1718 1719 /** 1720 * Register a factory object with the given ID. The factory 1721 * method should return a new instance of the given transliterator. 1722 * 1723 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1724 * be called at application startup, prior to any calls to 1725 * Transliterator.getInstance to avoid undefined behavior. 1726 * 1727 * @param ID the ID of this transliterator 1728 * @param factory the factory object 1729 * @stable ICU 2.0 1730 */ 1731 public static void registerFactory(String ID, Factory factory) { 1732 registry.put(ID, factory, true); 1733 } 1734 1735 /** 1736 * Register a Transliterator object with the given ID. 1737 * 1738 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1739 * be called at application startup, prior to any calls to 1740 * Transliterator.getInstance to avoid undefined behavior. 1741 * 1742 * @param trans the Transliterator object 1743 * @stable ICU 2.2 1744 */ 1745 public static void registerInstance(Transliterator trans) { 1746 registry.put(trans.getID(), trans, true); 1747 } 1748 1749 /** 1750 * Register a Transliterator object. 1751 * 1752 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1753 * be called at application startup, prior to any calls to 1754 * Transliterator.getInstance to avoid undefined behavior. 1755 * 1756 * @param trans the Transliterator object 1757 */ 1758 static void registerInstance(Transliterator trans, boolean visible) { 1759 registry.put(trans.getID(), trans, visible); 1760 } 1761 1762 /** 1763 * Register an ID as an alias of another ID. Instantiating 1764 * alias ID produces the same result as instantiating the original ID. 1765 * This is generally used to create short aliases of compound IDs. 1766 * 1767 * <p>Because ICU may choose to cache Transliterator objects internally, this must 1768 * be called at application startup, prior to any calls to 1769 * Transliterator.getInstance to avoid undefined behavior. 1770 * 1771 * @param aliasID The new ID being registered. 1772 * @param realID The existing ID that the new ID should be an alias of. 1773 * @stable ICU 3.6 1774 */ 1775 public static void registerAlias(String aliasID, String realID) { 1776 registry.put(aliasID, realID, true); 1777 } 1778 1779 /** 1780 * Register two targets as being inverses of one another. For 1781 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 1782 * Transliterator to form the following inverse relationships: 1783 * 1784 * <pre>NFC => NFD 1785 * Any-NFC => Any-NFD 1786 * NFD => NFC 1787 * Any-NFD => Any-NFC</pre> 1788 * 1789 * (Without the special inverse registration, the inverse of NFC 1790 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 1791 * that the presence or absence of "Any-" is preserved. 1792 * 1793 * <p>The relationship is symmetrical; registering (a, b) is 1794 * equivalent to registering (b, a). 1795 * 1796 * <p>The relevant IDs must still be registered separately as 1797 * factories or classes. 1798 * 1799 * <p>Only the targets are specified. Special inverses always 1800 * have the form Any-Target1 <=> Any-Target2. The target should 1801 * have canonical casing (the casing desired to be produced when 1802 * an inverse is formed) and should contain no whitespace or other 1803 * extraneous characters. 1804 * 1805 * @param target the target against which to register the inverse 1806 * @param inverseTarget the inverse of target, that is 1807 * Any-target.getInverse() => Any-inverseTarget 1808 * @param bidirectional if true, register the reverse relation 1809 * as well, that is, Any-inverseTarget.getInverse() => Any-target 1810 */ 1811 static void registerSpecialInverse(String target, 1812 String inverseTarget, 1813 boolean bidirectional) { 1814 TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional); 1815 } 1816 1817 /** 1818 * Unregisters a transliterator or class. This may be either 1819 * a system transliterator or a user transliterator or class. 1820 * 1821 * @param ID the ID of the transliterator or class 1822 * @see #registerClass 1823 * @stable ICU 2.0 1824 */ 1825 public static void unregister(String ID) { 1826 displayNameCache.remove(new CaseInsensitiveString(ID)); 1827 registry.remove(ID); 1828 } 1829 1830 /** 1831 * Returns an enumeration over the programmatic names of registered 1832 * <code>Transliterator</code> objects. This includes both system 1833 * transliterators and user transliterators registered using 1834 * <code>registerClass()</code>. The enumerated names may be 1835 * passed to <code>getInstance()</code>. 1836 * 1837 * @return An <code>Enumeration</code> over <code>String</code> objects 1838 * @see #getInstance 1839 * @see #registerClass 1840 * @stable ICU 2.0 1841 */ 1842 public static final Enumeration<String> getAvailableIDs() { 1843 return registry.getAvailableIDs(); 1844 } 1845 1846 /** 1847 * Returns an enumeration over the source names of registered 1848 * transliterators. Source names may be passed to 1849 * getAvailableTargets() to obtain available targets for each 1850 * source. 1851 * @stable ICU 2.0 1852 */ 1853 public static final Enumeration<String> getAvailableSources() { 1854 return registry.getAvailableSources(); 1855 } 1856 1857 /** 1858 * Returns an enumeration over the target names of registered 1859 * transliterators having a given source name. Target names may 1860 * be passed to getAvailableVariants() to obtain available 1861 * variants for each source and target pair. 1862 * @stable ICU 2.0 1863 */ 1864 public static final Enumeration<String> getAvailableTargets(String source) { 1865 return registry.getAvailableTargets(source); 1866 } 1867 1868 /** 1869 * Returns an enumeration over the variant names of registered 1870 * transliterators having a given source name and target name. 1871 * @stable ICU 2.0 1872 */ 1873 public static final Enumeration<String> getAvailableVariants(String source, 1874 String target) { 1875 return registry.getAvailableVariants(source, target); 1876 } 1877 private static final String ROOT = "root", 1878 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs"; 1879 static { 1880 registry = new TransliteratorRegistry(); 1881 1882 // The display name cache starts out empty 1883 displayNameCache = Collections.synchronizedMap(new HashMap<CaseInsensitiveString, String>()); 1884 /* The following code parses the index table located in 1885 * icu/data/translit/root.txt. The index is an n x 4 table 1886 * that follows this format: 1887 * <id>{ 1888 * file{ 1889 * resource{"<resource>"} 1890 * direction{"<direction>"} 1891 * } 1892 * } 1893 * <id>{ 1894 * internal{ 1895 * resource{"<resource>"} 1896 * direction{"<direction"} 1897 * } 1898 * } 1899 * <id>{ 1900 * alias{"<getInstanceArg"} 1901 * } 1902 * <id> is the ID of the system transliterator being defined. These 1903 * are public IDs enumerated by Transliterator.getAvailableIDs(), 1904 * unless the second field is "internal". 1905 * 1906 * <resource> is a ResourceReader resource name. Currently these refer 1907 * to file names under com/ibm/text/resources. This string is passed 1908 * directly to ResourceReader, together with <encoding>. 1909 * 1910 * <direction> is either "FORWARD" or "REVERSE". 1911 * 1912 * <getInstanceArg> is a string to be passed directly to 1913 * Transliterator.getInstance(). The returned Transliterator object 1914 * then has its ID changed to <id> and is returned. 1915 * 1916 * The extra blank field on "alias" lines is to make the array square. 1917 */ 1918 UResourceBundle bundle, transIDs, colBund; 1919 bundle = UResourceBundle.getBundleInstance(ICUData.ICU_TRANSLIT_BASE_NAME, ROOT); 1920 transIDs = bundle.get(RB_RULE_BASED_IDS); 1921 1922 int row, maxRows; 1923 maxRows = transIDs.getSize(); 1924 for (row = 0; row < maxRows; row++) { 1925 colBund = transIDs.get(row); 1926 String ID = colBund.getKey(); 1927 if (ID.indexOf("-t-") >= 0) { 1928 continue; 1929 } 1930 UResourceBundle res = colBund.get(0); 1931 String type = res.getKey(); 1932 if (type.equals("file") || type.equals("internal")) { 1933 // Rest of line is <resource>:<encoding>:<direction> 1934 // pos colon c2 1935 String resString = res.getString("resource"); 1936 int dir; 1937 String direction = res.getString("direction"); 1938 switch (direction.charAt(0)) { 1939 case 'F': 1940 dir = FORWARD; 1941 break; 1942 case 'R': 1943 dir = REVERSE; 1944 break; 1945 default: 1946 throw new RuntimeException("Can't parse direction: " + direction); 1947 } 1948 registry.put(ID, 1949 resString, // resource 1950 dir, 1951 !type.equals("internal")); 1952 } else if (type.equals("alias")) { 1953 //'alias'; row[2]=createInstance argument 1954 String resString = res.getString(); 1955 registry.put(ID, resString, true); 1956 } else { 1957 // Unknown type 1958 throw new RuntimeException("Unknow type: " + type); 1959 } 1960 } 1961 1962 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false); 1963 1964 // Register non-rule-based transliterators 1965 registerClass(NullTransliterator._ID, 1966 NullTransliterator.class, null); 1967 RemoveTransliterator.register(); 1968 EscapeTransliterator.register(); 1969 UnescapeTransliterator.register(); 1970 LowercaseTransliterator.register(); 1971 UppercaseTransliterator.register(); 1972 TitlecaseTransliterator.register(); 1973 CaseFoldTransliterator.register(); 1974 UnicodeNameTransliterator.register(); 1975 NameUnicodeTransliterator.register(); 1976 NormalizationTransliterator.register(); 1977 BreakTransliterator.register(); 1978 AnyTransliterator.register(); // do this last! 1979 } 1980 1981 /** 1982 * Register the script-based "Any" transliterators: Any-Latin, Any-Greek 1983 * @internal 1984 * @deprecated This API is ICU internal only. 1985 */ 1986 @Deprecated 1987 public static void registerAny() { 1988 AnyTransliterator.register(); 1989 } 1990 1991 /** 1992 * The factory interface for transliterators. Transliterator 1993 * subclasses can register factory objects for IDs using the 1994 * registerFactory() method of Transliterator. When invoked, the 1995 * factory object will be passed the ID being instantiated. This 1996 * makes it possible to register one factory method to more than 1997 * one ID, or for a factory method to parameterize its result 1998 * based on the variant. 1999 * @stable ICU 2.0 2000 */ 2001 public static interface Factory { 2002 /** 2003 * Return a transliterator for the given ID. 2004 * @stable ICU 2.0 2005 */ 2006 Transliterator getInstance(String ID); 2007 } 2008 2009 /** 2010 * Implements StringTransform via this method. 2011 * @param source text to be transformed (eg lowercased) 2012 * @return result 2013 * @stable ICU 3.8 2014 */ 2015 @Override 2016 public String transform(String source) { 2017 return transliterate(source); 2018 } 2019 } 2020