1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.util.List; 12 13 import com.ibm.icu.impl.Utility; 14 import com.ibm.icu.impl.UtilityExtensions; 15 16 /** 17 * A transliterator that is composed of two or more other 18 * transliterator objects linked together. For example, if one 19 * transliterator transliterates from script A to script B, and 20 * another transliterates from script B to script C, the two may be 21 * combined to form a new transliterator from A to C. 22 * 23 * <p>Composed transliterators may not behave as expected. For 24 * example, inverses may not combine to form the identity 25 * transliterator. See the class documentation for {@link 26 * Transliterator} for details. 27 * 28 * <p>Copyright © IBM Corporation 1999. All rights reserved. 29 * 30 * @author Alan Liu 31 */ 32 class CompoundTransliterator extends Transliterator { 33 34 private Transliterator[] trans; 35 36 private int numAnonymousRBTs = 0; 37 38 /** 39 * Constructs a new compound transliterator given an array of 40 * transliterators. The array of transliterators may be of any 41 * length, including zero or one, however, useful compound 42 * transliterators have at least two components. 43 * @param transliterators array of <code>Transliterator</code> 44 * objects 45 * @param filter the filter. Any character for which 46 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 47 * altered by this transliterator. If <tt>filter</tt> is 48 * <tt>null</tt> then no filtering is applied. 49 */ 50 /*public CompoundTransliterator(Transliterator[] transliterators, 51 UnicodeFilter filter) { 52 super(joinIDs(transliterators), filter); 53 trans = new Transliterator[transliterators.length]; 54 System.arraycopy(transliterators, 0, trans, 0, trans.length); 55 computeMaximumContextLength(); 56 }*/ 57 58 /** 59 * Constructs a new compound transliterator given an array of 60 * transliterators. The array of transliterators may be of any 61 * length, including zero or one, however, useful compound 62 * transliterators have at least two components. 63 * @param transliterators array of <code>Transliterator</code> 64 * objects 65 */ 66 /*public CompoundTransliterator(Transliterator[] transliterators) { 67 this(transliterators, null); 68 }*/ 69 70 /** 71 * Constructs a new compound transliterator. 72 * @param ID compound ID 73 * @param direction either Transliterator.FORWARD or Transliterator.REVERSE 74 * @param filter a global filter for this compound transliterator 75 * or null 76 */ 77 /*public CompoundTransliterator(String ID, int direction, 78 UnicodeFilter filter) { 79 super(ID, filter); 80 init(ID, direction, true); 81 }*/ 82 83 /** 84 * Constructs a new compound transliterator with no filter. 85 * @param ID compound ID 86 * @param direction either Transliterator.FORWARD or Transliterator.REVERSE 87 */ 88 /*public CompoundTransliterator(String ID, int direction) { 89 this(ID, direction, null); 90 }*/ 91 92 /** 93 * Constructs a new forward compound transliterator with no filter. 94 * @param ID compound ID 95 */ 96 /*public CompoundTransliterator(String ID) { 97 this(ID, FORWARD, null); 98 }*/ 99 100 /** 101 * Package private constructor for Transliterator from a vector of 102 * transliterators. The caller is responsible for fixing up the 103 * ID. 104 */ 105 CompoundTransliterator(List<Transliterator> list) { 106 this(list, 0); 107 } 108 109 CompoundTransliterator(List<Transliterator> list, int numAnonymousRBTs) { 110 super("", null); 111 trans = null; 112 init(list, FORWARD, false); 113 this.numAnonymousRBTs = numAnonymousRBTs; 114 // assume caller will fixup ID 115 } 116 117 /** 118 * Internal method for safeClone... 119 * @param id 120 * @param filter2 121 * @param trans2 122 * @param numAnonymousRBTs2 123 */ 124 CompoundTransliterator(String id, UnicodeFilter filter2, Transliterator[] trans2, int numAnonymousRBTs2) { 125 super(id, filter2); 126 trans = trans2; 127 numAnonymousRBTs = numAnonymousRBTs2; 128 } 129 130 /** 131 * Finish constructing a transliterator: only to be called by 132 * constructors. Before calling init(), set trans and filter to NULL. 133 * @param id the id containing ';'-separated entries 134 * @param direction either FORWARD or REVERSE 135 * @param idSplitPoint the index into id at which the 136 * splitTrans should be inserted, if there is one, or 137 * -1 if there is none. 138 * @param splitTrans a transliterator to be inserted 139 * before the entry at offset idSplitPoint in the id string. May be 140 * NULL to insert no entry. 141 * @param fixReverseID if TRUE, then reconstruct the ID of reverse 142 * entries by calling getID() of component entries. Some constructors 143 * do not require this because they apply a facade ID anyway. 144 */ 145 /*private void init(String id, 146 int direction, 147 boolean fixReverseID) { 148 // assert(trans == 0); 149 150 Vector list = new Vector(); 151 UnicodeSet[] compoundFilter = new UnicodeSet[1]; 152 StringBuffer regenID = new StringBuffer(); 153 if (!TransliteratorIDParser.parseCompoundID(id, direction, 154 regenID, list, compoundFilter)) { 155 throw new IllegalArgumentException("Invalid ID " + id); 156 } 157 158 TransliteratorIDParser.instantiateList(list); 159 160 init(list, direction, fixReverseID); 161 162 if (compoundFilter[0] != null) { 163 setFilter(compoundFilter[0]); 164 } 165 }*/ 166 167 168 /** 169 * Finish constructing a transliterator: only to be called by 170 * constructors. Before calling init(), set trans and filter to NULL. 171 * @param list a vector of transliterator objects to be adopted. It 172 * should NOT be empty. The list should be in declared order. That 173 * is, it should be in the FORWARD order; if direction is REVERSE then 174 * the list order will be reversed. 175 * @param direction either FORWARD or REVERSE 176 * @param fixReverseID if TRUE, then reconstruct the ID of reverse 177 * entries by calling getID() of component entries. Some constructors 178 * do not require this because they apply a facade ID anyway. 179 */ 180 private void init(List<Transliterator> list, 181 int direction, 182 boolean fixReverseID) { 183 // assert(trans == 0); 184 185 // Allocate array 186 int count = list.size(); 187 trans = new Transliterator[count]; 188 189 // Move the transliterators from the vector into an array. 190 // Reverse the order if necessary. 191 int i; 192 for (i=0; i<count; ++i) { 193 int j = (direction == FORWARD) ? i : count - 1 - i; 194 trans[i] = list.get(j); 195 } 196 197 // If the direction is UTRANS_REVERSE then we may need to fix the 198 // ID. 199 if (direction == REVERSE && fixReverseID) { 200 StringBuilder newID = new StringBuilder(); 201 for (i=0; i<count; ++i) { 202 if (i > 0) { 203 newID.append(ID_DELIM); 204 } 205 newID.append(trans[i].getID()); 206 } 207 setID(newID.toString()); 208 } 209 210 computeMaximumContextLength(); 211 } 212 213 /** 214 * Return the IDs of the given list of transliterators, concatenated 215 * with ';' delimiting them. Equivalent to the perlish expression 216 * join(';', map($_.getID(), transliterators). 217 */ 218 /*private static String joinIDs(Transliterator[] transliterators) { 219 StringBuffer id = new StringBuffer(); 220 for (int i=0; i<transliterators.length; ++i) { 221 if (i > 0) { 222 id.append(';'); 223 } 224 id.append(transliterators[i].getID()); 225 } 226 return id.toString(); 227 }*/ 228 229 /** 230 * Returns the number of transliterators in this chain. 231 * @return number of transliterators in this chain. 232 */ 233 public int getCount() { 234 return trans.length; 235 } 236 237 /** 238 * Returns the transliterator at the given index in this chain. 239 * @param index index into chain, from 0 to <code>getCount() - 1</code> 240 * @return transliterator at the given index 241 */ 242 public Transliterator getTransliterator(int index) { 243 return trans[index]; 244 } 245 246 /** 247 * Append c to buf, unless buf is empty or buf already ends in c. 248 */ 249 private static void _smartAppend(StringBuilder buf, char c) { 250 if (buf.length() != 0 && 251 buf.charAt(buf.length() - 1) != c) { 252 buf.append(c); 253 } 254 } 255 256 /** 257 * Override Transliterator: 258 * Create a rule string that can be passed to createFromRules() 259 * to recreate this transliterator. 260 * @param escapeUnprintable if TRUE then convert unprintable 261 * character to their hex escape representations, \\uxxxx or 262 * \\Uxxxxxxxx. Unprintable characters are those other than 263 * U+000A, U+0020..U+007E. 264 * @return the rule string 265 */ 266 @Override 267 public String toRules(boolean escapeUnprintable) { 268 // We do NOT call toRules() on our component transliterators, in 269 // general. If we have several rule-based transliterators, this 270 // yields a concatenation of the rules -- not what we want. We do 271 // handle compound RBT transliterators specially -- those for which 272 // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, 273 // we do call toRules() recursively. 274 StringBuilder rulesSource = new StringBuilder(); 275 if (numAnonymousRBTs >= 1 && getFilter() != null) { 276 // If we are a compound RBT and if we have a global 277 // filter, then emit it at the top. 278 rulesSource.append("::").append(getFilter().toPattern(escapeUnprintable)).append(ID_DELIM); 279 } 280 for (int i=0; i<trans.length; ++i) { 281 String rule; 282 283 // Anonymous RuleBasedTransliterators (inline rules and 284 // ::BEGIN/::END blocks) are given IDs that begin with 285 // "%Pass": use toRules() to write all the rules to the output 286 // (and insert "::Null;" if we have two in a row) 287 if (trans[i].getID().startsWith("%Pass")) { 288 rule = trans[i].toRules(escapeUnprintable); 289 if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1].getID().startsWith("%Pass")) 290 rule = "::Null;" + rule; 291 292 // we also use toRules() on CompoundTransliterators (which we 293 // check for by looking for a semicolon in the ID)-- this gets 294 // the list of their child transliterators output in the right 295 // format 296 } else if (trans[i].getID().indexOf(';') >= 0) { 297 rule = trans[i].toRules(escapeUnprintable); 298 299 // for everything else, use baseToRules() 300 } else { 301 rule = trans[i].baseToRules(escapeUnprintable); 302 } 303 _smartAppend(rulesSource, '\n'); 304 rulesSource.append(rule); 305 _smartAppend(rulesSource, ID_DELIM); 306 } 307 return rulesSource.toString(); 308 } 309 310 /** 311 * @internal 312 */ 313 @Override 314 public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { 315 UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter)); 316 UnicodeSet tempTargetSet = new UnicodeSet(); 317 for (int i=0; i<trans.length; ++i) { 318 // each time we produce targets, those can be used by subsequent items, despite the filter. 319 // so we get just those items, and add them to the filter each time. 320 tempTargetSet.clear(); 321 trans[i].addSourceTargetSet(myFilter, sourceSet, tempTargetSet); 322 targetSet.addAll(tempTargetSet); 323 myFilter.addAll(tempTargetSet); 324 } 325 } 326 327 // /** 328 // * Returns the set of all characters that may be generated as 329 // * replacement text by this transliterator. 330 // */ 331 // public UnicodeSet getTargetSet() { 332 // UnicodeSet set = new UnicodeSet(); 333 // for (int i=0; i<trans.length; ++i) { 334 // // This is a heuristic, and not 100% reliable. 335 // set.addAll(trans[i].getTargetSet()); 336 // } 337 // return set; 338 // } 339 340 /** 341 * Implements {@link Transliterator#handleTransliterate}. 342 */ 343 @Override 344 protected void handleTransliterate(Replaceable text, 345 Position index, boolean incremental) { 346 /* Call each transliterator with the same start value and 347 * initial cursor index, but with the limit index as modified 348 * by preceding transliterators. The cursor index must be 349 * reset for each transliterator to give each a chance to 350 * transliterate the text. The initial cursor index is known 351 * to still point to the same place after each transliterator 352 * is called because each transliterator will not change the 353 * text between start and the initial value of cursor. 354 * 355 * IMPORTANT: After the first transliterator, each subsequent 356 * transliterator only gets to transliterate text committed by 357 * preceding transliterators; that is, the cursor (output 358 * value) of transliterator i becomes the limit (input value) 359 * of transliterator i+1. Finally, the overall limit is fixed 360 * up before we return. 361 * 362 * Assumptions we make here: 363 * (1) contextStart <= start <= limit <= contextLimit <= text.length() 364 * (2) start <= start' <= limit' ;cursor doesn't move back 365 * (3) start <= limit' ;text before cursor unchanged 366 * - start' is the value of start after calling handleKT 367 * - limit' is the value of limit after calling handleKT 368 */ 369 370 /** 371 * Example: 3 transliterators. This example illustrates the 372 * mechanics we need to implement. C, S, and L are the contextStart, 373 * start, and limit. gl is the globalLimit. contextLimit is 374 * equal to limit throughout. 375 * 376 * 1. h-u, changes hex to Unicode 377 * 378 * 4 7 a d 0 4 7 a 379 * abc/u0061/u => abca/u 380 * C S L C S L gl=f->a 381 * 382 * 2. upup, changes "x" to "XX" 383 * 384 * 4 7 a 4 7 a 385 * abca/u => abcAA/u 386 * C SL C S 387 * L gl=a->b 388 * 3. u-h, changes Unicode to hex 389 * 390 * 4 7 a 4 7 a d 0 3 391 * abcAA/u => abc/u0041/u0041/u 392 * C S L C S 393 * L gl=b->15 394 * 4. return 395 * 396 * 4 7 a d 0 3 397 * abc/u0041/u0041/u 398 * C S L 399 */ 400 401 if (trans.length < 1) { 402 index.start = index.limit; 403 return; // Short circuit for empty compound transliterators 404 } 405 406 // compoundLimit is the limit value for the entire compound 407 // operation. We overwrite index.limit with the previous 408 // index.start. After each transliteration, we update 409 // compoundLimit for insertions or deletions that have happened. 410 int compoundLimit = index.limit; 411 412 // compoundStart is the start for the entire compound 413 // operation. 414 int compoundStart = index.start; 415 416 int delta = 0; // delta in length 417 418 StringBuffer log = null; 419 ///CLOVER:OFF 420 if (DEBUG) { 421 log = new StringBuffer("CompoundTransliterator{" + getID() + 422 (incremental ? "}i: IN=" : "}: IN=")); 423 UtilityExtensions.formatInput(log, text, index); 424 System.out.println(Utility.escape(log.toString())); 425 } 426 ///CLOVER:ON 427 428 // Give each transliterator a crack at the run of characters. 429 // See comments at the top of the method for more detail. 430 for (int i=0; i<trans.length; ++i) { 431 index.start = compoundStart; // Reset start 432 int limit = index.limit; 433 434 if (index.start == index.limit) { 435 // Short circuit for empty range 436 ///CLOVER:OFF 437 if (DEBUG) { 438 System.out.println("CompoundTransliterator[" + i + 439 ".." + (trans.length-1) + 440 (incremental ? "]i: " : "]: ") + 441 UtilityExtensions.formatInput(text, index) + 442 " (NOTHING TO DO)"); 443 } 444 ///CLOVER:ON 445 break; 446 } 447 448 ///CLOVER:OFF 449 if (DEBUG) { 450 log.setLength(0); 451 log.append("CompoundTransliterator[" + i + "=" + 452 trans[i].getID() + 453 (incremental ? "]i: " : "]: ")); 454 UtilityExtensions.formatInput(log, text, index); 455 } 456 ///CLOVER:ON 457 458 trans[i].filteredTransliterate(text, index, incremental); 459 460 // In a properly written transliterator, start == limit after 461 // handleTransliterate() returns when incremental is false. 462 // Catch cases where the subclass doesn't do this, and throw 463 // an exception. (Just pinning start to limit is a bad idea, 464 // because what's probably happening is that the subclass 465 // isn't transliterating all the way to the end, and it should 466 // in non-incremental mode.) 467 if (!incremental && index.start != index.limit) { 468 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + trans[i].getID()); 469 } 470 471 ///CLOVER:OFF 472 if (DEBUG) { 473 log.append(" => "); 474 UtilityExtensions.formatInput(log, text, index); 475 System.out.println(Utility.escape(log.toString())); 476 } 477 ///CLOVER:ON 478 479 // Cumulative delta for insertions/deletions 480 delta += index.limit - limit; 481 482 if (incremental) { 483 // In the incremental case, only allow subsequent 484 // transliterators to modify what has already been 485 // completely processed by prior transliterators. In the 486 // non-incrmental case, allow each transliterator to 487 // process the entire text. 488 index.limit = index.start; 489 } 490 } 491 492 compoundLimit += delta; 493 494 // Start is good where it is -- where the last transliterator left 495 // it. Limit needs to be put back where it was, modulo 496 // adjustments for deletions/insertions. 497 index.limit = compoundLimit; 498 499 ///CLOVER:OFF 500 if (DEBUG) { 501 log.setLength(0); 502 log.append("CompoundTransliterator{" + getID() + 503 (incremental ? "}i: OUT=" : "}: OUT=")); 504 UtilityExtensions.formatInput(log, text, index); 505 System.out.println(Utility.escape(log.toString())); 506 } 507 ///CLOVER:ON 508 } 509 510 /** 511 * Compute and set the length of the longest context required by this transliterator. 512 * This is <em>preceding</em> context. 513 */ 514 private void computeMaximumContextLength() { 515 int max = 0; 516 for (int i=0; i<trans.length; ++i) { 517 int len = trans[i].getMaximumContextLength(); 518 if (len > max) { 519 max = len; 520 } 521 } 522 setMaximumContextLength(max); 523 } 524 525 /** 526 * Temporary hack for registry problem. Needs to be replaced by better architecture. 527 */ 528 public Transliterator safeClone() { 529 UnicodeFilter filter = getFilter(); 530 if (filter != null && filter instanceof UnicodeSet) { 531 filter = new UnicodeSet((UnicodeSet)filter); 532 } 533 return new CompoundTransliterator(getID(), filter, trans, numAnonymousRBTs); 534 } 535 } 536