Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.util.List;
     12 
     13 import com.ibm.icu.impl.Utility;
     14 import com.ibm.icu.impl.UtilityExtensions;
     15 
     16 /**
     17  * A transliterator that is composed of two or more other
     18  * transliterator objects linked together.  For example, if one
     19  * transliterator transliterates from script A to script B, and
     20  * another transliterates from script B to script C, the two may be
     21  * combined to form a new transliterator from A to C.
     22  *
     23  * <p>Composed transliterators may not behave as expected.  For
     24  * example, inverses may not combine to form the identity
     25  * transliterator.  See the class documentation for {@link
     26  * Transliterator} for details.
     27  *
     28  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
     29  *
     30  * @author Alan Liu
     31  */
     32 class CompoundTransliterator extends Transliterator {
     33 
     34     private Transliterator[] trans;
     35 
     36     private int numAnonymousRBTs = 0;
     37 
     38     /**
     39      * Constructs a new compound transliterator given an array of
     40      * transliterators.  The array of transliterators may be of any
     41      * length, including zero or one, however, useful compound
     42      * transliterators have at least two components.
     43      * @param transliterators array of <code>Transliterator</code>
     44      * objects
     45      * @param filter the filter.  Any character for which
     46      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
     47      * altered by this transliterator.  If <tt>filter</tt> is
     48      * <tt>null</tt> then no filtering is applied.
     49      */
     50     /*public CompoundTransliterator(Transliterator[] transliterators,
     51                                   UnicodeFilter filter) {
     52         super(joinIDs(transliterators), filter);
     53         trans = new Transliterator[transliterators.length];
     54         System.arraycopy(transliterators, 0, trans, 0, trans.length);
     55         computeMaximumContextLength();
     56     }*/
     57 
     58     /**
     59      * Constructs a new compound transliterator given an array of
     60      * transliterators.  The array of transliterators may be of any
     61      * length, including zero or one, however, useful compound
     62      * transliterators have at least two components.
     63      * @param transliterators array of <code>Transliterator</code>
     64      * objects
     65      */
     66     /*public CompoundTransliterator(Transliterator[] transliterators) {
     67         this(transliterators, null);
     68     }*/
     69 
     70     /**
     71      * Constructs a new compound transliterator.
     72      * @param ID compound ID
     73      * @param direction either Transliterator.FORWARD or Transliterator.REVERSE
     74      * @param filter a global filter for this compound transliterator
     75      * or null
     76      */
     77     /*public CompoundTransliterator(String ID, int direction,
     78                                   UnicodeFilter filter) {
     79         super(ID, filter);
     80         init(ID, direction, true);
     81     }*/
     82 
     83     /**
     84      * Constructs a new compound transliterator with no filter.
     85      * @param ID compound ID
     86      * @param direction either Transliterator.FORWARD or Transliterator.REVERSE
     87      */
     88     /*public CompoundTransliterator(String ID, int direction) {
     89         this(ID, direction, null);
     90     }*/
     91 
     92     /**
     93      * Constructs a new forward compound transliterator with no filter.
     94      * @param ID compound ID
     95      */
     96     /*public CompoundTransliterator(String ID) {
     97         this(ID, FORWARD, null);
     98     }*/
     99 
    100     /**
    101      * Package private constructor for Transliterator from a vector of
    102      * transliterators.  The caller is responsible for fixing up the
    103      * ID.
    104      */
    105     CompoundTransliterator(List<Transliterator> list) {
    106         this(list, 0);
    107     }
    108 
    109     CompoundTransliterator(List<Transliterator> list, int numAnonymousRBTs) {
    110         super("", null);
    111         trans = null;
    112         init(list, FORWARD, false);
    113         this.numAnonymousRBTs = numAnonymousRBTs;
    114         // assume caller will fixup ID
    115     }
    116 
    117     /**
    118      * Internal method for safeClone...
    119      * @param id
    120      * @param filter2
    121      * @param trans2
    122      * @param numAnonymousRBTs2
    123      */
    124     CompoundTransliterator(String id, UnicodeFilter filter2, Transliterator[] trans2, int numAnonymousRBTs2) {
    125         super(id, filter2);
    126         trans = trans2;
    127         numAnonymousRBTs = numAnonymousRBTs2;
    128     }
    129 
    130     /**
    131      * Finish constructing a transliterator: only to be called by
    132      * constructors.  Before calling init(), set trans and filter to NULL.
    133      * @param id the id containing ';'-separated entries
    134      * @param direction either FORWARD or REVERSE
    135      * @param idSplitPoint the index into id at which the
    136      * splitTrans should be inserted, if there is one, or
    137      * -1 if there is none.
    138      * @param splitTrans a transliterator to be inserted
    139      * before the entry at offset idSplitPoint in the id string.  May be
    140      * NULL to insert no entry.
    141      * @param fixReverseID if TRUE, then reconstruct the ID of reverse
    142      * entries by calling getID() of component entries.  Some constructors
    143      * do not require this because they apply a facade ID anyway.
    144      */
    145     /*private void init(String id,
    146                       int direction,
    147                       boolean fixReverseID) {
    148         // assert(trans == 0);
    149 
    150         Vector list = new Vector();
    151         UnicodeSet[] compoundFilter = new UnicodeSet[1];
    152         StringBuffer regenID = new StringBuffer();
    153         if (!TransliteratorIDParser.parseCompoundID(id, direction,
    154                  regenID, list, compoundFilter)) {
    155             throw new IllegalArgumentException("Invalid ID " + id);
    156         }
    157 
    158         TransliteratorIDParser.instantiateList(list);
    159 
    160         init(list, direction, fixReverseID);
    161 
    162         if (compoundFilter[0] != null) {
    163             setFilter(compoundFilter[0]);
    164         }
    165     }*/
    166 
    167 
    168     /**
    169      * Finish constructing a transliterator: only to be called by
    170      * constructors.  Before calling init(), set trans and filter to NULL.
    171      * @param list a vector of transliterator objects to be adopted.  It
    172      * should NOT be empty.  The list should be in declared order.  That
    173      * is, it should be in the FORWARD order; if direction is REVERSE then
    174      * the list order will be reversed.
    175      * @param direction either FORWARD or REVERSE
    176      * @param fixReverseID if TRUE, then reconstruct the ID of reverse
    177      * entries by calling getID() of component entries.  Some constructors
    178      * do not require this because they apply a facade ID anyway.
    179      */
    180     private void init(List<Transliterator> list,
    181                       int direction,
    182                       boolean fixReverseID) {
    183         // assert(trans == 0);
    184 
    185         // Allocate array
    186         int count = list.size();
    187         trans = new Transliterator[count];
    188 
    189         // Move the transliterators from the vector into an array.
    190         // Reverse the order if necessary.
    191         int i;
    192         for (i=0; i<count; ++i) {
    193             int j = (direction == FORWARD) ? i : count - 1 - i;
    194             trans[i] = list.get(j);
    195         }
    196 
    197         // If the direction is UTRANS_REVERSE then we may need to fix the
    198         // ID.
    199         if (direction == REVERSE && fixReverseID) {
    200             StringBuilder newID = new StringBuilder();
    201             for (i=0; i<count; ++i) {
    202                 if (i > 0) {
    203                     newID.append(ID_DELIM);
    204                 }
    205                 newID.append(trans[i].getID());
    206             }
    207             setID(newID.toString());
    208         }
    209 
    210         computeMaximumContextLength();
    211     }
    212 
    213     /**
    214      * Return the IDs of the given list of transliterators, concatenated
    215      * with ';' delimiting them.  Equivalent to the perlish expression
    216      * join(';', map($_.getID(), transliterators).
    217      */
    218     /*private static String joinIDs(Transliterator[] transliterators) {
    219         StringBuffer id = new StringBuffer();
    220         for (int i=0; i<transliterators.length; ++i) {
    221             if (i > 0) {
    222                 id.append(';');
    223             }
    224             id.append(transliterators[i].getID());
    225         }
    226         return id.toString();
    227     }*/
    228 
    229     /**
    230      * Returns the number of transliterators in this chain.
    231      * @return number of transliterators in this chain.
    232      */
    233     public int getCount() {
    234         return trans.length;
    235     }
    236 
    237     /**
    238      * Returns the transliterator at the given index in this chain.
    239      * @param index index into chain, from 0 to <code>getCount() - 1</code>
    240      * @return transliterator at the given index
    241      */
    242     public Transliterator getTransliterator(int index) {
    243         return trans[index];
    244     }
    245 
    246     /**
    247      * Append c to buf, unless buf is empty or buf already ends in c.
    248      */
    249     private static void _smartAppend(StringBuilder buf, char c) {
    250         if (buf.length() != 0 &&
    251             buf.charAt(buf.length() - 1) != c) {
    252             buf.append(c);
    253         }
    254     }
    255 
    256     /**
    257      * Override Transliterator:
    258      * Create a rule string that can be passed to createFromRules()
    259      * to recreate this transliterator.
    260      * @param escapeUnprintable if TRUE then convert unprintable
    261      * character to their hex escape representations, \\uxxxx or
    262      * \\Uxxxxxxxx.  Unprintable characters are those other than
    263      * U+000A, U+0020..U+007E.
    264      * @return the rule string
    265      */
    266     @Override
    267     public String toRules(boolean escapeUnprintable) {
    268         // We do NOT call toRules() on our component transliterators, in
    269         // general.  If we have several rule-based transliterators, this
    270         // yields a concatenation of the rules -- not what we want.  We do
    271         // handle compound RBT transliterators specially -- those for which
    272         // compoundRBTIndex >= 0.  For the transliterator at compoundRBTIndex,
    273         // we do call toRules() recursively.
    274         StringBuilder rulesSource = new StringBuilder();
    275         if (numAnonymousRBTs >= 1 && getFilter() != null) {
    276             // If we are a compound RBT and if we have a global
    277             // filter, then emit it at the top.
    278             rulesSource.append("::").append(getFilter().toPattern(escapeUnprintable)).append(ID_DELIM);
    279         }
    280         for (int i=0; i<trans.length; ++i) {
    281             String rule;
    282 
    283             // Anonymous RuleBasedTransliterators (inline rules and
    284             // ::BEGIN/::END blocks) are given IDs that begin with
    285             // "%Pass": use toRules() to write all the rules to the output
    286             // (and insert "::Null;" if we have two in a row)
    287             if (trans[i].getID().startsWith("%Pass")) {
    288                 rule = trans[i].toRules(escapeUnprintable);
    289                 if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1].getID().startsWith("%Pass"))
    290                     rule = "::Null;" + rule;
    291 
    292             // we also use toRules() on CompoundTransliterators (which we
    293             // check for by looking for a semicolon in the ID)-- this gets
    294             // the list of their child transliterators output in the right
    295             // format
    296             } else if (trans[i].getID().indexOf(';') >= 0) {
    297                 rule = trans[i].toRules(escapeUnprintable);
    298 
    299             // for everything else, use baseToRules()
    300             } else {
    301                 rule = trans[i].baseToRules(escapeUnprintable);
    302             }
    303             _smartAppend(rulesSource, '\n');
    304             rulesSource.append(rule);
    305             _smartAppend(rulesSource, ID_DELIM);
    306         }
    307         return rulesSource.toString();
    308     }
    309 
    310     /**
    311      * @internal
    312      */
    313     @Override
    314     public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    315         UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter));
    316         UnicodeSet tempTargetSet = new UnicodeSet();
    317         for (int i=0; i<trans.length; ++i) {
    318             // each time we produce targets, those can be used by subsequent items, despite the filter.
    319             // so we get just those items, and add them to the filter each time.
    320             tempTargetSet.clear();
    321             trans[i].addSourceTargetSet(myFilter, sourceSet, tempTargetSet);
    322             targetSet.addAll(tempTargetSet);
    323             myFilter.addAll(tempTargetSet);
    324         }
    325     }
    326 
    327 //    /**
    328 //     * Returns the set of all characters that may be generated as
    329 //     * replacement text by this transliterator.
    330 //     */
    331 //    public UnicodeSet getTargetSet() {
    332 //        UnicodeSet set = new UnicodeSet();
    333 //        for (int i=0; i<trans.length; ++i) {
    334 //            // This is a heuristic, and not 100% reliable.
    335 //            set.addAll(trans[i].getTargetSet());
    336 //        }
    337 //        return set;
    338 //    }
    339 
    340     /**
    341      * Implements {@link Transliterator#handleTransliterate}.
    342      */
    343     @Override
    344     protected void handleTransliterate(Replaceable text,
    345                                        Position index, boolean incremental) {
    346         /* Call each transliterator with the same start value and
    347          * initial cursor index, but with the limit index as modified
    348          * by preceding transliterators.  The cursor index must be
    349          * reset for each transliterator to give each a chance to
    350          * transliterate the text.  The initial cursor index is known
    351          * to still point to the same place after each transliterator
    352          * is called because each transliterator will not change the
    353          * text between start and the initial value of cursor.
    354          *
    355          * IMPORTANT: After the first transliterator, each subsequent
    356          * transliterator only gets to transliterate text committed by
    357          * preceding transliterators; that is, the cursor (output
    358          * value) of transliterator i becomes the limit (input value)
    359          * of transliterator i+1.  Finally, the overall limit is fixed
    360          * up before we return.
    361          *
    362          * Assumptions we make here:
    363          * (1) contextStart <= start <= limit <= contextLimit <= text.length()
    364          * (2) start <= start' <= limit'  ;cursor doesn't move back
    365          * (3) start <= limit'            ;text before cursor unchanged
    366          * - start' is the value of start after calling handleKT
    367          * - limit' is the value of limit after calling handleKT
    368          */
    369 
    370         /**
    371          * Example: 3 transliterators.  This example illustrates the
    372          * mechanics we need to implement.  C, S, and L are the contextStart,
    373          * start, and limit.  gl is the globalLimit.  contextLimit is
    374          * equal to limit throughout.
    375          *
    376          * 1. h-u, changes hex to Unicode
    377          *
    378          *    4  7  a  d  0      4  7  a
    379          *    abc/u0061/u    =>  abca/u
    380          *    C  S       L       C   S L   gl=f->a
    381          *
    382          * 2. upup, changes "x" to "XX"
    383          *
    384          *    4  7  a       4  7  a
    385          *    abca/u    =>  abcAA/u
    386          *    C  SL         C    S
    387          *                       L    gl=a->b
    388          * 3. u-h, changes Unicode to hex
    389          *
    390          *    4  7  a        4  7  a  d  0  3
    391          *    abcAA/u    =>  abc/u0041/u0041/u
    392          *    C  S L         C              S
    393          *                                  L   gl=b->15
    394          * 4. return
    395          *
    396          *    4  7  a  d  0  3
    397          *    abc/u0041/u0041/u
    398          *    C S L
    399          */
    400 
    401         if (trans.length < 1) {
    402             index.start = index.limit;
    403             return; // Short circuit for empty compound transliterators
    404         }
    405 
    406         // compoundLimit is the limit value for the entire compound
    407         // operation.  We overwrite index.limit with the previous
    408         // index.start.  After each transliteration, we update
    409         // compoundLimit for insertions or deletions that have happened.
    410         int compoundLimit = index.limit;
    411 
    412         // compoundStart is the start for the entire compound
    413         // operation.
    414         int compoundStart = index.start;
    415 
    416         int delta = 0; // delta in length
    417 
    418         StringBuffer log = null;
    419         ///CLOVER:OFF
    420         if (DEBUG) {
    421             log = new StringBuffer("CompoundTransliterator{" + getID() +
    422                                    (incremental ? "}i: IN=" : "}: IN="));
    423             UtilityExtensions.formatInput(log, text, index);
    424             System.out.println(Utility.escape(log.toString()));
    425         }
    426         ///CLOVER:ON
    427 
    428         // Give each transliterator a crack at the run of characters.
    429         // See comments at the top of the method for more detail.
    430         for (int i=0; i<trans.length; ++i) {
    431             index.start = compoundStart; // Reset start
    432             int limit = index.limit;
    433 
    434             if (index.start == index.limit) {
    435                 // Short circuit for empty range
    436                 ///CLOVER:OFF
    437                 if (DEBUG) {
    438                     System.out.println("CompoundTransliterator[" + i +
    439                                        ".." + (trans.length-1) +
    440                                        (incremental ? "]i: " : "]: ") +
    441                                        UtilityExtensions.formatInput(text, index) +
    442                                        " (NOTHING TO DO)");
    443                 }
    444                 ///CLOVER:ON
    445                 break;
    446             }
    447 
    448             ///CLOVER:OFF
    449             if (DEBUG) {
    450                 log.setLength(0);
    451                 log.append("CompoundTransliterator[" + i + "=" +
    452                            trans[i].getID() +
    453                            (incremental ? "]i: " : "]: "));
    454                 UtilityExtensions.formatInput(log, text, index);
    455             }
    456             ///CLOVER:ON
    457 
    458             trans[i].filteredTransliterate(text, index, incremental);
    459 
    460             // In a properly written transliterator, start == limit after
    461             // handleTransliterate() returns when incremental is false.
    462             // Catch cases where the subclass doesn't do this, and throw
    463             // an exception.  (Just pinning start to limit is a bad idea,
    464             // because what's probably happening is that the subclass
    465             // isn't transliterating all the way to the end, and it should
    466             // in non-incremental mode.)
    467             if (!incremental && index.start != index.limit) {
    468                 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + trans[i].getID());
    469             }
    470 
    471             ///CLOVER:OFF
    472             if (DEBUG) {
    473                 log.append(" => ");
    474                 UtilityExtensions.formatInput(log, text, index);
    475                 System.out.println(Utility.escape(log.toString()));
    476             }
    477             ///CLOVER:ON
    478 
    479             // Cumulative delta for insertions/deletions
    480             delta += index.limit - limit;
    481 
    482             if (incremental) {
    483                 // In the incremental case, only allow subsequent
    484                 // transliterators to modify what has already been
    485                 // completely processed by prior transliterators.  In the
    486                 // non-incrmental case, allow each transliterator to
    487                 // process the entire text.
    488                 index.limit = index.start;
    489             }
    490         }
    491 
    492         compoundLimit += delta;
    493 
    494         // Start is good where it is -- where the last transliterator left
    495         // it.  Limit needs to be put back where it was, modulo
    496         // adjustments for deletions/insertions.
    497         index.limit = compoundLimit;
    498 
    499         ///CLOVER:OFF
    500         if (DEBUG) {
    501             log.setLength(0);
    502             log.append("CompoundTransliterator{" + getID() +
    503                        (incremental ? "}i: OUT=" : "}: OUT="));
    504             UtilityExtensions.formatInput(log, text, index);
    505             System.out.println(Utility.escape(log.toString()));
    506         }
    507         ///CLOVER:ON
    508     }
    509 
    510     /**
    511      * Compute and set the length of the longest context required by this transliterator.
    512      * This is <em>preceding</em> context.
    513      */
    514     private void computeMaximumContextLength() {
    515         int max = 0;
    516         for (int i=0; i<trans.length; ++i) {
    517             int len = trans[i].getMaximumContextLength();
    518             if (len > max) {
    519                 max = len;
    520             }
    521         }
    522         setMaximumContextLength(max);
    523     }
    524 
    525     /**
    526      * Temporary hack for registry problem. Needs to be replaced by better architecture.
    527      */
    528     public Transliterator safeClone() {
    529         UnicodeFilter filter = getFilter();
    530         if (filter != null && filter instanceof UnicodeSet) {
    531             filter = new UnicodeSet((UnicodeSet)filter);
    532         }
    533         return new CompoundTransliterator(getID(), filter, trans, numAnonymousRBTs);
    534     }
    535 }
    536