Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2002-2007, International Business Machines Corporation
      6 *   and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   01/14/2002  aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 package com.ibm.icu.text;
     14 import com.ibm.icu.impl.Utility;
     15 
     16 /**
     17  * A replacer that produces static text as its output.  The text may
     18  * contain transliterator stand-in characters that represent nested
     19  * UnicodeReplacer objects, making it possible to encode a tree of
     20  * replacers in a StringReplacer.  A StringReplacer that contains such
     21  * stand-ins is called a <em>complex</em> StringReplacer.  A complex
     22  * StringReplacer has a slower processing loop than a non-complex one.
     23  * @author Alan Liu
     24  */
     25 class StringReplacer implements UnicodeReplacer {
     26 
     27     /**
     28      * Output text, possibly containing stand-in characters that
     29      * represent nested UnicodeReplacers.
     30      */
     31     private String output;
     32 
     33     /**
     34      * Cursor position.  Value is ignored if hasCursor is false.
     35      */
     36     private int cursorPos;
     37 
     38     /**
     39      * True if this object outputs a cursor position.
     40      */
     41     private boolean hasCursor;
     42 
     43     /**
     44      * A complex object contains nested replacers and requires more
     45      * complex processing.  StringReplacers are initially assumed to
     46      * be complex.  If no nested replacers are seen during processing,
     47      * then isComplex is set to false, and future replacements are
     48      * short circuited for better performance.
     49      */
     50     private boolean isComplex;
     51 
     52     /**
     53      * Object that translates stand-in characters in 'output' to
     54      * UnicodeReplacer objects.
     55      */
     56     private final RuleBasedTransliterator.Data data;
     57 
     58     /**
     59      * Construct a StringReplacer that sets the emits the given output
     60      * text and sets the cursor to the given position.
     61      * @param theOutput text that will replace input text when the
     62      * replace() method is called.  May contain stand-in characters
     63      * that represent nested replacers.
     64      * @param theCursorPos cursor position that will be returned by
     65      * the replace() method
     66      * @param theData transliterator context object that translates
     67      * stand-in characters to UnicodeReplacer objects
     68      */
     69     public StringReplacer(String theOutput,
     70                           int theCursorPos,
     71                           RuleBasedTransliterator.Data theData) {
     72         output = theOutput;
     73         cursorPos = theCursorPos;
     74         hasCursor = true;
     75         data = theData;
     76         isComplex = true;
     77     }
     78 
     79     /**
     80      * Construct a StringReplacer that sets the emits the given output
     81      * text and does not modify the cursor.
     82      * @param theOutput text that will replace input text when the
     83      * replace() method is called.  May contain stand-in characters
     84      * that represent nested replacers.
     85      * @param theData transliterator context object that translates
     86      * stand-in characters to UnicodeReplacer objects
     87      */
     88     public StringReplacer(String theOutput,
     89                           RuleBasedTransliterator.Data theData) {
     90         output = theOutput;
     91         cursorPos = 0;
     92         hasCursor = false;
     93         data = theData;
     94         isComplex = true;
     95     }
     96 
     97 //=    public static UnicodeReplacer valueOf(String output,
     98 //=                                          int cursorPos,
     99 //=                                          RuleBasedTransliterator.Data data) {
    100 //=        if (output.length() == 1) {
    101 //=            char c = output.charAt(0);
    102 //=            UnicodeReplacer r = data.lookupReplacer(c);
    103 //=            if (r != null) {
    104 //=                return r;
    105 //=            }
    106 //=        }
    107 //=        return new StringReplacer(output, cursorPos, data);
    108 //=    }
    109 
    110     /**
    111      * UnicodeReplacer API
    112      */
    113     @Override
    114     public int replace(Replaceable text,
    115                        int start,
    116                        int limit,
    117                        int[] cursor) {
    118         int outLen;
    119         int newStart = 0;
    120 
    121         // NOTE: It should be possible to _always_ run the complex
    122         // processing code; just slower.  If not, then there is a bug
    123         // in the complex processing code.
    124 
    125         // Simple (no nested replacers) Processing Code :
    126         if (!isComplex) {
    127             text.replace(start, limit, output);
    128             outLen = output.length();
    129 
    130             // Setup default cursor position (for cursorPos within output)
    131             newStart = cursorPos;
    132         }
    133 
    134         // Complex (nested replacers) Processing Code :
    135         else {
    136             /* When there are segments to be copied, use the Replaceable.copy()
    137              * API in order to retain out-of-band data.  Copy everything to the
    138              * end of the string, then copy them back over the key.  This preserves
    139              * the integrity of indices into the key and surrounding context while
    140              * generating the output text.
    141              */
    142             StringBuffer buf = new StringBuffer();
    143             int oOutput; // offset into 'output'
    144             isComplex = false;
    145 
    146             // The temporary buffer starts at tempStart, and extends
    147             // to destLimit + tempExtra.  The start of the buffer has a single
    148             // character from before the key.  This provides style
    149             // data when addition characters are filled into the
    150             // temporary buffer.  If there is nothing to the left, use
    151             // the non-character U+FFFF, which Replaceable subclasses
    152             // should treat specially as a "no-style character."
    153             // destStart points to the point after the style context
    154             // character, so it is tempStart+1 or tempStart+2.
    155             int tempStart = text.length(); // start of temp buffer
    156             int destStart = tempStart; // copy new text to here
    157             if (start > 0) {
    158                 int len = UTF16.getCharCount(text.char32At(start-1));
    159                 text.copy(start-len, start, tempStart);
    160                 destStart += len;
    161             } else {
    162                 text.replace(tempStart, tempStart, "\uFFFF");
    163                 destStart++;
    164             }
    165             int destLimit = destStart;
    166             int tempExtra = 0; // temp chars after destLimit
    167 
    168             for (oOutput=0; oOutput<output.length(); ) {
    169                 if (oOutput == cursorPos) {
    170                     // Record the position of the cursor
    171                     newStart = buf.length() + destLimit - destStart; // relative to start
    172                     // the buf.length() was inserted for bug 5789
    173                     // the problem is that if we are accumulating into a buffer (when r == null below)
    174                     // then the actual length of the text at that point needs to add the buf length.
    175                     // there was an alternative suggested in #5789, but that looks like it won't work
    176                     // if we have accumulated some stuff in the dest part AND have a non-zero buffer.
    177                 }
    178                 int c = UTF16.charAt(output, oOutput);
    179 
    180                 // When we are at the last position copy the right style
    181                 // context character into the temporary buffer.  We don't
    182                 // do this before because it will provide an incorrect
    183                 // right context for previous replace() operations.
    184                 int nextIndex = oOutput + UTF16.getCharCount(c);
    185                 if (nextIndex == output.length()) {
    186                     tempExtra = UTF16.getCharCount(text.char32At(limit));
    187                     text.copy(limit, limit+tempExtra, destLimit);
    188                 }
    189 
    190                 UnicodeReplacer r = data.lookupReplacer(c);
    191                 if (r == null) {
    192                     // Accumulate straight (non-segment) text.
    193                     UTF16.append(buf, c);
    194                 } else {
    195                     isComplex = true;
    196 
    197                     // Insert any accumulated straight text.
    198                     if (buf.length() > 0) {
    199                         text.replace(destLimit, destLimit, buf.toString());
    200                         destLimit += buf.length();
    201                         buf.setLength(0);
    202                     }
    203 
    204                     // Delegate output generation to replacer object
    205                     int len = r.replace(text, destLimit, destLimit, cursor);
    206                     destLimit += len;
    207                 }
    208                 oOutput = nextIndex;
    209             }
    210             // Insert any accumulated straight text.
    211             if (buf.length() > 0) {
    212                 text.replace(destLimit, destLimit, buf.toString());
    213                 destLimit += buf.length();
    214             }
    215             if (oOutput == cursorPos) {
    216                 // Record the position of the cursor
    217                 newStart = destLimit - destStart; // relative to start
    218             }
    219 
    220             outLen = destLimit - destStart;
    221 
    222             // Copy new text to start, and delete it
    223             text.copy(destStart, destLimit, start);
    224             text.replace(tempStart + outLen, destLimit + tempExtra + outLen, "");
    225 
    226             // Delete the old text (the key)
    227             text.replace(start + outLen, limit + outLen, "");
    228         }
    229 
    230         if (hasCursor) {
    231             // Adjust the cursor for positions outside the key.  These
    232             // refer to code points rather than code units.  If cursorPos
    233             // is within the output string, then use newStart, which has
    234             // already been set above.
    235             if (cursorPos < 0) {
    236                 newStart = start;
    237                 int n = cursorPos;
    238                 // Outside the output string, cursorPos counts code points
    239                 while (n < 0 && newStart > 0) {
    240                     newStart -= UTF16.getCharCount(text.char32At(newStart-1));
    241                     ++n;
    242                 }
    243                 newStart += n;
    244             } else if (cursorPos > output.length()) {
    245                 newStart = start + outLen;
    246                 int n = cursorPos - output.length();
    247                 // Outside the output string, cursorPos counts code points
    248                 while (n > 0 && newStart < text.length()) {
    249                     newStart += UTF16.getCharCount(text.char32At(newStart));
    250                     --n;
    251                 }
    252                 newStart += n;
    253             } else {
    254                 // Cursor is within output string.  It has been set up above
    255                 // to be relative to start.
    256                 newStart += start;
    257             }
    258 
    259             cursor[0] = newStart;
    260         }
    261 
    262         return outLen;
    263     }
    264 
    265     /**
    266      * UnicodeReplacer API
    267      */
    268     @Override
    269     public String toReplacerPattern(boolean escapeUnprintable) {
    270         StringBuffer rule = new StringBuffer();
    271         StringBuffer quoteBuf = new StringBuffer();
    272 
    273         int cursor = cursorPos;
    274 
    275         // Handle a cursor preceding the output
    276         if (hasCursor && cursor < 0) {
    277             while (cursor++ < 0) {
    278                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
    279             }
    280             // Fall through and append '|' below
    281         }
    282 
    283         for (int i=0; i<output.length(); ++i) {
    284             if (hasCursor && i == cursor) {
    285                 Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
    286             }
    287             char c = output.charAt(i); // Ok to use 16-bits here
    288 
    289             UnicodeReplacer r = data.lookupReplacer(c);
    290             if (r == null) {
    291                 Utility.appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
    292             } else {
    293                 StringBuffer buf = new StringBuffer(" ");
    294                 buf.append(r.toReplacerPattern(escapeUnprintable));
    295                 buf.append(' ');
    296                 Utility.appendToRule(rule, buf.toString(),
    297                                      true, escapeUnprintable, quoteBuf);
    298             }
    299         }
    300 
    301         // Handle a cursor after the output.  Use > rather than >= because
    302         // if cursor == output.length() it is at the end of the output,
    303         // which is the default position, so we need not emit it.
    304         if (hasCursor && cursor > output.length()) {
    305             cursor -= output.length();
    306             while (cursor-- > 0) {
    307                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
    308             }
    309             Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
    310         }
    311         // Flush quoteBuf out to result
    312         Utility.appendToRule(rule, -1,
    313                              true, escapeUnprintable, quoteBuf);
    314 
    315         return rule.toString();
    316     }
    317 
    318     /**
    319      * Union the set of all characters that may output by this object
    320      * into the given set.
    321      * @param toUnionTo the set into which to union the output characters
    322      */
    323     @Override
    324     public void addReplacementSetTo(UnicodeSet toUnionTo) {
    325         int ch;
    326         for (int i=0; i<output.length(); i+=UTF16.getCharCount(ch)) {
    327             ch = UTF16.charAt(output, i);
    328             UnicodeReplacer r = data.lookupReplacer(ch);
    329             if (r == null) {
    330                 toUnionTo.add(ch);
    331             } else {
    332                 r.addReplacementSetTo(toUnionTo);
    333             }
    334         }
    335     }
    336 }
    337 
    338 //eof
    339