Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2002-2012, International Business Machines Corporation
      4 *   and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   01/21/2002  aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uniset.h"
     16 #include "unicode/utf16.h"
     17 #include "strrepl.h"
     18 #include "rbt_data.h"
     19 #include "util.h"
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 UnicodeReplacer::~UnicodeReplacer() {}
     24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
     25 
     26 /**
     27  * Construct a StringReplacer that sets the emits the given output
     28  * text and sets the cursor to the given position.
     29  * @param theOutput text that will replace input text when the
     30  * replace() method is called.  May contain stand-in characters
     31  * that represent nested replacers.
     32  * @param theCursorPos cursor position that will be returned by
     33  * the replace() method
     34  * @param theData transliterator context object that translates
     35  * stand-in characters to UnicodeReplacer objects
     36  */
     37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     38                                int32_t theCursorPos,
     39                                const TransliterationRuleData* theData) {
     40     output = theOutput;
     41     cursorPos = theCursorPos;
     42     hasCursor = TRUE;
     43     data = theData;
     44     isComplex = TRUE;
     45 }
     46 
     47 /**
     48  * Construct a StringReplacer that sets the emits the given output
     49  * text and does not modify the cursor.
     50  * @param theOutput text that will replace input text when the
     51  * replace() method is called.  May contain stand-in characters
     52  * that represent nested replacers.
     53  * @param theData transliterator context object that translates
     54  * stand-in characters to UnicodeReplacer objects
     55  */
     56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     57                                const TransliterationRuleData* theData) {
     58     output = theOutput;
     59     cursorPos = 0;
     60     hasCursor = FALSE;
     61     data = theData;
     62     isComplex = TRUE;
     63 }
     64 
     65 /**
     66  * Copy constructor.
     67  */
     68 StringReplacer::StringReplacer(const StringReplacer& other) :
     69     UnicodeFunctor(other),
     70     UnicodeReplacer(other)
     71 {
     72     output = other.output;
     73     cursorPos = other.cursorPos;
     74     hasCursor = other.hasCursor;
     75     data = other.data;
     76     isComplex = other.isComplex;
     77 }
     78 
     79 /**
     80  * Destructor
     81  */
     82 StringReplacer::~StringReplacer() {
     83 }
     84 
     85 /**
     86  * Implement UnicodeFunctor
     87  */
     88 UnicodeFunctor* StringReplacer::clone() const {
     89     return new StringReplacer(*this);
     90 }
     91 
     92 /**
     93  * Implement UnicodeFunctor
     94  */
     95 UnicodeReplacer* StringReplacer::toReplacer() const {
     96   return const_cast<StringReplacer *>(this);
     97 }
     98 
     99 /**
    100  * UnicodeReplacer API
    101  */
    102 int32_t StringReplacer::replace(Replaceable& text,
    103                                 int32_t start,
    104                                 int32_t limit,
    105                                 int32_t& cursor) {
    106     int32_t outLen;
    107     int32_t newStart = 0;
    108 
    109     // NOTE: It should be possible to _always_ run the complex
    110     // processing code; just slower.  If not, then there is a bug
    111     // in the complex processing code.
    112 
    113     // Simple (no nested replacers) Processing Code :
    114     if (!isComplex) {
    115         text.handleReplaceBetween(start, limit, output);
    116         outLen = output.length();
    117 
    118         // Setup default cursor position (for cursorPos within output)
    119         newStart = cursorPos;
    120     }
    121 
    122     // Complex (nested replacers) Processing Code :
    123     else {
    124         /* When there are segments to be copied, use the Replaceable.copy()
    125          * API in order to retain out-of-band data.  Copy everything to the
    126          * end of the string, then copy them back over the key.  This preserves
    127          * the integrity of indices into the key and surrounding context while
    128          * generating the output text.
    129          */
    130         UnicodeString buf;
    131         int32_t oOutput; // offset into 'output'
    132         isComplex = FALSE;
    133 
    134         // The temporary buffer starts at tempStart, and extends
    135         // to destLimit.  The start of the buffer has a single
    136         // character from before the key.  This provides style
    137         // data when addition characters are filled into the
    138         // temporary buffer.  If there is nothing to the left, use
    139         // the non-character U+FFFF, which Replaceable subclasses
    140         // should treat specially as a "no-style character."
    141         // destStart points to the point after the style context
    142         // character, so it is tempStart+1 or tempStart+2.
    143         int32_t tempStart = text.length(); // start of temp buffer
    144         int32_t destStart = tempStart; // copy new text to here
    145         if (start > 0) {
    146             int32_t len = U16_LENGTH(text.char32At(start-1));
    147             text.copy(start-len, start, tempStart);
    148             destStart += len;
    149         } else {
    150             UnicodeString str((UChar) 0xFFFF);
    151             text.handleReplaceBetween(tempStart, tempStart, str);
    152             destStart++;
    153         }
    154         int32_t destLimit = destStart;
    155 
    156         for (oOutput=0; oOutput<output.length(); ) {
    157             if (oOutput == cursorPos) {
    158                 // Record the position of the cursor
    159                 newStart = destLimit - destStart; // relative to start
    160             }
    161             UChar32 c = output.char32At(oOutput);
    162             UnicodeReplacer* r = data->lookupReplacer(c);
    163             if (r == NULL) {
    164                 // Accumulate straight (non-segment) text.
    165                 buf.append(c);
    166             } else {
    167                 isComplex = TRUE;
    168 
    169                 // Insert any accumulated straight text.
    170                 if (buf.length() > 0) {
    171                     text.handleReplaceBetween(destLimit, destLimit, buf);
    172                     destLimit += buf.length();
    173                     buf.truncate(0);
    174                 }
    175 
    176                 // Delegate output generation to replacer object
    177                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
    178                 destLimit += len;
    179             }
    180             oOutput += U16_LENGTH(c);
    181         }
    182         // Insert any accumulated straight text.
    183         if (buf.length() > 0) {
    184             text.handleReplaceBetween(destLimit, destLimit, buf);
    185             destLimit += buf.length();
    186         }
    187         if (oOutput == cursorPos) {
    188             // Record the position of the cursor
    189             newStart = destLimit - destStart; // relative to start
    190         }
    191 
    192         outLen = destLimit - destStart;
    193 
    194         // Copy new text to start, and delete it
    195         text.copy(destStart, destLimit, start);
    196         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
    197 
    198         // Delete the old text (the key)
    199         text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
    200     }
    201 
    202     if (hasCursor) {
    203         // Adjust the cursor for positions outside the key.  These
    204         // refer to code points rather than code units.  If cursorPos
    205         // is within the output string, then use newStart, which has
    206         // already been set above.
    207         if (cursorPos < 0) {
    208             newStart = start;
    209             int32_t n = cursorPos;
    210             // Outside the output string, cursorPos counts code points
    211             while (n < 0 && newStart > 0) {
    212                 newStart -= U16_LENGTH(text.char32At(newStart-1));
    213                 ++n;
    214             }
    215             newStart += n;
    216         } else if (cursorPos > output.length()) {
    217             newStart = start + outLen;
    218             int32_t n = cursorPos - output.length();
    219             // Outside the output string, cursorPos counts code points
    220             while (n > 0 && newStart < text.length()) {
    221                 newStart += U16_LENGTH(text.char32At(newStart));
    222                 --n;
    223             }
    224             newStart += n;
    225         } else {
    226             // Cursor is within output string.  It has been set up above
    227             // to be relative to start.
    228             newStart += start;
    229         }
    230 
    231         cursor = newStart;
    232     }
    233 
    234     return outLen;
    235 }
    236 
    237 /**
    238  * UnicodeReplacer API
    239  */
    240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
    241                                                  UBool escapeUnprintable) const {
    242     rule.truncate(0);
    243     UnicodeString quoteBuf;
    244 
    245     int32_t cursor = cursorPos;
    246 
    247     // Handle a cursor preceding the output
    248     if (hasCursor && cursor < 0) {
    249         while (cursor++ < 0) {
    250             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
    251         }
    252         // Fall through and append '|' below
    253     }
    254 
    255     for (int32_t i=0; i<output.length(); ++i) {
    256         if (hasCursor && i == cursor) {
    257             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    258         }
    259         UChar c = output.charAt(i); // Ok to use 16-bits here
    260 
    261         UnicodeReplacer* r = data->lookupReplacer(c);
    262         if (r == NULL) {
    263             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
    264         } else {
    265             UnicodeString buf;
    266             r->toReplacerPattern(buf, escapeUnprintable);
    267             buf.insert(0, (UChar)0x20);
    268             buf.append((UChar)0x20);
    269             ICU_Utility::appendToRule(rule, buf,
    270                                       TRUE, escapeUnprintable, quoteBuf);
    271         }
    272     }
    273 
    274     // Handle a cursor after the output.  Use > rather than >= because
    275     // if cursor == output.length() it is at the end of the output,
    276     // which is the default position, so we need not emit it.
    277     if (hasCursor && cursor > output.length()) {
    278         cursor -= output.length();
    279         while (cursor-- > 0) {
    280             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
    281         }
    282         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    283     }
    284     // Flush quoteBuf out to result
    285     ICU_Utility::appendToRule(rule, -1,
    286                               TRUE, escapeUnprintable, quoteBuf);
    287 
    288     return rule;
    289 }
    290 
    291 /**
    292  * Implement UnicodeReplacer
    293  */
    294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    295     UChar32 ch;
    296     for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
    297     ch = output.char32At(i);
    298     UnicodeReplacer* r = data->lookupReplacer(ch);
    299     if (r == NULL) {
    300         toUnionTo.add(ch);
    301     } else {
    302         r->addReplacementSetTo(toUnionTo);
    303     }
    304     }
    305 }
    306 
    307 /**
    308  * UnicodeFunctor API
    309  */
    310 void StringReplacer::setData(const TransliterationRuleData* d) {
    311     data = d;
    312     int32_t i = 0;
    313     while (i<output.length()) {
    314         UChar32 c = output.char32At(i);
    315         UnicodeFunctor* f = data->lookup(c);
    316         if (f != NULL) {
    317             f->setData(data);
    318         }
    319         i += U16_LENGTH(c);
    320     }
    321 }
    322 
    323 U_NAMESPACE_END
    324 
    325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    326 
    327 //eof
    328