Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2002-2004, International Business Machines Corporation
      4 *   and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   01/21/2002  aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "strrepl.h"
     16 #include "rbt_data.h"
     17 #include "util.h"
     18 #include "unicode/uniset.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 static const UChar EMPTY[] = { 0 }; // empty string: ""
     23 
     24 UnicodeReplacer::~UnicodeReplacer() {}
     25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
     26 
     27 /**
     28  * Construct a StringReplacer that sets the emits the given output
     29  * text and sets the cursor to the given position.
     30  * @param theOutput text that will replace input text when the
     31  * replace() method is called.  May contain stand-in characters
     32  * that represent nested replacers.
     33  * @param theCursorPos cursor position that will be returned by
     34  * the replace() method
     35  * @param theData transliterator context object that translates
     36  * stand-in characters to UnicodeReplacer objects
     37  */
     38 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     39                                int32_t theCursorPos,
     40                                const TransliterationRuleData* theData) {
     41     output = theOutput;
     42     cursorPos = theCursorPos;
     43     hasCursor = TRUE;
     44     data = theData;
     45     isComplex = TRUE;
     46 }
     47 
     48 /**
     49  * Construct a StringReplacer that sets the emits the given output
     50  * text and does not modify the cursor.
     51  * @param theOutput text that will replace input text when the
     52  * replace() method is called.  May contain stand-in characters
     53  * that represent nested replacers.
     54  * @param theData transliterator context object that translates
     55  * stand-in characters to UnicodeReplacer objects
     56  */
     57 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     58                                const TransliterationRuleData* theData) {
     59     output = theOutput;
     60     cursorPos = 0;
     61     hasCursor = FALSE;
     62     data = theData;
     63     isComplex = TRUE;
     64 }
     65 
     66 /**
     67  * Copy constructor.
     68  */
     69 StringReplacer::StringReplacer(const StringReplacer& other) :
     70     UnicodeFunctor(other),
     71     UnicodeReplacer(other)
     72 {
     73     output = other.output;
     74     cursorPos = other.cursorPos;
     75     hasCursor = other.hasCursor;
     76     data = other.data;
     77     isComplex = other.isComplex;
     78 }
     79 
     80 /**
     81  * Destructor
     82  */
     83 StringReplacer::~StringReplacer() {
     84 }
     85 
     86 /**
     87  * Implement UnicodeFunctor
     88  */
     89 UnicodeFunctor* StringReplacer::clone() const {
     90     return new StringReplacer(*this);
     91 }
     92 
     93 /**
     94  * Implement UnicodeFunctor
     95  */
     96 UnicodeReplacer* StringReplacer::toReplacer() const {
     97     return (UnicodeReplacer*) this;
     98 }
     99 
    100 /**
    101  * UnicodeReplacer API
    102  */
    103 int32_t StringReplacer::replace(Replaceable& text,
    104                                 int32_t start,
    105                                 int32_t limit,
    106                                 int32_t& cursor) {
    107     int32_t outLen;
    108     int32_t newStart = 0;
    109 
    110     // NOTE: It should be possible to _always_ run the complex
    111     // processing code; just slower.  If not, then there is a bug
    112     // in the complex processing code.
    113 
    114     // Simple (no nested replacers) Processing Code :
    115     if (!isComplex) {
    116         text.handleReplaceBetween(start, limit, output);
    117         outLen = output.length();
    118 
    119         // Setup default cursor position (for cursorPos within output)
    120         newStart = cursorPos;
    121     }
    122 
    123     // Complex (nested replacers) Processing Code :
    124     else {
    125         /* When there are segments to be copied, use the Replaceable.copy()
    126          * API in order to retain out-of-band data.  Copy everything to the
    127          * end of the string, then copy them back over the key.  This preserves
    128          * the integrity of indices into the key and surrounding context while
    129          * generating the output text.
    130          */
    131         UnicodeString buf;
    132         int32_t oOutput; // offset into 'output'
    133         isComplex = FALSE;
    134 
    135         // The temporary buffer starts at tempStart, and extends
    136         // to destLimit.  The start of the buffer has a single
    137         // character from before the key.  This provides style
    138         // data when addition characters are filled into the
    139         // temporary buffer.  If there is nothing to the left, use
    140         // the non-character U+FFFF, which Replaceable subclasses
    141         // should treat specially as a "no-style character."
    142         // destStart points to the point after the style context
    143         // character, so it is tempStart+1 or tempStart+2.
    144         int32_t tempStart = text.length(); // start of temp buffer
    145         int32_t destStart = tempStart; // copy new text to here
    146         if (start > 0) {
    147             int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
    148             text.copy(start-len, start, tempStart);
    149             destStart += len;
    150         } else {
    151             UnicodeString str((UChar) 0xFFFF);
    152             text.handleReplaceBetween(tempStart, tempStart, str);
    153             destStart++;
    154         }
    155         int32_t destLimit = destStart;
    156 
    157         for (oOutput=0; oOutput<output.length(); ) {
    158             if (oOutput == cursorPos) {
    159                 // Record the position of the cursor
    160                 newStart = destLimit - destStart; // relative to start
    161             }
    162             UChar32 c = output.char32At(oOutput);
    163             UnicodeReplacer* r = data->lookupReplacer(c);
    164             if (r == NULL) {
    165                 // Accumulate straight (non-segment) text.
    166                 buf.append(c);
    167             } else {
    168                 isComplex = TRUE;
    169 
    170                 // Insert any accumulated straight text.
    171                 if (buf.length() > 0) {
    172                     text.handleReplaceBetween(destLimit, destLimit, buf);
    173                     destLimit += buf.length();
    174                     buf.truncate(0);
    175                 }
    176 
    177                 // Delegate output generation to replacer object
    178                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
    179                 destLimit += len;
    180             }
    181             oOutput += UTF_CHAR_LENGTH(c);
    182         }
    183         // Insert any accumulated straight text.
    184         if (buf.length() > 0) {
    185             text.handleReplaceBetween(destLimit, destLimit, buf);
    186             destLimit += buf.length();
    187         }
    188         if (oOutput == cursorPos) {
    189             // Record the position of the cursor
    190             newStart = destLimit - destStart; // relative to start
    191         }
    192 
    193         outLen = destLimit - destStart;
    194 
    195         // Copy new text to start, and delete it
    196         text.copy(destStart, destLimit, start);
    197         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);
    198 
    199         // Delete the old text (the key)
    200         text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
    201     }
    202 
    203     if (hasCursor) {
    204         // Adjust the cursor for positions outside the key.  These
    205         // refer to code points rather than code units.  If cursorPos
    206         // is within the output string, then use newStart, which has
    207         // already been set above.
    208         if (cursorPos < 0) {
    209             newStart = start;
    210             int32_t n = cursorPos;
    211             // Outside the output string, cursorPos counts code points
    212             while (n < 0 && newStart > 0) {
    213                 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
    214                 ++n;
    215             }
    216             newStart += n;
    217         } else if (cursorPos > output.length()) {
    218             newStart = start + outLen;
    219             int32_t n = cursorPos - output.length();
    220             // Outside the output string, cursorPos counts code points
    221             while (n > 0 && newStart < text.length()) {
    222                 newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
    223                 --n;
    224             }
    225             newStart += n;
    226         } else {
    227             // Cursor is within output string.  It has been set up above
    228             // to be relative to start.
    229             newStart += start;
    230         }
    231 
    232         cursor = newStart;
    233     }
    234 
    235     return outLen;
    236 }
    237 
    238 /**
    239  * UnicodeReplacer API
    240  */
    241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
    242                                                  UBool escapeUnprintable) const {
    243     rule.truncate(0);
    244     UnicodeString quoteBuf;
    245 
    246     int32_t cursor = cursorPos;
    247 
    248     // Handle a cursor preceding the output
    249     if (hasCursor && cursor < 0) {
    250         while (cursor++ < 0) {
    251             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
    252         }
    253         // Fall through and append '|' below
    254     }
    255 
    256     for (int32_t i=0; i<output.length(); ++i) {
    257         if (hasCursor && i == cursor) {
    258             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    259         }
    260         UChar c = output.charAt(i); // Ok to use 16-bits here
    261 
    262         UnicodeReplacer* r = data->lookupReplacer(c);
    263         if (r == NULL) {
    264             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
    265         } else {
    266             UnicodeString buf;
    267             r->toReplacerPattern(buf, escapeUnprintable);
    268             buf.insert(0, (UChar)0x20);
    269             buf.append((UChar)0x20);
    270             ICU_Utility::appendToRule(rule, buf,
    271                                       TRUE, escapeUnprintable, quoteBuf);
    272         }
    273     }
    274 
    275     // Handle a cursor after the output.  Use > rather than >= because
    276     // if cursor == output.length() it is at the end of the output,
    277     // which is the default position, so we need not emit it.
    278     if (hasCursor && cursor > output.length()) {
    279         cursor -= output.length();
    280         while (cursor-- > 0) {
    281             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
    282         }
    283         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    284     }
    285     // Flush quoteBuf out to result
    286     ICU_Utility::appendToRule(rule, -1,
    287                               TRUE, escapeUnprintable, quoteBuf);
    288 
    289     return rule;
    290 }
    291 
    292 /**
    293  * Implement UnicodeReplacer
    294  */
    295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    296     UChar32 ch;
    297     for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
    298     ch = output.char32At(i);
    299     UnicodeReplacer* r = data->lookupReplacer(ch);
    300     if (r == NULL) {
    301         toUnionTo.add(ch);
    302     } else {
    303         r->addReplacementSetTo(toUnionTo);
    304     }
    305     }
    306 }
    307 
    308 /**
    309  * UnicodeFunctor API
    310  */
    311 void StringReplacer::setData(const TransliterationRuleData* d) {
    312     data = d;
    313     int32_t i = 0;
    314     while (i<output.length()) {
    315         UChar32 c = output.char32At(i);
    316         UnicodeFunctor* f = data->lookup(c);
    317         if (f != NULL) {
    318             f->setData(data);
    319         }
    320         i += UTF_CHAR_LENGTH(c);
    321     }
    322 }
    323 
    324 U_NAMESPACE_END
    325 
    326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    327 
    328 //eof
    329