Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 1999-2008, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  *   Date        Name        Description
      7  *   11/17/99    aliu        Creation.
      8  **********************************************************************
      9  */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/unistr.h"
     16 #include "unicode/uniset.h"
     17 #include "rbt_set.h"
     18 #include "rbt_rule.h"
     19 #include "cmemory.h"
     20 #include "putilimp.h"
     21 
     22 U_CDECL_BEGIN
     23 static void U_CALLCONV _deleteRule(void *rule) {
     24     delete (U_NAMESPACE_QUALIFIER TransliterationRule *)rule;
     25 }
     26 U_CDECL_END
     27 
     28 //----------------------------------------------------------------------
     29 // BEGIN Debugging support
     30 //----------------------------------------------------------------------
     31 
     32 // #define DEBUG_RBT
     33 
     34 #ifdef DEBUG_RBT
     35 #include <stdio.h>
     36 #include "charstr.h"
     37 
     38 /**
     39  * @param appendTo result is appended to this param.
     40  * @param input the string being transliterated
     41  * @param pos the index struct
     42  */
     43 static UnicodeString& _formatInput(UnicodeString &appendTo,
     44                                    const UnicodeString& input,
     45                                    const UTransPosition& pos) {
     46     // Output a string of the form aaa{bbb|ccc|ddd}eee, where
     47     // the {} indicate the context start and limit, and the ||
     48     // indicate the start and limit.
     49     if (0 <= pos.contextStart &&
     50         pos.contextStart <= pos.start &&
     51         pos.start <= pos.limit &&
     52         pos.limit <= pos.contextLimit &&
     53         pos.contextLimit <= input.length()) {
     54 
     55         UnicodeString a, b, c, d, e;
     56         input.extractBetween(0, pos.contextStart, a);
     57         input.extractBetween(pos.contextStart, pos.start, b);
     58         input.extractBetween(pos.start, pos.limit, c);
     59         input.extractBetween(pos.limit, pos.contextLimit, d);
     60         input.extractBetween(pos.contextLimit, input.length(), e);
     61         appendTo.append(a).append((UChar)123/*{*/).append(b).
     62             append((UChar)124/*|*/).append(c).append((UChar)124/*|*/).append(d).
     63             append((UChar)125/*}*/).append(e);
     64     } else {
     65         appendTo.append("INVALID UTransPosition");
     66         //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
     67         //                pos.contextStart + ", s=" + pos.start + ", l=" +
     68         //                pos.limit + ", cl=" + pos.contextLimit + "} on " +
     69         //                input);
     70     }
     71     return appendTo;
     72 }
     73 
     74 // Append a hex string to the target
     75 UnicodeString& _appendHex(uint32_t number,
     76                           int32_t digits,
     77                           UnicodeString& target) {
     78     static const UChar digitString[] = {
     79         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
     80         0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
     81     };
     82     while (digits--) {
     83         target += digitString[(number >> (digits*4)) & 0xF];
     84     }
     85     return target;
     86 }
     87 
     88 // Replace nonprintable characters with unicode escapes
     89 UnicodeString& _escape(const UnicodeString &source,
     90                        UnicodeString &target) {
     91     for (int32_t i = 0; i < source.length(); ) {
     92         UChar32 ch = source.char32At(i);
     93         i += UTF_CHAR_LENGTH(ch);
     94         if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {
     95             if (ch <= 0xFFFF) {
     96                 target += "\\u";
     97                 _appendHex(ch, 4, target);
     98             } else {
     99                 target += "\\U";
    100                 _appendHex(ch, 8, target);
    101             }
    102         } else {
    103             target += ch;
    104         }
    105     }
    106     return target;
    107 }
    108 
    109 inline void _debugOut(const char* msg, TransliterationRule* rule,
    110                       const Replaceable& theText, UTransPosition& pos) {
    111     UnicodeString buf(msg, "");
    112     if (rule) {
    113         UnicodeString r;
    114         rule->toRule(r, TRUE);
    115         buf.append((UChar)32).append(r);
    116     }
    117     buf.append(UnicodeString(" => ", ""));
    118     UnicodeString* text = (UnicodeString*)&theText;
    119     _formatInput(buf, *text, pos);
    120     UnicodeString esc;
    121     _escape(buf, esc);
    122     CharString cbuf(esc);
    123     printf("%s\n", (const char*) cbuf);
    124 }
    125 
    126 #else
    127 #define _debugOut(msg, rule, theText, pos)
    128 #endif
    129 
    130 //----------------------------------------------------------------------
    131 // END Debugging support
    132 //----------------------------------------------------------------------
    133 
    134 // Fill the precontext and postcontext with the patterns of the rules
    135 // that are masking one another.
    136 static void maskingError(const U_NAMESPACE_QUALIFIER TransliterationRule& rule1,
    137                          const U_NAMESPACE_QUALIFIER TransliterationRule& rule2,
    138                          UParseError& parseError) {
    139     U_NAMESPACE_QUALIFIER UnicodeString r;
    140     int32_t len;
    141 
    142     parseError.line = parseError.offset = -1;
    143 
    144     // for pre-context
    145     rule1.toRule(r, FALSE);
    146     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
    147     r.extract(0, len, parseError.preContext);
    148     parseError.preContext[len] = 0;
    149 
    150     //for post-context
    151     r.truncate(0);
    152     rule2.toRule(r, FALSE);
    153     len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
    154     r.extract(0, len, parseError.postContext);
    155     parseError.postContext[len] = 0;
    156 }
    157 
    158 U_NAMESPACE_BEGIN
    159 
    160 /**
    161  * Construct a new empty rule set.
    162  */
    163 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {
    164     ruleVector = new UVector(&_deleteRule, NULL, status);
    165     if (U_FAILURE(status)) {
    166         return;
    167     }
    168     if (ruleVector == NULL) {
    169         status = U_MEMORY_ALLOCATION_ERROR;
    170     }
    171     rules = NULL;
    172     maxContextLength = 0;
    173 }
    174 
    175 /**
    176  * Copy constructor.
    177  */
    178 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
    179     UMemory(other),
    180     ruleVector(0),
    181     rules(0),
    182     maxContextLength(other.maxContextLength) {
    183 
    184     int32_t i, len;
    185     uprv_memcpy(index, other.index, sizeof(index));
    186     UErrorCode status = U_ZERO_ERROR;
    187     ruleVector = new UVector(&_deleteRule, NULL, status);
    188     if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {
    189         len = other.ruleVector->size();
    190         for (i=0; i<len && U_SUCCESS(status); ++i) {
    191             TransliterationRule *tempTranslitRule = new TransliterationRule(*(TransliterationRule*)other.ruleVector->elementAt(i));
    192             // Null pointer test
    193             if (tempTranslitRule == NULL) {
    194                 status = U_MEMORY_ALLOCATION_ERROR;
    195                 break;
    196             }
    197             ruleVector->addElement(tempTranslitRule, status);
    198             if (U_FAILURE(status)) {
    199                 break;
    200             }
    201         }
    202     }
    203     if (other.rules != 0 && U_SUCCESS(status)) {
    204         UParseError p;
    205         freeze(p, status);
    206     }
    207 }
    208 
    209 /**
    210  * Destructor.
    211  */
    212 TransliterationRuleSet::~TransliterationRuleSet() {
    213     delete ruleVector; // This deletes the contained rules
    214     uprv_free(rules);
    215 }
    216 
    217 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {
    218     /**
    219      * We assume that the ruleset has already been frozen.
    220      */
    221     int32_t len = index[256]; // see freeze()
    222     for (int32_t i=0; i<len; ++i) {
    223         rules[i]->setData(d);
    224     }
    225 }
    226 
    227 /**
    228  * Return the maximum context length.
    229  * @return the length of the longest preceding context.
    230  */
    231 int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
    232     return maxContextLength;
    233 }
    234 
    235 /**
    236  * Add a rule to this set.  Rules are added in order, and order is
    237  * significant.  The last call to this method must be followed by
    238  * a call to <code>freeze()</code> before the rule set is used.
    239  *
    240  * <p>If freeze() has already been called, calling addRule()
    241  * unfreezes the rules, and freeze() must be called again.
    242  *
    243  * @param adoptedRule the rule to add
    244  */
    245 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
    246                                      UErrorCode& status) {
    247     if (U_FAILURE(status)) {
    248         delete adoptedRule;
    249         return;
    250     }
    251     ruleVector->addElement(adoptedRule, status);
    252 
    253     int32_t len;
    254     if ((len = adoptedRule->getContextLength()) > maxContextLength) {
    255         maxContextLength = len;
    256     }
    257 
    258     uprv_free(rules);
    259     rules = 0;
    260 }
    261 
    262 /**
    263  * Check this for masked rules and index it to optimize performance.
    264  * The sequence of operations is: (1) add rules to a set using
    265  * <code>addRule()</code>; (2) freeze the set using
    266  * <code>freeze()</code>; (3) use the rule set.  If
    267  * <code>addRule()</code> is called after calling this method, it
    268  * invalidates this object, and this method must be called again.
    269  * That is, <code>freeze()</code> may be called multiple times,
    270  * although for optimal performance it shouldn't be.
    271  */
    272 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
    273     /* Construct the rule array and index table.  We reorder the
    274      * rules by sorting them into 256 bins.  Each bin contains all
    275      * rules matching the index value for that bin.  A rule
    276      * matches an index value if string whose first key character
    277      * has a low byte equal to the index value can match the rule.
    278      *
    279      * Each bin contains zero or more rules, in the same order
    280      * they were found originally.  However, the total rules in
    281      * the bins may exceed the number in the original vector,
    282      * since rules that have a variable as their first key
    283      * character will generally fall into more than one bin.
    284      *
    285      * That is, each bin contains all rules that either have that
    286      * first index value as their first key character, or have
    287      * a set containing the index value as their first character.
    288      */
    289     int32_t n = ruleVector->size();
    290     int32_t j;
    291     int16_t x;
    292     UVector v(2*n, status); // heuristic; adjust as needed
    293 
    294     if (U_FAILURE(status)) {
    295         return;
    296     }
    297 
    298     /* Precompute the index values.  This saves a LOT of time.
    299      * Be careful not to call malloc(0).
    300      */
    301     int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) );
    302     /* test for NULL */
    303     if (indexValue == 0) {
    304         status = U_MEMORY_ALLOCATION_ERROR;
    305         return;
    306     }
    307     for (j=0; j<n; ++j) {
    308         TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
    309         indexValue[j] = r->getIndexValue();
    310     }
    311     for (x=0; x<256; ++x) {
    312         index[x] = v.size();
    313         for (j=0; j<n; ++j) {
    314             if (indexValue[j] >= 0) {
    315                 if (indexValue[j] == x) {
    316                     v.addElement(ruleVector->elementAt(j), status);
    317                 }
    318             } else {
    319                 // If the indexValue is < 0, then the first key character is
    320                 // a set, and we must use the more time-consuming
    321                 // matchesIndexValue check.  In practice this happens
    322                 // rarely, so we seldom tread this code path.
    323                 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
    324                 if (r->matchesIndexValue((uint8_t)x)) {
    325                     v.addElement(r, status);
    326                 }
    327             }
    328         }
    329     }
    330     uprv_free(indexValue);
    331     index[256] = v.size();
    332 
    333     /* Freeze things into an array.
    334      */
    335     uprv_free(rules); // Contains alias pointers
    336 
    337     /* You can't do malloc(0)! */
    338     if (v.size() == 0) {
    339         rules = NULL;
    340         return;
    341     }
    342     rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *));
    343     /* test for NULL */
    344     if (rules == 0) {
    345         status = U_MEMORY_ALLOCATION_ERROR;
    346         return;
    347     }
    348     for (j=0; j<v.size(); ++j) {
    349         rules[j] = (TransliterationRule*) v.elementAt(j);
    350     }
    351 
    352     // TODO Add error reporting that indicates the rules that
    353     //      are being masked.
    354     //UnicodeString errors;
    355 
    356     /* Check for masking.  This is MUCH faster than our old check,
    357      * which was each rule against each following rule, since we
    358      * only have to check for masking within each bin now.  It's
    359      * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
    360      * count, and n2 is the per-bin rule count.  But n2<<n1, so
    361      * it's a big win.
    362      */
    363     for (x=0; x<256; ++x) {
    364         for (j=index[x]; j<index[x+1]-1; ++j) {
    365             TransliterationRule* r1 = rules[j];
    366             for (int32_t k=j+1; k<index[x+1]; ++k) {
    367                 TransliterationRule* r2 = rules[k];
    368                 if (r1->masks(*r2)) {
    369 //|                 if (errors == null) {
    370 //|                     errors = new StringBuffer();
    371 //|                 } else {
    372 //|                     errors.append("\n");
    373 //|                 }
    374 //|                 errors.append("Rule " + r1 + " masks " + r2);
    375                     status = U_RULE_MASK_ERROR;
    376                     maskingError(*r1, *r2, parseError);
    377                     return;
    378                 }
    379             }
    380         }
    381     }
    382 
    383     //if (errors != null) {
    384     //    throw new IllegalArgumentException(errors.toString());
    385     //}
    386 }
    387 
    388 /**
    389  * Transliterate the given text with the given UTransPosition
    390  * indices.  Return TRUE if the transliteration should continue
    391  * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
    392  * Note that FALSE is only ever returned if isIncremental is TRUE.
    393  * @param text the text to be transliterated
    394  * @param pos the position indices, which will be updated
    395  * @param incremental if TRUE, assume new text may be inserted
    396  * at index.limit, and return FALSE if thre is a partial match.
    397  * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
    398  * indicating that transliteration should stop until more text
    399  * arrives.
    400  */
    401 UBool TransliterationRuleSet::transliterate(Replaceable& text,
    402                                             UTransPosition& pos,
    403                                             UBool incremental) {
    404     int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
    405     for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
    406         UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
    407         switch (m) {
    408         case U_MATCH:
    409             _debugOut("match", rules[i], text, pos);
    410             return TRUE;
    411         case U_PARTIAL_MATCH:
    412             _debugOut("partial match", rules[i], text, pos);
    413             return FALSE;
    414         default: /* Ram: added default to make GCC happy */
    415             break;
    416         }
    417     }
    418     // No match or partial match from any rule
    419     pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start));
    420     _debugOut("no match", NULL, text, pos);
    421     return TRUE;
    422 }
    423 
    424 /**
    425  * Create rule strings that represents this rule set.
    426  */
    427 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
    428                                                UBool escapeUnprintable) const {
    429     int32_t i;
    430     int32_t count = ruleVector->size();
    431     ruleSource.truncate(0);
    432     for (i=0; i<count; ++i) {
    433         if (i != 0) {
    434             ruleSource.append((UChar) 0x000A /*\n*/);
    435         }
    436         TransliterationRule *r =
    437             (TransliterationRule*) ruleVector->elementAt(i);
    438         r->toRule(ruleSource, escapeUnprintable);
    439     }
    440     return ruleSource;
    441 }
    442 
    443 /**
    444  * Return the set of all characters that may be modified
    445  * (getTarget=false) or emitted (getTarget=true) by this set.
    446  */
    447 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
    448                                UBool getTarget) const
    449 {
    450     result.clear();
    451     int32_t count = ruleVector->size();
    452     for (int32_t i=0; i<count; ++i) {
    453         TransliterationRule* r =
    454             (TransliterationRule*) ruleVector->elementAt(i);
    455         if (getTarget) {
    456             r->addTargetSetTo(result);
    457         } else {
    458             r->addSourceSetTo(result);
    459         }
    460     }
    461     return result;
    462 }
    463 
    464 U_NAMESPACE_END
    465 
    466 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    467