Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (c) 2001-2008, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  *   Date        Name        Description
      7  *   11/19/2001  aliu        Creation.
      8  **********************************************************************
      9  */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uchar.h"
     16 #include "unesctrn.h"
     17 #include "util.h"
     18 
     19 #include "cmemory.h"
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 /**
     24  * Special character marking the end of the spec[] array.
     25  */
     26 static const UChar END = 0xFFFF;
     27 
     28 // Unicode: "U+10FFFF" hex, min=4, max=6
     29 static const UChar SPEC_Unicode[] = {
     30     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
     31     END
     32 };
     33 
     34 // Java: "\\uFFFF" hex, min=4, max=4
     35 static const UChar SPEC_Java[] = {
     36     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     37     END
     38 };
     39 
     40 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
     41 static const UChar SPEC_C[] = {
     42     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     43     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
     44     END
     45 };
     46 
     47 // XML: "" hex, min=1, max=6
     48 static const UChar SPEC_XML[] = {
     49     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
     50     END
     51 };
     52 
     53 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
     54 static const UChar SPEC_XML10[] = {
     55     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
     56     END
     57 };
     58 
     59 // Perl: "\\x{263A}" hex, min=1, max=6
     60 static const UChar SPEC_Perl[] = {
     61     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
     62     END
     63 };
     64 
     65 // All: Java, C, Perl, XML, XML10, Unicode
     66 static const UChar SPEC_Any[] = {
     67     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
     68     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
     69     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
     70     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
     71     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
     72     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
     73     END
     74 };
     75 
     76 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
     77 
     78 static UChar* copySpec(const UChar* spec) {
     79     int32_t len = 0;
     80     while (spec[len] != END) {
     81         ++len;
     82     }
     83     ++len;
     84     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
     85     // Check for memory allocation error.
     86     if (result != NULL) {
     87     	uprv_memcpy(result, spec, len*sizeof(result[0]));
     88     }
     89     return result;
     90 }
     91 
     92 /**
     93  * Factory methods.  Ignore the context.
     94  */
     95 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
     96     return new UnescapeTransliterator(ID, SPEC_Unicode);
     97 }
     98 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
     99     return new UnescapeTransliterator(ID, SPEC_Java);
    100 }
    101 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
    102     return new UnescapeTransliterator(ID, SPEC_C);
    103 }
    104 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
    105     return new UnescapeTransliterator(ID, SPEC_XML);
    106 }
    107 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
    108     return new UnescapeTransliterator(ID, SPEC_XML10);
    109 }
    110 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
    111     return new UnescapeTransliterator(ID, SPEC_Perl);
    112 }
    113 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
    114     return new UnescapeTransliterator(ID, SPEC_Any);
    115 }
    116 
    117 /**
    118  * Registers standard variants with the system.  Called by
    119  * Transliterator during initialization.
    120  */
    121 void UnescapeTransliterator::registerIDs() {
    122     Token t = integerToken(0);
    123 
    124     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
    125 
    126     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
    127 
    128     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
    129 
    130     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
    131 
    132     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
    133 
    134     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
    135 
    136     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
    137 }
    138 
    139 /**
    140  * Constructor.  Takes the encoded spec array.
    141  */
    142 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
    143                                                const UChar *newSpec) :
    144     Transliterator(newID, NULL)
    145 {
    146     this->spec = copySpec(newSpec);
    147 }
    148 
    149 /**
    150  * Copy constructor.
    151  */
    152 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
    153     Transliterator(o) {
    154     this->spec = copySpec(o.spec);
    155 }
    156 
    157 UnescapeTransliterator::~UnescapeTransliterator() {
    158     uprv_free(spec);
    159 }
    160 
    161 /**
    162  * Transliterator API.
    163  */
    164 Transliterator* UnescapeTransliterator::clone() const {
    165     return new UnescapeTransliterator(*this);
    166 }
    167 
    168 /**
    169  * Implements {@link Transliterator#handleTransliterate}.
    170  */
    171 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    172                                                  UBool isIncremental) const {
    173     int32_t start = pos.start;
    174     int32_t limit = pos.limit;
    175     int32_t i, j, ipat;
    176 
    177     while (start < limit) {
    178         // Loop over the forms in spec[].  Exit this loop when we
    179         // match one of the specs.  Exit the outer loop if a
    180         // partial match is detected and isIncremental is true.
    181         for (j=0, ipat=0; spec[ipat] != END; ++j) {
    182 
    183             // Read the header
    184             int32_t prefixLen = spec[ipat++];
    185             int32_t suffixLen = spec[ipat++];
    186             int8_t  radix     = (int8_t) spec[ipat++];
    187             int32_t minDigits = spec[ipat++];
    188             int32_t maxDigits = spec[ipat++];
    189 
    190             // s is a copy of start that is advanced over the
    191             // characters as we parse them.
    192             int32_t s = start;
    193             UBool match = TRUE;
    194 
    195             for (i=0; i<prefixLen; ++i) {
    196                 if (s >= limit) {
    197                     if (i > 0) {
    198                         // We've already matched a character.  This is
    199                         // a partial match, so we return if in
    200                         // incremental mode.  In non-incremental mode,
    201                         // go to the next spec.
    202                         if (isIncremental) {
    203                             goto exit;
    204                         }
    205                         match = FALSE;
    206                         break;
    207                     }
    208                 }
    209                 UChar c = text.charAt(s++);
    210                 if (c != spec[ipat + i]) {
    211                     match = FALSE;
    212                     break;
    213                 }
    214             }
    215 
    216             if (match) {
    217                 UChar32 u = 0;
    218                 int32_t digitCount = 0;
    219                 for (;;) {
    220                     if (s >= limit) {
    221                         // Check for partial match in incremental mode.
    222                         if (s > start && isIncremental) {
    223                             goto exit;
    224                         }
    225                         break;
    226                     }
    227                     UChar32 ch = text.char32At(s);
    228                     int32_t digit = u_digit(ch, radix);
    229                     if (digit < 0) {
    230                         break;
    231                     }
    232                     s += UTF_CHAR_LENGTH(ch);
    233                     u = (u * radix) + digit;
    234                     if (++digitCount == maxDigits) {
    235                         break;
    236                     }
    237                 }
    238 
    239                 match = (digitCount >= minDigits);
    240 
    241                 if (match) {
    242                     for (i=0; i<suffixLen; ++i) {
    243                         if (s >= limit) {
    244                             // Check for partial match in incremental mode.
    245                             if (s > start && isIncremental) {
    246                                 goto exit;
    247                             }
    248                             match = FALSE;
    249                             break;
    250                         }
    251                         UChar c = text.charAt(s++);
    252                         if (c != spec[ipat + prefixLen + i]) {
    253                             match = FALSE;
    254                             break;
    255                         }
    256                     }
    257 
    258                     if (match) {
    259                         // At this point, we have a match
    260                         UnicodeString str(u);
    261                         text.handleReplaceBetween(start, s, str);
    262                         limit -= s - start - str.length();
    263                         // The following break statement leaves the
    264                         // loop that is traversing the forms in
    265                         // spec[].  We then parse the next input
    266                         // character.
    267                         break;
    268                     }
    269                 }
    270             }
    271 
    272             ipat += prefixLen + suffixLen;
    273         }
    274 
    275         if (start < limit) {
    276             start += UTF_CHAR_LENGTH(text.char32At(start));
    277         }
    278     }
    279 
    280   exit:
    281     pos.contextLimit += limit - pos.limit;
    282     pos.limit = limit;
    283     pos.start = start;
    284 }
    285 
    286 U_NAMESPACE_END
    287 
    288 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    289 
    290 //eof
    291