Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (c) 2001-2011, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  *   Date        Name        Description
      7  *   11/19/2001  aliu        Creation.
      8  **********************************************************************
      9  */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uchar.h"
     16 #include "unicode/utf16.h"
     17 #include "unesctrn.h"
     18 #include "util.h"
     19 
     20 #include "cmemory.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 /**
     25  * Special character marking the end of the spec[] array.
     26  */
     27 static const UChar END = 0xFFFF;
     28 
     29 // Unicode: "U+10FFFF" hex, min=4, max=6
     30 static const UChar SPEC_Unicode[] = {
     31     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
     32     END
     33 };
     34 
     35 // Java: "\\uFFFF" hex, min=4, max=4
     36 static const UChar SPEC_Java[] = {
     37     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     38     END
     39 };
     40 
     41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
     42 static const UChar SPEC_C[] = {
     43     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     44     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
     45     END
     46 };
     47 
     48 // XML: "" hex, min=1, max=6
     49 static const UChar SPEC_XML[] = {
     50     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
     51     END
     52 };
     53 
     54 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
     55 static const UChar SPEC_XML10[] = {
     56     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
     57     END
     58 };
     59 
     60 // Perl: "\\x{263A}" hex, min=1, max=6
     61 static const UChar SPEC_Perl[] = {
     62     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
     63     END
     64 };
     65 
     66 // All: Java, C, Perl, XML, XML10, Unicode
     67 static const UChar SPEC_Any[] = {
     68     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
     69     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
     70     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
     71     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
     72     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
     73     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
     74     END
     75 };
     76 
     77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
     78 
     79 static UChar* copySpec(const UChar* spec) {
     80     int32_t len = 0;
     81     while (spec[len] != END) {
     82         ++len;
     83     }
     84     ++len;
     85     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
     86     // Check for memory allocation error.
     87     if (result != NULL) {
     88     	uprv_memcpy(result, spec, len*sizeof(result[0]));
     89     }
     90     return result;
     91 }
     92 
     93 /**
     94  * Factory methods.  Ignore the context.
     95  */
     96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
     97     return new UnescapeTransliterator(ID, SPEC_Unicode);
     98 }
     99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
    100     return new UnescapeTransliterator(ID, SPEC_Java);
    101 }
    102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
    103     return new UnescapeTransliterator(ID, SPEC_C);
    104 }
    105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
    106     return new UnescapeTransliterator(ID, SPEC_XML);
    107 }
    108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
    109     return new UnescapeTransliterator(ID, SPEC_XML10);
    110 }
    111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
    112     return new UnescapeTransliterator(ID, SPEC_Perl);
    113 }
    114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
    115     return new UnescapeTransliterator(ID, SPEC_Any);
    116 }
    117 
    118 /**
    119  * Registers standard variants with the system.  Called by
    120  * Transliterator during initialization.
    121  */
    122 void UnescapeTransliterator::registerIDs() {
    123     Token t = integerToken(0);
    124 
    125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
    126 
    127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
    128 
    129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
    130 
    131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
    132 
    133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
    134 
    135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
    136 
    137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
    138 }
    139 
    140 /**
    141  * Constructor.  Takes the encoded spec array.
    142  */
    143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
    144                                                const UChar *newSpec) :
    145     Transliterator(newID, NULL)
    146 {
    147     this->spec = copySpec(newSpec);
    148 }
    149 
    150 /**
    151  * Copy constructor.
    152  */
    153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
    154     Transliterator(o) {
    155     this->spec = copySpec(o.spec);
    156 }
    157 
    158 UnescapeTransliterator::~UnescapeTransliterator() {
    159     uprv_free(spec);
    160 }
    161 
    162 /**
    163  * Transliterator API.
    164  */
    165 Transliterator* UnescapeTransliterator::clone() const {
    166     return new UnescapeTransliterator(*this);
    167 }
    168 
    169 /**
    170  * Implements {@link Transliterator#handleTransliterate}.
    171  */
    172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    173                                                  UBool isIncremental) const {
    174     int32_t start = pos.start;
    175     int32_t limit = pos.limit;
    176     int32_t i, j, ipat;
    177 
    178     while (start < limit) {
    179         // Loop over the forms in spec[].  Exit this loop when we
    180         // match one of the specs.  Exit the outer loop if a
    181         // partial match is detected and isIncremental is true.
    182         for (j=0, ipat=0; spec[ipat] != END; ++j) {
    183 
    184             // Read the header
    185             int32_t prefixLen = spec[ipat++];
    186             int32_t suffixLen = spec[ipat++];
    187             int8_t  radix     = (int8_t) spec[ipat++];
    188             int32_t minDigits = spec[ipat++];
    189             int32_t maxDigits = spec[ipat++];
    190 
    191             // s is a copy of start that is advanced over the
    192             // characters as we parse them.
    193             int32_t s = start;
    194             UBool match = TRUE;
    195 
    196             for (i=0; i<prefixLen; ++i) {
    197                 if (s >= limit) {
    198                     if (i > 0) {
    199                         // We've already matched a character.  This is
    200                         // a partial match, so we return if in
    201                         // incremental mode.  In non-incremental mode,
    202                         // go to the next spec.
    203                         if (isIncremental) {
    204                             goto exit;
    205                         }
    206                         match = FALSE;
    207                         break;
    208                     }
    209                 }
    210                 UChar c = text.charAt(s++);
    211                 if (c != spec[ipat + i]) {
    212                     match = FALSE;
    213                     break;
    214                 }
    215             }
    216 
    217             if (match) {
    218                 UChar32 u = 0;
    219                 int32_t digitCount = 0;
    220                 for (;;) {
    221                     if (s >= limit) {
    222                         // Check for partial match in incremental mode.
    223                         if (s > start && isIncremental) {
    224                             goto exit;
    225                         }
    226                         break;
    227                     }
    228                     UChar32 ch = text.char32At(s);
    229                     int32_t digit = u_digit(ch, radix);
    230                     if (digit < 0) {
    231                         break;
    232                     }
    233                     s += U16_LENGTH(ch);
    234                     u = (u * radix) + digit;
    235                     if (++digitCount == maxDigits) {
    236                         break;
    237                     }
    238                 }
    239 
    240                 match = (digitCount >= minDigits);
    241 
    242                 if (match) {
    243                     for (i=0; i<suffixLen; ++i) {
    244                         if (s >= limit) {
    245                             // Check for partial match in incremental mode.
    246                             if (s > start && isIncremental) {
    247                                 goto exit;
    248                             }
    249                             match = FALSE;
    250                             break;
    251                         }
    252                         UChar c = text.charAt(s++);
    253                         if (c != spec[ipat + prefixLen + i]) {
    254                             match = FALSE;
    255                             break;
    256                         }
    257                     }
    258 
    259                     if (match) {
    260                         // At this point, we have a match
    261                         UnicodeString str(u);
    262                         text.handleReplaceBetween(start, s, str);
    263                         limit -= s - start - str.length();
    264                         // The following break statement leaves the
    265                         // loop that is traversing the forms in
    266                         // spec[].  We then parse the next input
    267                         // character.
    268                         break;
    269                     }
    270                 }
    271             }
    272 
    273             ipat += prefixLen + suffixLen;
    274         }
    275 
    276         if (start < limit) {
    277             start += U16_LENGTH(text.char32At(start));
    278         }
    279     }
    280 
    281   exit:
    282     pos.contextLimit += limit - pos.limit;
    283     pos.limit = limit;
    284     pos.start = start;
    285 }
    286 
    287 U_NAMESPACE_END
    288 
    289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    290 
    291 //eof
    292