Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   07/03/01    aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/normalizer2.h"
     16 #include "cstring.h"
     17 #include "nortrans.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
     22 
     23 static inline Transliterator::Token cstrToken(const char *s) {
     24     return Transliterator::pointerToken((void *)s);
     25 }
     26 
     27 /**
     28  * System registration hook.
     29  */
     30 void NormalizationTransliterator::registerIDs() {
     31     // In the Token, the byte after the NUL is the UNormalization2Mode.
     32     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
     33                                      _create, cstrToken("nfc\0\0"));
     34     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
     35                                      _create, cstrToken("nfkc\0\0"));
     36     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
     37                                      _create, cstrToken("nfc\0\1"));
     38     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
     39                                      _create, cstrToken("nfkc\0\1"));
     40     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
     41                                      _create, cstrToken("nfc\0\2"));
     42     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
     43                                      _create, cstrToken("nfc\0\3"));
     44     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
     45                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
     46     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
     47                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
     48     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
     49                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
     50     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
     51                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
     52 }
     53 
     54 /**
     55  * Factory methods
     56  */
     57 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
     58                                                      Token context) {
     59     const char *name = (const char *)context.pointer;
     60     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
     61     UErrorCode errorCode = U_ZERO_ERROR;
     62     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
     63     if(U_SUCCESS(errorCode)) {
     64         return new NormalizationTransliterator(ID, *norm2);
     65     } else {
     66         return NULL;
     67     }
     68 }
     69 
     70 /**
     71  * Constructs a transliterator.
     72  */
     73 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
     74                                                          const Normalizer2 &norm2) :
     75     Transliterator(id, 0), fNorm2(norm2) {}
     76 
     77 /**
     78  * Destructor.
     79  */
     80 NormalizationTransliterator::~NormalizationTransliterator() {
     81 }
     82 
     83 /**
     84  * Copy constructor.
     85  */
     86 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
     87     Transliterator(o), fNorm2(o.fNorm2) {}
     88 
     89 /**
     90  * Transliterator API.
     91  */
     92 Transliterator* NormalizationTransliterator::clone(void) const {
     93     return new NormalizationTransliterator(*this);
     94 }
     95 
     96 /**
     97  * Implements {@link Transliterator#handleTransliterate}.
     98  */
     99 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
    100                                                       UBool isIncremental) const {
    101     // start and limit of the input range
    102     int32_t start = offsets.start;
    103     int32_t limit = offsets.limit;
    104     if(start >= limit) {
    105         return;
    106     }
    107 
    108     /*
    109      * Normalize as short chunks at a time as possible even in
    110      * bulk mode, so that styled text is minimally disrupted.
    111      * In incremental mode, a chunk that ends with offsets.limit
    112      * must not be normalized.
    113      *
    114      * If it was known that the input text is not styled, then
    115      * a bulk mode normalization could look like this:
    116 
    117     UnicodeString input, normalized;
    118     int32_t length = limit - start;
    119     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    120     input.releaseBuffer(length);
    121 
    122     UErrorCode status = U_ZERO_ERROR;
    123     fNorm2.normalize(input, normalized, status);
    124 
    125     text.handleReplaceBetween(start, limit, normalized);
    126 
    127     int32_t delta = normalized.length() - length;
    128     offsets.contextLimit += delta;
    129     offsets.limit += delta;
    130     offsets.start = limit + delta;
    131 
    132      */
    133     UErrorCode errorCode = U_ZERO_ERROR;
    134     UnicodeString segment;
    135     UnicodeString normalized;
    136     UChar32 c = text.char32At(start);
    137     do {
    138         int32_t prev = start;
    139         // Skip at least one character so we make progress.
    140         // c holds the character at start.
    141         segment.remove();
    142         do {
    143             segment.append(c);
    144             start += U16_LENGTH(c);
    145         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
    146         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
    147             // stop in incremental mode when we reach the input limit
    148             // in case there are additional characters that could change the
    149             // normalization result
    150             start=prev;
    151             break;
    152         }
    153         fNorm2.normalize(segment, normalized, errorCode);
    154         if(U_FAILURE(errorCode)) {
    155             break;
    156         }
    157         if(segment != normalized) {
    158             // replace the input chunk with its normalized form
    159             text.handleReplaceBetween(prev, start, normalized);
    160 
    161             // update all necessary indexes accordingly
    162             int32_t delta = normalized.length() - (start - prev);
    163             start += delta;
    164             limit += delta;
    165         }
    166     } while(start < limit);
    167 
    168     offsets.start = start;
    169     offsets.contextLimit += limit - offsets.limit;
    170     offsets.limit = limit;
    171 }
    172 
    173 U_NAMESPACE_END
    174 
    175 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    176