Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   07/03/01    aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/normalizer2.h"
     16 #include "unicode/utf16.h"
     17 #include "cstring.h"
     18 #include "nortrans.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
     23 
     24 static inline Transliterator::Token cstrToken(const char *s) {
     25     return Transliterator::pointerToken((void *)s);
     26 }
     27 
     28 /**
     29  * System registration hook.
     30  */
     31 void NormalizationTransliterator::registerIDs() {
     32     // In the Token, the byte after the NUL is the UNormalization2Mode.
     33     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
     34                                      _create, cstrToken("nfc\0\0"));
     35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
     36                                      _create, cstrToken("nfkc\0\0"));
     37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
     38                                      _create, cstrToken("nfc\0\1"));
     39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
     40                                      _create, cstrToken("nfkc\0\1"));
     41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
     42                                      _create, cstrToken("nfc\0\2"));
     43     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
     44                                      _create, cstrToken("nfc\0\3"));
     45     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
     46                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
     47     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
     48                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
     49     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
     50                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
     51     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
     52                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
     53 }
     54 
     55 /**
     56  * Factory methods
     57  */
     58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
     59                                                      Token context) {
     60     const char *name = (const char *)context.pointer;
     61     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
     62     UErrorCode errorCode = U_ZERO_ERROR;
     63     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
     64     if(U_SUCCESS(errorCode)) {
     65         return new NormalizationTransliterator(ID, *norm2);
     66     } else {
     67         return NULL;
     68     }
     69 }
     70 
     71 /**
     72  * Constructs a transliterator.
     73  */
     74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
     75                                                          const Normalizer2 &norm2) :
     76     Transliterator(id, 0), fNorm2(norm2) {}
     77 
     78 /**
     79  * Destructor.
     80  */
     81 NormalizationTransliterator::~NormalizationTransliterator() {
     82 }
     83 
     84 /**
     85  * Copy constructor.
     86  */
     87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
     88     Transliterator(o), fNorm2(o.fNorm2) {}
     89 
     90 /**
     91  * Transliterator API.
     92  */
     93 Transliterator* NormalizationTransliterator::clone(void) const {
     94     return new NormalizationTransliterator(*this);
     95 }
     96 
     97 /**
     98  * Implements {@link Transliterator#handleTransliterate}.
     99  */
    100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
    101                                                       UBool isIncremental) const {
    102     // start and limit of the input range
    103     int32_t start = offsets.start;
    104     int32_t limit = offsets.limit;
    105     if(start >= limit) {
    106         return;
    107     }
    108 
    109     /*
    110      * Normalize as short chunks at a time as possible even in
    111      * bulk mode, so that styled text is minimally disrupted.
    112      * In incremental mode, a chunk that ends with offsets.limit
    113      * must not be normalized.
    114      *
    115      * If it was known that the input text is not styled, then
    116      * a bulk mode normalization could look like this:
    117 
    118     UnicodeString input, normalized;
    119     int32_t length = limit - start;
    120     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    121     input.releaseBuffer(length);
    122 
    123     UErrorCode status = U_ZERO_ERROR;
    124     fNorm2.normalize(input, normalized, status);
    125 
    126     text.handleReplaceBetween(start, limit, normalized);
    127 
    128     int32_t delta = normalized.length() - length;
    129     offsets.contextLimit += delta;
    130     offsets.limit += delta;
    131     offsets.start = limit + delta;
    132 
    133      */
    134     UErrorCode errorCode = U_ZERO_ERROR;
    135     UnicodeString segment;
    136     UnicodeString normalized;
    137     UChar32 c = text.char32At(start);
    138     do {
    139         int32_t prev = start;
    140         // Skip at least one character so we make progress.
    141         // c holds the character at start.
    142         segment.remove();
    143         do {
    144             segment.append(c);
    145             start += U16_LENGTH(c);
    146         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
    147         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
    148             // stop in incremental mode when we reach the input limit
    149             // in case there are additional characters that could change the
    150             // normalization result
    151             start=prev;
    152             break;
    153         }
    154         fNorm2.normalize(segment, normalized, errorCode);
    155         if(U_FAILURE(errorCode)) {
    156             break;
    157         }
    158         if(segment != normalized) {
    159             // replace the input chunk with its normalized form
    160             text.handleReplaceBetween(prev, start, normalized);
    161 
    162             // update all necessary indexes accordingly
    163             int32_t delta = normalized.length() - (start - prev);
    164             start += delta;
    165             limit += delta;
    166         }
    167     } while(start < limit);
    168 
    169     offsets.start = start;
    170     offsets.contextLimit += limit - offsets.limit;
    171     offsets.limit = limit;
    172 }
    173 
    174 U_NAMESPACE_END
    175 
    176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    177