1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 07/03/01 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/normalizer2.h" 16 #include "unicode/utf16.h" 17 #include "cstring.h" 18 #include "nortrans.h" 19 20 U_NAMESPACE_BEGIN 21 22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 23 24 static inline Transliterator::Token cstrToken(const char *s) { 25 return Transliterator::pointerToken((void *)s); 26 } 27 28 /** 29 * System registration hook. 30 */ 31 void NormalizationTransliterator::registerIDs() { 32 // In the Token, the byte after the NUL is the UNormalization2Mode. 33 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 34 _create, cstrToken("nfc\0\0")); 35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 36 _create, cstrToken("nfkc\0\0")); 37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 38 _create, cstrToken("nfc\0\1")); 39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 40 _create, cstrToken("nfkc\0\1")); 41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 42 _create, cstrToken("nfc\0\2")); 43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 44 _create, cstrToken("nfc\0\3")); 45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 46 UNICODE_STRING_SIMPLE("NFD"), TRUE); 47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 48 UNICODE_STRING_SIMPLE("NFKD"), TRUE); 49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 50 UNICODE_STRING_SIMPLE("NFD"), FALSE); 51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 52 UNICODE_STRING_SIMPLE("FCD"), FALSE); 53 } 54 55 /** 56 * Factory methods 57 */ 58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 59 Token context) { 60 const char *name = (const char *)context.pointer; 61 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 62 UErrorCode errorCode = U_ZERO_ERROR; 63 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 64 if(U_SUCCESS(errorCode)) { 65 return new NormalizationTransliterator(ID, *norm2); 66 } else { 67 return NULL; 68 } 69 } 70 71 /** 72 * Constructs a transliterator. 73 */ 74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 75 const Normalizer2 &norm2) : 76 Transliterator(id, 0), fNorm2(norm2) {} 77 78 /** 79 * Destructor. 80 */ 81 NormalizationTransliterator::~NormalizationTransliterator() { 82 } 83 84 /** 85 * Copy constructor. 86 */ 87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 88 Transliterator(o), fNorm2(o.fNorm2) {} 89 90 /** 91 * Transliterator API. 92 */ 93 Transliterator* NormalizationTransliterator::clone(void) const { 94 return new NormalizationTransliterator(*this); 95 } 96 97 /** 98 * Implements {@link Transliterator#handleTransliterate}. 99 */ 100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 101 UBool isIncremental) const { 102 // start and limit of the input range 103 int32_t start = offsets.start; 104 int32_t limit = offsets.limit; 105 if(start >= limit) { 106 return; 107 } 108 109 /* 110 * Normalize as short chunks at a time as possible even in 111 * bulk mode, so that styled text is minimally disrupted. 112 * In incremental mode, a chunk that ends with offsets.limit 113 * must not be normalized. 114 * 115 * If it was known that the input text is not styled, then 116 * a bulk mode normalization could look like this: 117 118 UnicodeString input, normalized; 119 int32_t length = limit - start; 120 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 121 input.releaseBuffer(length); 122 123 UErrorCode status = U_ZERO_ERROR; 124 fNorm2.normalize(input, normalized, status); 125 126 text.handleReplaceBetween(start, limit, normalized); 127 128 int32_t delta = normalized.length() - length; 129 offsets.contextLimit += delta; 130 offsets.limit += delta; 131 offsets.start = limit + delta; 132 133 */ 134 UErrorCode errorCode = U_ZERO_ERROR; 135 UnicodeString segment; 136 UnicodeString normalized; 137 UChar32 c = text.char32At(start); 138 do { 139 int32_t prev = start; 140 // Skip at least one character so we make progress. 141 // c holds the character at start. 142 segment.remove(); 143 do { 144 segment.append(c); 145 start += U16_LENGTH(c); 146 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 147 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 148 // stop in incremental mode when we reach the input limit 149 // in case there are additional characters that could change the 150 // normalization result 151 start=prev; 152 break; 153 } 154 fNorm2.normalize(segment, normalized, errorCode); 155 if(U_FAILURE(errorCode)) { 156 break; 157 } 158 if(segment != normalized) { 159 // replace the input chunk with its normalized form 160 text.handleReplaceBetween(prev, start, normalized); 161 162 // update all necessary indexes accordingly 163 int32_t delta = normalized.length() - (start - prev); 164 start += delta; 165 limit += delta; 166 } 167 } while(start < limit); 168 169 offsets.start = start; 170 offsets.contextLimit += limit - offsets.limit; 171 offsets.limit = limit; 172 } 173 174 U_NAMESPACE_END 175 176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 177