1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 07/03/01 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/normalizer2.h" 18 #include "unicode/utf16.h" 19 #include "cstring.h" 20 #include "nortrans.h" 21 22 U_NAMESPACE_BEGIN 23 24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 25 26 static inline Transliterator::Token cstrToken(const char *s) { 27 return Transliterator::pointerToken((void *)s); 28 } 29 30 /** 31 * System registration hook. 32 */ 33 void NormalizationTransliterator::registerIDs() { 34 // In the Token, the byte after the NUL is the UNormalization2Mode. 35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 36 _create, cstrToken("nfc\0\0")); 37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 38 _create, cstrToken("nfkc\0\0")); 39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 40 _create, cstrToken("nfc\0\1")); 41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 42 _create, cstrToken("nfkc\0\1")); 43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 44 _create, cstrToken("nfc\0\2")); 45 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 46 _create, cstrToken("nfc\0\3")); 47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 48 UNICODE_STRING_SIMPLE("NFD"), TRUE); 49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 50 UNICODE_STRING_SIMPLE("NFKD"), TRUE); 51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 52 UNICODE_STRING_SIMPLE("NFD"), FALSE); 53 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 54 UNICODE_STRING_SIMPLE("FCD"), FALSE); 55 } 56 57 /** 58 * Factory methods 59 */ 60 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 61 Token context) { 62 const char *name = (const char *)context.pointer; 63 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 64 UErrorCode errorCode = U_ZERO_ERROR; 65 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 66 if(U_SUCCESS(errorCode)) { 67 return new NormalizationTransliterator(ID, *norm2); 68 } else { 69 return NULL; 70 } 71 } 72 73 /** 74 * Constructs a transliterator. 75 */ 76 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 77 const Normalizer2 &norm2) : 78 Transliterator(id, 0), fNorm2(norm2) {} 79 80 /** 81 * Destructor. 82 */ 83 NormalizationTransliterator::~NormalizationTransliterator() { 84 } 85 86 /** 87 * Copy constructor. 88 */ 89 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 90 Transliterator(o), fNorm2(o.fNorm2) {} 91 92 /** 93 * Transliterator API. 94 */ 95 Transliterator* NormalizationTransliterator::clone(void) const { 96 return new NormalizationTransliterator(*this); 97 } 98 99 /** 100 * Implements {@link Transliterator#handleTransliterate}. 101 */ 102 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 103 UBool isIncremental) const { 104 // start and limit of the input range 105 int32_t start = offsets.start; 106 int32_t limit = offsets.limit; 107 if(start >= limit) { 108 return; 109 } 110 111 /* 112 * Normalize as short chunks at a time as possible even in 113 * bulk mode, so that styled text is minimally disrupted. 114 * In incremental mode, a chunk that ends with offsets.limit 115 * must not be normalized. 116 * 117 * If it was known that the input text is not styled, then 118 * a bulk mode normalization could look like this: 119 120 UnicodeString input, normalized; 121 int32_t length = limit - start; 122 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 123 input.releaseBuffer(length); 124 125 UErrorCode status = U_ZERO_ERROR; 126 fNorm2.normalize(input, normalized, status); 127 128 text.handleReplaceBetween(start, limit, normalized); 129 130 int32_t delta = normalized.length() - length; 131 offsets.contextLimit += delta; 132 offsets.limit += delta; 133 offsets.start = limit + delta; 134 135 */ 136 UErrorCode errorCode = U_ZERO_ERROR; 137 UnicodeString segment; 138 UnicodeString normalized; 139 UChar32 c = text.char32At(start); 140 do { 141 int32_t prev = start; 142 // Skip at least one character so we make progress. 143 // c holds the character at start. 144 segment.remove(); 145 do { 146 segment.append(c); 147 start += U16_LENGTH(c); 148 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 149 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 150 // stop in incremental mode when we reach the input limit 151 // in case there are additional characters that could change the 152 // normalization result 153 start=prev; 154 break; 155 } 156 fNorm2.normalize(segment, normalized, errorCode); 157 if(U_FAILURE(errorCode)) { 158 break; 159 } 160 if(segment != normalized) { 161 // replace the input chunk with its normalized form 162 text.handleReplaceBetween(prev, start, normalized); 163 164 // update all necessary indexes accordingly 165 int32_t delta = normalized.length() - (start - prev); 166 start += delta; 167 limit += delta; 168 } 169 } while(start < limit); 170 171 offsets.start = start; 172 offsets.contextLimit += limit - offsets.limit; 173 offsets.limit = limit; 174 } 175 176 U_NAMESPACE_END 177 178 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 179