1 // 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // norms.cpp 5 // created: 2017jun04 Markus W. Scherer 6 // (pulled out of n2builder.cpp) 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_NORMALIZATION 11 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include "unicode/errorcode.h" 15 #include "unicode/unistr.h" 16 #include "unicode/utf16.h" 17 #include "normalizer2impl.h" 18 #include "norms.h" 19 #include "toolutil.h" 20 #include "utrie2.h" 21 #include "uvectr32.h" 22 23 U_NAMESPACE_BEGIN 24 25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { 26 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 27 if(cc==0) { 28 fLastStarterIndex=fLength; 29 } 30 fArray[fLength++]=(c<<8)|cc; 31 return; 32 } 33 // Let this character bubble back to its canonical order. 34 int32_t i=fLength-1; 35 while(i>fLastStarterIndex && ccAt(i)>cc) { 36 --i; 37 } 38 ++i; // after the last starter or prevCC<=cc 39 // Move this and the following characters forward one to make space. 40 for(int32_t j=fLength; i<j; --j) { 41 fArray[j]=fArray[j-1]; 42 } 43 fArray[i]=(c<<8)|cc; 44 ++fLength; 45 fDidReorder=TRUE; 46 } 47 48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const { 49 dest.remove(); 50 for(int32_t i=0; i<fLength; ++i) { 51 dest.append(charAt(i)); 52 } 53 } 54 55 UChar32 Norm::combine(UChar32 trail) const { 56 int32_t length; 57 const CompositionPair *pairs=getCompositionPairs(length); 58 for(int32_t i=0; i<length; ++i) { 59 if(trail==pairs[i].trail) { 60 return pairs[i].composite; 61 } 62 if(trail<pairs[i].trail) { 63 break; 64 } 65 } 66 return U_SENTINEL; 67 } 68 69 Norms::Norms(UErrorCode &errorCode) { 70 normTrie=utrie2_open(0, 0, &errorCode); 71 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 72 // Default "inert" Norm struct at index 0. Practically immutable. 73 norms=allocNorm(); 74 norms->type=Norm::INERT; 75 } 76 77 Norms::~Norms() { 78 utrie2_close(normTrie); 79 int32_t normsLength=utm_countItems(normMem); 80 for(int32_t i=1; i<normsLength; ++i) { 81 delete norms[i].mapping; 82 delete norms[i].rawMapping; 83 delete norms[i].compositions; 84 } 85 utm_close(normMem); 86 } 87 88 Norm *Norms::allocNorm() { 89 Norm *p=(Norm *)utm_alloc(normMem); 90 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 91 return p; 92 } 93 94 Norm *Norms::getNorm(UChar32 c) { 95 uint32_t i=utrie2_get32(normTrie, c); 96 if(i==0) { 97 return nullptr; 98 } 99 return norms+i; 100 } 101 102 const Norm *Norms::getNorm(UChar32 c) const { 103 uint32_t i=utrie2_get32(normTrie, c); 104 if(i==0) { 105 return nullptr; 106 } 107 return norms+i; 108 } 109 110 const Norm &Norms::getNormRef(UChar32 c) const { 111 return norms[utrie2_get32(normTrie, c)]; 112 } 113 114 Norm *Norms::createNorm(UChar32 c) { 115 uint32_t i=utrie2_get32(normTrie, c); 116 if(i!=0) { 117 return norms+i; 118 } else { 119 /* allocate Norm */ 120 Norm *p=allocNorm(); 121 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 122 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 123 return p; 124 } 125 } 126 127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const { 128 int32_t length=mapping.length(); 129 U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK); 130 const char16_t *s=mapping.getBuffer(); 131 int32_t i=0; 132 UChar32 c; 133 while(i<length) { 134 U16_NEXT(s, i, length, c); 135 buffer.append(c, getCC(c)); 136 } 137 if(buffer.didReorder()) { 138 buffer.toString(mapping); 139 } 140 } 141 142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const { 143 if((highCC-lowCC)>=2) { 144 int32_t length; 145 const CompositionPair *pairs=norm.getCompositionPairs(length); 146 for(int32_t i=0; i<length; ++i) { 147 uint8_t trailCC=getCC(pairs[i].trail); 148 if(lowCC<trailCC && trailCC<highCC) { 149 return TRUE; 150 } 151 } 152 } 153 return FALSE; 154 } 155 156 U_CDECL_BEGIN 157 158 static UBool U_CALLCONV 159 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 160 return ((Norms::Enumerator *)context)->rangeHandler(start, end, value); 161 } 162 163 U_CDECL_END 164 165 void Norms::enumRanges(Enumerator &e) { 166 utrie2_enum(normTrie, nullptr, enumRangeHandler, &e); 167 } 168 169 Norms::Enumerator::~Enumerator() {} 170 171 UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 172 if(value!=0) { 173 rangeHandler(start, end, norms.getNormRefByIndex(value)); 174 } 175 return TRUE; 176 } 177 178 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { 179 if(norm.mappingType!=Norm::ROUND_TRIP) { return; } 180 if(start!=end) { 181 fprintf(stderr, 182 "gennorm2 error: same round-trip mapping for " 183 "more than 1 code point U+%04lX..U+%04lX\n", 184 (long)start, (long)end); 185 exit(U_INVALID_FORMAT_ERROR); 186 } 187 if(norm.cc!=0) { 188 fprintf(stderr, 189 "gennorm2 error: " 190 "U+%04lX has a round-trip mapping and ccc!=0, " 191 "not possible in Unicode normalization\n", 192 (long)start); 193 exit(U_INVALID_FORMAT_ERROR); 194 } 195 // setRoundTripMapping() ensured that there are exactly two code points. 196 const UnicodeString &m=*norm.mapping; 197 UChar32 lead=m.char32At(0); 198 UChar32 trail=m.char32At(m.length()-1); 199 if(norms.getCC(lead)!=0) { 200 fprintf(stderr, 201 "gennorm2 error: " 202 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 203 "not possible in Unicode normalization\n", 204 (long)start, (long)lead); 205 exit(U_INVALID_FORMAT_ERROR); 206 } 207 // Flag for trailing character. 208 norms.createNorm(trail)->combinesBack=TRUE; 209 // Insert (trail, composite) pair into compositions list for the lead character. 210 IcuToolErrorCode errorCode("gennorm2/addComposition()"); 211 Norm *leadNorm=norms.createNorm(lead); 212 UVector32 *compositions=leadNorm->compositions; 213 int32_t i; 214 if(compositions==nullptr) { 215 compositions=leadNorm->compositions=new UVector32(errorCode); 216 i=0; // "insert" the first pair at index 0 217 } else { 218 // Insertion sort, and check for duplicate trail characters. 219 int32_t length; 220 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 221 for(i=0; i<length; ++i) { 222 if(trail==pairs[i].trail) { 223 fprintf(stderr, 224 "gennorm2 error: same round-trip mapping for " 225 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 226 (long)start, (long)lead, (long)trail); 227 exit(U_INVALID_FORMAT_ERROR); 228 } 229 if(trail<pairs[i].trail) { 230 break; 231 } 232 } 233 } 234 compositions->insertElementAt(trail, 2*i, errorCode); 235 compositions->insertElementAt(start, 2*i+1, errorCode); 236 } 237 238 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { 239 if(!norm.hasMapping()) { return; } 240 const UnicodeString &m=*norm.mapping; 241 UnicodeString *decomposed=nullptr; 242 const UChar *s=toUCharPtr(m.getBuffer()); 243 int32_t length=m.length(); 244 int32_t prev, i=0; 245 UChar32 c; 246 while(i<length) { 247 prev=i; 248 U16_NEXT(s, i, length, c); 249 if(start<=c && c<=end) { 250 fprintf(stderr, 251 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 252 (long)c); 253 exit(U_INVALID_FORMAT_ERROR); 254 } 255 const Norm &cNorm=norms.getNormRef(c); 256 if(cNorm.hasMapping()) { 257 if(norm.mappingType==Norm::ROUND_TRIP) { 258 if(prev==0) { 259 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 260 fprintf(stderr, 261 "gennorm2 error: " 262 "U+%04lX's round-trip mapping's starter " 263 "U+%04lX one-way-decomposes, " 264 "not possible in Unicode normalization\n", 265 (long)start, (long)c); 266 exit(U_INVALID_FORMAT_ERROR); 267 } 268 uint8_t myTrailCC=norms.getCC(m.char32At(i)); 269 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 270 uint8_t cTrailCC=norms.getCC(cTrailChar); 271 if(cTrailCC>myTrailCC) { 272 fprintf(stderr, 273 "gennorm2 error: " 274 "U+%04lX's round-trip mapping's starter " 275 "U+%04lX decomposes and the " 276 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 277 "not possible in Unicode normalization\n", 278 (long)start, (long)c, 279 (short)cTrailCC, (short)myTrailCC); 280 exit(U_INVALID_FORMAT_ERROR); 281 } 282 } else { 283 fprintf(stderr, 284 "gennorm2 error: " 285 "U+%04lX's round-trip mapping's non-starter " 286 "U+%04lX decomposes, " 287 "not possible in Unicode normalization\n", 288 (long)start, (long)c); 289 exit(U_INVALID_FORMAT_ERROR); 290 } 291 } 292 if(decomposed==nullptr) { 293 decomposed=new UnicodeString(m, 0, prev); 294 } 295 decomposed->append(*cNorm.mapping); 296 } else if(Hangul::isHangul(c)) { 297 UChar buffer[3]; 298 int32_t hangulLength=Hangul::decompose(c, buffer); 299 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 300 fprintf(stderr, 301 "gennorm2 error: " 302 "U+%04lX's round-trip mapping's non-starter " 303 "U+%04lX decomposes, " 304 "not possible in Unicode normalization\n", 305 (long)start, (long)c); 306 exit(U_INVALID_FORMAT_ERROR); 307 } 308 if(decomposed==nullptr) { 309 decomposed=new UnicodeString(m, 0, prev); 310 } 311 decomposed->append(buffer, hangulLength); 312 } else if(decomposed!=nullptr) { 313 decomposed->append(m, prev, i-prev); 314 } 315 } 316 if(decomposed!=nullptr) { 317 if(norm.rawMapping==nullptr) { 318 // Remember the original mapping when decomposing recursively. 319 norm.rawMapping=norm.mapping; 320 } else { 321 delete norm.mapping; 322 } 323 norm.mapping=decomposed; 324 // Not norm.setMappingCP(); because the original mapping 325 // is most likely to be encodable as a delta. 326 didDecompose|=TRUE; 327 } 328 } 329 330 U_NAMESPACE_END 331 332 #endif // #if !UCONFIG_NO_NORMALIZATION 333