1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: n2builder.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov25 14 * created by: Markus W. Scherer 15 * 16 * Builds Normalizer2 data and writes a binary .nrm file. 17 * For the file format see source/common/normalizer2impl.h. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "unicode/std_string.h" // U_HAVE_STD_STRING, #include <string> 22 #include "n2builder.h" // UCONFIG_NO_NORMALIZATION=1 if !U_HAVE_STD_STRING 23 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #if U_HAVE_STD_STRING 28 #include <vector> 29 #endif 30 #include "unicode/errorcode.h" 31 #include "unicode/localpointer.h" 32 #include "unicode/putil.h" 33 #include "unicode/udata.h" 34 #include "unicode/uniset.h" 35 #include "unicode/unistr.h" 36 #include "unicode/ustring.h" 37 #include "hash.h" 38 #include "normalizer2impl.h" 39 #include "toolutil.h" 40 #include "unewdata.h" 41 #include "utrie2.h" 42 43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 44 45 #if !UCONFIG_NO_NORMALIZATION 46 47 /* UDataInfo cf. udata.h */ 48 static UDataInfo dataInfo={ 49 sizeof(UDataInfo), 50 0, 51 52 U_IS_BIG_ENDIAN, 53 U_CHARSET_FAMILY, 54 U_SIZEOF_UCHAR, 55 0, 56 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 58 { 1, 0, 0, 0 }, /* formatVersion */ 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 60 }; 61 62 U_NAMESPACE_BEGIN 63 64 class HangulIterator { 65 public: 66 struct Range { 67 UChar32 start, limit; 68 uint16_t norm16; 69 }; 70 71 HangulIterator() : rangeIndex(0) {} 72 const Range *nextRange() { 73 if(rangeIndex<LENGTHOF(ranges)) { 74 return ranges+rangeIndex++; 75 } else { 76 return NULL; 77 } 78 } 79 void reset() { rangeIndex=0; } 80 private: 81 static const Range ranges[4]; 82 int32_t rangeIndex; 83 }; 84 85 const HangulIterator::Range HangulIterator::ranges[4]={ 86 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 87 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 88 // JAMO_T_BASE+1: not U+11A7 89 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 90 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 91 }; 92 93 struct CompositionPair { 94 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 95 UChar32 trail, composite; 96 }; 97 98 struct Norm { 99 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 100 101 UBool hasMapping() const { return mappingType>REMOVED; } 102 103 // Requires hasMapping() and well-formed mapping. 104 void setMappingCP() { 105 UChar32 c; 106 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 107 mappingCP=c; 108 } else { 109 mappingCP=U_SENTINEL; 110 } 111 } 112 113 UnicodeString *mapping; 114 UChar32 mappingCP; // >=0 if mapping to 1 code point 115 int32_t mappingPhase; 116 MappingType mappingType; 117 118 U_STD_NSQ vector<CompositionPair> *compositions; 119 uint8_t cc; 120 UBool combinesBack; 121 UBool hasNoCompBoundaryAfter; 122 123 enum OffsetType { 124 OFFSET_NONE, OFFSET_MAYBE_YES, 125 OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO, 126 OFFSET_DELTA 127 }; 128 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 129 int32_t offset; 130 }; 131 132 class Normalizer2DBEnumerator { 133 public: 134 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 135 virtual ~Normalizer2DBEnumerator() {} 136 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 137 Normalizer2DBEnumerator *ptr() { return this; } 138 protected: 139 Normalizer2DataBuilder &builder; 140 }; 141 142 U_CDECL_BEGIN 143 144 static UBool U_CALLCONV 145 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 146 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 147 } 148 149 U_CDECL_END 150 151 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 152 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) { 153 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 154 normTrie=utrie2_open(0, 0, &errorCode); 155 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 156 norms=allocNorm(); // unused Norm struct at index 0 157 memset(indexes, 0, sizeof(indexes)); 158 } 159 160 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 161 utrie2_close(normTrie); 162 int32_t normsLength=utm_countItems(normMem); 163 for(int32_t i=1; i<normsLength; ++i) { 164 delete norms[i].mapping; 165 delete norms[i].compositions; 166 } 167 utm_close(normMem); 168 utrie2_close(norm16Trie); 169 } 170 171 void 172 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 173 u_versionFromString(unicodeVersion, v); 174 } 175 176 Norm *Normalizer2DataBuilder::allocNorm() { 177 Norm *p=(Norm *)utm_alloc(normMem); 178 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 179 return p; 180 } 181 182 /* get an existing Norm unit */ 183 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 184 uint32_t i=utrie2_get32(normTrie, c); 185 if(i==0) { 186 return NULL; 187 } 188 return norms+i; 189 } 190 191 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 192 return norms[utrie2_get32(normTrie, c)]; 193 } 194 195 /* 196 * get or create a Norm unit; 197 * get or create the intermediate trie entries for it as well 198 */ 199 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 200 uint32_t i=utrie2_get32(normTrie, c); 201 if(i!=0) { 202 return norms+i; 203 } else { 204 /* allocate Norm */ 205 Norm *p=allocNorm(); 206 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 207 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 208 return p; 209 } 210 } 211 212 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 213 if(p!=NULL) { 214 if(p->mappingType!=Norm::NONE) { 215 if( overrideHandling==OVERRIDE_NONE || 216 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 217 ) { 218 fprintf(stderr, 219 "error in gennorm2 phase %d: " 220 "not permitted to override mapping for U+%04lX from phase %d\n", 221 (int)phase, (long)c, (int)p->mappingPhase); 222 exit(U_INVALID_FORMAT_ERROR); 223 } 224 delete p->mapping; 225 p->mapping=NULL; 226 } 227 p->mappingPhase=phase; 228 } 229 return p; 230 } 231 232 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 233 overrideHandling=oh; 234 ++phase; 235 } 236 237 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 238 createNorm(c)->cc=cc; 239 } 240 241 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 242 return getNormRef(c).cc; 243 } 244 245 static UBool isWellFormed(const UnicodeString &s) { 246 UErrorCode errorCode=U_ZERO_ERROR; 247 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 248 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 249 } 250 251 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 252 if(!isWellFormed(m)) { 253 fprintf(stderr, 254 "error in gennorm2 phase %d: " 255 "illegal one-way mapping from U+%04lX to malformed string\n", 256 (int)phase, (long)c); 257 exit(U_INVALID_FORMAT_ERROR); 258 } 259 Norm *p=checkNormForMapping(createNorm(c), c); 260 p->mapping=new UnicodeString(m); 261 p->mappingType=Norm::ONE_WAY; 262 p->setMappingCP(); 263 } 264 265 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 266 if(U_IS_SURROGATE(c)) { 267 fprintf(stderr, 268 "error in gennorm2 phase %d: " 269 "illegal round-trip mapping from surrogate code point U+%04lX\n", 270 (int)phase, (long)c); 271 exit(U_INVALID_FORMAT_ERROR); 272 } 273 if(!isWellFormed(m)) { 274 fprintf(stderr, 275 "error in gennorm2 phase %d: " 276 "illegal round-trip mapping from U+%04lX to malformed string\n", 277 (int)phase, (long)c); 278 exit(U_INVALID_FORMAT_ERROR); 279 } 280 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 281 if(numCP!=2) { 282 fprintf(stderr, 283 "error in gennorm2 phase %d: " 284 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 285 (int)phase, (long)c, (int)numCP); 286 exit(U_INVALID_FORMAT_ERROR); 287 } 288 Norm *p=checkNormForMapping(createNorm(c), c); 289 p->mapping=new UnicodeString(m); 290 p->mappingType=Norm::ROUND_TRIP; 291 p->mappingCP=U_SENTINEL; 292 } 293 294 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 295 Norm *p=checkNormForMapping(getNorm(c), c); 296 if(p!=NULL) { 297 p->mappingType=Norm::REMOVED; 298 } 299 } 300 301 class CompositionBuilder : public Normalizer2DBEnumerator { 302 public: 303 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 304 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 305 builder.addComposition(start, end, value); 306 return TRUE; 307 } 308 }; 309 310 void 311 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 312 if(norms[value].mappingType==Norm::ROUND_TRIP) { 313 if(start!=end) { 314 fprintf(stderr, 315 "gennorm2 error: same round-trip mapping for " 316 "more than 1 code point U+%04lX..U+%04lX\n", 317 (long)start, (long)end); 318 exit(U_INVALID_FORMAT_ERROR); 319 } 320 if(norms[value].cc!=0) { 321 fprintf(stderr, 322 "gennorm2 error: " 323 "U+%04lX has a round-trip mapping and ccc!=0, " 324 "not possible in Unicode normalization\n", 325 (long)start); 326 exit(U_INVALID_FORMAT_ERROR); 327 } 328 // setRoundTripMapping() ensured that there are exactly two code points. 329 const UnicodeString &m=*norms[value].mapping; 330 UChar32 lead=m.char32At(0); 331 UChar32 trail=m.char32At(m.length()-1); 332 if(getCC(lead)!=0) { 333 fprintf(stderr, 334 "gennorm2 error: " 335 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 336 "not possible in Unicode normalization\n", 337 (long)start, (long)lead); 338 exit(U_INVALID_FORMAT_ERROR); 339 } 340 // Flag for trailing character. 341 createNorm(trail)->combinesBack=TRUE; 342 // Insert (trail, composite) pair into compositions list for the lead character. 343 CompositionPair pair(trail, start); 344 Norm *leadNorm=createNorm(lead); 345 U_STD_NSQ vector<CompositionPair> *compositions=leadNorm->compositions; 346 if(compositions==NULL) { 347 compositions=leadNorm->compositions=new U_STD_NSQ vector<CompositionPair>; 348 compositions->push_back(pair); 349 } else { 350 // Insertion sort, and check for duplicate trail characters. 351 U_STD_NSQ vector<CompositionPair>::iterator it; 352 for(it=compositions->begin(); it!=compositions->end(); ++it) { 353 if(trail==it->trail) { 354 fprintf(stderr, 355 "gennorm2 error: same round-trip mapping for " 356 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 357 (long)start, (long)lead, (long)trail); 358 exit(U_INVALID_FORMAT_ERROR); 359 } 360 if(trail<it->trail) { 361 break; 362 } 363 } 364 compositions->insert(it, pair); 365 } 366 } 367 } 368 369 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 370 uint8_t lowCC, uint8_t highCC) const { 371 const U_STD_NSQ vector<CompositionPair> *compositions=norm.compositions; 372 if(compositions!=NULL && (highCC-lowCC)>=2) { 373 U_STD_NSQ vector<CompositionPair>::const_iterator it; 374 for(it=compositions->begin(); it!=compositions->end(); ++it) { 375 uint8_t trailCC=getCC(it->trail); 376 if(lowCC<trailCC && trailCC<highCC) { 377 return TRUE; 378 } 379 } 380 } 381 return FALSE; 382 } 383 384 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 385 const U_STD_NSQ vector<CompositionPair> *compositions=norm.compositions; 386 if(compositions!=NULL) { 387 U_STD_NSQ vector<CompositionPair>::const_iterator it; 388 for(it=compositions->begin(); it!=compositions->end(); ++it) { 389 if(trail==it->trail) { 390 return it->composite; 391 } 392 if(trail<it->trail) { 393 break; 394 } 395 } 396 } 397 return U_SENTINEL; 398 } 399 400 class Decomposer : public Normalizer2DBEnumerator { 401 public: 402 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 403 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 404 didDecompose|=builder.decompose(start, end, value); 405 return TRUE; 406 } 407 UBool didDecompose; 408 }; 409 410 UBool 411 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 412 if(norms[value].hasMapping()) { 413 const UnicodeString &m=*norms[value].mapping; 414 UnicodeString *decomposed=NULL; 415 const UChar *s=m.getBuffer(); 416 int32_t length=m.length(); 417 int32_t prev, i=0; 418 UChar32 c; 419 while(i<length) { 420 prev=i; 421 U16_NEXT(s, i, length, c); 422 if(start<=c && c<=end) { 423 fprintf(stderr, 424 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 425 (long)c); 426 exit(U_INVALID_FORMAT_ERROR); 427 } 428 const Norm &cNorm=getNormRef(c); 429 if(cNorm.hasMapping()) { 430 if(norms[value].mappingType==Norm::ROUND_TRIP) { 431 if(prev==0) { 432 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 433 fprintf(stderr, 434 "gennorm2 error: " 435 "U+%04lX's round-trip mapping's starter " 436 "U+%04lX one-way-decomposes, " 437 "not possible in Unicode normalization\n", 438 (long)start, (long)c); 439 exit(U_INVALID_FORMAT_ERROR); 440 } 441 uint8_t myTrailCC=getCC(m.char32At(i)); 442 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 443 uint8_t cTrailCC=getCC(cTrailChar); 444 if(cTrailCC>myTrailCC) { 445 fprintf(stderr, 446 "gennorm2 error: " 447 "U+%04lX's round-trip mapping's starter " 448 "U+%04lX decomposes and the " 449 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 450 "not possible in Unicode normalization\n", 451 (long)start, (long)c, 452 (short)cTrailCC, (short)myTrailCC); 453 exit(U_INVALID_FORMAT_ERROR); 454 } 455 } else { 456 fprintf(stderr, 457 "gennorm2 error: " 458 "U+%04lX's round-trip mapping's non-starter " 459 "U+%04lX decomposes, " 460 "not possible in Unicode normalization\n", 461 (long)start, (long)c); 462 exit(U_INVALID_FORMAT_ERROR); 463 } 464 } 465 if(decomposed==NULL) { 466 decomposed=new UnicodeString(m, 0, prev); 467 } 468 decomposed->append(*cNorm.mapping); 469 } else if(Hangul::isHangul(c)) { 470 UChar buffer[3]; 471 int32_t hangulLength=Hangul::decompose(c, buffer); 472 if(norms[value].mappingType==Norm::ROUND_TRIP && prev!=0) { 473 fprintf(stderr, 474 "gennorm2 error: " 475 "U+%04lX's round-trip mapping's non-starter " 476 "U+%04lX decomposes, " 477 "not possible in Unicode normalization\n", 478 (long)start, (long)c); 479 exit(U_INVALID_FORMAT_ERROR); 480 } 481 if(decomposed==NULL) { 482 decomposed=new UnicodeString(m, 0, prev); 483 } 484 decomposed->append(buffer, hangulLength); 485 } else if(decomposed!=NULL) { 486 decomposed->append(m, prev, i-prev); 487 } 488 } 489 if(decomposed!=NULL) { 490 delete norms[value].mapping; 491 norms[value].mapping=decomposed; 492 // Not norms[value].setMappingCP(); because the original mapping 493 // is most likely to be encodable as a delta. 494 return TRUE; 495 } 496 } 497 return FALSE; 498 } 499 500 class BuilderReorderingBuffer { 501 public: 502 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 503 void reset() { 504 fLength=0; 505 fLastStarterIndex=-1; 506 fDidReorder=FALSE; 507 } 508 int32_t length() const { return fLength; } 509 UBool isEmpty() const { return fLength==0; } 510 int32_t lastStarterIndex() const { return fLastStarterIndex; } 511 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 512 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 513 UBool didReorder() const { return fDidReorder; } 514 void append(UChar32 c, uint8_t cc) { 515 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 516 if(cc==0) { 517 fLastStarterIndex=fLength; 518 } 519 fArray[fLength++]=(c<<8)|cc; 520 return; 521 } 522 // Let this character bubble back to its canonical order. 523 int32_t i=fLength-1; 524 while(i>fLastStarterIndex && ccAt(i)>cc) { 525 --i; 526 } 527 ++i; // after the last starter or prevCC<=cc 528 // Move this and the following characters forward one to make space. 529 for(int32_t j=fLength; i<j; --j) { 530 fArray[j]=fArray[j-1]; 531 } 532 fArray[i]=(c<<8)|cc; 533 ++fLength; 534 fDidReorder=TRUE; 535 } 536 void toString(UnicodeString &dest) { 537 dest.remove(); 538 for(int32_t i=0; i<fLength; ++i) { 539 dest.append(charAt(i)); 540 } 541 } 542 void setComposite(UChar32 composite, int32_t combMarkIndex) { 543 fArray[fLastStarterIndex]=composite<<8; 544 // Remove the combining mark that contributed to the composite. 545 --fLength; 546 while(combMarkIndex<fLength) { 547 fArray[combMarkIndex]=fArray[combMarkIndex+1]; 548 ++combMarkIndex; 549 } 550 } 551 private: 552 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 553 int32_t fLength; 554 int32_t fLastStarterIndex; 555 UBool fDidReorder; 556 }; 557 558 void 559 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 560 UnicodeString &m=*p->mapping; 561 int32_t length=m.length(); 562 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 563 return; // writeMapping() will complain about it and print the code point. 564 } 565 const UChar *s=m.getBuffer(); 566 int32_t i=0; 567 UChar32 c; 568 while(i<length) { 569 U16_NEXT(s, i, length, c); 570 buffer.append(c, getCC(c)); 571 } 572 if(buffer.didReorder()) { 573 buffer.toString(m); 574 } 575 } 576 577 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 578 if(buffer.isEmpty()) { 579 return TRUE; // maps-to-empty string is no boundary of any kind 580 } 581 int32_t lastStarterIndex=buffer.lastStarterIndex(); 582 if(lastStarterIndex<0) { 583 return TRUE; // no starter 584 } 585 UChar32 starter=buffer.charAt(lastStarterIndex); 586 if( Hangul::isJamoL(starter) || 587 (Hangul::isJamoV(starter) && 588 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 589 ) { 590 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 591 // otherwise it is blocked. 592 return lastStarterIndex==buffer.length()-1; 593 } 594 // no Hangul in fully decomposed mapping 595 const Norm *starterNorm=&getNormRef(starter); 596 if(starterNorm->compositions==NULL) { 597 return FALSE; // the last starter does not combine forward 598 } 599 // Compose as far as possible, and see if further compositions are possible. 600 uint8_t prevCC=0; 601 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 602 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 603 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 604 return TRUE; 605 } 606 if( prevCC<cc && 607 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 608 ) { 609 buffer.setComposite(starter, combMarkIndex); 610 starterNorm=&getNormRef(starter); 611 if(starterNorm->compositions==NULL) { 612 return FALSE; // the composite does not combine further 613 } 614 } else { 615 prevCC=cc; 616 ++combMarkIndex; 617 } 618 } 619 // TRUE if the final, forward-combining starter is at the end. 620 return prevCC==0; 621 } 622 623 // Requires p->hasMapping(). 624 void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 625 UnicodeString &m=*p->mapping; 626 int32_t length=m.length(); 627 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 628 fprintf(stderr, 629 "gennorm2 error: " 630 "mapping for U+%04lX longer than maximum of %d\n", 631 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 632 exit(U_INVALID_FORMAT_ERROR); 633 } 634 int32_t leadCC, trailCC; 635 if(length==0) { 636 leadCC=trailCC=0; 637 } else { 638 leadCC=getCC(m.char32At(0)); 639 trailCC=getCC(m.char32At(length-1)); 640 } 641 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 642 fprintf(stderr, 643 "gennorm2 error: " 644 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 645 (long)c); 646 exit(U_INVALID_FORMAT_ERROR); 647 } 648 int32_t firstUnit=length|(trailCC<<8); 649 int32_t secondUnit=p->cc|(leadCC<<8); 650 if(secondUnit!=0) { 651 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 652 } 653 if(p->compositions!=NULL) { 654 firstUnit|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST; 655 } 656 if(p->hasNoCompBoundaryAfter) { 657 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 658 } 659 dataString.append((UChar)firstUnit); 660 if(secondUnit!=0) { 661 dataString.append((UChar)secondUnit); 662 } 663 dataString.append(m); 664 } 665 666 // Requires p->compositions!=NULL. 667 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 668 if(p->cc!=0) { 669 fprintf(stderr, 670 "gennorm2 error: " 671 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 672 (long)c); 673 exit(U_INVALID_FORMAT_ERROR); 674 } 675 int32_t length=p->compositions->size(); 676 for(int32_t i=0; i<length; ++i) { 677 CompositionPair &pair=p->compositions->at(i); 678 // 22 bits for the composite character and whether it combines forward. 679 UChar32 compositeAndFwd=pair.composite<<1; 680 if(getNormRef(pair.composite).compositions!=NULL) { 681 compositeAndFwd|=1; // The composite character also combines-forward. 682 } 683 // Encode most pairs in two units and some in three. 684 int32_t firstUnit, secondUnit, thirdUnit; 685 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 686 if(compositeAndFwd<=0xffff) { 687 firstUnit=pair.trail<<1; 688 secondUnit=compositeAndFwd; 689 thirdUnit=-1; 690 } else { 691 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 692 secondUnit=compositeAndFwd>>16; 693 thirdUnit=compositeAndFwd; 694 } 695 } else { 696 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 697 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 698 Normalizer2Impl::COMP_1_TRIPLE; 699 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 700 (compositeAndFwd>>16); 701 thirdUnit=compositeAndFwd; 702 } 703 // Set the high bit of the first unit if this is the last composition pair. 704 if(i==(length-1)) { 705 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 706 } 707 dataString.append((UChar)firstUnit).append((UChar)secondUnit); 708 if(thirdUnit>=0) { 709 dataString.append((UChar)thirdUnit); 710 } 711 } 712 } 713 714 class ExtraDataWriter : public Normalizer2DBEnumerator { 715 public: 716 ExtraDataWriter(Normalizer2DataBuilder &b) : 717 Normalizer2DBEnumerator(b), 718 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 719 yesNoData(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 720 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 721 if(value!=0) { 722 if(start!=end) { 723 fprintf(stderr, 724 "gennorm2 error: unexpected shared data for " 725 "multiple code points U+%04lX..U+%04lX\n", 726 (long)start, (long)end); 727 exit(U_INTERNAL_PROGRAM_ERROR); 728 } 729 builder.writeExtraData(start, value, *this); 730 } 731 return TRUE; 732 } 733 UnicodeString maybeYesCompositions; 734 UnicodeString yesYesCompositions; 735 UnicodeString yesNoData; 736 UnicodeString noNoMappings; 737 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 738 }; 739 740 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 741 Norm *p=norms+value; 742 if(p->combinesBack) { 743 if(p->hasMapping()) { 744 fprintf(stderr, 745 "gennorm2 error: " 746 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 747 (long)c); 748 exit(U_INVALID_FORMAT_ERROR); 749 } 750 if(p->compositions!=NULL) { 751 p->offset= 752 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 753 Norm::OFFSET_MAYBE_YES; 754 writeCompositions(c, p, writer.maybeYesCompositions); 755 } 756 } else if(!p->hasMapping()) { 757 if(p->compositions!=NULL) { 758 p->offset= 759 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 760 Norm::OFFSET_YES_YES; 761 writeCompositions(c, p, writer.yesYesCompositions); 762 } 763 } else if(p->mappingType==Norm::ROUND_TRIP) { 764 p->offset= 765 (writer.yesNoData.length()<<Norm::OFFSET_SHIFT)| 766 Norm::OFFSET_YES_NO; 767 writeMapping(c, p, writer.yesNoData); 768 if(p->compositions!=NULL) { 769 writeCompositions(c, p, writer.yesNoData); 770 } 771 } else /* one-way */ { 772 if(p->compositions!=NULL) { 773 fprintf(stderr, 774 "gennorm2 error: " 775 "U+%04lX combines-forward and has a one-way mapping, " 776 "not possible in Unicode normalization\n", 777 (long)c); 778 exit(U_INVALID_FORMAT_ERROR); 779 } 780 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 781 // Try a compact, algorithmic encoding. 782 // Only for ccc=0, because we can't store additional information. 783 if(p->mappingCP>=0) { 784 int32_t delta=p->mappingCP-c; 785 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 786 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 787 } 788 } 789 } 790 if(p->offset==0) { 791 int32_t oldNoNoLength=writer.noNoMappings.length(); 792 writeMapping(c, p, writer.noNoMappings); 793 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 794 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 795 if(previousOffset!=0) { 796 // Duplicate, remove the new units and point to the old ones. 797 writer.noNoMappings.truncate(oldNoNoLength); 798 p->offset= 799 ((previousOffset-1)<<Norm::OFFSET_SHIFT)| 800 Norm::OFFSET_NO_NO; 801 } else { 802 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 803 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 804 writer.previousNoNoMappings.puti(newMapping, oldNoNoLength+1, errorCode); 805 p->offset= 806 (oldNoNoLength<<Norm::OFFSET_SHIFT)| 807 Norm::OFFSET_NO_NO; 808 } 809 } 810 } 811 } 812 813 class Norm16Writer : public Normalizer2DBEnumerator { 814 public: 815 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 816 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 817 builder.writeNorm16(start, end, value); 818 return TRUE; 819 } 820 }; 821 822 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 823 if(value!=0) { 824 const Norm *p=norms+value; 825 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 826 int32_t norm16=0; 827 UBool isDecompNo=FALSE; 828 UBool isCompNoMaybe=FALSE; 829 switch(p->offset&Norm::OFFSET_MASK) { 830 case Norm::OFFSET_NONE: 831 // No mapping, no compositions list. 832 if(p->combinesBack) { 833 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 834 isDecompNo=(UBool)(p->cc!=0); 835 isCompNoMaybe=TRUE; 836 } else if(p->cc!=0) { 837 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 838 isDecompNo=isCompNoMaybe=TRUE; 839 } 840 break; 841 case Norm::OFFSET_MAYBE_YES: 842 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 843 isCompNoMaybe=TRUE; 844 break; 845 case Norm::OFFSET_YES_YES: 846 norm16=offset; 847 break; 848 case Norm::OFFSET_YES_NO: 849 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 850 isDecompNo=TRUE; 851 break; 852 case Norm::OFFSET_NO_NO: 853 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 854 isDecompNo=isCompNoMaybe=TRUE; 855 break; 856 case Norm::OFFSET_DELTA: 857 norm16=getCenterNoNoDelta()+offset; 858 isDecompNo=isCompNoMaybe=TRUE; 859 break; 860 default: // Should not occur. 861 exit(U_INTERNAL_PROGRAM_ERROR); 862 } 863 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 864 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 865 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 866 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 867 } 868 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 869 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 870 } 871 } 872 } 873 874 void Normalizer2DataBuilder::setHangulData() { 875 HangulIterator hi; 876 const HangulIterator::Range *range; 877 // Check that none of the Hangul/Jamo code points have data. 878 while((range=hi.nextRange())!=NULL) { 879 for(UChar32 c=range->start; c<range->limit; ++c) { 880 if(utrie2_get32(norm16Trie, c)!=0) { 881 fprintf(stderr, 882 "gennorm2 error: " 883 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 884 (long)c); 885 exit(U_INVALID_FORMAT_ERROR); 886 } 887 } 888 } 889 // Set data for algorithmic runtime handling. 890 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 891 hi.reset(); 892 while((range=hi.nextRange())!=NULL) { 893 uint16_t norm16=range->norm16; 894 if(norm16==0) { 895 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 896 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 897 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 898 } 899 } else { 900 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 901 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 902 } 903 } 904 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 905 errorCode.assertSuccess(); 906 } 907 } 908 909 U_CDECL_BEGIN 910 911 static UBool U_CALLCONV 912 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 913 uint32_t *pMaxValue=(uint32_t *)context; 914 if(value>*pMaxValue) { 915 *pMaxValue=value; 916 } 917 return TRUE; 918 } 919 920 U_CDECL_END 921 922 void Normalizer2DataBuilder::processData() { 923 IcuToolErrorCode errorCode("gennorm2/processData()"); 924 norm16Trie=utrie2_open(0, 0, errorCode); 925 errorCode.assertSuccess(); 926 927 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 928 929 Decomposer decomposer(*this); 930 do { 931 decomposer.didDecompose=FALSE; 932 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 933 } while(decomposer.didDecompose); 934 935 BuilderReorderingBuffer buffer; 936 int32_t normsLength=utm_countItems(normMem); 937 for(int32_t i=1; i<normsLength; ++i) { 938 if(norms[i].hasMapping()) { 939 buffer.reset(); 940 reorder(norms+i, buffer); 941 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 942 } 943 } 944 945 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 946 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 947 948 ExtraDataWriter extraDataWriter(*this); 949 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 950 951 extraData=extraDataWriter.maybeYesCompositions; 952 extraData.append(extraDataWriter.yesYesCompositions). 953 append(extraDataWriter.yesNoData). 954 append(extraDataWriter.noNoMappings); 955 // Pad to even length for 4-byte alignment of following data. 956 if(extraData.length()&1) { 957 extraData.append((UChar)0); 958 } 959 960 indexes[Normalizer2Impl::IX_MIN_YES_NO]= 961 extraDataWriter.yesYesCompositions.length(); 962 indexes[Normalizer2Impl::IX_MIN_NO_NO]= 963 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 964 extraDataWriter.yesNoData.length(); 965 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 966 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 967 extraDataWriter.noNoMappings.length(); 968 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 969 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 970 extraDataWriter.maybeYesCompositions.length(); 971 972 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 973 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 974 fprintf(stderr, 975 "gennorm2 error: " 976 "data structure overflow, too much mapping composition data\n"); 977 exit(U_BUFFER_OVERFLOW_ERROR); 978 } 979 980 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 981 982 setHangulData(); 983 984 // Look for the "worst" norm16 value of any supplementary code point 985 // corresponding to a lead surrogate, and set it as that surrogate's value. 986 // Enables quick check inner loops to look at only code units. 987 // 988 // We could be more sophisticated: 989 // We could collect a bit set for whether there are values in the different 990 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 991 // and select the best value that only breaks the composition and/or decomposition 992 // inner loops if necessary. 993 // However, that seems like overkill for an optimization for supplementary characters. 994 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 995 uint32_t maxValue=utrie2_get32(norm16Trie, lead); 996 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 997 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 998 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 999 ) { 1000 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1001 // Otherwise it might end up at something like JAMO_VT which stays in 1002 // the inner decomposition quick check loop. 1003 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1004 } 1005 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1006 } 1007 1008 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1009 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1010 // which is harmless. 1011 // As a result, the minimum code points are always BMP code points. 1012 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1013 if(minCP>=0x10000) { 1014 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1015 } 1016 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1017 if(minCP>=0x10000) { 1018 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1019 } 1020 } 1021 1022 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1023 processData(); 1024 1025 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1026 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1027 int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1028 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1029 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1030 errorCode.errorName()); 1031 exit(errorCode.reset()); 1032 } 1033 errorCode.reset(); 1034 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1035 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1036 errorCode.assertSuccess(); 1037 1038 int32_t offset=(int32_t)sizeof(indexes); 1039 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1040 offset+=norm16TrieLength; 1041 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1042 int32_t totalSize=offset+=extraData.length()*2; 1043 for(int32_t i=Normalizer2Impl::IX_RESERVED2_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1044 indexes[i]=totalSize; 1045 } 1046 1047 if(beVerbose) { 1048 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1049 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1050 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1051 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1052 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1053 printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1054 printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1055 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1056 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1057 } 1058 1059 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1060 UNewDataMemory *pData= 1061 udata_create(NULL, NULL, filename, &dataInfo, 1062 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1063 if(errorCode.isFailure()) { 1064 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1065 filename, errorCode.errorName()); 1066 exit(errorCode.reset()); 1067 } 1068 udata_writeBlock(pData, indexes, sizeof(indexes)); 1069 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1070 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1071 1072 int32_t writtenSize=udata_finish(pData, errorCode); 1073 if(errorCode.isFailure()) { 1074 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1075 exit(errorCode.reset()); 1076 } 1077 if(writtenSize!=totalSize) { 1078 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1079 (long)writtenSize, (long)totalSize); 1080 exit(U_INTERNAL_PROGRAM_ERROR); 1081 } 1082 } 1083 1084 U_NAMESPACE_END 1085 1086 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1087 1088 /* 1089 * Hey, Emacs, please set the following: 1090 * 1091 * Local Variables: 1092 * indent-tabs-mode: nil 1093 * End: 1094 */ 1095