1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: n2builder.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov25 14 * created by: Markus W. Scherer 15 * 16 * Builds Normalizer2 data and writes a binary .nrm file. 17 * For the file format see source/common/normalizer2impl.h. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "n2builder.h" 22 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #if U_HAVE_STD_STRING 27 #include <vector> 28 #endif 29 #include "unicode/errorcode.h" 30 #include "unicode/localpointer.h" 31 #include "unicode/putil.h" 32 #include "unicode/udata.h" 33 #include "unicode/uniset.h" 34 #include "unicode/unistr.h" 35 #include "unicode/ustring.h" 36 #include "hash.h" 37 #include "normalizer2impl.h" 38 #include "toolutil.h" 39 #include "unewdata.h" 40 #include "utrie2.h" 41 #include "uvectr32.h" 42 43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 44 45 #if !UCONFIG_NO_NORMALIZATION 46 47 /* UDataInfo cf. udata.h */ 48 static UDataInfo dataInfo={ 49 sizeof(UDataInfo), 50 0, 51 52 U_IS_BIG_ENDIAN, 53 U_CHARSET_FAMILY, 54 U_SIZEOF_UCHAR, 55 0, 56 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 58 { 1, 0, 0, 0 }, /* formatVersion */ 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 60 }; 61 62 U_NAMESPACE_BEGIN 63 64 class HangulIterator { 65 public: 66 struct Range { 67 UChar32 start, limit; 68 uint16_t norm16; 69 }; 70 71 HangulIterator() : rangeIndex(0) {} 72 const Range *nextRange() { 73 if(rangeIndex<LENGTHOF(ranges)) { 74 return ranges+rangeIndex++; 75 } else { 76 return NULL; 77 } 78 } 79 void reset() { rangeIndex=0; } 80 private: 81 static const Range ranges[4]; 82 int32_t rangeIndex; 83 }; 84 85 const HangulIterator::Range HangulIterator::ranges[4]={ 86 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 87 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 88 // JAMO_T_BASE+1: not U+11A7 89 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 90 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 91 }; 92 93 struct CompositionPair { 94 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 95 UChar32 trail, composite; 96 }; 97 98 struct Norm { 99 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 100 101 UBool hasMapping() const { return mappingType>REMOVED; } 102 103 // Requires hasMapping() and well-formed mapping. 104 void setMappingCP() { 105 UChar32 c; 106 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 107 mappingCP=c; 108 } else { 109 mappingCP=U_SENTINEL; 110 } 111 } 112 113 const CompositionPair *getCompositionPairs(int32_t &length) const { 114 if(compositions==NULL) { 115 length=0; 116 return NULL; 117 } else { 118 length=compositions->size()/2; 119 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 120 } 121 } 122 123 UnicodeString *mapping; 124 UChar32 mappingCP; // >=0 if mapping to 1 code point 125 int32_t mappingPhase; 126 MappingType mappingType; 127 128 UVector32 *compositions; // (trail, composite) pairs 129 uint8_t cc; 130 UBool combinesBack; 131 UBool hasNoCompBoundaryAfter; 132 133 enum OffsetType { 134 OFFSET_NONE, OFFSET_MAYBE_YES, 135 OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO, 136 OFFSET_DELTA 137 }; 138 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 139 int32_t offset; 140 }; 141 142 class Normalizer2DBEnumerator { 143 public: 144 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 145 virtual ~Normalizer2DBEnumerator() {} 146 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 147 Normalizer2DBEnumerator *ptr() { return this; } 148 protected: 149 Normalizer2DataBuilder &builder; 150 }; 151 152 U_CDECL_BEGIN 153 154 static UBool U_CALLCONV 155 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 156 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 157 } 158 159 U_CDECL_END 160 161 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 162 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) { 163 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 164 normTrie=utrie2_open(0, 0, &errorCode); 165 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 166 norms=allocNorm(); // unused Norm struct at index 0 167 memset(indexes, 0, sizeof(indexes)); 168 } 169 170 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 171 utrie2_close(normTrie); 172 int32_t normsLength=utm_countItems(normMem); 173 for(int32_t i=1; i<normsLength; ++i) { 174 delete norms[i].mapping; 175 delete norms[i].compositions; 176 } 177 utm_close(normMem); 178 utrie2_close(norm16Trie); 179 } 180 181 void 182 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 183 u_versionFromString(unicodeVersion, v); 184 } 185 186 Norm *Normalizer2DataBuilder::allocNorm() { 187 Norm *p=(Norm *)utm_alloc(normMem); 188 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 189 return p; 190 } 191 192 /* get an existing Norm unit */ 193 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 194 uint32_t i=utrie2_get32(normTrie, c); 195 if(i==0) { 196 return NULL; 197 } 198 return norms+i; 199 } 200 201 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 202 return norms[utrie2_get32(normTrie, c)]; 203 } 204 205 /* 206 * get or create a Norm unit; 207 * get or create the intermediate trie entries for it as well 208 */ 209 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 210 uint32_t i=utrie2_get32(normTrie, c); 211 if(i!=0) { 212 return norms+i; 213 } else { 214 /* allocate Norm */ 215 Norm *p=allocNorm(); 216 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 217 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 218 return p; 219 } 220 } 221 222 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 223 if(p!=NULL) { 224 if(p->mappingType!=Norm::NONE) { 225 if( overrideHandling==OVERRIDE_NONE || 226 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 227 ) { 228 fprintf(stderr, 229 "error in gennorm2 phase %d: " 230 "not permitted to override mapping for U+%04lX from phase %d\n", 231 (int)phase, (long)c, (int)p->mappingPhase); 232 exit(U_INVALID_FORMAT_ERROR); 233 } 234 delete p->mapping; 235 p->mapping=NULL; 236 } 237 p->mappingPhase=phase; 238 } 239 return p; 240 } 241 242 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 243 overrideHandling=oh; 244 ++phase; 245 } 246 247 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 248 createNorm(c)->cc=cc; 249 } 250 251 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 252 return getNormRef(c).cc; 253 } 254 255 static UBool isWellFormed(const UnicodeString &s) { 256 UErrorCode errorCode=U_ZERO_ERROR; 257 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 258 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 259 } 260 261 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 262 if(!isWellFormed(m)) { 263 fprintf(stderr, 264 "error in gennorm2 phase %d: " 265 "illegal one-way mapping from U+%04lX to malformed string\n", 266 (int)phase, (long)c); 267 exit(U_INVALID_FORMAT_ERROR); 268 } 269 Norm *p=checkNormForMapping(createNorm(c), c); 270 p->mapping=new UnicodeString(m); 271 p->mappingType=Norm::ONE_WAY; 272 p->setMappingCP(); 273 } 274 275 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 276 if(U_IS_SURROGATE(c)) { 277 fprintf(stderr, 278 "error in gennorm2 phase %d: " 279 "illegal round-trip mapping from surrogate code point U+%04lX\n", 280 (int)phase, (long)c); 281 exit(U_INVALID_FORMAT_ERROR); 282 } 283 if(!isWellFormed(m)) { 284 fprintf(stderr, 285 "error in gennorm2 phase %d: " 286 "illegal round-trip mapping from U+%04lX to malformed string\n", 287 (int)phase, (long)c); 288 exit(U_INVALID_FORMAT_ERROR); 289 } 290 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 291 if(numCP!=2) { 292 fprintf(stderr, 293 "error in gennorm2 phase %d: " 294 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 295 (int)phase, (long)c, (int)numCP); 296 exit(U_INVALID_FORMAT_ERROR); 297 } 298 Norm *p=checkNormForMapping(createNorm(c), c); 299 p->mapping=new UnicodeString(m); 300 p->mappingType=Norm::ROUND_TRIP; 301 p->mappingCP=U_SENTINEL; 302 } 303 304 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 305 Norm *p=checkNormForMapping(getNorm(c), c); 306 if(p!=NULL) { 307 p->mappingType=Norm::REMOVED; 308 } 309 } 310 311 class CompositionBuilder : public Normalizer2DBEnumerator { 312 public: 313 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 314 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 315 builder.addComposition(start, end, value); 316 return TRUE; 317 } 318 }; 319 320 void 321 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 322 if(norms[value].mappingType==Norm::ROUND_TRIP) { 323 if(start!=end) { 324 fprintf(stderr, 325 "gennorm2 error: same round-trip mapping for " 326 "more than 1 code point U+%04lX..U+%04lX\n", 327 (long)start, (long)end); 328 exit(U_INVALID_FORMAT_ERROR); 329 } 330 if(norms[value].cc!=0) { 331 fprintf(stderr, 332 "gennorm2 error: " 333 "U+%04lX has a round-trip mapping and ccc!=0, " 334 "not possible in Unicode normalization\n", 335 (long)start); 336 exit(U_INVALID_FORMAT_ERROR); 337 } 338 // setRoundTripMapping() ensured that there are exactly two code points. 339 const UnicodeString &m=*norms[value].mapping; 340 UChar32 lead=m.char32At(0); 341 UChar32 trail=m.char32At(m.length()-1); 342 if(getCC(lead)!=0) { 343 fprintf(stderr, 344 "gennorm2 error: " 345 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 346 "not possible in Unicode normalization\n", 347 (long)start, (long)lead); 348 exit(U_INVALID_FORMAT_ERROR); 349 } 350 // Flag for trailing character. 351 createNorm(trail)->combinesBack=TRUE; 352 // Insert (trail, composite) pair into compositions list for the lead character. 353 IcuToolErrorCode errorCode("gennorm2/addComposition()"); 354 Norm *leadNorm=createNorm(lead); 355 UVector32 *compositions=leadNorm->compositions; 356 int32_t i; 357 if(compositions==NULL) { 358 compositions=leadNorm->compositions=new UVector32(errorCode); 359 i=0; // "insert" the first pair at index 0 360 } else { 361 // Insertion sort, and check for duplicate trail characters. 362 int32_t length; 363 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 364 for(i=0; i<length; ++i) { 365 if(trail==pairs[i].trail) { 366 fprintf(stderr, 367 "gennorm2 error: same round-trip mapping for " 368 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 369 (long)start, (long)lead, (long)trail); 370 exit(U_INVALID_FORMAT_ERROR); 371 } 372 if(trail<pairs[i].trail) { 373 break; 374 } 375 } 376 } 377 compositions->insertElementAt(trail, 2*i, errorCode); 378 compositions->insertElementAt(start, 2*i+1, errorCode); 379 } 380 } 381 382 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 383 uint8_t lowCC, uint8_t highCC) const { 384 if((highCC-lowCC)>=2) { 385 int32_t length; 386 const CompositionPair *pairs=norm.getCompositionPairs(length); 387 for(int32_t i=0; i<length; ++i) { 388 uint8_t trailCC=getCC(pairs[i].trail); 389 if(lowCC<trailCC && trailCC<highCC) { 390 return TRUE; 391 } 392 } 393 } 394 return FALSE; 395 } 396 397 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 398 int32_t length; 399 const CompositionPair *pairs=norm.getCompositionPairs(length); 400 for(int32_t i=0; i<length; ++i) { 401 if(trail==pairs[i].trail) { 402 return pairs[i].composite; 403 } 404 if(trail<pairs[i].trail) { 405 break; 406 } 407 } 408 return U_SENTINEL; 409 } 410 411 class Decomposer : public Normalizer2DBEnumerator { 412 public: 413 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 414 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 415 didDecompose|=builder.decompose(start, end, value); 416 return TRUE; 417 } 418 UBool didDecompose; 419 }; 420 421 UBool 422 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 423 if(norms[value].hasMapping()) { 424 const UnicodeString &m=*norms[value].mapping; 425 UnicodeString *decomposed=NULL; 426 const UChar *s=m.getBuffer(); 427 int32_t length=m.length(); 428 int32_t prev, i=0; 429 UChar32 c; 430 while(i<length) { 431 prev=i; 432 U16_NEXT(s, i, length, c); 433 if(start<=c && c<=end) { 434 fprintf(stderr, 435 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 436 (long)c); 437 exit(U_INVALID_FORMAT_ERROR); 438 } 439 const Norm &cNorm=getNormRef(c); 440 if(cNorm.hasMapping()) { 441 if(norms[value].mappingType==Norm::ROUND_TRIP) { 442 if(prev==0) { 443 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 444 fprintf(stderr, 445 "gennorm2 error: " 446 "U+%04lX's round-trip mapping's starter " 447 "U+%04lX one-way-decomposes, " 448 "not possible in Unicode normalization\n", 449 (long)start, (long)c); 450 exit(U_INVALID_FORMAT_ERROR); 451 } 452 uint8_t myTrailCC=getCC(m.char32At(i)); 453 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 454 uint8_t cTrailCC=getCC(cTrailChar); 455 if(cTrailCC>myTrailCC) { 456 fprintf(stderr, 457 "gennorm2 error: " 458 "U+%04lX's round-trip mapping's starter " 459 "U+%04lX decomposes and the " 460 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 461 "not possible in Unicode normalization\n", 462 (long)start, (long)c, 463 (short)cTrailCC, (short)myTrailCC); 464 exit(U_INVALID_FORMAT_ERROR); 465 } 466 } else { 467 fprintf(stderr, 468 "gennorm2 error: " 469 "U+%04lX's round-trip mapping's non-starter " 470 "U+%04lX decomposes, " 471 "not possible in Unicode normalization\n", 472 (long)start, (long)c); 473 exit(U_INVALID_FORMAT_ERROR); 474 } 475 } 476 if(decomposed==NULL) { 477 decomposed=new UnicodeString(m, 0, prev); 478 } 479 decomposed->append(*cNorm.mapping); 480 } else if(Hangul::isHangul(c)) { 481 UChar buffer[3]; 482 int32_t hangulLength=Hangul::decompose(c, buffer); 483 if(norms[value].mappingType==Norm::ROUND_TRIP && prev!=0) { 484 fprintf(stderr, 485 "gennorm2 error: " 486 "U+%04lX's round-trip mapping's non-starter " 487 "U+%04lX decomposes, " 488 "not possible in Unicode normalization\n", 489 (long)start, (long)c); 490 exit(U_INVALID_FORMAT_ERROR); 491 } 492 if(decomposed==NULL) { 493 decomposed=new UnicodeString(m, 0, prev); 494 } 495 decomposed->append(buffer, hangulLength); 496 } else if(decomposed!=NULL) { 497 decomposed->append(m, prev, i-prev); 498 } 499 } 500 if(decomposed!=NULL) { 501 delete norms[value].mapping; 502 norms[value].mapping=decomposed; 503 // Not norms[value].setMappingCP(); because the original mapping 504 // is most likely to be encodable as a delta. 505 return TRUE; 506 } 507 } 508 return FALSE; 509 } 510 511 class BuilderReorderingBuffer { 512 public: 513 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 514 void reset() { 515 fLength=0; 516 fLastStarterIndex=-1; 517 fDidReorder=FALSE; 518 } 519 int32_t length() const { return fLength; } 520 UBool isEmpty() const { return fLength==0; } 521 int32_t lastStarterIndex() const { return fLastStarterIndex; } 522 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 523 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 524 UBool didReorder() const { return fDidReorder; } 525 void append(UChar32 c, uint8_t cc) { 526 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 527 if(cc==0) { 528 fLastStarterIndex=fLength; 529 } 530 fArray[fLength++]=(c<<8)|cc; 531 return; 532 } 533 // Let this character bubble back to its canonical order. 534 int32_t i=fLength-1; 535 while(i>fLastStarterIndex && ccAt(i)>cc) { 536 --i; 537 } 538 ++i; // after the last starter or prevCC<=cc 539 // Move this and the following characters forward one to make space. 540 for(int32_t j=fLength; i<j; --j) { 541 fArray[j]=fArray[j-1]; 542 } 543 fArray[i]=(c<<8)|cc; 544 ++fLength; 545 fDidReorder=TRUE; 546 } 547 void toString(UnicodeString &dest) { 548 dest.remove(); 549 for(int32_t i=0; i<fLength; ++i) { 550 dest.append(charAt(i)); 551 } 552 } 553 void setComposite(UChar32 composite, int32_t combMarkIndex) { 554 fArray[fLastStarterIndex]=composite<<8; 555 // Remove the combining mark that contributed to the composite. 556 --fLength; 557 while(combMarkIndex<fLength) { 558 fArray[combMarkIndex]=fArray[combMarkIndex+1]; 559 ++combMarkIndex; 560 } 561 } 562 private: 563 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 564 int32_t fLength; 565 int32_t fLastStarterIndex; 566 UBool fDidReorder; 567 }; 568 569 void 570 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 571 UnicodeString &m=*p->mapping; 572 int32_t length=m.length(); 573 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 574 return; // writeMapping() will complain about it and print the code point. 575 } 576 const UChar *s=m.getBuffer(); 577 int32_t i=0; 578 UChar32 c; 579 while(i<length) { 580 U16_NEXT(s, i, length, c); 581 buffer.append(c, getCC(c)); 582 } 583 if(buffer.didReorder()) { 584 buffer.toString(m); 585 } 586 } 587 588 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 589 if(buffer.isEmpty()) { 590 return TRUE; // maps-to-empty string is no boundary of any kind 591 } 592 int32_t lastStarterIndex=buffer.lastStarterIndex(); 593 if(lastStarterIndex<0) { 594 return TRUE; // no starter 595 } 596 UChar32 starter=buffer.charAt(lastStarterIndex); 597 if( Hangul::isJamoL(starter) || 598 (Hangul::isJamoV(starter) && 599 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 600 ) { 601 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 602 // otherwise it is blocked. 603 return lastStarterIndex==buffer.length()-1; 604 } 605 // no Hangul in fully decomposed mapping 606 const Norm *starterNorm=&getNormRef(starter); 607 if(starterNorm->compositions==NULL) { 608 return FALSE; // the last starter does not combine forward 609 } 610 // Compose as far as possible, and see if further compositions are possible. 611 uint8_t prevCC=0; 612 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 613 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 614 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 615 return TRUE; 616 } 617 if( prevCC<cc && 618 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 619 ) { 620 buffer.setComposite(starter, combMarkIndex); 621 starterNorm=&getNormRef(starter); 622 if(starterNorm->compositions==NULL) { 623 return FALSE; // the composite does not combine further 624 } 625 } else { 626 prevCC=cc; 627 ++combMarkIndex; 628 } 629 } 630 // TRUE if the final, forward-combining starter is at the end. 631 return prevCC==0; 632 } 633 634 // Requires p->hasMapping(). 635 void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 636 UnicodeString &m=*p->mapping; 637 int32_t length=m.length(); 638 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 639 fprintf(stderr, 640 "gennorm2 error: " 641 "mapping for U+%04lX longer than maximum of %d\n", 642 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 643 exit(U_INVALID_FORMAT_ERROR); 644 } 645 int32_t leadCC, trailCC; 646 if(length==0) { 647 leadCC=trailCC=0; 648 } else { 649 leadCC=getCC(m.char32At(0)); 650 trailCC=getCC(m.char32At(length-1)); 651 } 652 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 653 fprintf(stderr, 654 "gennorm2 error: " 655 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 656 (long)c); 657 exit(U_INVALID_FORMAT_ERROR); 658 } 659 int32_t firstUnit=length|(trailCC<<8); 660 int32_t secondUnit=p->cc|(leadCC<<8); 661 if(secondUnit!=0) { 662 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 663 } 664 if(p->compositions!=NULL) { 665 firstUnit|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST; 666 } 667 if(p->hasNoCompBoundaryAfter) { 668 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 669 } 670 dataString.append((UChar)firstUnit); 671 if(secondUnit!=0) { 672 dataString.append((UChar)secondUnit); 673 } 674 dataString.append(m); 675 } 676 677 // Requires p->compositions!=NULL. 678 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 679 if(p->cc!=0) { 680 fprintf(stderr, 681 "gennorm2 error: " 682 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 683 (long)c); 684 exit(U_INVALID_FORMAT_ERROR); 685 } 686 int32_t length; 687 const CompositionPair *pairs=p->getCompositionPairs(length); 688 for(int32_t i=0; i<length; ++i) { 689 const CompositionPair &pair=pairs[i]; 690 // 22 bits for the composite character and whether it combines forward. 691 UChar32 compositeAndFwd=pair.composite<<1; 692 if(getNormRef(pair.composite).compositions!=NULL) { 693 compositeAndFwd|=1; // The composite character also combines-forward. 694 } 695 // Encode most pairs in two units and some in three. 696 int32_t firstUnit, secondUnit, thirdUnit; 697 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 698 if(compositeAndFwd<=0xffff) { 699 firstUnit=pair.trail<<1; 700 secondUnit=compositeAndFwd; 701 thirdUnit=-1; 702 } else { 703 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 704 secondUnit=compositeAndFwd>>16; 705 thirdUnit=compositeAndFwd; 706 } 707 } else { 708 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 709 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 710 Normalizer2Impl::COMP_1_TRIPLE; 711 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 712 (compositeAndFwd>>16); 713 thirdUnit=compositeAndFwd; 714 } 715 // Set the high bit of the first unit if this is the last composition pair. 716 if(i==(length-1)) { 717 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 718 } 719 dataString.append((UChar)firstUnit).append((UChar)secondUnit); 720 if(thirdUnit>=0) { 721 dataString.append((UChar)thirdUnit); 722 } 723 } 724 } 725 726 class ExtraDataWriter : public Normalizer2DBEnumerator { 727 public: 728 ExtraDataWriter(Normalizer2DataBuilder &b) : 729 Normalizer2DBEnumerator(b), 730 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 731 yesNoData(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 732 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 733 if(value!=0) { 734 if(start!=end) { 735 fprintf(stderr, 736 "gennorm2 error: unexpected shared data for " 737 "multiple code points U+%04lX..U+%04lX\n", 738 (long)start, (long)end); 739 exit(U_INTERNAL_PROGRAM_ERROR); 740 } 741 builder.writeExtraData(start, value, *this); 742 } 743 return TRUE; 744 } 745 UnicodeString maybeYesCompositions; 746 UnicodeString yesYesCompositions; 747 UnicodeString yesNoData; 748 UnicodeString noNoMappings; 749 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 750 }; 751 752 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 753 Norm *p=norms+value; 754 if(p->combinesBack) { 755 if(p->hasMapping()) { 756 fprintf(stderr, 757 "gennorm2 error: " 758 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 759 (long)c); 760 exit(U_INVALID_FORMAT_ERROR); 761 } 762 if(p->compositions!=NULL) { 763 p->offset= 764 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 765 Norm::OFFSET_MAYBE_YES; 766 writeCompositions(c, p, writer.maybeYesCompositions); 767 } 768 } else if(!p->hasMapping()) { 769 if(p->compositions!=NULL) { 770 p->offset= 771 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 772 Norm::OFFSET_YES_YES; 773 writeCompositions(c, p, writer.yesYesCompositions); 774 } 775 } else if(p->mappingType==Norm::ROUND_TRIP) { 776 p->offset= 777 (writer.yesNoData.length()<<Norm::OFFSET_SHIFT)| 778 Norm::OFFSET_YES_NO; 779 writeMapping(c, p, writer.yesNoData); 780 if(p->compositions!=NULL) { 781 writeCompositions(c, p, writer.yesNoData); 782 } 783 } else /* one-way */ { 784 if(p->compositions!=NULL) { 785 fprintf(stderr, 786 "gennorm2 error: " 787 "U+%04lX combines-forward and has a one-way mapping, " 788 "not possible in Unicode normalization\n", 789 (long)c); 790 exit(U_INVALID_FORMAT_ERROR); 791 } 792 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 793 // Try a compact, algorithmic encoding. 794 // Only for ccc=0, because we can't store additional information. 795 if(p->mappingCP>=0) { 796 int32_t delta=p->mappingCP-c; 797 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 798 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 799 } 800 } 801 } 802 if(p->offset==0) { 803 int32_t oldNoNoLength=writer.noNoMappings.length(); 804 writeMapping(c, p, writer.noNoMappings); 805 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 806 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 807 if(previousOffset!=0) { 808 // Duplicate, remove the new units and point to the old ones. 809 writer.noNoMappings.truncate(oldNoNoLength); 810 p->offset= 811 ((previousOffset-1)<<Norm::OFFSET_SHIFT)| 812 Norm::OFFSET_NO_NO; 813 } else { 814 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 815 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 816 writer.previousNoNoMappings.puti(newMapping, oldNoNoLength+1, errorCode); 817 p->offset= 818 (oldNoNoLength<<Norm::OFFSET_SHIFT)| 819 Norm::OFFSET_NO_NO; 820 } 821 } 822 } 823 } 824 825 class Norm16Writer : public Normalizer2DBEnumerator { 826 public: 827 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 828 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 829 builder.writeNorm16(start, end, value); 830 return TRUE; 831 } 832 }; 833 834 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 835 if(value!=0) { 836 const Norm *p=norms+value; 837 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 838 int32_t norm16=0; 839 UBool isDecompNo=FALSE; 840 UBool isCompNoMaybe=FALSE; 841 switch(p->offset&Norm::OFFSET_MASK) { 842 case Norm::OFFSET_NONE: 843 // No mapping, no compositions list. 844 if(p->combinesBack) { 845 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 846 isDecompNo=(UBool)(p->cc!=0); 847 isCompNoMaybe=TRUE; 848 } else if(p->cc!=0) { 849 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 850 isDecompNo=isCompNoMaybe=TRUE; 851 } 852 break; 853 case Norm::OFFSET_MAYBE_YES: 854 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 855 isCompNoMaybe=TRUE; 856 break; 857 case Norm::OFFSET_YES_YES: 858 norm16=offset; 859 break; 860 case Norm::OFFSET_YES_NO: 861 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 862 isDecompNo=TRUE; 863 break; 864 case Norm::OFFSET_NO_NO: 865 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 866 isDecompNo=isCompNoMaybe=TRUE; 867 break; 868 case Norm::OFFSET_DELTA: 869 norm16=getCenterNoNoDelta()+offset; 870 isDecompNo=isCompNoMaybe=TRUE; 871 break; 872 default: // Should not occur. 873 exit(U_INTERNAL_PROGRAM_ERROR); 874 } 875 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 876 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 877 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 878 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 879 } 880 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 881 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 882 } 883 } 884 } 885 886 void Normalizer2DataBuilder::setHangulData() { 887 HangulIterator hi; 888 const HangulIterator::Range *range; 889 // Check that none of the Hangul/Jamo code points have data. 890 while((range=hi.nextRange())!=NULL) { 891 for(UChar32 c=range->start; c<range->limit; ++c) { 892 if(utrie2_get32(norm16Trie, c)!=0) { 893 fprintf(stderr, 894 "gennorm2 error: " 895 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 896 (long)c); 897 exit(U_INVALID_FORMAT_ERROR); 898 } 899 } 900 } 901 // Set data for algorithmic runtime handling. 902 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 903 hi.reset(); 904 while((range=hi.nextRange())!=NULL) { 905 uint16_t norm16=range->norm16; 906 if(norm16==0) { 907 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 908 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 909 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 910 } 911 } else { 912 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 913 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 914 } 915 } 916 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 917 errorCode.assertSuccess(); 918 } 919 } 920 921 U_CDECL_BEGIN 922 923 static UBool U_CALLCONV 924 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 925 uint32_t *pMaxValue=(uint32_t *)context; 926 if(value>*pMaxValue) { 927 *pMaxValue=value; 928 } 929 return TRUE; 930 } 931 932 U_CDECL_END 933 934 void Normalizer2DataBuilder::processData() { 935 IcuToolErrorCode errorCode("gennorm2/processData()"); 936 norm16Trie=utrie2_open(0, 0, errorCode); 937 errorCode.assertSuccess(); 938 939 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 940 941 Decomposer decomposer(*this); 942 do { 943 decomposer.didDecompose=FALSE; 944 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 945 } while(decomposer.didDecompose); 946 947 BuilderReorderingBuffer buffer; 948 int32_t normsLength=utm_countItems(normMem); 949 for(int32_t i=1; i<normsLength; ++i) { 950 if(norms[i].hasMapping()) { 951 buffer.reset(); 952 reorder(norms+i, buffer); 953 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 954 } 955 } 956 957 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 958 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 959 960 ExtraDataWriter extraDataWriter(*this); 961 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 962 963 extraData=extraDataWriter.maybeYesCompositions; 964 extraData.append(extraDataWriter.yesYesCompositions). 965 append(extraDataWriter.yesNoData). 966 append(extraDataWriter.noNoMappings); 967 // Pad to even length for 4-byte alignment of following data. 968 if(extraData.length()&1) { 969 extraData.append((UChar)0); 970 } 971 972 indexes[Normalizer2Impl::IX_MIN_YES_NO]= 973 extraDataWriter.yesYesCompositions.length(); 974 indexes[Normalizer2Impl::IX_MIN_NO_NO]= 975 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 976 extraDataWriter.yesNoData.length(); 977 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 978 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 979 extraDataWriter.noNoMappings.length(); 980 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 981 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 982 extraDataWriter.maybeYesCompositions.length(); 983 984 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 985 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 986 fprintf(stderr, 987 "gennorm2 error: " 988 "data structure overflow, too much mapping composition data\n"); 989 exit(U_BUFFER_OVERFLOW_ERROR); 990 } 991 992 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 993 994 setHangulData(); 995 996 // Look for the "worst" norm16 value of any supplementary code point 997 // corresponding to a lead surrogate, and set it as that surrogate's value. 998 // Enables quick check inner loops to look at only code units. 999 // 1000 // We could be more sophisticated: 1001 // We could collect a bit set for whether there are values in the different 1002 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 1003 // and select the best value that only breaks the composition and/or decomposition 1004 // inner loops if necessary. 1005 // However, that seems like overkill for an optimization for supplementary characters. 1006 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1007 uint32_t maxValue=utrie2_get32(norm16Trie, lead); 1008 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 1009 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 1010 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 1011 ) { 1012 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1013 // Otherwise it might end up at something like JAMO_VT which stays in 1014 // the inner decomposition quick check loop. 1015 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1016 } 1017 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1018 } 1019 1020 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1021 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1022 // which is harmless. 1023 // As a result, the minimum code points are always BMP code points. 1024 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1025 if(minCP>=0x10000) { 1026 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1027 } 1028 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1029 if(minCP>=0x10000) { 1030 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1031 } 1032 } 1033 1034 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1035 processData(); 1036 1037 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1038 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1039 int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1040 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1041 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1042 errorCode.errorName()); 1043 exit(errorCode.reset()); 1044 } 1045 errorCode.reset(); 1046 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1047 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1048 errorCode.assertSuccess(); 1049 1050 int32_t offset=(int32_t)sizeof(indexes); 1051 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1052 offset+=norm16TrieLength; 1053 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1054 int32_t totalSize=offset+=extraData.length()*2; 1055 for(int32_t i=Normalizer2Impl::IX_RESERVED2_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1056 indexes[i]=totalSize; 1057 } 1058 1059 if(beVerbose) { 1060 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1061 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1062 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1063 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1064 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1065 printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1066 printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1067 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1068 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1069 } 1070 1071 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1072 UNewDataMemory *pData= 1073 udata_create(NULL, NULL, filename, &dataInfo, 1074 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1075 if(errorCode.isFailure()) { 1076 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1077 filename, errorCode.errorName()); 1078 exit(errorCode.reset()); 1079 } 1080 udata_writeBlock(pData, indexes, sizeof(indexes)); 1081 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1082 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1083 1084 int32_t writtenSize=udata_finish(pData, errorCode); 1085 if(errorCode.isFailure()) { 1086 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1087 exit(errorCode.reset()); 1088 } 1089 if(writtenSize!=totalSize) { 1090 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1091 (long)writtenSize, (long)totalSize); 1092 exit(U_INTERNAL_PROGRAM_ERROR); 1093 } 1094 } 1095 1096 U_NAMESPACE_END 1097 1098 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1099 1100 /* 1101 * Hey, Emacs, please set the following: 1102 * 1103 * Local Variables: 1104 * indent-tabs-mode: nil 1105 * End: 1106 */ 1107