1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: n2builder.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov25 14 * created by: Markus W. Scherer 15 * 16 * Builds Normalizer2 data and writes a binary .nrm file. 17 * For the file format see source/common/normalizer2impl.h. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "n2builder.h" 22 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #if U_HAVE_STD_STRING 27 #include <vector> 28 #endif 29 #include "unicode/errorcode.h" 30 #include "unicode/localpointer.h" 31 #include "unicode/putil.h" 32 #include "unicode/udata.h" 33 #include "unicode/uniset.h" 34 #include "unicode/unistr.h" 35 #include "unicode/ustring.h" 36 #include "hash.h" 37 #include "normalizer2impl.h" 38 #include "toolutil.h" 39 #include "unewdata.h" 40 #include "utrie2.h" 41 #include "uvectr32.h" 42 43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 44 45 #if !UCONFIG_NO_NORMALIZATION 46 47 /* UDataInfo cf. udata.h */ 48 static UDataInfo dataInfo={ 49 sizeof(UDataInfo), 50 0, 51 52 U_IS_BIG_ENDIAN, 53 U_CHARSET_FAMILY, 54 U_SIZEOF_UCHAR, 55 0, 56 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 58 { 2, 0, 0, 0 }, /* formatVersion */ 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 60 }; 61 62 U_NAMESPACE_BEGIN 63 64 class HangulIterator { 65 public: 66 struct Range { 67 UChar32 start, limit; 68 uint16_t norm16; 69 }; 70 71 HangulIterator() : rangeIndex(0) {} 72 const Range *nextRange() { 73 if(rangeIndex<LENGTHOF(ranges)) { 74 return ranges+rangeIndex++; 75 } else { 76 return NULL; 77 } 78 } 79 void reset() { rangeIndex=0; } 80 private: 81 static const Range ranges[4]; 82 int32_t rangeIndex; 83 }; 84 85 const HangulIterator::Range HangulIterator::ranges[4]={ 86 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 87 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 88 // JAMO_T_BASE+1: not U+11A7 89 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 90 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 91 }; 92 93 struct CompositionPair { 94 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 95 UChar32 trail, composite; 96 }; 97 98 struct Norm { 99 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 100 101 UBool hasMapping() const { return mappingType>REMOVED; } 102 103 // Requires hasMapping() and well-formed mapping. 104 void setMappingCP() { 105 UChar32 c; 106 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 107 mappingCP=c; 108 } else { 109 mappingCP=U_SENTINEL; 110 } 111 } 112 113 const CompositionPair *getCompositionPairs(int32_t &length) const { 114 if(compositions==NULL) { 115 length=0; 116 return NULL; 117 } else { 118 length=compositions->size()/2; 119 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 120 } 121 } 122 123 UnicodeString *mapping; 124 UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed 125 UChar32 mappingCP; // >=0 if mapping to 1 code point 126 int32_t mappingPhase; 127 MappingType mappingType; 128 129 UVector32 *compositions; // (trail, composite) pairs 130 uint8_t cc; 131 UBool combinesBack; 132 UBool hasNoCompBoundaryAfter; 133 134 enum OffsetType { 135 OFFSET_NONE, 136 // Composition for back-combining character. Allowed, but not normally used. 137 OFFSET_MAYBE_YES, 138 // Composition for a starter that does not have a decomposition mapping. 139 OFFSET_YES_YES, 140 // Round-trip mapping & composition for a starter. 141 OFFSET_YES_NO_MAPPING_AND_COMPOSITION, 142 // Round-trip mapping for a starter that itself does not combine-forward. 143 OFFSET_YES_NO_MAPPING_ONLY, 144 // One-way mapping. 145 OFFSET_NO_NO, 146 // Delta for an algorithmic one-way mapping. 147 OFFSET_DELTA 148 }; 149 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 150 int32_t offset; 151 }; 152 153 class Normalizer2DBEnumerator { 154 public: 155 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 156 virtual ~Normalizer2DBEnumerator() {} 157 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 158 Normalizer2DBEnumerator *ptr() { return this; } 159 protected: 160 Normalizer2DataBuilder &builder; 161 }; 162 163 U_CDECL_BEGIN 164 165 static UBool U_CALLCONV 166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 167 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 168 } 169 170 U_CDECL_END 171 172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 173 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) { 174 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 175 normTrie=utrie2_open(0, 0, &errorCode); 176 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 177 norms=allocNorm(); // unused Norm struct at index 0 178 memset(indexes, 0, sizeof(indexes)); 179 memset(smallFCD, 0, sizeof(smallFCD)); 180 } 181 182 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 183 utrie2_close(normTrie); 184 int32_t normsLength=utm_countItems(normMem); 185 for(int32_t i=1; i<normsLength; ++i) { 186 delete norms[i].mapping; 187 delete norms[i].rawMapping; 188 delete norms[i].compositions; 189 } 190 utm_close(normMem); 191 utrie2_close(norm16Trie); 192 } 193 194 void 195 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 196 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 197 UVersionInfo version; 198 u_versionFromString(version, v); 199 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 200 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 201 ) { 202 char buffer[U_MAX_VERSION_STRING_LENGTH]; 203 u_versionToString(unicodeVersion, buffer); 204 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 205 buffer, v); 206 exit(U_ILLEGAL_ARGUMENT_ERROR); 207 } 208 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 209 } 210 211 Norm *Normalizer2DataBuilder::allocNorm() { 212 Norm *p=(Norm *)utm_alloc(normMem); 213 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 214 return p; 215 } 216 217 /* get an existing Norm unit */ 218 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 219 uint32_t i=utrie2_get32(normTrie, c); 220 if(i==0) { 221 return NULL; 222 } 223 return norms+i; 224 } 225 226 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 227 return norms[utrie2_get32(normTrie, c)]; 228 } 229 230 /* 231 * get or create a Norm unit; 232 * get or create the intermediate trie entries for it as well 233 */ 234 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 235 uint32_t i=utrie2_get32(normTrie, c); 236 if(i!=0) { 237 return norms+i; 238 } else { 239 /* allocate Norm */ 240 Norm *p=allocNorm(); 241 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 242 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 243 return p; 244 } 245 } 246 247 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 248 if(p!=NULL) { 249 if(p->mappingType!=Norm::NONE) { 250 if( overrideHandling==OVERRIDE_NONE || 251 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 252 ) { 253 fprintf(stderr, 254 "error in gennorm2 phase %d: " 255 "not permitted to override mapping for U+%04lX from phase %d\n", 256 (int)phase, (long)c, (int)p->mappingPhase); 257 exit(U_INVALID_FORMAT_ERROR); 258 } 259 delete p->mapping; 260 p->mapping=NULL; 261 } 262 p->mappingPhase=phase; 263 } 264 return p; 265 } 266 267 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 268 overrideHandling=oh; 269 ++phase; 270 } 271 272 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 273 createNorm(c)->cc=cc; 274 } 275 276 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 277 return getNormRef(c).cc; 278 } 279 280 static UBool isWellFormed(const UnicodeString &s) { 281 UErrorCode errorCode=U_ZERO_ERROR; 282 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 283 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 284 } 285 286 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 287 if(!isWellFormed(m)) { 288 fprintf(stderr, 289 "error in gennorm2 phase %d: " 290 "illegal one-way mapping from U+%04lX to malformed string\n", 291 (int)phase, (long)c); 292 exit(U_INVALID_FORMAT_ERROR); 293 } 294 Norm *p=checkNormForMapping(createNorm(c), c); 295 p->mapping=new UnicodeString(m); 296 p->mappingType=Norm::ONE_WAY; 297 p->setMappingCP(); 298 } 299 300 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 301 if(U_IS_SURROGATE(c)) { 302 fprintf(stderr, 303 "error in gennorm2 phase %d: " 304 "illegal round-trip mapping from surrogate code point U+%04lX\n", 305 (int)phase, (long)c); 306 exit(U_INVALID_FORMAT_ERROR); 307 } 308 if(!isWellFormed(m)) { 309 fprintf(stderr, 310 "error in gennorm2 phase %d: " 311 "illegal round-trip mapping from U+%04lX to malformed string\n", 312 (int)phase, (long)c); 313 exit(U_INVALID_FORMAT_ERROR); 314 } 315 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 316 if(numCP!=2) { 317 fprintf(stderr, 318 "error in gennorm2 phase %d: " 319 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 320 (int)phase, (long)c, (int)numCP); 321 exit(U_INVALID_FORMAT_ERROR); 322 } 323 Norm *p=checkNormForMapping(createNorm(c), c); 324 p->mapping=new UnicodeString(m); 325 p->mappingType=Norm::ROUND_TRIP; 326 p->mappingCP=U_SENTINEL; 327 } 328 329 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 330 Norm *p=checkNormForMapping(getNorm(c), c); 331 if(p!=NULL) { 332 p->mappingType=Norm::REMOVED; 333 } 334 } 335 336 class CompositionBuilder : public Normalizer2DBEnumerator { 337 public: 338 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 339 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 340 builder.addComposition(start, end, value); 341 return TRUE; 342 } 343 }; 344 345 void 346 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 347 if(norms[value].mappingType==Norm::ROUND_TRIP) { 348 if(start!=end) { 349 fprintf(stderr, 350 "gennorm2 error: same round-trip mapping for " 351 "more than 1 code point U+%04lX..U+%04lX\n", 352 (long)start, (long)end); 353 exit(U_INVALID_FORMAT_ERROR); 354 } 355 if(norms[value].cc!=0) { 356 fprintf(stderr, 357 "gennorm2 error: " 358 "U+%04lX has a round-trip mapping and ccc!=0, " 359 "not possible in Unicode normalization\n", 360 (long)start); 361 exit(U_INVALID_FORMAT_ERROR); 362 } 363 // setRoundTripMapping() ensured that there are exactly two code points. 364 const UnicodeString &m=*norms[value].mapping; 365 UChar32 lead=m.char32At(0); 366 UChar32 trail=m.char32At(m.length()-1); 367 if(getCC(lead)!=0) { 368 fprintf(stderr, 369 "gennorm2 error: " 370 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 371 "not possible in Unicode normalization\n", 372 (long)start, (long)lead); 373 exit(U_INVALID_FORMAT_ERROR); 374 } 375 // Flag for trailing character. 376 createNorm(trail)->combinesBack=TRUE; 377 // Insert (trail, composite) pair into compositions list for the lead character. 378 IcuToolErrorCode errorCode("gennorm2/addComposition()"); 379 Norm *leadNorm=createNorm(lead); 380 UVector32 *compositions=leadNorm->compositions; 381 int32_t i; 382 if(compositions==NULL) { 383 compositions=leadNorm->compositions=new UVector32(errorCode); 384 i=0; // "insert" the first pair at index 0 385 } else { 386 // Insertion sort, and check for duplicate trail characters. 387 int32_t length; 388 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 389 for(i=0; i<length; ++i) { 390 if(trail==pairs[i].trail) { 391 fprintf(stderr, 392 "gennorm2 error: same round-trip mapping for " 393 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 394 (long)start, (long)lead, (long)trail); 395 exit(U_INVALID_FORMAT_ERROR); 396 } 397 if(trail<pairs[i].trail) { 398 break; 399 } 400 } 401 } 402 compositions->insertElementAt(trail, 2*i, errorCode); 403 compositions->insertElementAt(start, 2*i+1, errorCode); 404 } 405 } 406 407 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 408 uint8_t lowCC, uint8_t highCC) const { 409 if((highCC-lowCC)>=2) { 410 int32_t length; 411 const CompositionPair *pairs=norm.getCompositionPairs(length); 412 for(int32_t i=0; i<length; ++i) { 413 uint8_t trailCC=getCC(pairs[i].trail); 414 if(lowCC<trailCC && trailCC<highCC) { 415 return TRUE; 416 } 417 } 418 } 419 return FALSE; 420 } 421 422 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 423 int32_t length; 424 const CompositionPair *pairs=norm.getCompositionPairs(length); 425 for(int32_t i=0; i<length; ++i) { 426 if(trail==pairs[i].trail) { 427 return pairs[i].composite; 428 } 429 if(trail<pairs[i].trail) { 430 break; 431 } 432 } 433 return U_SENTINEL; 434 } 435 436 class Decomposer : public Normalizer2DBEnumerator { 437 public: 438 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 439 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 440 didDecompose|=builder.decompose(start, end, value); 441 return TRUE; 442 } 443 UBool didDecompose; 444 }; 445 446 UBool 447 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 448 if(norms[value].hasMapping()) { 449 Norm &norm=norms[value]; 450 const UnicodeString &m=*norm.mapping; 451 UnicodeString *decomposed=NULL; 452 const UChar *s=m.getBuffer(); 453 int32_t length=m.length(); 454 int32_t prev, i=0; 455 UChar32 c; 456 while(i<length) { 457 prev=i; 458 U16_NEXT(s, i, length, c); 459 if(start<=c && c<=end) { 460 fprintf(stderr, 461 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 462 (long)c); 463 exit(U_INVALID_FORMAT_ERROR); 464 } 465 const Norm &cNorm=getNormRef(c); 466 if(cNorm.hasMapping()) { 467 if(norm.mappingType==Norm::ROUND_TRIP) { 468 if(prev==0) { 469 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 470 fprintf(stderr, 471 "gennorm2 error: " 472 "U+%04lX's round-trip mapping's starter " 473 "U+%04lX one-way-decomposes, " 474 "not possible in Unicode normalization\n", 475 (long)start, (long)c); 476 exit(U_INVALID_FORMAT_ERROR); 477 } 478 uint8_t myTrailCC=getCC(m.char32At(i)); 479 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 480 uint8_t cTrailCC=getCC(cTrailChar); 481 if(cTrailCC>myTrailCC) { 482 fprintf(stderr, 483 "gennorm2 error: " 484 "U+%04lX's round-trip mapping's starter " 485 "U+%04lX decomposes and the " 486 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 487 "not possible in Unicode normalization\n", 488 (long)start, (long)c, 489 (short)cTrailCC, (short)myTrailCC); 490 exit(U_INVALID_FORMAT_ERROR); 491 } 492 } else { 493 fprintf(stderr, 494 "gennorm2 error: " 495 "U+%04lX's round-trip mapping's non-starter " 496 "U+%04lX decomposes, " 497 "not possible in Unicode normalization\n", 498 (long)start, (long)c); 499 exit(U_INVALID_FORMAT_ERROR); 500 } 501 } 502 if(decomposed==NULL) { 503 decomposed=new UnicodeString(m, 0, prev); 504 } 505 decomposed->append(*cNorm.mapping); 506 } else if(Hangul::isHangul(c)) { 507 UChar buffer[3]; 508 int32_t hangulLength=Hangul::decompose(c, buffer); 509 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 510 fprintf(stderr, 511 "gennorm2 error: " 512 "U+%04lX's round-trip mapping's non-starter " 513 "U+%04lX decomposes, " 514 "not possible in Unicode normalization\n", 515 (long)start, (long)c); 516 exit(U_INVALID_FORMAT_ERROR); 517 } 518 if(decomposed==NULL) { 519 decomposed=new UnicodeString(m, 0, prev); 520 } 521 decomposed->append(buffer, hangulLength); 522 } else if(decomposed!=NULL) { 523 decomposed->append(m, prev, i-prev); 524 } 525 } 526 if(decomposed!=NULL) { 527 if(norm.rawMapping==NULL) { 528 // Remember the original mapping when decomposing recursively. 529 norm.rawMapping=norm.mapping; 530 } else { 531 delete norm.mapping; 532 } 533 norm.mapping=decomposed; 534 // Not norm.setMappingCP(); because the original mapping 535 // is most likely to be encodable as a delta. 536 return TRUE; 537 } 538 } 539 return FALSE; 540 } 541 542 class BuilderReorderingBuffer { 543 public: 544 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 545 void reset() { 546 fLength=0; 547 fLastStarterIndex=-1; 548 fDidReorder=FALSE; 549 } 550 int32_t length() const { return fLength; } 551 UBool isEmpty() const { return fLength==0; } 552 int32_t lastStarterIndex() const { return fLastStarterIndex; } 553 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 554 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 555 UBool didReorder() const { return fDidReorder; } 556 void append(UChar32 c, uint8_t cc) { 557 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 558 if(cc==0) { 559 fLastStarterIndex=fLength; 560 } 561 fArray[fLength++]=(c<<8)|cc; 562 return; 563 } 564 // Let this character bubble back to its canonical order. 565 int32_t i=fLength-1; 566 while(i>fLastStarterIndex && ccAt(i)>cc) { 567 --i; 568 } 569 ++i; // after the last starter or prevCC<=cc 570 // Move this and the following characters forward one to make space. 571 for(int32_t j=fLength; i<j; --j) { 572 fArray[j]=fArray[j-1]; 573 } 574 fArray[i]=(c<<8)|cc; 575 ++fLength; 576 fDidReorder=TRUE; 577 } 578 void toString(UnicodeString &dest) { 579 dest.remove(); 580 for(int32_t i=0; i<fLength; ++i) { 581 dest.append(charAt(i)); 582 } 583 } 584 void setComposite(UChar32 composite, int32_t combMarkIndex) { 585 fArray[fLastStarterIndex]=composite<<8; 586 // Remove the combining mark that contributed to the composite. 587 --fLength; 588 while(combMarkIndex<fLength) { 589 fArray[combMarkIndex]=fArray[combMarkIndex+1]; 590 ++combMarkIndex; 591 } 592 } 593 private: 594 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 595 int32_t fLength; 596 int32_t fLastStarterIndex; 597 UBool fDidReorder; 598 }; 599 600 void 601 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 602 UnicodeString &m=*p->mapping; 603 int32_t length=m.length(); 604 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 605 return; // writeMapping() will complain about it and print the code point. 606 } 607 const UChar *s=m.getBuffer(); 608 int32_t i=0; 609 UChar32 c; 610 while(i<length) { 611 U16_NEXT(s, i, length, c); 612 buffer.append(c, getCC(c)); 613 } 614 if(buffer.didReorder()) { 615 buffer.toString(m); 616 } 617 } 618 619 /* 620 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter(). 621 * A starter character with a mapping does not have a composition boundary after it 622 * if the character itself combines-forward (which is tested by the caller of this function), 623 * or it is deleted (mapped to the empty string), 624 * or its mapping contains no starter, 625 * or the last starter combines-forward. 626 */ 627 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 628 if(buffer.isEmpty()) { 629 return TRUE; // maps-to-empty-string is no boundary of any kind 630 } 631 int32_t lastStarterIndex=buffer.lastStarterIndex(); 632 if(lastStarterIndex<0) { 633 return TRUE; // no starter 634 } 635 UChar32 starter=buffer.charAt(lastStarterIndex); 636 if( Hangul::isJamoL(starter) || 637 (Hangul::isJamoV(starter) && 638 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 639 ) { 640 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 641 // otherwise it is blocked. 642 return lastStarterIndex==buffer.length()-1; 643 } 644 // Note: There can be no Hangul syllable in the fully decomposed mapping. 645 const Norm *starterNorm=&getNormRef(starter); 646 if(starterNorm->compositions==NULL) { 647 return FALSE; // the last starter does not combine forward 648 } 649 // Compose as far as possible, and see if further compositions are possible. 650 uint8_t prevCC=0; 651 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 652 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 653 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 654 return TRUE; 655 } 656 if( prevCC<cc && 657 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 658 ) { 659 buffer.setComposite(starter, combMarkIndex); 660 starterNorm=&getNormRef(starter); 661 if(starterNorm->compositions==NULL) { 662 return FALSE; // the composite does not combine further 663 } 664 } else { 665 prevCC=cc; 666 ++combMarkIndex; 667 } 668 } 669 // TRUE if the final, forward-combining starter is at the end. 670 return prevCC==0; 671 } 672 673 // Requires p->hasMapping(). 674 // Returns the offset of the "first unit" from the beginning of the extraData for c. 675 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. 676 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 677 UnicodeString &m=*p->mapping; 678 int32_t length=m.length(); 679 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 680 fprintf(stderr, 681 "gennorm2 error: " 682 "mapping for U+%04lX longer than maximum of %d\n", 683 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 684 exit(U_INVALID_FORMAT_ERROR); 685 } 686 int32_t leadCC, trailCC; 687 if(length==0) { 688 leadCC=trailCC=0; 689 } else { 690 leadCC=getCC(m.char32At(0)); 691 trailCC=getCC(m.char32At(length-1)); 692 } 693 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 694 fprintf(stderr, 695 "gennorm2 error: " 696 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 697 (long)c); 698 exit(U_INVALID_FORMAT_ERROR); 699 } 700 // Write small-FCD data. 701 if((leadCC|trailCC)!=0) { 702 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 703 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 704 } 705 // Write the mapping & raw mapping extraData. 706 int32_t firstUnit=length|(trailCC<<8); 707 int32_t preMappingLength=0; 708 if(p->rawMapping!=NULL) { 709 UnicodeString &rm=*p->rawMapping; 710 int32_t rmLength=rm.length(); 711 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 712 fprintf(stderr, 713 "gennorm2 error: " 714 "raw mapping for U+%04lX longer than maximum of %d\n", 715 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 716 exit(U_INVALID_FORMAT_ERROR); 717 } 718 UChar rm0=rm.charAt(0); 719 if( rmLength==length-1 && 720 // 99: overlong substring lengths get pinned to remainder lengths anyway 721 0==rm.compare(1, 99, m, 2, 99) && 722 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 723 ) { 724 // Compression: 725 // rawMapping=rm0+mapping.substring(2) -> store only rm0 726 // 727 // The raw mapping is the same as the final mapping after replacing 728 // the final mapping's first two code units with the raw mapping's first one. 729 // In this case, we store only that first unit, rm0. 730 // This helps with a few hundred mappings. 731 dataString.append(rm0); 732 preMappingLength=1; 733 } else { 734 // Store the raw mapping with its length. 735 dataString.append(rm); 736 dataString.append((UChar)rmLength); 737 preMappingLength=rmLength+1; 738 } 739 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 740 } 741 int32_t cccLccc=p->cc|(leadCC<<8); 742 if(cccLccc!=0) { 743 dataString.append((UChar)cccLccc); 744 ++preMappingLength; 745 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 746 } 747 if(p->hasNoCompBoundaryAfter) { 748 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 749 } 750 dataString.append((UChar)firstUnit); 751 dataString.append(m); 752 return preMappingLength; 753 } 754 755 // Requires p->compositions!=NULL. 756 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 757 if(p->cc!=0) { 758 fprintf(stderr, 759 "gennorm2 error: " 760 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 761 (long)c); 762 exit(U_INVALID_FORMAT_ERROR); 763 } 764 int32_t length; 765 const CompositionPair *pairs=p->getCompositionPairs(length); 766 for(int32_t i=0; i<length; ++i) { 767 const CompositionPair &pair=pairs[i]; 768 // 22 bits for the composite character and whether it combines forward. 769 UChar32 compositeAndFwd=pair.composite<<1; 770 if(getNormRef(pair.composite).compositions!=NULL) { 771 compositeAndFwd|=1; // The composite character also combines-forward. 772 } 773 // Encode most pairs in two units and some in three. 774 int32_t firstUnit, secondUnit, thirdUnit; 775 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 776 if(compositeAndFwd<=0xffff) { 777 firstUnit=pair.trail<<1; 778 secondUnit=compositeAndFwd; 779 thirdUnit=-1; 780 } else { 781 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 782 secondUnit=compositeAndFwd>>16; 783 thirdUnit=compositeAndFwd; 784 } 785 } else { 786 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 787 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 788 Normalizer2Impl::COMP_1_TRIPLE; 789 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 790 (compositeAndFwd>>16); 791 thirdUnit=compositeAndFwd; 792 } 793 // Set the high bit of the first unit if this is the last composition pair. 794 if(i==(length-1)) { 795 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 796 } 797 dataString.append((UChar)firstUnit).append((UChar)secondUnit); 798 if(thirdUnit>=0) { 799 dataString.append((UChar)thirdUnit); 800 } 801 } 802 } 803 804 class ExtraDataWriter : public Normalizer2DBEnumerator { 805 public: 806 ExtraDataWriter(Normalizer2DataBuilder &b) : 807 Normalizer2DBEnumerator(b), 808 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 809 yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 810 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 811 if(value!=0) { 812 if(start!=end) { 813 fprintf(stderr, 814 "gennorm2 error: unexpected shared data for " 815 "multiple code points U+%04lX..U+%04lX\n", 816 (long)start, (long)end); 817 exit(U_INTERNAL_PROGRAM_ERROR); 818 } 819 builder.writeExtraData(start, value, *this); 820 } 821 return TRUE; 822 } 823 UnicodeString maybeYesCompositions; 824 UnicodeString yesYesCompositions; 825 UnicodeString yesNoMappingsAndCompositions; 826 UnicodeString yesNoMappingsOnly; 827 UnicodeString noNoMappings; 828 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 829 }; 830 831 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 832 Norm *p=norms+value; 833 if(!p->hasMapping()) { 834 // Write small-FCD data. 835 // There is similar code in writeMapping() for characters that do have a mapping. 836 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) { 837 fprintf(stderr, 838 "gennorm2 error: " 839 "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", 840 (long)c); 841 exit(U_INVALID_FORMAT_ERROR); 842 } 843 if(p->cc!=0) { 844 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 845 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 846 } 847 } 848 if(p->combinesBack) { 849 if(p->hasMapping()) { 850 fprintf(stderr, 851 "gennorm2 error: " 852 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 853 (long)c); 854 exit(U_INVALID_FORMAT_ERROR); 855 } 856 if(p->compositions!=NULL) { 857 p->offset= 858 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 859 Norm::OFFSET_MAYBE_YES; 860 writeCompositions(c, p, writer.maybeYesCompositions); 861 } 862 } else if(!p->hasMapping()) { 863 if(p->compositions!=NULL) { 864 p->offset= 865 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 866 Norm::OFFSET_YES_YES; 867 writeCompositions(c, p, writer.yesYesCompositions); 868 } 869 } else if(p->mappingType==Norm::ROUND_TRIP) { 870 if(p->compositions!=NULL) { 871 int32_t offset=writer.yesNoMappingsAndCompositions.length()+ 872 writeMapping(c, p, writer.yesNoMappingsAndCompositions); 873 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION; 874 writeCompositions(c, p, writer.yesNoMappingsAndCompositions); 875 } else { 876 int32_t offset=writer.yesNoMappingsOnly.length()+ 877 writeMapping(c, p, writer.yesNoMappingsOnly); 878 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY; 879 } 880 } else /* one-way */ { 881 if(p->compositions!=NULL) { 882 fprintf(stderr, 883 "gennorm2 error: " 884 "U+%04lX combines-forward and has a one-way mapping, " 885 "not possible in Unicode normalization\n", 886 (long)c); 887 exit(U_INVALID_FORMAT_ERROR); 888 } 889 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 890 // Try a compact, algorithmic encoding. 891 // Only for ccc=0, because we can't store additional information 892 // and we do not recursively follow an algorithmic encoding for access to the ccc. 893 // 894 // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding 895 // if the mappingCP decomposes further, to ensure that there is a place to store it. 896 // We want to see that the final mapping does not have exactly 1 code point, 897 // or else we would have to recursively ensure that the final mapping is stored 898 // in normal extraData. 899 if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { 900 int32_t delta=p->mappingCP-c; 901 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 902 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 903 } 904 } 905 } 906 if(p->offset==0) { 907 int32_t oldNoNoLength=writer.noNoMappings.length(); 908 int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); 909 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 910 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 911 if(previousOffset!=0) { 912 // Duplicate, remove the new units and point to the old ones. 913 writer.noNoMappings.truncate(oldNoNoLength); 914 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 915 } else { 916 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 917 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 918 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode); 919 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 920 } 921 } 922 } 923 } 924 925 class Norm16Writer : public Normalizer2DBEnumerator { 926 public: 927 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 928 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 929 builder.writeNorm16(start, end, value); 930 return TRUE; 931 } 932 }; 933 934 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 935 if(value!=0) { 936 const Norm *p=norms+value; 937 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 938 int32_t norm16=0; 939 UBool isDecompNo=FALSE; 940 UBool isCompNoMaybe=FALSE; 941 switch(p->offset&Norm::OFFSET_MASK) { 942 case Norm::OFFSET_NONE: 943 // No mapping, no compositions list. 944 if(p->combinesBack) { 945 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 946 isDecompNo=(UBool)(p->cc!=0); 947 isCompNoMaybe=TRUE; 948 } else if(p->cc!=0) { 949 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 950 isDecompNo=isCompNoMaybe=TRUE; 951 } 952 break; 953 case Norm::OFFSET_MAYBE_YES: 954 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 955 isCompNoMaybe=TRUE; 956 break; 957 case Norm::OFFSET_YES_YES: 958 norm16=offset; 959 break; 960 case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: 961 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 962 isDecompNo=TRUE; 963 break; 964 case Norm::OFFSET_YES_NO_MAPPING_ONLY: 965 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; 966 isDecompNo=TRUE; 967 break; 968 case Norm::OFFSET_NO_NO: 969 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 970 isDecompNo=isCompNoMaybe=TRUE; 971 break; 972 case Norm::OFFSET_DELTA: 973 norm16=getCenterNoNoDelta()+offset; 974 isDecompNo=isCompNoMaybe=TRUE; 975 break; 976 default: // Should not occur. 977 exit(U_INTERNAL_PROGRAM_ERROR); 978 } 979 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 980 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 981 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 982 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 983 } 984 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 985 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 986 } 987 } 988 } 989 990 void Normalizer2DataBuilder::setHangulData() { 991 HangulIterator hi; 992 const HangulIterator::Range *range; 993 // Check that none of the Hangul/Jamo code points have data. 994 while((range=hi.nextRange())!=NULL) { 995 for(UChar32 c=range->start; c<range->limit; ++c) { 996 if(utrie2_get32(norm16Trie, c)!=0) { 997 fprintf(stderr, 998 "gennorm2 error: " 999 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 1000 (long)c); 1001 exit(U_INVALID_FORMAT_ERROR); 1002 } 1003 } 1004 } 1005 // Set data for algorithmic runtime handling. 1006 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 1007 hi.reset(); 1008 while((range=hi.nextRange())!=NULL) { 1009 uint16_t norm16=range->norm16; 1010 if(norm16==0) { 1011 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 1012 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 1013 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 1014 } 1015 } else { 1016 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 1017 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 1018 } 1019 } 1020 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 1021 errorCode.assertSuccess(); 1022 } 1023 } 1024 1025 U_CDECL_BEGIN 1026 1027 static UBool U_CALLCONV 1028 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1029 uint32_t *pMaxValue=(uint32_t *)context; 1030 if(value>*pMaxValue) { 1031 *pMaxValue=value; 1032 } 1033 return TRUE; 1034 } 1035 1036 U_CDECL_END 1037 1038 void Normalizer2DataBuilder::processData() { 1039 IcuToolErrorCode errorCode("gennorm2/processData()"); 1040 norm16Trie=utrie2_open(0, 0, errorCode); 1041 errorCode.assertSuccess(); 1042 1043 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 1044 1045 Decomposer decomposer(*this); 1046 do { 1047 decomposer.didDecompose=FALSE; 1048 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 1049 } while(decomposer.didDecompose); 1050 1051 BuilderReorderingBuffer buffer; 1052 int32_t normsLength=utm_countItems(normMem); 1053 for(int32_t i=1; i<normsLength; ++i) { 1054 // Set the hasNoCompBoundaryAfter flag for use by the last code branch 1055 // in Normalizer2Impl::hasCompBoundaryAfter(). 1056 // For details see the comments on hasNoCompBoundaryAfter(buffer). 1057 const Norm &norm=norms[i]; 1058 if(norm.hasMapping()) { 1059 if(norm.compositions!=NULL) { 1060 norms[i].hasNoCompBoundaryAfter=TRUE; 1061 } else { 1062 buffer.reset(); 1063 reorder(norms+i, buffer); 1064 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 1065 } 1066 } 1067 } 1068 1069 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 1070 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 1071 1072 ExtraDataWriter extraDataWriter(*this); 1073 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 1074 1075 extraData=extraDataWriter.maybeYesCompositions; 1076 extraData.append(extraDataWriter.yesYesCompositions). 1077 append(extraDataWriter.yesNoMappingsAndCompositions). 1078 append(extraDataWriter.yesNoMappingsOnly). 1079 append(extraDataWriter.noNoMappings); 1080 // Pad to even length for 4-byte alignment of following data. 1081 if(extraData.length()&1) { 1082 extraData.append((UChar)0); 1083 } 1084 1085 indexes[Normalizer2Impl::IX_MIN_YES_NO]= 1086 extraDataWriter.yesYesCompositions.length(); 1087 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]= 1088 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 1089 extraDataWriter.yesNoMappingsAndCompositions.length(); 1090 indexes[Normalizer2Impl::IX_MIN_NO_NO]= 1091 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+ 1092 extraDataWriter.yesNoMappingsOnly.length(); 1093 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 1094 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 1095 extraDataWriter.noNoMappings.length(); 1096 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 1097 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 1098 extraDataWriter.maybeYesCompositions.length(); 1099 1100 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 1101 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 1102 fprintf(stderr, 1103 "gennorm2 error: " 1104 "data structure overflow, too much mapping composition data\n"); 1105 exit(U_BUFFER_OVERFLOW_ERROR); 1106 } 1107 1108 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 1109 1110 setHangulData(); 1111 1112 // Look for the "worst" norm16 value of any supplementary code point 1113 // corresponding to a lead surrogate, and set it as that surrogate's value. 1114 // Enables quick check inner loops to look at only code units. 1115 // 1116 // We could be more sophisticated: 1117 // We could collect a bit set for whether there are values in the different 1118 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 1119 // and select the best value that only breaks the composition and/or decomposition 1120 // inner loops if necessary. 1121 // However, that seems like overkill for an optimization for supplementary characters. 1122 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1123 uint32_t maxValue=utrie2_get32(norm16Trie, lead); 1124 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 1125 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 1126 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 1127 ) { 1128 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1129 // Otherwise it might end up at something like JAMO_VT which stays in 1130 // the inner decomposition quick check loop. 1131 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1132 } 1133 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1134 } 1135 1136 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1137 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1138 // which is harmless. 1139 // As a result, the minimum code points are always BMP code points. 1140 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1141 if(minCP>=0x10000) { 1142 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1143 } 1144 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1145 if(minCP>=0x10000) { 1146 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1147 } 1148 } 1149 1150 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1151 processData(); 1152 1153 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1154 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1155 int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1156 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1157 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1158 errorCode.errorName()); 1159 exit(errorCode.reset()); 1160 } 1161 errorCode.reset(); 1162 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1163 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1164 errorCode.assertSuccess(); 1165 1166 int32_t offset=(int32_t)sizeof(indexes); 1167 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1168 offset+=norm16TrieLength; 1169 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1170 offset+=extraData.length()*2; 1171 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 1172 offset+=sizeof(smallFCD); 1173 int32_t totalSize=offset; 1174 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1175 indexes[i]=totalSize; 1176 } 1177 1178 if(beVerbose) { 1179 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1180 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1181 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 1182 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1183 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1184 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1185 printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1186 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 1187 printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1188 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1189 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1190 } 1191 1192 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 1193 if(0==memcmp(nullVersion, unicodeVersion, 4)) { 1194 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 1195 } 1196 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1197 UNewDataMemory *pData= 1198 udata_create(NULL, NULL, filename, &dataInfo, 1199 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1200 if(errorCode.isFailure()) { 1201 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1202 filename, errorCode.errorName()); 1203 exit(errorCode.reset()); 1204 } 1205 udata_writeBlock(pData, indexes, sizeof(indexes)); 1206 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1207 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1208 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 1209 int32_t writtenSize=udata_finish(pData, errorCode); 1210 if(errorCode.isFailure()) { 1211 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1212 exit(errorCode.reset()); 1213 } 1214 if(writtenSize!=totalSize) { 1215 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1216 (long)writtenSize, (long)totalSize); 1217 exit(U_INTERNAL_PROGRAM_ERROR); 1218 } 1219 } 1220 1221 U_NAMESPACE_END 1222 1223 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1224 1225 /* 1226 * Hey, Emacs, please set the following: 1227 * 1228 * Local Variables: 1229 * indent-tabs-mode: nil 1230 * End: 1231 */ 1232