1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: n2builder.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov25 14 * created by: Markus W. Scherer 15 * 16 * Builds Normalizer2 data and writes a binary .nrm file. 17 * For the file format see source/common/normalizer2impl.h. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "n2builder.h" 22 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #if U_HAVE_STD_STRING 27 #include <vector> 28 #endif 29 #include "unicode/errorcode.h" 30 #include "unicode/localpointer.h" 31 #include "unicode/putil.h" 32 #include "unicode/udata.h" 33 #include "unicode/uniset.h" 34 #include "unicode/unistr.h" 35 #include "unicode/ustring.h" 36 #include "charstr.h" 37 #include "hash.h" 38 #include "normalizer2impl.h" 39 #include "toolutil.h" 40 #include "unewdata.h" 41 #include "utrie2.h" 42 #include "uvectr32.h" 43 #include "writesrc.h" 44 45 #if !UCONFIG_NO_NORMALIZATION 46 47 /* UDataInfo cf. udata.h */ 48 static UDataInfo dataInfo={ 49 sizeof(UDataInfo), 50 0, 51 52 U_IS_BIG_ENDIAN, 53 U_CHARSET_FAMILY, 54 U_SIZEOF_UCHAR, 55 0, 56 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 58 { 2, 0, 0, 0 }, /* formatVersion */ 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 60 }; 61 62 U_NAMESPACE_BEGIN 63 64 class HangulIterator { 65 public: 66 struct Range { 67 UChar32 start, limit; 68 uint16_t norm16; 69 }; 70 71 HangulIterator() : rangeIndex(0) {} 72 const Range *nextRange() { 73 if(rangeIndex<UPRV_LENGTHOF(ranges)) { 74 return ranges+rangeIndex++; 75 } else { 76 return NULL; 77 } 78 } 79 void reset() { rangeIndex=0; } 80 private: 81 static const Range ranges[4]; 82 int32_t rangeIndex; 83 }; 84 85 const HangulIterator::Range HangulIterator::ranges[4]={ 86 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 87 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 88 // JAMO_T_BASE+1: not U+11A7 89 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 90 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 91 }; 92 93 struct CompositionPair { 94 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 95 UChar32 trail, composite; 96 }; 97 98 struct Norm { 99 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 100 101 UBool hasMapping() const { return mappingType>REMOVED; } 102 103 // Requires hasMapping() and well-formed mapping. 104 void setMappingCP() { 105 UChar32 c; 106 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 107 mappingCP=c; 108 } else { 109 mappingCP=U_SENTINEL; 110 } 111 } 112 113 const CompositionPair *getCompositionPairs(int32_t &length) const { 114 if(compositions==NULL) { 115 length=0; 116 return NULL; 117 } else { 118 length=compositions->size()/2; 119 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 120 } 121 } 122 123 UnicodeString *mapping; 124 UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed 125 UChar32 mappingCP; // >=0 if mapping to 1 code point 126 int32_t mappingPhase; 127 MappingType mappingType; 128 129 UVector32 *compositions; // (trail, composite) pairs 130 uint8_t cc; 131 UBool combinesBack; 132 UBool hasNoCompBoundaryAfter; 133 134 enum OffsetType { 135 OFFSET_NONE, 136 // Composition for back-combining character. Allowed, but not normally used. 137 OFFSET_MAYBE_YES, 138 // Composition for a starter that does not have a decomposition mapping. 139 OFFSET_YES_YES, 140 // Round-trip mapping & composition for a starter. 141 OFFSET_YES_NO_MAPPING_AND_COMPOSITION, 142 // Round-trip mapping for a starter that itself does not combine-forward. 143 OFFSET_YES_NO_MAPPING_ONLY, 144 // One-way mapping. 145 OFFSET_NO_NO, 146 // Delta for an algorithmic one-way mapping. 147 OFFSET_DELTA 148 }; 149 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 150 int32_t offset; 151 }; 152 153 class Normalizer2DBEnumerator { 154 public: 155 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 156 virtual ~Normalizer2DBEnumerator() {} 157 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 158 Normalizer2DBEnumerator *ptr() { return this; } 159 protected: 160 Normalizer2DataBuilder &builder; 161 }; 162 163 U_CDECL_BEGIN 164 165 static UBool U_CALLCONV 166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 167 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 168 } 169 170 U_CDECL_END 171 172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 173 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), 174 norm16TrieLength(0) { 175 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 176 normTrie=utrie2_open(0, 0, &errorCode); 177 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 178 norms=allocNorm(); // unused Norm struct at index 0 179 memset(indexes, 0, sizeof(indexes)); 180 memset(smallFCD, 0, sizeof(smallFCD)); 181 } 182 183 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 184 utrie2_close(normTrie); 185 int32_t normsLength=utm_countItems(normMem); 186 for(int32_t i=1; i<normsLength; ++i) { 187 delete norms[i].mapping; 188 delete norms[i].rawMapping; 189 delete norms[i].compositions; 190 } 191 utm_close(normMem); 192 utrie2_close(norm16Trie); 193 } 194 195 void 196 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 197 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 198 UVersionInfo version; 199 u_versionFromString(version, v); 200 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 201 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 202 ) { 203 char buffer[U_MAX_VERSION_STRING_LENGTH]; 204 u_versionToString(unicodeVersion, buffer); 205 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 206 buffer, v); 207 exit(U_ILLEGAL_ARGUMENT_ERROR); 208 } 209 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 210 } 211 212 Norm *Normalizer2DataBuilder::allocNorm() { 213 Norm *p=(Norm *)utm_alloc(normMem); 214 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 215 return p; 216 } 217 218 /* get an existing Norm unit */ 219 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 220 uint32_t i=utrie2_get32(normTrie, c); 221 if(i==0) { 222 return NULL; 223 } 224 return norms+i; 225 } 226 227 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 228 return norms[utrie2_get32(normTrie, c)]; 229 } 230 231 /* 232 * get or create a Norm unit; 233 * get or create the intermediate trie entries for it as well 234 */ 235 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 236 uint32_t i=utrie2_get32(normTrie, c); 237 if(i!=0) { 238 return norms+i; 239 } else { 240 /* allocate Norm */ 241 Norm *p=allocNorm(); 242 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 243 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 244 return p; 245 } 246 } 247 248 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 249 if(p!=NULL) { 250 if(p->mappingType!=Norm::NONE) { 251 if( overrideHandling==OVERRIDE_NONE || 252 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 253 ) { 254 fprintf(stderr, 255 "error in gennorm2 phase %d: " 256 "not permitted to override mapping for U+%04lX from phase %d\n", 257 (int)phase, (long)c, (int)p->mappingPhase); 258 exit(U_INVALID_FORMAT_ERROR); 259 } 260 delete p->mapping; 261 p->mapping=NULL; 262 } 263 p->mappingPhase=phase; 264 } 265 return p; 266 } 267 268 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 269 overrideHandling=oh; 270 ++phase; 271 } 272 273 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 274 createNorm(c)->cc=cc; 275 } 276 277 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 278 return getNormRef(c).cc; 279 } 280 281 static UBool isWellFormed(const UnicodeString &s) { 282 UErrorCode errorCode=U_ZERO_ERROR; 283 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 284 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 285 } 286 287 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 288 if(!isWellFormed(m)) { 289 fprintf(stderr, 290 "error in gennorm2 phase %d: " 291 "illegal one-way mapping from U+%04lX to malformed string\n", 292 (int)phase, (long)c); 293 exit(U_INVALID_FORMAT_ERROR); 294 } 295 Norm *p=checkNormForMapping(createNorm(c), c); 296 p->mapping=new UnicodeString(m); 297 p->mappingType=Norm::ONE_WAY; 298 p->setMappingCP(); 299 } 300 301 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 302 if(U_IS_SURROGATE(c)) { 303 fprintf(stderr, 304 "error in gennorm2 phase %d: " 305 "illegal round-trip mapping from surrogate code point U+%04lX\n", 306 (int)phase, (long)c); 307 exit(U_INVALID_FORMAT_ERROR); 308 } 309 if(!isWellFormed(m)) { 310 fprintf(stderr, 311 "error in gennorm2 phase %d: " 312 "illegal round-trip mapping from U+%04lX to malformed string\n", 313 (int)phase, (long)c); 314 exit(U_INVALID_FORMAT_ERROR); 315 } 316 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 317 if(numCP!=2) { 318 fprintf(stderr, 319 "error in gennorm2 phase %d: " 320 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 321 (int)phase, (long)c, (int)numCP); 322 exit(U_INVALID_FORMAT_ERROR); 323 } 324 Norm *p=checkNormForMapping(createNorm(c), c); 325 p->mapping=new UnicodeString(m); 326 p->mappingType=Norm::ROUND_TRIP; 327 p->mappingCP=U_SENTINEL; 328 } 329 330 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 331 Norm *p=checkNormForMapping(getNorm(c), c); 332 if(p!=NULL) { 333 p->mappingType=Norm::REMOVED; 334 } 335 } 336 337 class CompositionBuilder : public Normalizer2DBEnumerator { 338 public: 339 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 340 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 341 builder.addComposition(start, end, value); 342 return TRUE; 343 } 344 }; 345 346 void 347 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 348 if(norms[value].mappingType==Norm::ROUND_TRIP) { 349 if(start!=end) { 350 fprintf(stderr, 351 "gennorm2 error: same round-trip mapping for " 352 "more than 1 code point U+%04lX..U+%04lX\n", 353 (long)start, (long)end); 354 exit(U_INVALID_FORMAT_ERROR); 355 } 356 if(norms[value].cc!=0) { 357 fprintf(stderr, 358 "gennorm2 error: " 359 "U+%04lX has a round-trip mapping and ccc!=0, " 360 "not possible in Unicode normalization\n", 361 (long)start); 362 exit(U_INVALID_FORMAT_ERROR); 363 } 364 // setRoundTripMapping() ensured that there are exactly two code points. 365 const UnicodeString &m=*norms[value].mapping; 366 UChar32 lead=m.char32At(0); 367 UChar32 trail=m.char32At(m.length()-1); 368 if(getCC(lead)!=0) { 369 fprintf(stderr, 370 "gennorm2 error: " 371 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 372 "not possible in Unicode normalization\n", 373 (long)start, (long)lead); 374 exit(U_INVALID_FORMAT_ERROR); 375 } 376 // Flag for trailing character. 377 createNorm(trail)->combinesBack=TRUE; 378 // Insert (trail, composite) pair into compositions list for the lead character. 379 IcuToolErrorCode errorCode("gennorm2/addComposition()"); 380 Norm *leadNorm=createNorm(lead); 381 UVector32 *compositions=leadNorm->compositions; 382 int32_t i; 383 if(compositions==NULL) { 384 compositions=leadNorm->compositions=new UVector32(errorCode); 385 i=0; // "insert" the first pair at index 0 386 } else { 387 // Insertion sort, and check for duplicate trail characters. 388 int32_t length; 389 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 390 for(i=0; i<length; ++i) { 391 if(trail==pairs[i].trail) { 392 fprintf(stderr, 393 "gennorm2 error: same round-trip mapping for " 394 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 395 (long)start, (long)lead, (long)trail); 396 exit(U_INVALID_FORMAT_ERROR); 397 } 398 if(trail<pairs[i].trail) { 399 break; 400 } 401 } 402 } 403 compositions->insertElementAt(trail, 2*i, errorCode); 404 compositions->insertElementAt(start, 2*i+1, errorCode); 405 } 406 } 407 408 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 409 uint8_t lowCC, uint8_t highCC) const { 410 if((highCC-lowCC)>=2) { 411 int32_t length; 412 const CompositionPair *pairs=norm.getCompositionPairs(length); 413 for(int32_t i=0; i<length; ++i) { 414 uint8_t trailCC=getCC(pairs[i].trail); 415 if(lowCC<trailCC && trailCC<highCC) { 416 return TRUE; 417 } 418 } 419 } 420 return FALSE; 421 } 422 423 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 424 int32_t length; 425 const CompositionPair *pairs=norm.getCompositionPairs(length); 426 for(int32_t i=0; i<length; ++i) { 427 if(trail==pairs[i].trail) { 428 return pairs[i].composite; 429 } 430 if(trail<pairs[i].trail) { 431 break; 432 } 433 } 434 return U_SENTINEL; 435 } 436 437 class Decomposer : public Normalizer2DBEnumerator { 438 public: 439 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 440 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 441 didDecompose|=builder.decompose(start, end, value); 442 return TRUE; 443 } 444 UBool didDecompose; 445 }; 446 447 UBool 448 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 449 if(norms[value].hasMapping()) { 450 Norm &norm=norms[value]; 451 const UnicodeString &m=*norm.mapping; 452 UnicodeString *decomposed=NULL; 453 const UChar *s=m.getBuffer(); 454 int32_t length=m.length(); 455 int32_t prev, i=0; 456 UChar32 c; 457 while(i<length) { 458 prev=i; 459 U16_NEXT(s, i, length, c); 460 if(start<=c && c<=end) { 461 fprintf(stderr, 462 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 463 (long)c); 464 exit(U_INVALID_FORMAT_ERROR); 465 } 466 const Norm &cNorm=getNormRef(c); 467 if(cNorm.hasMapping()) { 468 if(norm.mappingType==Norm::ROUND_TRIP) { 469 if(prev==0) { 470 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 471 fprintf(stderr, 472 "gennorm2 error: " 473 "U+%04lX's round-trip mapping's starter " 474 "U+%04lX one-way-decomposes, " 475 "not possible in Unicode normalization\n", 476 (long)start, (long)c); 477 exit(U_INVALID_FORMAT_ERROR); 478 } 479 uint8_t myTrailCC=getCC(m.char32At(i)); 480 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 481 uint8_t cTrailCC=getCC(cTrailChar); 482 if(cTrailCC>myTrailCC) { 483 fprintf(stderr, 484 "gennorm2 error: " 485 "U+%04lX's round-trip mapping's starter " 486 "U+%04lX decomposes and the " 487 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 488 "not possible in Unicode normalization\n", 489 (long)start, (long)c, 490 (short)cTrailCC, (short)myTrailCC); 491 exit(U_INVALID_FORMAT_ERROR); 492 } 493 } else { 494 fprintf(stderr, 495 "gennorm2 error: " 496 "U+%04lX's round-trip mapping's non-starter " 497 "U+%04lX decomposes, " 498 "not possible in Unicode normalization\n", 499 (long)start, (long)c); 500 exit(U_INVALID_FORMAT_ERROR); 501 } 502 } 503 if(decomposed==NULL) { 504 decomposed=new UnicodeString(m, 0, prev); 505 } 506 decomposed->append(*cNorm.mapping); 507 } else if(Hangul::isHangul(c)) { 508 UChar buffer[3]; 509 int32_t hangulLength=Hangul::decompose(c, buffer); 510 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 511 fprintf(stderr, 512 "gennorm2 error: " 513 "U+%04lX's round-trip mapping's non-starter " 514 "U+%04lX decomposes, " 515 "not possible in Unicode normalization\n", 516 (long)start, (long)c); 517 exit(U_INVALID_FORMAT_ERROR); 518 } 519 if(decomposed==NULL) { 520 decomposed=new UnicodeString(m, 0, prev); 521 } 522 decomposed->append(buffer, hangulLength); 523 } else if(decomposed!=NULL) { 524 decomposed->append(m, prev, i-prev); 525 } 526 } 527 if(decomposed!=NULL) { 528 if(norm.rawMapping==NULL) { 529 // Remember the original mapping when decomposing recursively. 530 norm.rawMapping=norm.mapping; 531 } else { 532 delete norm.mapping; 533 } 534 norm.mapping=decomposed; 535 // Not norm.setMappingCP(); because the original mapping 536 // is most likely to be encodable as a delta. 537 return TRUE; 538 } 539 } 540 return FALSE; 541 } 542 543 class BuilderReorderingBuffer { 544 public: 545 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 546 void reset() { 547 fLength=0; 548 fLastStarterIndex=-1; 549 fDidReorder=FALSE; 550 } 551 int32_t length() const { return fLength; } 552 UBool isEmpty() const { return fLength==0; } 553 int32_t lastStarterIndex() const { return fLastStarterIndex; } 554 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 555 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 556 UBool didReorder() const { return fDidReorder; } 557 void append(UChar32 c, uint8_t cc) { 558 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 559 if(cc==0) { 560 fLastStarterIndex=fLength; 561 } 562 fArray[fLength++]=(c<<8)|cc; 563 return; 564 } 565 // Let this character bubble back to its canonical order. 566 int32_t i=fLength-1; 567 while(i>fLastStarterIndex && ccAt(i)>cc) { 568 --i; 569 } 570 ++i; // after the last starter or prevCC<=cc 571 // Move this and the following characters forward one to make space. 572 for(int32_t j=fLength; i<j; --j) { 573 fArray[j]=fArray[j-1]; 574 } 575 fArray[i]=(c<<8)|cc; 576 ++fLength; 577 fDidReorder=TRUE; 578 } 579 void toString(UnicodeString &dest) { 580 dest.remove(); 581 for(int32_t i=0; i<fLength; ++i) { 582 dest.append(charAt(i)); 583 } 584 } 585 void setComposite(UChar32 composite, int32_t combMarkIndex) { 586 fArray[fLastStarterIndex]=composite<<8; 587 // Remove the combining mark that contributed to the composite. 588 --fLength; 589 while(combMarkIndex<fLength) { 590 fArray[combMarkIndex]=fArray[combMarkIndex+1]; 591 ++combMarkIndex; 592 } 593 } 594 private: 595 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 596 int32_t fLength; 597 int32_t fLastStarterIndex; 598 UBool fDidReorder; 599 }; 600 601 void 602 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 603 UnicodeString &m=*p->mapping; 604 int32_t length=m.length(); 605 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 606 return; // writeMapping() will complain about it and print the code point. 607 } 608 const UChar *s=m.getBuffer(); 609 int32_t i=0; 610 UChar32 c; 611 while(i<length) { 612 U16_NEXT(s, i, length, c); 613 buffer.append(c, getCC(c)); 614 } 615 if(buffer.didReorder()) { 616 buffer.toString(m); 617 } 618 } 619 620 /* 621 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter(). 622 * A starter character with a mapping does not have a composition boundary after it 623 * if the character itself combines-forward (which is tested by the caller of this function), 624 * or it is deleted (mapped to the empty string), 625 * or its mapping contains no starter, 626 * or the last starter combines-forward. 627 */ 628 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 629 if(buffer.isEmpty()) { 630 return TRUE; // maps-to-empty-string is no boundary of any kind 631 } 632 int32_t lastStarterIndex=buffer.lastStarterIndex(); 633 if(lastStarterIndex<0) { 634 return TRUE; // no starter 635 } 636 UChar32 starter=buffer.charAt(lastStarterIndex); 637 if( Hangul::isJamoL(starter) || 638 (Hangul::isJamoV(starter) && 639 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 640 ) { 641 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 642 // otherwise it is blocked. 643 return lastStarterIndex==buffer.length()-1; 644 } 645 // Note: There can be no Hangul syllable in the fully decomposed mapping. 646 const Norm *starterNorm=&getNormRef(starter); 647 if(starterNorm->compositions==NULL) { 648 return FALSE; // the last starter does not combine forward 649 } 650 // Compose as far as possible, and see if further compositions are possible. 651 uint8_t prevCC=0; 652 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 653 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 654 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 655 return TRUE; 656 } 657 if( prevCC<cc && 658 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 659 ) { 660 buffer.setComposite(starter, combMarkIndex); 661 starterNorm=&getNormRef(starter); 662 if(starterNorm->compositions==NULL) { 663 return FALSE; // the composite does not combine further 664 } 665 } else { 666 prevCC=cc; 667 ++combMarkIndex; 668 } 669 } 670 // TRUE if the final, forward-combining starter is at the end. 671 return prevCC==0; 672 } 673 674 // Requires p->hasMapping(). 675 // Returns the offset of the "first unit" from the beginning of the extraData for c. 676 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. 677 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 678 UnicodeString &m=*p->mapping; 679 int32_t length=m.length(); 680 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 681 fprintf(stderr, 682 "gennorm2 error: " 683 "mapping for U+%04lX longer than maximum of %d\n", 684 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 685 exit(U_INVALID_FORMAT_ERROR); 686 } 687 int32_t leadCC, trailCC; 688 if(length==0) { 689 leadCC=trailCC=0; 690 } else { 691 leadCC=getCC(m.char32At(0)); 692 trailCC=getCC(m.char32At(length-1)); 693 } 694 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 695 fprintf(stderr, 696 "gennorm2 error: " 697 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 698 (long)c); 699 exit(U_INVALID_FORMAT_ERROR); 700 } 701 // Write small-FCD data. 702 if((leadCC|trailCC)!=0) { 703 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 704 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 705 } 706 // Write the mapping & raw mapping extraData. 707 int32_t firstUnit=length|(trailCC<<8); 708 int32_t preMappingLength=0; 709 if(p->rawMapping!=NULL) { 710 UnicodeString &rm=*p->rawMapping; 711 int32_t rmLength=rm.length(); 712 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 713 fprintf(stderr, 714 "gennorm2 error: " 715 "raw mapping for U+%04lX longer than maximum of %d\n", 716 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 717 exit(U_INVALID_FORMAT_ERROR); 718 } 719 UChar rm0=rm.charAt(0); 720 if( rmLength==length-1 && 721 // 99: overlong substring lengths get pinned to remainder lengths anyway 722 0==rm.compare(1, 99, m, 2, 99) && 723 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 724 ) { 725 // Compression: 726 // rawMapping=rm0+mapping.substring(2) -> store only rm0 727 // 728 // The raw mapping is the same as the final mapping after replacing 729 // the final mapping's first two code units with the raw mapping's first one. 730 // In this case, we store only that first unit, rm0. 731 // This helps with a few hundred mappings. 732 dataString.append(rm0); 733 preMappingLength=1; 734 } else { 735 // Store the raw mapping with its length. 736 dataString.append(rm); 737 dataString.append((UChar)rmLength); 738 preMappingLength=rmLength+1; 739 } 740 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 741 } 742 int32_t cccLccc=p->cc|(leadCC<<8); 743 if(cccLccc!=0) { 744 dataString.append((UChar)cccLccc); 745 ++preMappingLength; 746 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 747 } 748 if(p->hasNoCompBoundaryAfter) { 749 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 750 } 751 dataString.append((UChar)firstUnit); 752 dataString.append(m); 753 return preMappingLength; 754 } 755 756 // Requires p->compositions!=NULL. 757 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 758 if(p->cc!=0) { 759 fprintf(stderr, 760 "gennorm2 error: " 761 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 762 (long)c); 763 exit(U_INVALID_FORMAT_ERROR); 764 } 765 int32_t length; 766 const CompositionPair *pairs=p->getCompositionPairs(length); 767 for(int32_t i=0; i<length; ++i) { 768 const CompositionPair &pair=pairs[i]; 769 // 22 bits for the composite character and whether it combines forward. 770 UChar32 compositeAndFwd=pair.composite<<1; 771 if(getNormRef(pair.composite).compositions!=NULL) { 772 compositeAndFwd|=1; // The composite character also combines-forward. 773 } 774 // Encode most pairs in two units and some in three. 775 int32_t firstUnit, secondUnit, thirdUnit; 776 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 777 if(compositeAndFwd<=0xffff) { 778 firstUnit=pair.trail<<1; 779 secondUnit=compositeAndFwd; 780 thirdUnit=-1; 781 } else { 782 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 783 secondUnit=compositeAndFwd>>16; 784 thirdUnit=compositeAndFwd; 785 } 786 } else { 787 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 788 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 789 Normalizer2Impl::COMP_1_TRIPLE; 790 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 791 (compositeAndFwd>>16); 792 thirdUnit=compositeAndFwd; 793 } 794 // Set the high bit of the first unit if this is the last composition pair. 795 if(i==(length-1)) { 796 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 797 } 798 dataString.append((UChar)firstUnit).append((UChar)secondUnit); 799 if(thirdUnit>=0) { 800 dataString.append((UChar)thirdUnit); 801 } 802 } 803 } 804 805 class ExtraDataWriter : public Normalizer2DBEnumerator { 806 public: 807 ExtraDataWriter(Normalizer2DataBuilder &b) : 808 Normalizer2DBEnumerator(b), 809 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 810 yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 811 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 812 if(value!=0) { 813 if(start!=end) { 814 fprintf(stderr, 815 "gennorm2 error: unexpected shared data for " 816 "multiple code points U+%04lX..U+%04lX\n", 817 (long)start, (long)end); 818 exit(U_INTERNAL_PROGRAM_ERROR); 819 } 820 builder.writeExtraData(start, value, *this); 821 } 822 return TRUE; 823 } 824 UnicodeString maybeYesCompositions; 825 UnicodeString yesYesCompositions; 826 UnicodeString yesNoMappingsAndCompositions; 827 UnicodeString yesNoMappingsOnly; 828 UnicodeString noNoMappings; 829 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 830 }; 831 832 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 833 Norm *p=norms+value; 834 if(!p->hasMapping()) { 835 // Write small-FCD data. 836 // There is similar code in writeMapping() for characters that do have a mapping. 837 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) { 838 fprintf(stderr, 839 "gennorm2 error: " 840 "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", 841 (long)c); 842 exit(U_INVALID_FORMAT_ERROR); 843 } 844 if(p->cc!=0) { 845 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 846 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 847 } 848 } 849 if(p->combinesBack) { 850 if(p->hasMapping()) { 851 fprintf(stderr, 852 "gennorm2 error: " 853 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 854 (long)c); 855 exit(U_INVALID_FORMAT_ERROR); 856 } 857 if(p->compositions!=NULL) { 858 p->offset= 859 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 860 Norm::OFFSET_MAYBE_YES; 861 writeCompositions(c, p, writer.maybeYesCompositions); 862 } 863 } else if(!p->hasMapping()) { 864 if(p->compositions!=NULL) { 865 p->offset= 866 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 867 Norm::OFFSET_YES_YES; 868 writeCompositions(c, p, writer.yesYesCompositions); 869 } 870 } else if(p->mappingType==Norm::ROUND_TRIP) { 871 if(p->compositions!=NULL) { 872 int32_t offset=writer.yesNoMappingsAndCompositions.length()+ 873 writeMapping(c, p, writer.yesNoMappingsAndCompositions); 874 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION; 875 writeCompositions(c, p, writer.yesNoMappingsAndCompositions); 876 } else { 877 int32_t offset=writer.yesNoMappingsOnly.length()+ 878 writeMapping(c, p, writer.yesNoMappingsOnly); 879 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY; 880 } 881 } else /* one-way */ { 882 if(p->compositions!=NULL) { 883 fprintf(stderr, 884 "gennorm2 error: " 885 "U+%04lX combines-forward and has a one-way mapping, " 886 "not possible in Unicode normalization\n", 887 (long)c); 888 exit(U_INVALID_FORMAT_ERROR); 889 } 890 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 891 // Try a compact, algorithmic encoding. 892 // Only for ccc=0, because we can't store additional information 893 // and we do not recursively follow an algorithmic encoding for access to the ccc. 894 // 895 // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding 896 // if the mappingCP decomposes further, to ensure that there is a place to store it. 897 // We want to see that the final mapping does not have exactly 1 code point, 898 // or else we would have to recursively ensure that the final mapping is stored 899 // in normal extraData. 900 if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { 901 int32_t delta=p->mappingCP-c; 902 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 903 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 904 } 905 } 906 } 907 if(p->offset==0) { 908 int32_t oldNoNoLength=writer.noNoMappings.length(); 909 int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); 910 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 911 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 912 if(previousOffset!=0) { 913 // Duplicate, remove the new units and point to the old ones. 914 writer.noNoMappings.truncate(oldNoNoLength); 915 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 916 } else { 917 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 918 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 919 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode); 920 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 921 } 922 } 923 } 924 } 925 926 class Norm16Writer : public Normalizer2DBEnumerator { 927 public: 928 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 929 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 930 builder.writeNorm16(start, end, value); 931 return TRUE; 932 } 933 }; 934 935 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 936 if(value!=0) { 937 const Norm *p=norms+value; 938 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 939 int32_t norm16=0; 940 UBool isDecompNo=FALSE; 941 UBool isCompNoMaybe=FALSE; 942 switch(p->offset&Norm::OFFSET_MASK) { 943 case Norm::OFFSET_NONE: 944 // No mapping, no compositions list. 945 if(p->combinesBack) { 946 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 947 isDecompNo=(UBool)(p->cc!=0); 948 isCompNoMaybe=TRUE; 949 } else if(p->cc!=0) { 950 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 951 isDecompNo=isCompNoMaybe=TRUE; 952 } 953 break; 954 case Norm::OFFSET_MAYBE_YES: 955 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 956 isCompNoMaybe=TRUE; 957 break; 958 case Norm::OFFSET_YES_YES: 959 norm16=offset; 960 break; 961 case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: 962 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 963 isDecompNo=TRUE; 964 break; 965 case Norm::OFFSET_YES_NO_MAPPING_ONLY: 966 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; 967 isDecompNo=TRUE; 968 break; 969 case Norm::OFFSET_NO_NO: 970 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 971 isDecompNo=isCompNoMaybe=TRUE; 972 break; 973 case Norm::OFFSET_DELTA: 974 norm16=getCenterNoNoDelta()+offset; 975 isDecompNo=isCompNoMaybe=TRUE; 976 break; 977 default: // Should not occur. 978 exit(U_INTERNAL_PROGRAM_ERROR); 979 } 980 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 981 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 982 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 983 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 984 } 985 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 986 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 987 } 988 } 989 } 990 991 void Normalizer2DataBuilder::setHangulData() { 992 HangulIterator hi; 993 const HangulIterator::Range *range; 994 // Check that none of the Hangul/Jamo code points have data. 995 while((range=hi.nextRange())!=NULL) { 996 for(UChar32 c=range->start; c<range->limit; ++c) { 997 if(utrie2_get32(norm16Trie, c)!=0) { 998 fprintf(stderr, 999 "gennorm2 error: " 1000 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 1001 (long)c); 1002 exit(U_INVALID_FORMAT_ERROR); 1003 } 1004 } 1005 } 1006 // Set data for algorithmic runtime handling. 1007 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 1008 hi.reset(); 1009 while((range=hi.nextRange())!=NULL) { 1010 uint16_t norm16=range->norm16; 1011 if(norm16==0) { 1012 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 1013 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 1014 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 1015 } 1016 } else { 1017 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 1018 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 1019 } 1020 } 1021 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 1022 errorCode.assertSuccess(); 1023 } 1024 } 1025 1026 U_CDECL_BEGIN 1027 1028 static UBool U_CALLCONV 1029 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1030 uint32_t *pMaxValue=(uint32_t *)context; 1031 if(value>*pMaxValue) { 1032 *pMaxValue=value; 1033 } 1034 return TRUE; 1035 } 1036 1037 U_CDECL_END 1038 1039 void Normalizer2DataBuilder::processData() { 1040 IcuToolErrorCode errorCode("gennorm2/processData()"); 1041 norm16Trie=utrie2_open(0, 0, errorCode); 1042 errorCode.assertSuccess(); 1043 1044 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 1045 1046 Decomposer decomposer(*this); 1047 do { 1048 decomposer.didDecompose=FALSE; 1049 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 1050 } while(decomposer.didDecompose); 1051 1052 BuilderReorderingBuffer buffer; 1053 int32_t normsLength=utm_countItems(normMem); 1054 for(int32_t i=1; i<normsLength; ++i) { 1055 // Set the hasNoCompBoundaryAfter flag for use by the last code branch 1056 // in Normalizer2Impl::hasCompBoundaryAfter(). 1057 // For details see the comments on hasNoCompBoundaryAfter(buffer). 1058 const Norm &norm=norms[i]; 1059 if(norm.hasMapping()) { 1060 if(norm.compositions!=NULL) { 1061 norms[i].hasNoCompBoundaryAfter=TRUE; 1062 } else { 1063 buffer.reset(); 1064 reorder(norms+i, buffer); 1065 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 1066 } 1067 } 1068 } 1069 1070 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 1071 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 1072 1073 ExtraDataWriter extraDataWriter(*this); 1074 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 1075 1076 extraData=extraDataWriter.maybeYesCompositions; 1077 extraData.append(extraDataWriter.yesYesCompositions). 1078 append(extraDataWriter.yesNoMappingsAndCompositions). 1079 append(extraDataWriter.yesNoMappingsOnly). 1080 append(extraDataWriter.noNoMappings); 1081 // Pad to even length for 4-byte alignment of following data. 1082 if(extraData.length()&1) { 1083 extraData.append((UChar)0); 1084 } 1085 1086 indexes[Normalizer2Impl::IX_MIN_YES_NO]= 1087 extraDataWriter.yesYesCompositions.length(); 1088 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]= 1089 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 1090 extraDataWriter.yesNoMappingsAndCompositions.length(); 1091 indexes[Normalizer2Impl::IX_MIN_NO_NO]= 1092 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+ 1093 extraDataWriter.yesNoMappingsOnly.length(); 1094 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 1095 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 1096 extraDataWriter.noNoMappings.length(); 1097 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 1098 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 1099 extraDataWriter.maybeYesCompositions.length(); 1100 1101 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 1102 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 1103 fprintf(stderr, 1104 "gennorm2 error: " 1105 "data structure overflow, too much mapping composition data\n"); 1106 exit(U_BUFFER_OVERFLOW_ERROR); 1107 } 1108 1109 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 1110 1111 setHangulData(); 1112 1113 // Look for the "worst" norm16 value of any supplementary code point 1114 // corresponding to a lead surrogate, and set it as that surrogate's value. 1115 // Enables quick check inner loops to look at only code units. 1116 // 1117 // We could be more sophisticated: 1118 // We could collect a bit set for whether there are values in the different 1119 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 1120 // and select the best value that only breaks the composition and/or decomposition 1121 // inner loops if necessary. 1122 // However, that seems like overkill for an optimization for supplementary characters. 1123 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1124 uint32_t maxValue=utrie2_get32(norm16Trie, lead); 1125 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 1126 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 1127 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 1128 ) { 1129 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1130 // Otherwise it might end up at something like JAMO_VT which stays in 1131 // the inner decomposition quick check loop. 1132 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1133 } 1134 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1135 } 1136 1137 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1138 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1139 // which is harmless. 1140 // As a result, the minimum code points are always BMP code points. 1141 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1142 if(minCP>=0x10000) { 1143 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1144 } 1145 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1146 if(minCP>=0x10000) { 1147 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1148 } 1149 1150 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1151 norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1152 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1153 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1154 errorCode.errorName()); 1155 exit(errorCode.reset()); 1156 } 1157 errorCode.reset(); 1158 1159 int32_t offset=(int32_t)sizeof(indexes); 1160 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1161 offset+=norm16TrieLength; 1162 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1163 offset+=extraData.length()*2; 1164 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 1165 offset+=sizeof(smallFCD); 1166 int32_t totalSize=offset; 1167 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1168 indexes[i]=totalSize; 1169 } 1170 1171 if(beVerbose) { 1172 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1173 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1174 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 1175 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1176 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1177 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1178 printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1179 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 1180 printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1181 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1182 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1183 } 1184 1185 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 1186 if(0==memcmp(nullVersion, unicodeVersion, 4)) { 1187 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 1188 } 1189 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1190 } 1191 1192 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1193 processData(); 1194 1195 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1196 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1197 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1198 errorCode.assertSuccess(); 1199 1200 UNewDataMemory *pData= 1201 udata_create(NULL, NULL, filename, &dataInfo, 1202 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1203 if(errorCode.isFailure()) { 1204 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1205 filename, errorCode.errorName()); 1206 exit(errorCode.reset()); 1207 } 1208 udata_writeBlock(pData, indexes, sizeof(indexes)); 1209 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1210 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1211 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 1212 int32_t writtenSize=udata_finish(pData, errorCode); 1213 if(errorCode.isFailure()) { 1214 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1215 exit(errorCode.reset()); 1216 } 1217 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1218 if(writtenSize!=totalSize) { 1219 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1220 (long)writtenSize, (long)totalSize); 1221 exit(U_INTERNAL_PROGRAM_ERROR); 1222 } 1223 } 1224 1225 void 1226 Normalizer2DataBuilder::writeCSourceFile(const char *filename) { 1227 processData(); 1228 1229 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); 1230 const char *basename=findBasename(filename); 1231 CharString path(filename, (int32_t)(basename-filename), errorCode); 1232 CharString dataName(basename, errorCode); 1233 const char *extension=strrchr(basename, '.'); 1234 if(extension!=NULL) { 1235 dataName.truncate((int32_t)(extension-basename)); 1236 } 1237 errorCode.assertSuccess(); 1238 1239 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1240 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1241 errorCode.assertSuccess(); 1242 1243 FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp"); 1244 if(f==NULL) { 1245 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", 1246 filename); 1247 exit(U_FILE_ACCESS_ERROR); 1248 return; 1249 } 1250 char line[100]; 1251 sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data()); 1252 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); 1253 sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data()); 1254 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); 1255 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", 1256 dataName.data()); 1257 usrc_writeArray(f, 1258 line, 1259 indexes, 32, Normalizer2Impl::IX_COUNT, 1260 "\n};\n\n"); 1261 sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data()); 1262 usrc_writeUTrie2Arrays(f, 1263 line, NULL, 1264 norm16Trie, 1265 "\n};\n\n"); 1266 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data()); 1267 usrc_writeArray(f, 1268 line, 1269 extraData.getBuffer(), 16, extraData.length(), 1270 "\n};\n\n"); 1271 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data()); 1272 usrc_writeArray(f, 1273 line, 1274 smallFCD, 8, sizeof(smallFCD), 1275 "\n};\n\n"); 1276 /*fputs( // TODO 1277 "static const UCaseProps %s_singleton={\n" 1278 " NULL,\n" 1279 " %s_indexes,\n" 1280 " %s_extraData,\n" 1281 " %s_smallFCD,\n", 1282 f);*/ 1283 sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); 1284 char line2[100]; 1285 sprintf(line2, "%s_trieIndex", dataName.data()); 1286 usrc_writeUTrie2Struct(f, 1287 line, 1288 norm16Trie, line2, NULL, 1289 "};\n"); 1290 fclose(f); 1291 } 1292 1293 U_NAMESPACE_END 1294 1295 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1296 1297 /* 1298 * Hey, Emacs, please set the following: 1299 * 1300 * Local Variables: 1301 * indent-tabs-mode: nil 1302 * End: 1303 */ 1304