1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: n2builder.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov25 16 * created by: Markus W. Scherer 17 * 18 * Builds Normalizer2 data and writes a binary .nrm file. 19 * For the file format see source/common/normalizer2impl.h. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "n2builder.h" 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <vector> 29 #include "unicode/errorcode.h" 30 #include "unicode/localpointer.h" 31 #include "unicode/putil.h" 32 #include "unicode/ucptrie.h" 33 #include "unicode/udata.h" 34 #include "unicode/umutablecptrie.h" 35 #include "unicode/uniset.h" 36 #include "unicode/unistr.h" 37 #include "unicode/usetiter.h" 38 #include "unicode/ustring.h" 39 #include "charstr.h" 40 #include "extradata.h" 41 #include "hash.h" 42 #include "normalizer2impl.h" 43 #include "norms.h" 44 #include "toolutil.h" 45 #include "unewdata.h" 46 #include "uvectr32.h" 47 #include "writesrc.h" 48 49 #if !UCONFIG_NO_NORMALIZATION 50 51 /* UDataInfo cf. udata.h */ 52 static UDataInfo dataInfo={ 53 sizeof(UDataInfo), 54 0, 55 56 U_IS_BIG_ENDIAN, 57 U_CHARSET_FAMILY, 58 U_SIZEOF_UCHAR, 59 0, 60 61 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 62 { 4, 0, 0, 0 }, /* formatVersion */ 63 { 11, 0, 0, 0 } /* dataVersion (Unicode version) */ 64 }; 65 66 U_NAMESPACE_BEGIN 67 68 class HangulIterator { 69 public: 70 struct Range { 71 UChar32 start, end; 72 }; 73 74 HangulIterator() : rangeIndex(0) {} 75 const Range *nextRange() { 76 if(rangeIndex<UPRV_LENGTHOF(ranges)) { 77 return ranges+rangeIndex++; 78 } else { 79 return NULL; 80 } 81 } 82 private: 83 static const Range ranges[4]; 84 int32_t rangeIndex; 85 }; 86 87 const HangulIterator::Range HangulIterator::ranges[4]={ 88 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END }, 89 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END }, 90 // JAMO_T_BASE+1: not U+11A7 91 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END }, 92 { Hangul::HANGUL_BASE, Hangul::HANGUL_END }, 93 }; 94 95 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 96 norms(errorCode), 97 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), 98 norm16TrieBytes(nullptr), norm16TrieLength(0) { 99 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 100 memset(indexes, 0, sizeof(indexes)); 101 memset(smallFCD, 0, sizeof(smallFCD)); 102 } 103 104 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 105 delete[] norm16TrieBytes; 106 } 107 108 void 109 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 110 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 111 UVersionInfo version; 112 u_versionFromString(version, v); 113 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 114 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 115 ) { 116 char buffer[U_MAX_VERSION_STRING_LENGTH]; 117 u_versionToString(unicodeVersion, buffer); 118 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 119 buffer, v); 120 exit(U_ILLEGAL_ARGUMENT_ERROR); 121 } 122 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 123 } 124 125 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 126 if(p!=NULL) { 127 if(p->mappingType!=Norm::NONE) { 128 if( overrideHandling==OVERRIDE_NONE || 129 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 130 ) { 131 fprintf(stderr, 132 "error in gennorm2 phase %d: " 133 "not permitted to override mapping for U+%04lX from phase %d\n", 134 (int)phase, (long)c, (int)p->mappingPhase); 135 exit(U_INVALID_FORMAT_ERROR); 136 } 137 delete p->mapping; 138 p->mapping=NULL; 139 } 140 p->mappingPhase=phase; 141 } 142 return p; 143 } 144 145 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 146 overrideHandling=oh; 147 ++phase; 148 } 149 150 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 151 norms.createNorm(c)->cc=cc; 152 norms.ccSet.add(c); 153 } 154 155 static UBool isWellFormed(const UnicodeString &s) { 156 UErrorCode errorCode=U_ZERO_ERROR; 157 u_strToUTF8(NULL, 0, NULL, toUCharPtr(s.getBuffer()), s.length(), &errorCode); 158 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 159 } 160 161 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 162 if(!isWellFormed(m)) { 163 fprintf(stderr, 164 "error in gennorm2 phase %d: " 165 "illegal one-way mapping from U+%04lX to malformed string\n", 166 (int)phase, (long)c); 167 exit(U_INVALID_FORMAT_ERROR); 168 } 169 Norm *p=checkNormForMapping(norms.createNorm(c), c); 170 p->mapping=new UnicodeString(m); 171 p->mappingType=Norm::ONE_WAY; 172 p->setMappingCP(); 173 norms.mappingSet.add(c); 174 } 175 176 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 177 if(U_IS_SURROGATE(c)) { 178 fprintf(stderr, 179 "error in gennorm2 phase %d: " 180 "illegal round-trip mapping from surrogate code point U+%04lX\n", 181 (int)phase, (long)c); 182 exit(U_INVALID_FORMAT_ERROR); 183 } 184 if(!isWellFormed(m)) { 185 fprintf(stderr, 186 "error in gennorm2 phase %d: " 187 "illegal round-trip mapping from U+%04lX to malformed string\n", 188 (int)phase, (long)c); 189 exit(U_INVALID_FORMAT_ERROR); 190 } 191 int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); 192 if(numCP!=2) { 193 fprintf(stderr, 194 "error in gennorm2 phase %d: " 195 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 196 (int)phase, (long)c, (int)numCP); 197 exit(U_INVALID_FORMAT_ERROR); 198 } 199 Norm *p=checkNormForMapping(norms.createNorm(c), c); 200 p->mapping=new UnicodeString(m); 201 p->mappingType=Norm::ROUND_TRIP; 202 p->mappingCP=U_SENTINEL; 203 norms.mappingSet.add(c); 204 } 205 206 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 207 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. 208 Norm *p=checkNormForMapping(norms.createNorm(c), c); 209 p->mappingType=Norm::REMOVED; 210 norms.mappingSet.add(c); 211 } 212 213 UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, 214 Norm::MappingType mappingType) const { 215 if(buffer.isEmpty()) { 216 return FALSE; // Maps-to-empty-string is no boundary of any kind. 217 } 218 int32_t lastStarterIndex=buffer.lastStarterIndex(); 219 if(lastStarterIndex<0) { 220 return FALSE; // no starter 221 } 222 const int32_t lastIndex=buffer.length()-1; 223 if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { 224 // One-way mapping where after the last starter is at least one combining mark 225 // with a combining class greater than 1, 226 // which means that another combining mark can reorder before it. 227 // By contrast, in a round-trip mapping this does not prevent a boundary as long as 228 // the starter or composite does not combine-forward with a following combining mark. 229 return FALSE; 230 } 231 UChar32 starter=buffer.charAt(lastStarterIndex); 232 if(lastStarterIndex==0 && norms.combinesBack(starter)) { 233 // The last starter is at the beginning of the mapping and combines backward. 234 return FALSE; 235 } 236 if(Hangul::isJamoL(starter) || 237 (Hangul::isJamoV(starter) && 238 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { 239 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 240 // otherwise it is blocked. 241 return lastStarterIndex!=lastIndex; 242 } 243 // Note: There can be no Hangul syllable in the fully decomposed mapping. 244 245 // Multiple starters can combine into one. 246 // Look for the first of the last sequence of starters, excluding Jamos. 247 int32_t i=lastStarterIndex; 248 UChar32 c; 249 while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) { 250 starter=c; 251 --i; 252 } 253 // Compose as far as possible, and see if further compositions with 254 // characters following this mapping are possible. 255 const Norm *starterNorm=norms.getNorm(starter); 256 if(i==lastStarterIndex && 257 (starterNorm==nullptr || starterNorm->compositions==nullptr)) { 258 return TRUE; // The last starter does not combine forward. 259 } 260 uint8_t prevCC=0; 261 while(++i<buffer.length()) { 262 uint8_t cc=buffer.ccAt(i); // !=0 if after last starter 263 if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { 264 // The starter combines with a mark that reorders before the current one. 265 return FALSE; 266 } 267 UChar32 c=buffer.charAt(i); 268 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && 269 norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { 270 // The starter combines with c into a composite replacement starter. 271 starterNorm=norms.getNorm(starter); 272 if(i>=lastStarterIndex && 273 (starterNorm==nullptr || starterNorm->compositions==nullptr)) { 274 return TRUE; // The composite does not combine further. 275 } 276 // Keep prevCC because we "removed" the combining mark. 277 } else if(cc==0) { 278 starterNorm=norms.getNorm(c); 279 if(i==lastStarterIndex && 280 (starterNorm==nullptr || starterNorm->compositions==nullptr)) { 281 return TRUE; // The new starter does not combine forward. 282 } 283 prevCC=0; 284 } else { 285 prevCC=cc; 286 } 287 } 288 if(prevCC==0) { 289 return FALSE; // forward-combining starter at the very end 290 } 291 if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { 292 // The starter combines with another mark. 293 return FALSE; 294 } 295 return TRUE; 296 } 297 298 UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const { 299 if(buffer.lastStarterIndex()<0) { 300 return FALSE; // no starter 301 } 302 const Norm *starterNorm=nullptr; 303 uint8_t prevCC=0; 304 for(int32_t i=0; i<buffer.length(); ++i) { 305 UChar32 c=buffer.charAt(i); 306 uint8_t cc=buffer.ccAt(i); 307 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && 308 norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) { 309 return TRUE; // normal composite 310 } else if(cc==0) { 311 if(Hangul::isJamoL(c)) { 312 if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) { 313 return TRUE; // Hangul syllable 314 } 315 starterNorm=nullptr; 316 } else { 317 starterNorm=norms.getNorm(c); 318 } 319 } 320 prevCC=cc; 321 } 322 return FALSE; 323 } 324 325 void Normalizer2DataBuilder::postProcess(Norm &norm) { 326 // Prerequisites: Compositions are built, mappings are recursively decomposed. 327 // Mappings are not yet in canonical order. 328 // 329 // This function works on a Norm struct. We do not know which code point(s) map(s) to it. 330 // Therefore, we cannot compute algorithmic mapping deltas here. 331 // Error conditions are checked, but printed later when we do know the offending code point. 332 if(norm.hasMapping()) { 333 if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) { 334 norm.error="mapping longer than maximum of 31"; 335 return; 336 } 337 // Ensure canonical order. 338 BuilderReorderingBuffer buffer; 339 if(norm.rawMapping!=nullptr) { 340 norms.reorder(*norm.rawMapping, buffer); 341 buffer.reset(); 342 } 343 norms.reorder(*norm.mapping, buffer); 344 if(buffer.isEmpty()) { 345 // A character that is deleted (maps to an empty string) must 346 // get the worst-case lccc and tccc values because arbitrary 347 // characters on both sides will become adjacent. 348 norm.leadCC=1; 349 norm.trailCC=0xff; 350 } else { 351 norm.leadCC=buffer.ccAt(0); 352 norm.trailCC=buffer.ccAt(buffer.length()-1); 353 } 354 355 norm.hasCompBoundaryBefore= 356 !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); 357 norm.hasCompBoundaryAfter= 358 norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType); 359 360 if(norm.combinesBack) { 361 norm.error="combines-back and decomposes, not possible in Unicode normalization"; 362 } else if(norm.mappingType==Norm::ROUND_TRIP) { 363 if(norm.compositions!=NULL) { 364 norm.type=Norm::YES_NO_COMBINES_FWD; 365 } else { 366 norm.type=Norm::YES_NO_MAPPING_ONLY; 367 } 368 } else { // one-way mapping 369 if(norm.compositions!=NULL) { 370 norm.error="combines-forward and has a one-way mapping, " 371 "not possible in Unicode normalization"; 372 } else if(buffer.isEmpty()) { 373 norm.type=Norm::NO_NO_EMPTY; 374 } else if(!norm.hasCompBoundaryBefore) { 375 norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC; 376 } else if(mappingRecomposes(buffer)) { 377 norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE; 378 } else { 379 // The mapping is comp-normalized. 380 norm.type=Norm::NO_NO_COMP_YES; 381 } 382 } 383 } else { // no mapping 384 norm.leadCC=norm.trailCC=norm.cc; 385 386 norm.hasCompBoundaryBefore= 387 norm.cc==0 && !norm.combinesBack; 388 norm.hasCompBoundaryAfter= 389 norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr; 390 391 if(norm.combinesBack) { 392 if(norm.compositions!=nullptr) { 393 // Earlier code checked ccc=0. 394 norm.type=Norm::MAYBE_YES_COMBINES_FWD; 395 } else { 396 norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc 397 } 398 } else if(norm.compositions!=nullptr) { 399 // Earlier code checked ccc=0. 400 norm.type=Norm::YES_YES_COMBINES_FWD; 401 } else if(norm.cc!=0) { 402 norm.type=Norm::YES_YES_WITH_CC; 403 } else { 404 norm.type=Norm::INERT; 405 } 406 } 407 } 408 409 class Norm16Writer : public Norms::Enumerator { 410 public: 411 Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) : 412 Norms::Enumerator(n), builder(b), norm16Trie(trie) {} 413 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE { 414 builder.writeNorm16(norm16Trie, start, end, norm); 415 } 416 Normalizer2DataBuilder &builder; 417 UMutableCPTrie *norm16Trie; 418 }; 419 420 void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { 421 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 422 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 423 } 424 425 void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) { 426 if((norm.leadCC|norm.trailCC)!=0) { 427 for(UChar32 c=start; c<=end; ++c) { 428 setSmallFCD(c); 429 } 430 } 431 432 int32_t norm16; 433 switch(norm.type) { 434 case Norm::INERT: 435 norm16=Normalizer2Impl::INERT; 436 break; 437 case Norm::YES_YES_COMBINES_FWD: 438 norm16=norm.offset*2; 439 break; 440 case Norm::YES_NO_COMBINES_FWD: 441 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2; 442 break; 443 case Norm::YES_NO_MAPPING_ONLY: 444 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2; 445 break; 446 case Norm::NO_NO_COMP_YES: 447 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2; 448 break; 449 case Norm::NO_NO_COMP_BOUNDARY_BEFORE: 450 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2; 451 break; 452 case Norm::NO_NO_COMP_NO_MAYBE_CC: 453 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2; 454 break; 455 case Norm::NO_NO_EMPTY: 456 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2; 457 break; 458 case Norm::NO_NO_DELTA: 459 { 460 // Positive offset from minNoNoDelta, shifted left for additional bits. 461 int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT; 462 if(norm.trailCC==0) { 463 // DELTA_TCCC_0==0 464 } else if(norm.trailCC==1) { 465 offset|=Normalizer2Impl::DELTA_TCCC_1; 466 } else { 467 offset|=Normalizer2Impl::DELTA_TCCC_GT_1; 468 } 469 norm16=getMinNoNoDelta()+offset; 470 break; 471 } 472 case Norm::MAYBE_YES_COMBINES_FWD: 473 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2; 474 break; 475 case Norm::MAYBE_YES_SIMPLE: 476 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255 477 break; 478 case Norm::YES_YES_WITH_CC: 479 U_ASSERT(norm.cc!=0); 480 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255 481 break; 482 default: // Should not occur. 483 exit(U_INTERNAL_PROGRAM_ERROR); 484 } 485 U_ASSERT((norm16&1)==0); 486 if(norm.hasCompBoundaryAfter) { 487 norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; 488 } 489 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 490 umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode); 491 492 // Set the minimum code points for real data lookups in the quick check loops. 493 UBool isDecompNo= 494 (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) || 495 norm.cc!=0; 496 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 497 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 498 } 499 UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES; 500 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 501 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 502 } 503 if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) { 504 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start; 505 } 506 } 507 508 void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) { 509 HangulIterator hi; 510 const HangulIterator::Range *range; 511 // Check that none of the Hangul/Jamo code points have data. 512 while((range=hi.nextRange())!=NULL) { 513 for(UChar32 c=range->start; c<=range->end; ++c) { 514 if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) { 515 fprintf(stderr, 516 "gennorm2 error: " 517 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 518 (long)c); 519 exit(U_INVALID_FORMAT_ERROR); 520 } 521 } 522 } 523 // Set data for algorithmic runtime handling. 524 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 525 526 // Jamo V/T are maybeYes 527 if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 528 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE; 529 } 530 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END, 531 Normalizer2Impl::JAMO_L, errorCode); 532 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END, 533 Normalizer2Impl::JAMO_VT, errorCode); 534 // JAMO_T_BASE+1: not U+11A7 535 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END, 536 Normalizer2Impl::JAMO_VT, errorCode); 537 538 // Hangul LV encoded as minYesNo 539 uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; 540 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER 541 uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]| 542 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; 543 if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 544 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE; 545 } 546 // Set the first LV, then write all other Hangul syllables as LVT, 547 // then overwrite the remaining LV. 548 umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode); 549 umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode); 550 UChar32 c=Hangul::HANGUL_BASE; 551 while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) { 552 umutablecptrie_set(norm16Trie, c, lv, errorCode); 553 } 554 errorCode.assertSuccess(); 555 } 556 557 LocalUCPTriePointer Normalizer2DataBuilder::processData() { 558 // Build composition lists before recursive decomposition, 559 // so that we still have the raw, pair-wise mappings. 560 CompositionBuilder compBuilder(norms); 561 norms.enumRanges(compBuilder); 562 563 // Recursively decompose all mappings. 564 Decomposer decomposer(norms); 565 do { 566 decomposer.didDecompose=FALSE; 567 norms.enumRanges(decomposer); 568 } while(decomposer.didDecompose); 569 570 // Set the Norm::Type and other properties. 571 int32_t normsLength=norms.length(); 572 for(int32_t i=1; i<normsLength; ++i) { 573 postProcess(norms.getNormRefByIndex(i)); 574 } 575 576 // Write the properties, mappings and composition lists to 577 // appropriate parts of the "extra data" array. 578 ExtraData extra(norms, optimization==OPTIMIZE_FAST); 579 norms.enumRanges(extra); 580 581 extraData=extra.yesYesCompositions; 582 indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2; 583 extraData.append(extra.yesNoMappingsAndCompositions); 584 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2; 585 extraData.append(extra.yesNoMappingsOnly); 586 indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2; 587 extraData.append(extra.noNoMappingsCompYes); 588 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2; 589 extraData.append(extra.noNoMappingsCompBoundaryBefore); 590 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2; 591 extraData.append(extra.noNoMappingsCompNoMaybeCC); 592 indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2; 593 extraData.append(extra.noNoMappingsEmpty); 594 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2; 595 596 // Pad the maybeYesCompositions length to a multiple of 4, 597 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center. 598 while(extra.maybeYesCompositions.length()&3) { 599 extra.maybeYesCompositions.append((UChar)0); 600 } 601 extraData.insert(0, extra.maybeYesCompositions); 602 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 603 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 604 extra.maybeYesCompositions.length()*2; 605 606 // Pad to even length for 4-byte alignment of following data. 607 if(extraData.length()&1) { 608 extraData.append((UChar)0); 609 } 610 611 int32_t minNoNoDelta=getMinNoNoDelta(); 612 U_ASSERT((minNoNoDelta&7)==0); 613 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 614 fprintf(stderr, 615 "gennorm2 error: " 616 "data structure overflow, too much mapping composition data\n"); 617 exit(U_BUFFER_OVERFLOW_ERROR); 618 } 619 620 // writeNorm16() and setHangulData() reduce these as needed. 621 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 622 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 623 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000; 624 625 IcuToolErrorCode errorCode("gennorm2/processData()"); 626 UMutableCPTrie *norm16Trie = umutablecptrie_open( 627 Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode); 628 errorCode.assertSuccess(); 629 630 // Map each code point to its norm16 value, 631 // including the properties that fit directly, 632 // and the offset to the "extra data" if necessary. 633 Norm16Writer norm16Writer(norm16Trie, norms, *this); 634 norms.enumRanges(norm16Writer); 635 // TODO: iterate via getRange() instead of callback? 636 637 setHangulData(norm16Trie); 638 639 // Look for the "worst" norm16 value of any supplementary code point 640 // corresponding to a lead surrogate, and set it as that surrogate's value. 641 // Enables UTF-16 quick check inner loops to look at only code units. 642 // 643 // We could be more sophisticated: 644 // We could collect a bit set for whether there are values in the different 645 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 646 // and select the best value that only breaks the composition and/or decomposition 647 // inner loops if necessary. 648 // However, that seems like overkill for an optimization for supplementary characters. 649 // 650 // First check that surrogate code *points* are inert. 651 // The parser should have rejected values/mappings for them. 652 uint32_t value; 653 UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0, 654 nullptr, nullptr, &value); 655 if (value != Normalizer2Impl::INERT || end < 0xdfff) { 656 fprintf(stderr, 657 "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n", 658 (int)end, (long)value); 659 exit(U_INTERNAL_PROGRAM_ERROR); 660 } 661 uint32_t maxNorm16 = 0; 662 // ANDing values yields 0 bits where any value has a 0. 663 // Used for worst-case HAS_COMP_BOUNDARY_AFTER. 664 uint32_t andedNorm16 = 0; 665 end = 0; 666 for (UChar32 start = 0x10000;;) { 667 if (start > end) { 668 end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0, 669 nullptr, nullptr, &value); 670 if (end < 0) { break; } 671 } 672 if ((start & 0x3ff) == 0) { 673 // Data for a new lead surrogate. 674 maxNorm16 = andedNorm16 = value; 675 } else { 676 if (value > maxNorm16) { 677 maxNorm16 = value; 678 } 679 andedNorm16 &= value; 680 } 681 // Intersect each range with the code points for one lead surrogate. 682 UChar32 leadEnd = start | 0x3ff; 683 if (leadEnd <= end) { 684 // End of the supplementary block for a lead surrogate. 685 if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) { 686 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 687 // Otherwise it might end up at something like JAMO_VT which stays in 688 // the inner decomposition quick check loop. 689 maxNorm16 = (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]; 690 } 691 maxNorm16 = 692 (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)| 693 (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); 694 if (maxNorm16 != Normalizer2Impl::INERT) { 695 umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode); 696 } 697 if (value == Normalizer2Impl::INERT) { 698 // Potentially skip inert supplementary blocks for several lead surrogates. 699 start = (end + 1) & ~0x3ff; 700 } else { 701 start = leadEnd + 1; 702 } 703 } else { 704 start = end + 1; 705 } 706 } 707 708 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 709 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 710 // which is harmless. 711 // As a result, the minimum code points are always BMP code points. 712 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 713 if(minCP>=0x10000) { 714 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 715 } 716 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 717 if(minCP>=0x10000) { 718 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 719 } 720 minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; 721 if(minCP>=0x10000) { 722 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP); 723 } 724 725 LocalUCPTriePointer builtTrie( 726 umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode)); 727 norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode); 728 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 729 fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n", 730 errorCode.errorName()); 731 exit(errorCode.reset()); 732 } 733 umutablecptrie_close(norm16Trie); 734 errorCode.reset(); 735 norm16TrieBytes=new uint8_t[norm16TrieLength]; 736 ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode); 737 errorCode.assertSuccess(); 738 739 int32_t offset=(int32_t)sizeof(indexes); 740 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 741 offset+=norm16TrieLength; 742 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 743 offset+=extraData.length()*2; 744 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 745 offset+=sizeof(smallFCD); 746 int32_t totalSize=offset; 747 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 748 indexes[i]=totalSize; 749 } 750 751 if(beVerbose) { 752 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 753 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 754 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 755 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 756 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 757 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 758 printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]); 759 printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 760 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 761 printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 762 printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); 763 printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); 764 printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]); 765 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 766 printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta); 767 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 768 } 769 770 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 771 if(0==memcmp(nullVersion, unicodeVersion, 4)) { 772 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 773 } 774 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 775 return builtTrie; 776 } 777 778 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 779 processData(); 780 781 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 782 UNewDataMemory *pData= 783 udata_create(NULL, NULL, filename, &dataInfo, 784 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 785 if(errorCode.isFailure()) { 786 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 787 filename, errorCode.errorName()); 788 exit(errorCode.reset()); 789 } 790 udata_writeBlock(pData, indexes, sizeof(indexes)); 791 udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength); 792 udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length()); 793 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 794 int32_t writtenSize=udata_finish(pData, errorCode); 795 if(errorCode.isFailure()) { 796 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 797 exit(errorCode.reset()); 798 } 799 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 800 if(writtenSize!=totalSize) { 801 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 802 (long)writtenSize, (long)totalSize); 803 exit(U_INTERNAL_PROGRAM_ERROR); 804 } 805 } 806 807 void 808 Normalizer2DataBuilder::writeCSourceFile(const char *filename) { 809 LocalUCPTriePointer norm16Trie = processData(); 810 811 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); 812 const char *basename=findBasename(filename); 813 CharString path(filename, (int32_t)(basename-filename), errorCode); 814 CharString dataName(basename, errorCode); 815 const char *extension=strrchr(basename, '.'); 816 if(extension!=NULL) { 817 dataName.truncate((int32_t)(extension-basename)); 818 } 819 const char *name=dataName.data(); 820 errorCode.assertSuccess(); 821 822 FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp"); 823 if(f==NULL) { 824 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", 825 filename); 826 exit(U_FILE_ACCESS_ERROR); 827 } 828 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); 829 830 char line[100]; 831 sprintf(line, "static const UVersionInfo %s_formatVersion={", name); 832 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); 833 sprintf(line, "static const UVersionInfo %s_dataVersion={", name); 834 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); 835 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name); 836 usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "\n};\n\n"); 837 838 usrc_writeUCPTrie(f, name, norm16Trie.getAlias()); 839 840 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", name); 841 usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "\n};\n\n"); 842 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", name); 843 usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n"); 844 845 fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); 846 fclose(f); 847 } 848 849 namespace { 850 851 bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { 852 if(s1 == nullptr) { 853 return s2 == nullptr; 854 } else if(s2 == nullptr) { 855 return false; 856 } else { 857 return *s1 == *s2; 858 } 859 } 860 861 const char *typeChars = "?-=>"; 862 863 void writeMapping(FILE *f, const UnicodeString *m) { 864 if(m != nullptr && !m->isEmpty()) { 865 int32_t i = 0; 866 UChar32 c = m->char32At(i); 867 fprintf(f, "%04lX", (long)c); 868 while((i += U16_LENGTH(c)) < m->length()) { 869 c = m->char32At(i); 870 fprintf(f, " %04lX", (long)c); 871 } 872 } 873 fputs("\n", f); 874 } 875 876 } // namespace 877 878 void 879 Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { 880 // Do not processData() before writing the input-syntax data file. 881 FILE *f = fopen(filename, "w"); 882 if(f == nullptr) { 883 fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", 884 filename); 885 exit(U_FILE_ACCESS_ERROR); 886 return; 887 } 888 889 if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || 890 unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { 891 char uv[U_MAX_VERSION_STRING_LENGTH]; 892 u_versionToString(unicodeVersion, uv); 893 fprintf(f, "* Unicode %s\n\n", uv); 894 } 895 896 UnicodeSetIterator ccIter(norms.ccSet); 897 UChar32 start = U_SENTINEL; 898 UChar32 end = U_SENTINEL; 899 uint8_t prevCC = 0; 900 bool done = false; 901 bool didWrite = false; 902 do { 903 UChar32 c; 904 uint8_t cc; 905 if(ccIter.next() && !ccIter.isString()) { 906 c = ccIter.getCodepoint(); 907 cc = norms.getCC(c); 908 } else { 909 c = 0x110000; 910 cc = 0; 911 done = true; 912 } 913 if(cc == prevCC && c == (end + 1)) { 914 end = c; 915 } else { 916 if(prevCC != 0) { 917 if(start == end) { 918 fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC); 919 } else { 920 fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC); 921 } 922 didWrite = true; 923 } 924 start = end = c; 925 prevCC = cc; 926 } 927 } while(!done); 928 if(didWrite) { 929 fputs("\n", f); 930 } 931 932 UnicodeSetIterator mIter(norms.mappingSet); 933 start = U_SENTINEL; 934 end = U_SENTINEL; 935 const UnicodeString *prevMapping = nullptr; 936 Norm::MappingType prevType = Norm::NONE; 937 done = false; 938 do { 939 UChar32 c; 940 const Norm *norm; 941 if(mIter.next() && !mIter.isString()) { 942 c = mIter.getCodepoint(); 943 norm = norms.getNorm(c); 944 } else { 945 c = 0x110000; 946 norm = nullptr; 947 done = true; 948 } 949 const UnicodeString *mapping; 950 Norm::MappingType type; 951 if(norm == nullptr) { 952 mapping = nullptr; 953 type = Norm::NONE; 954 } else { 955 type = norm->mappingType; 956 if(type == Norm::NONE) { 957 mapping = nullptr; 958 } else { 959 mapping = norm->mapping; 960 } 961 } 962 if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { 963 end = c; 964 } else { 965 if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { 966 if(start == end) { 967 fprintf(f, "%04lX%c", (long)start, typeChars[prevType]); 968 } else { 969 fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]); 970 } 971 writeMapping(f, prevMapping); 972 } 973 start = end = c; 974 prevMapping = mapping; 975 prevType = type; 976 } 977 } while(!done); 978 979 fclose(f); 980 } 981 982 void 983 Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, 984 const Normalizer2DataBuilder &b2, 985 Normalizer2DataBuilder &diff) { 986 // Compute diff = b1 - b2 987 // so that we should be able to get b1 = b2 + diff. 988 if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { 989 memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); 990 } 991 992 UnicodeSet ccSet(b1.norms.ccSet); 993 ccSet.addAll(b2.norms.ccSet); 994 UnicodeSetIterator ccIter(ccSet); 995 while(ccIter.next() && !ccIter.isString()) { 996 UChar32 c = ccIter.getCodepoint(); 997 uint8_t cc1 = b1.norms.getCC(c); 998 uint8_t cc2 = b2.norms.getCC(c); 999 if(cc1 != cc2) { 1000 diff.setCC(c, cc1); 1001 } 1002 } 1003 1004 UnicodeSet mSet(b1.norms.mappingSet); 1005 mSet.addAll(b2.norms.mappingSet); 1006 UnicodeSetIterator mIter(mSet); 1007 while(mIter.next() && !mIter.isString()) { 1008 UChar32 c = mIter.getCodepoint(); 1009 const Norm *norm1 = b1.norms.getNorm(c); 1010 const Norm *norm2 = b2.norms.getNorm(c); 1011 const UnicodeString *mapping1; 1012 Norm::MappingType type1; 1013 if(norm1 == nullptr || !norm1->hasMapping()) { 1014 mapping1 = nullptr; 1015 type1 = Norm::NONE; 1016 } else { 1017 mapping1 = norm1->mapping; 1018 type1 = norm1->mappingType; 1019 } 1020 const UnicodeString *mapping2; 1021 Norm::MappingType type2; 1022 if(norm2 == nullptr || !norm2->hasMapping()) { 1023 mapping2 = nullptr; 1024 type2 = Norm::NONE; 1025 } else { 1026 mapping2 = norm2->mapping; 1027 type2 = norm2->mappingType; 1028 } 1029 if(type1 == type2 && equalStrings(mapping1, mapping2)) { 1030 // Nothing to do. 1031 } else if(type1 == Norm::NONE) { 1032 diff.removeMapping(c); 1033 } else if(type1 == Norm::ROUND_TRIP) { 1034 diff.setRoundTripMapping(c, *mapping1); 1035 } else if(type1 == Norm::ONE_WAY) { 1036 diff.setOneWayMapping(c, *mapping1); 1037 } 1038 } 1039 } 1040 1041 U_NAMESPACE_END 1042 1043 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1044 1045 /* 1046 * Hey, Emacs, please set the following: 1047 * 1048 * Local Variables: 1049 * indent-tabs-mode: nil 1050 * End: 1051 */ 1052