1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2impl.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/normalizer2.h" 22 #include "unicode/udata.h" 23 #include "unicode/ustring.h" 24 #include "cmemory.h" 25 #include "mutex.h" 26 #include "normalizer2impl.h" 27 #include "uassert.h" 28 #include "uset_imp.h" 29 #include "utrie2.h" 30 31 U_NAMESPACE_BEGIN 32 33 // ReorderingBuffer -------------------------------------------------------- *** 34 35 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 36 int32_t length=str.length(); 37 start=str.getBuffer(destCapacity); 38 if(start==NULL) { 39 // getBuffer() already did str.setToBogus() 40 errorCode=U_MEMORY_ALLOCATION_ERROR; 41 return FALSE; 42 } 43 limit=start+length; 44 remainingCapacity=str.getCapacity()-length; 45 reorderStart=start; 46 if(start==limit) { 47 lastCC=0; 48 } else { 49 setIterator(); 50 lastCC=previousCC(); 51 // Set reorderStart after the last code point with cc<=1 if there is one. 52 if(lastCC>1) { 53 while(previousCC()>1) {} 54 } 55 reorderStart=codePointLimit; 56 } 57 return TRUE; 58 } 59 60 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 61 int32_t length=(int32_t)(limit-start); 62 return 63 length==(int32_t)(otherLimit-otherStart) && 64 0==u_memcmp(start, otherStart, length); 65 } 66 67 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 68 if(remainingCapacity<2 && !resize(2, errorCode)) { 69 return FALSE; 70 } 71 if(lastCC<=cc || cc==0) { 72 limit[0]=U16_LEAD(c); 73 limit[1]=U16_TRAIL(c); 74 limit+=2; 75 lastCC=cc; 76 if(cc<=1) { 77 reorderStart=limit; 78 } 79 } else { 80 insert(c, cc); 81 } 82 remainingCapacity-=2; 83 return TRUE; 84 } 85 86 UBool ReorderingBuffer::append(const UChar *s, int32_t length, 87 uint8_t leadCC, uint8_t trailCC, 88 UErrorCode &errorCode) { 89 if(length==0) { 90 return TRUE; 91 } 92 if(remainingCapacity<length && !resize(length, errorCode)) { 93 return FALSE; 94 } 95 remainingCapacity-=length; 96 if(lastCC<=leadCC || leadCC==0) { 97 if(trailCC<=1) { 98 reorderStart=limit+length; 99 } else if(leadCC<=1) { 100 reorderStart=limit+1; // Ok if not a code point boundary. 101 } 102 const UChar *sLimit=s+length; 103 do { *limit++=*s++; } while(s!=sLimit); 104 lastCC=trailCC; 105 } else { 106 int32_t i=0; 107 UChar32 c; 108 U16_NEXT(s, i, length, c); 109 insert(c, leadCC); // insert first code point 110 while(i<length) { 111 U16_NEXT(s, i, length, c); 112 if(i<length) { 113 // s must be in NFD, otherwise we need to use getCC(). 114 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 115 } else { 116 leadCC=trailCC; 117 } 118 append(c, leadCC, errorCode); 119 } 120 } 121 return TRUE; 122 } 123 124 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 125 int32_t cpLength=U16_LENGTH(c); 126 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 127 return FALSE; 128 } 129 remainingCapacity-=cpLength; 130 if(cpLength==1) { 131 *limit++=(UChar)c; 132 } else { 133 limit[0]=U16_LEAD(c); 134 limit[1]=U16_TRAIL(c); 135 limit+=2; 136 } 137 lastCC=0; 138 reorderStart=limit; 139 return TRUE; 140 } 141 142 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 143 if(s==sLimit) { 144 return TRUE; 145 } 146 int32_t length=(int32_t)(sLimit-s); 147 if(remainingCapacity<length && !resize(length, errorCode)) { 148 return FALSE; 149 } 150 u_memcpy(limit, s, length); 151 limit+=length; 152 remainingCapacity-=length; 153 lastCC=0; 154 reorderStart=limit; 155 return TRUE; 156 } 157 158 void ReorderingBuffer::remove() { 159 reorderStart=limit=start; 160 remainingCapacity=str.getCapacity(); 161 lastCC=0; 162 } 163 164 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 165 if(suffixLength<(limit-start)) { 166 limit-=suffixLength; 167 remainingCapacity+=suffixLength; 168 } else { 169 limit=start; 170 remainingCapacity=str.getCapacity(); 171 } 172 lastCC=0; 173 reorderStart=limit; 174 } 175 176 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 177 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 178 int32_t length=(int32_t)(limit-start); 179 str.releaseBuffer(length); 180 int32_t newCapacity=length+appendLength; 181 int32_t doubleCapacity=2*str.getCapacity(); 182 if(newCapacity<doubleCapacity) { 183 newCapacity=doubleCapacity; 184 } 185 if(newCapacity<256) { 186 newCapacity=256; 187 } 188 start=str.getBuffer(newCapacity); 189 if(start==NULL) { 190 // getBuffer() already did str.setToBogus() 191 errorCode=U_MEMORY_ALLOCATION_ERROR; 192 return FALSE; 193 } 194 reorderStart=start+reorderStartIndex; 195 limit=start+length; 196 remainingCapacity=str.getCapacity()-length; 197 return TRUE; 198 } 199 200 void ReorderingBuffer::skipPrevious() { 201 codePointLimit=codePointStart; 202 UChar c=*--codePointStart; 203 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 204 --codePointStart; 205 } 206 } 207 208 uint8_t ReorderingBuffer::previousCC() { 209 codePointLimit=codePointStart; 210 if(reorderStart>=codePointStart) { 211 return 0; 212 } 213 UChar32 c=*--codePointStart; 214 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 215 return 0; 216 } 217 218 UChar c2; 219 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 220 --codePointStart; 221 c=U16_GET_SUPPLEMENTARY(c2, c); 222 } 223 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 224 } 225 226 // Inserts c somewhere before the last character. 227 // Requires 0<cc<lastCC which implies reorderStart<limit. 228 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 229 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 230 // insert c at codePointLimit, after the character with prevCC<=cc 231 UChar *q=limit; 232 UChar *r=limit+=U16_LENGTH(c); 233 do { 234 *--r=*--q; 235 } while(codePointLimit!=q); 236 writeCodePoint(q, c); 237 if(cc<=1) { 238 reorderStart=r; 239 } 240 } 241 242 // Normalizer2Impl --------------------------------------------------------- *** 243 244 Normalizer2Impl::~Normalizer2Impl() { 245 udata_close(memory); 246 utrie2_close(normTrie); 247 UTrie2Singleton(fcdTrieSingleton).deleteInstance(); 248 } 249 250 UBool U_CALLCONV 251 Normalizer2Impl::isAcceptable(void *context, 252 const char *type, const char *name, 253 const UDataInfo *pInfo) { 254 if( 255 pInfo->size>=20 && 256 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 257 pInfo->charsetFamily==U_CHARSET_FAMILY && 258 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 259 pInfo->dataFormat[1]==0x72 && 260 pInfo->dataFormat[2]==0x6d && 261 pInfo->dataFormat[3]==0x32 && 262 pInfo->formatVersion[0]==1 263 ) { 264 Normalizer2Impl *me=(Normalizer2Impl *)context; 265 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 266 return TRUE; 267 } else { 268 return FALSE; 269 } 270 } 271 272 void 273 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 274 if(U_FAILURE(errorCode)) { 275 return; 276 } 277 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 278 if(U_FAILURE(errorCode)) { 279 return; 280 } 281 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 282 const int32_t *inIndexes=(const int32_t *)inBytes; 283 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 284 if(indexesLength<=IX_MIN_MAYBE_YES) { 285 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 286 return; 287 } 288 289 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 290 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 291 292 minYesNo=inIndexes[IX_MIN_YES_NO]; 293 minNoNo=inIndexes[IX_MIN_NO_NO]; 294 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 295 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 296 297 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 298 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 299 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 300 inBytes+offset, nextOffset-offset, NULL, 301 &errorCode); 302 if(U_FAILURE(errorCode)) { 303 return; 304 } 305 306 offset=nextOffset; 307 maybeYesCompositions=(const uint16_t *)(inBytes+offset); 308 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 309 } 310 311 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 312 UChar32 c; 313 if(cpStart==(cpLimit-1)) { 314 c=*cpStart; 315 } else { 316 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 317 } 318 uint16_t prevNorm16=getNorm16(c); 319 if(prevNorm16<=minYesNo) { 320 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 321 } else { 322 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 323 } 324 } 325 326 U_CDECL_BEGIN 327 328 static UBool U_CALLCONV 329 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 330 /* add the start code point to the USet */ 331 const USetAdder *sa=(const USetAdder *)context; 332 sa->add(sa->set, start); 333 return TRUE; 334 } 335 336 U_CDECL_END 337 338 void 339 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 340 /* add the start code point of each same-value range of each trie */ 341 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); 342 343 /* add Hangul LV syllables and LV+1 because of skippables */ 344 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 345 sa->add(sa->set, c); 346 sa->add(sa->set, c+1); 347 } 348 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 349 } 350 351 const UChar * 352 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 353 UChar32 minNeedDataCP, 354 ReorderingBuffer *buffer, 355 UErrorCode &errorCode) const { 356 // Make some effort to support NUL-terminated strings reasonably. 357 // Take the part of the fast quick check loop that does not look up 358 // data and check the first part of the string. 359 // After this prefix, determine the string length to simplify the rest 360 // of the code. 361 const UChar *prevSrc=src; 362 UChar c; 363 while((c=*src++)<minNeedDataCP && c!=0) {} 364 // Back out the last character for full processing. 365 // Copy this prefix. 366 if(--src!=prevSrc) { 367 if(buffer!=NULL) { 368 buffer->appendZeroCC(prevSrc, src, errorCode); 369 } 370 } 371 return src; 372 } 373 374 // Dual functionality: 375 // buffer!=NULL: normalize 376 // buffer==NULL: isNormalized/spanQuickCheckYes 377 const UChar * 378 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 379 ReorderingBuffer *buffer, 380 UErrorCode &errorCode) const { 381 UChar32 minNoCP=minDecompNoCP; 382 if(limit==NULL) { 383 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 384 if(U_FAILURE(errorCode)) { 385 return src; 386 } 387 limit=u_strchr(src, 0); 388 } 389 390 const UChar *prevSrc; 391 UChar32 c=0; 392 uint16_t norm16=0; 393 394 // only for quick check 395 const UChar *prevBoundary=src; 396 uint8_t prevCC=0; 397 398 for(;;) { 399 // count code units below the minimum or with irrelevant data for the quick check 400 for(prevSrc=src; src!=limit;) { 401 if( (c=*src)<minNoCP || 402 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 403 ) { 404 ++src; 405 } else if(!U16_IS_SURROGATE(c)) { 406 break; 407 } else { 408 UChar c2; 409 if(U16_IS_SURROGATE_LEAD(c)) { 410 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 411 c=U16_GET_SUPPLEMENTARY(c, c2); 412 } 413 } else /* trail surrogate */ { 414 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 415 --src; 416 c=U16_GET_SUPPLEMENTARY(c2, c); 417 } 418 } 419 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 420 src+=U16_LENGTH(c); 421 } else { 422 break; 423 } 424 } 425 } 426 // copy these code units all at once 427 if(src!=prevSrc) { 428 if(buffer!=NULL) { 429 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 430 break; 431 } 432 } else { 433 prevCC=0; 434 prevBoundary=src; 435 } 436 } 437 if(src==limit) { 438 break; 439 } 440 441 // Check one above-minimum, relevant code point. 442 src+=U16_LENGTH(c); 443 if(buffer!=NULL) { 444 if(!decompose(c, norm16, *buffer, errorCode)) { 445 break; 446 } 447 } else { 448 if(isDecompYes(norm16)) { 449 uint8_t cc=getCCFromYesOrMaybe(norm16); 450 if(prevCC<=cc || cc==0) { 451 prevCC=cc; 452 if(cc<=1) { 453 prevBoundary=src; 454 } 455 continue; 456 } 457 } 458 return prevBoundary; // "no" or cc out of order 459 } 460 } 461 return src; 462 } 463 464 // Decompose a short piece of text which is likely to contain characters that 465 // fail the quick check loop and/or where the quick check loop's overhead 466 // is unlikely to be amortized. 467 // Called by the compose() and makeFCD() implementations. 468 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 469 ReorderingBuffer &buffer, 470 UErrorCode &errorCode) const { 471 while(src<limit) { 472 UChar32 c; 473 uint16_t norm16; 474 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 475 if(!decompose(c, norm16, buffer, errorCode)) { 476 return FALSE; 477 } 478 } 479 return TRUE; 480 } 481 482 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 483 ReorderingBuffer &buffer, 484 UErrorCode &errorCode) const { 485 // Only loops for 1:1 algorithmic mappings. 486 for(;;) { 487 // get the decomposition and the lead and trail cc's 488 if(isDecompYes(norm16)) { 489 // c does not decompose 490 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 491 } else if(isHangul(norm16)) { 492 // Hangul syllable: decompose algorithmically 493 UChar jamos[3]; 494 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 495 } else if(isDecompNoAlgorithmic(norm16)) { 496 c=mapAlgorithmic(c, norm16); 497 norm16=getNorm16(c); 498 } else { 499 // c decomposes, get everything from the variable-length extra data 500 const uint16_t *mapping=getMapping(norm16); 501 uint16_t firstUnit=*mapping++; 502 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 503 uint8_t leadCC, trailCC; 504 trailCC=(uint8_t)(firstUnit>>8); 505 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 506 leadCC=(uint8_t)(*mapping++>>8); 507 } else { 508 leadCC=0; 509 } 510 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode); 511 } 512 } 513 } 514 515 const UChar * 516 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 517 const UChar *decomp=NULL; 518 uint16_t norm16; 519 for(;;) { 520 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 521 // c does not decompose 522 return decomp; 523 } else if(isHangul(norm16)) { 524 // Hangul syllable: decompose algorithmically 525 length=Hangul::decompose(c, buffer); 526 return buffer; 527 } else if(isDecompNoAlgorithmic(norm16)) { 528 c=mapAlgorithmic(c, norm16); 529 decomp=buffer; 530 length=0; 531 U16_APPEND_UNSAFE(buffer, length, c); 532 } else { 533 // c decomposes, get everything from the variable-length extra data 534 const uint16_t *mapping=getMapping(norm16); 535 uint16_t firstUnit=*mapping++; 536 length=firstUnit&MAPPING_LENGTH_MASK; 537 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 538 ++mapping; 539 } 540 return (const UChar *)mapping; 541 } 542 } 543 } 544 545 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 546 UBool doDecompose, 547 ReorderingBuffer &buffer, 548 UErrorCode &errorCode) const { 549 if(doDecompose) { 550 decompose(src, limit, &buffer, errorCode); 551 return; 552 } 553 // Just merge the strings at the boundary. 554 ForwardUTrie2StringIterator iter(normTrie, src, limit); 555 uint8_t firstCC, prevCC, cc; 556 firstCC=prevCC=cc=getCC(iter.next16()); 557 while(cc!=0) { 558 prevCC=cc; 559 cc=getCC(iter.next16()); 560 }; 561 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) && 562 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 563 } 564 565 // Note: hasDecompBoundary() could be implemented as aliases to 566 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 567 // at the cost of building the FCD trie for a decomposition normalizer. 568 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 569 for(;;) { 570 if(c<minDecompNoCP) { 571 return TRUE; 572 } 573 uint16_t norm16=getNorm16(c); 574 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 575 return TRUE; 576 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 577 return FALSE; // ccc!=0 578 } else if(isDecompNoAlgorithmic(norm16)) { 579 c=mapAlgorithmic(c, norm16); 580 } else { 581 // c decomposes, get everything from the variable-length extra data 582 const uint16_t *mapping=getMapping(norm16); 583 uint16_t firstUnit=*mapping++; 584 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 585 return FALSE; 586 } 587 if(!before) { 588 // decomp after-boundary: same as hasFCDBoundaryAfter(), 589 // fcd16<=1 || trailCC==0 590 if(firstUnit>0x1ff) { 591 return FALSE; // trailCC>1 592 } 593 if(firstUnit<=0xff) { 594 return TRUE; // trailCC==0 595 } 596 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 597 } 598 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 599 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0; 600 } 601 } 602 } 603 604 /* 605 * Finds the recomposition result for 606 * a forward-combining "lead" character, 607 * specified with a pointer to its compositions list, 608 * and a backward-combining "trail" character. 609 * 610 * If the lead and trail characters combine, then this function returns 611 * the following "compositeAndFwd" value: 612 * Bits 21..1 composite character 613 * Bit 0 set if the composite is a forward-combining starter 614 * otherwise it returns -1. 615 * 616 * The compositions list has (trail, compositeAndFwd) pair entries, 617 * encoded as either pairs or triples of 16-bit units. 618 * The last entry has the high bit of its first unit set. 619 * 620 * The list is sorted by ascending trail characters (there are no duplicates). 621 * A linear search is used. 622 * 623 * See normalizer2impl.h for a more detailed description 624 * of the compositions list format. 625 */ 626 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 627 uint16_t key1, firstUnit; 628 if(trail<COMP_1_TRAIL_LIMIT) { 629 // trail character is 0..33FF 630 // result entry may have 2 or 3 units 631 key1=(uint16_t)(trail<<1); 632 while(key1>(firstUnit=*list)) { 633 list+=2+(firstUnit&COMP_1_TRIPLE); 634 } 635 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 636 if(firstUnit&COMP_1_TRIPLE) { 637 return ((int32_t)list[1]<<16)|list[2]; 638 } else { 639 return list[1]; 640 } 641 } 642 } else { 643 // trail character is 3400..10FFFF 644 // result entry has 3 units 645 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 646 ((trail>>COMP_1_TRAIL_SHIFT))& 647 ~COMP_1_TRIPLE); 648 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 649 uint16_t secondUnit; 650 for(;;) { 651 if(key1>(firstUnit=*list)) { 652 list+=2+(firstUnit&COMP_1_TRIPLE); 653 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 654 if(key2>(secondUnit=list[1])) { 655 if(firstUnit&COMP_1_LAST_TUPLE) { 656 break; 657 } else { 658 list+=3; 659 } 660 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 661 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 662 } else { 663 break; 664 } 665 } else { 666 break; 667 } 668 } 669 } 670 return -1; 671 } 672 673 /* 674 * Recomposes the buffer text starting at recomposeStartIndex 675 * (which is in NFD - decomposed and canonically ordered), 676 * and truncates the buffer contents. 677 * 678 * Note that recomposition never lengthens the text: 679 * Any character consists of either one or two code units; 680 * a composition may contain at most one more code unit than the original starter, 681 * while the combining mark that is removed has at least one code unit. 682 */ 683 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 684 UBool onlyContiguous) const { 685 UChar *p=buffer.getStart()+recomposeStartIndex; 686 UChar *limit=buffer.getLimit(); 687 if(p==limit) { 688 return; 689 } 690 691 UChar *starter, *pRemove, *q, *r; 692 const uint16_t *compositionsList; 693 UChar32 c, compositeAndFwd; 694 uint16_t norm16; 695 uint8_t cc, prevCC; 696 UBool starterIsSupplementary; 697 698 // Some of the following variables are not used until we have a forward-combining starter 699 // and are only initialized now to avoid compiler warnings. 700 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 701 starter=NULL; 702 starterIsSupplementary=FALSE; 703 prevCC=0; 704 705 for(;;) { 706 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 707 cc=getCCFromYesOrMaybe(norm16); 708 if( // this character combines backward and 709 isMaybe(norm16) && 710 // we have seen a starter that combines forward and 711 compositionsList!=NULL && 712 // the backward-combining character is not blocked 713 (prevCC<cc || prevCC==0) 714 ) { 715 if(isJamoVT(norm16)) { 716 // c is a Jamo V/T, see if we can compose it with the previous character. 717 if(c<Hangul::JAMO_T_BASE) { 718 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 719 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 720 if(prev<Hangul::JAMO_L_COUNT) { 721 pRemove=p-1; 722 UChar syllable=(UChar) 723 (Hangul::HANGUL_BASE+ 724 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 725 Hangul::JAMO_T_COUNT); 726 UChar t; 727 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 728 ++p; 729 syllable+=t; // The next character was a Jamo T. 730 } 731 *starter=syllable; 732 // remove the Jamo V/T 733 q=pRemove; 734 r=p; 735 while(r<limit) { 736 *q++=*r++; 737 } 738 limit=q; 739 p=pRemove; 740 } 741 } 742 /* 743 * No "else" for Jamo T: 744 * Since the input is in NFD, there are no Hangul LV syllables that 745 * a Jamo T could combine with. 746 * All Jamo Ts are combined above when handling Jamo Vs. 747 */ 748 if(p==limit) { 749 break; 750 } 751 compositionsList=NULL; 752 continue; 753 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 754 // The starter and the combining mark (c) do combine. 755 UChar32 composite=compositeAndFwd>>1; 756 757 // Replace the starter with the composite, remove the combining mark. 758 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 759 if(starterIsSupplementary) { 760 if(U_IS_SUPPLEMENTARY(composite)) { 761 // both are supplementary 762 starter[0]=U16_LEAD(composite); 763 starter[1]=U16_TRAIL(composite); 764 } else { 765 *starter=(UChar)composite; 766 // The composite is shorter than the starter, 767 // move the intermediate characters forward one. 768 starterIsSupplementary=FALSE; 769 q=starter+1; 770 r=q+1; 771 while(r<pRemove) { 772 *q++=*r++; 773 } 774 --pRemove; 775 } 776 } else if(U_IS_SUPPLEMENTARY(composite)) { 777 // The composite is longer than the starter, 778 // move the intermediate characters back one. 779 starterIsSupplementary=TRUE; 780 ++starter; // temporarily increment for the loop boundary 781 q=pRemove; 782 r=++pRemove; 783 while(starter<q) { 784 *--r=*--q; 785 } 786 *starter=U16_TRAIL(composite); 787 *--starter=U16_LEAD(composite); // undo the temporary increment 788 } else { 789 // both are on the BMP 790 *starter=(UChar)composite; 791 } 792 793 /* remove the combining mark by moving the following text over it */ 794 if(pRemove<p) { 795 q=pRemove; 796 r=p; 797 while(r<limit) { 798 *q++=*r++; 799 } 800 limit=q; 801 p=pRemove; 802 } 803 // Keep prevCC because we removed the combining mark. 804 805 if(p==limit) { 806 break; 807 } 808 // Is the composite a starter that combines forward? 809 if(compositeAndFwd&1) { 810 compositionsList= 811 getCompositionsListForComposite(getNorm16(composite)); 812 } else { 813 compositionsList=NULL; 814 } 815 816 // We combined; continue with looking for compositions. 817 continue; 818 } 819 } 820 821 // no combination this time 822 prevCC=cc; 823 if(p==limit) { 824 break; 825 } 826 827 // If c did not combine, then check if it is a starter. 828 if(cc==0) { 829 // Found a new starter. 830 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 831 // It may combine with something, prepare for it. 832 if(U_IS_BMP(c)) { 833 starterIsSupplementary=FALSE; 834 starter=p-1; 835 } else { 836 starterIsSupplementary=TRUE; 837 starter=p-2; 838 } 839 } 840 } else if(onlyContiguous) { 841 // FCC: no discontiguous compositions; any intervening character blocks. 842 compositionsList=NULL; 843 } 844 } 845 buffer.setReorderingLimit(limit); 846 } 847 848 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 849 // doCompose: normalize 850 // !doCompose: isNormalized (buffer must be empty and initialized) 851 UBool 852 Normalizer2Impl::compose(const UChar *src, const UChar *limit, 853 UBool onlyContiguous, 854 UBool doCompose, 855 ReorderingBuffer &buffer, 856 UErrorCode &errorCode) const { 857 UChar32 minNoMaybeCP=minCompNoMaybeCP; 858 if(limit==NULL) { 859 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 860 doCompose ? &buffer : NULL, 861 errorCode); 862 if(U_FAILURE(errorCode)) { 863 return FALSE; 864 } 865 limit=u_strchr(src, 0); 866 } 867 868 /* 869 * prevBoundary points to the last character before the current one 870 * that has a composition boundary before it with ccc==0 and quick check "yes". 871 * Keeping track of prevBoundary saves us looking for a composition boundary 872 * when we find a "no" or "maybe". 873 * 874 * When we back out from prevSrc back to prevBoundary, 875 * then we also remove those same characters (which had been simply copied 876 * or canonically-order-inserted) from the ReorderingBuffer. 877 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 878 * must correspond 1:1 to destination units at the end of the destination buffer. 879 */ 880 const UChar *prevBoundary=src; 881 const UChar *prevSrc; 882 UChar32 c=0; 883 uint16_t norm16=0; 884 885 // only for isNormalized 886 uint8_t prevCC=0; 887 888 for(;;) { 889 // count code units below the minimum or with irrelevant data for the quick check 890 for(prevSrc=src; src!=limit;) { 891 if( (c=*src)<minNoMaybeCP || 892 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 893 ) { 894 ++src; 895 } else if(!U16_IS_SURROGATE(c)) { 896 break; 897 } else { 898 UChar c2; 899 if(U16_IS_SURROGATE_LEAD(c)) { 900 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 901 c=U16_GET_SUPPLEMENTARY(c, c2); 902 } 903 } else /* trail surrogate */ { 904 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 905 --src; 906 c=U16_GET_SUPPLEMENTARY(c2, c); 907 } 908 } 909 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 910 src+=U16_LENGTH(c); 911 } else { 912 break; 913 } 914 } 915 } 916 // copy these code units all at once 917 if(src!=prevSrc) { 918 if(doCompose) { 919 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 920 break; 921 } 922 } else { 923 prevCC=0; 924 } 925 if(src==limit) { 926 break; 927 } 928 // Set prevBoundary to the last character in the quick check loop. 929 prevBoundary=src-1; 930 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 931 U16_IS_LEAD(*(prevBoundary-1)) 932 ) { 933 --prevBoundary; 934 } 935 // The start of the current character (c). 936 prevSrc=src; 937 } else if(src==limit) { 938 break; 939 } 940 941 src+=U16_LENGTH(c); 942 /* 943 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 944 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 945 * or has ccc!=0. 946 * Check for Jamo V/T, then for regular characters. 947 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 948 */ 949 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 950 UChar prev=*(prevSrc-1); 951 UBool needToDecompose=FALSE; 952 if(c<Hangul::JAMO_T_BASE) { 953 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 954 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 955 if(prev<Hangul::JAMO_L_COUNT) { 956 if(!doCompose) { 957 return FALSE; 958 } 959 UChar syllable=(UChar) 960 (Hangul::HANGUL_BASE+ 961 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 962 Hangul::JAMO_T_COUNT); 963 UChar t; 964 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 965 ++src; 966 syllable+=t; // The next character was a Jamo T. 967 prevBoundary=src; 968 buffer.setLastChar(syllable); 969 continue; 970 } 971 // If we see L+V+x where x!=T then we drop to the slow path, 972 // decompose and recompose. 973 // This is to deal with NFKC finding normal L and V but a 974 // compatibility variant of a T. We need to either fully compose that 975 // combination here (which would complicate the code and may not work 976 // with strange custom data) or use the slow path -- or else our replacing 977 // two input characters (L+V) with one output character (LV syllable) 978 // would violate the invariant that [prevBoundary..prevSrc[ has the same 979 // length as what we appended to the buffer since prevBoundary. 980 needToDecompose=TRUE; 981 } 982 } else if(Hangul::isHangulWithoutJamoT(prev)) { 983 // c is a Jamo Trailing consonant, 984 // compose with previous Hangul LV that does not contain a Jamo T. 985 if(!doCompose) { 986 return FALSE; 987 } 988 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 989 prevBoundary=src; 990 continue; 991 } 992 if(!needToDecompose) { 993 // The Jamo V/T did not compose into a Hangul syllable. 994 if(doCompose) { 995 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 996 break; 997 } 998 } else { 999 prevCC=0; 1000 } 1001 continue; 1002 } 1003 } 1004 /* 1005 * Source buffer pointers: 1006 * 1007 * all done quick check current char not yet 1008 * "yes" but (c) processed 1009 * may combine 1010 * forward 1011 * [-------------[-------------[-------------[-------------[ 1012 * | | | | | 1013 * orig. src prevBoundary prevSrc src limit 1014 * 1015 * 1016 * Destination buffer pointers inside the ReorderingBuffer: 1017 * 1018 * all done might take not filled yet 1019 * characters for 1020 * reordering 1021 * [-------------[-------------[-------------[ 1022 * | | | | 1023 * start reorderStart limit | 1024 * +remainingCap.+ 1025 */ 1026 if(norm16>=MIN_YES_YES_WITH_CC) { 1027 uint8_t cc=(uint8_t)norm16; // cc!=0 1028 if( onlyContiguous && // FCC 1029 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1030 prevBoundary<prevSrc && 1031 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1032 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1033 // passed the quick check "yes && ccc==0" test. 1034 // Check whether the last character was a "yesYes" or a "yesNo". 1035 // If a "yesNo", then we get its trailing ccc from its 1036 // mapping and check for canonical order. 1037 // All other cases are ok. 1038 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1039 ) { 1040 // Fails FCD test, need to decompose and contiguously recompose. 1041 if(!doCompose) { 1042 return FALSE; 1043 } 1044 } else if(doCompose) { 1045 if(!buffer.append(c, cc, errorCode)) { 1046 break; 1047 } 1048 continue; 1049 } else if(prevCC<=cc) { 1050 prevCC=cc; 1051 continue; 1052 } else { 1053 return FALSE; 1054 } 1055 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1056 return FALSE; 1057 } 1058 1059 /* 1060 * Find appropriate boundaries around this character, 1061 * decompose the source text from between the boundaries, 1062 * and recompose it. 1063 * 1064 * We may need to remove the last few characters from the ReorderingBuffer 1065 * to account for source text that was copied or appended 1066 * but needs to take part in the recomposition. 1067 */ 1068 1069 /* 1070 * Find the last composition boundary in [prevBoundary..src[. 1071 * It is either the decomposition of the current character (at prevSrc), 1072 * or prevBoundary. 1073 */ 1074 if(hasCompBoundaryBefore(c, norm16)) { 1075 prevBoundary=prevSrc; 1076 } else if(doCompose) { 1077 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1078 } 1079 1080 // Find the next composition boundary in [src..limit[ - 1081 // modifies src to point to the next starter. 1082 src=(UChar *)findNextCompBoundary(src, limit); 1083 1084 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1085 int32_t recomposeStartIndex=buffer.length(); 1086 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1087 break; 1088 } 1089 recompose(buffer, recomposeStartIndex, onlyContiguous); 1090 if(!doCompose) { 1091 if(!buffer.equals(prevBoundary, src)) { 1092 return FALSE; 1093 } 1094 buffer.remove(); 1095 prevCC=0; 1096 } 1097 1098 // Move to the next starter. We never need to look back before this point again. 1099 prevBoundary=src; 1100 } 1101 return TRUE; 1102 } 1103 1104 // Very similar to compose(): Make the same changes in both places if relevant. 1105 // pQCResult==NULL: spanQuickCheckYes 1106 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1107 const UChar * 1108 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1109 UBool onlyContiguous, 1110 UNormalizationCheckResult *pQCResult) const { 1111 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1112 if(limit==NULL) { 1113 UErrorCode errorCode=U_ZERO_ERROR; 1114 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1115 limit=u_strchr(src, 0); 1116 } 1117 1118 /* 1119 * prevBoundary points to the last character before the current one 1120 * that has a composition boundary before it with ccc==0 and quick check "yes". 1121 */ 1122 const UChar *prevBoundary=src; 1123 const UChar *prevSrc; 1124 UChar32 c=0; 1125 uint16_t norm16=0; 1126 uint8_t prevCC=0; 1127 1128 for(;;) { 1129 // count code units below the minimum or with irrelevant data for the quick check 1130 for(prevSrc=src;;) { 1131 if(src==limit) { 1132 return src; 1133 } 1134 if( (c=*src)<minNoMaybeCP || 1135 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1136 ) { 1137 ++src; 1138 } else if(!U16_IS_SURROGATE(c)) { 1139 break; 1140 } else { 1141 UChar c2; 1142 if(U16_IS_SURROGATE_LEAD(c)) { 1143 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1144 c=U16_GET_SUPPLEMENTARY(c, c2); 1145 } 1146 } else /* trail surrogate */ { 1147 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1148 --src; 1149 c=U16_GET_SUPPLEMENTARY(c2, c); 1150 } 1151 } 1152 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1153 src+=U16_LENGTH(c); 1154 } else { 1155 break; 1156 } 1157 } 1158 } 1159 if(src!=prevSrc) { 1160 // Set prevBoundary to the last character in the quick check loop. 1161 prevBoundary=src-1; 1162 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1163 U16_IS_LEAD(*(prevBoundary-1)) 1164 ) { 1165 --prevBoundary; 1166 } 1167 prevCC=0; 1168 // The start of the current character (c). 1169 prevSrc=src; 1170 } 1171 1172 src+=U16_LENGTH(c); 1173 /* 1174 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1175 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1176 * or has ccc!=0. 1177 */ 1178 if(isMaybeOrNonZeroCC(norm16)) { 1179 uint8_t cc=getCCFromYesOrMaybe(norm16); 1180 if( onlyContiguous && // FCC 1181 cc!=0 && 1182 prevCC==0 && 1183 prevBoundary<prevSrc && 1184 // prevCC==0 && prevBoundary<prevSrc tell us that 1185 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1186 // passed the quick check "yes && ccc==0" test. 1187 // Check whether the last character was a "yesYes" or a "yesNo". 1188 // If a "yesNo", then we get its trailing ccc from its 1189 // mapping and check for canonical order. 1190 // All other cases are ok. 1191 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1192 ) { 1193 // Fails FCD test. 1194 } else if(prevCC<=cc || cc==0) { 1195 prevCC=cc; 1196 if(norm16<MIN_YES_YES_WITH_CC) { 1197 if(pQCResult!=NULL) { 1198 *pQCResult=UNORM_MAYBE; 1199 } else { 1200 return prevBoundary; 1201 } 1202 } 1203 continue; 1204 } 1205 } 1206 if(pQCResult!=NULL) { 1207 *pQCResult=UNORM_NO; 1208 } 1209 return prevBoundary; 1210 } 1211 } 1212 1213 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1214 UBool doCompose, 1215 UBool onlyContiguous, 1216 ReorderingBuffer &buffer, 1217 UErrorCode &errorCode) const { 1218 if(!buffer.isEmpty()) { 1219 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1220 if(src!=firstStarterInSrc) { 1221 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1222 buffer.getLimit()); 1223 UnicodeString middle(lastStarterInDest, 1224 (int32_t)(buffer.getLimit()-lastStarterInDest)); 1225 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest)); 1226 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1227 const UChar *middleStart=middle.getBuffer(); 1228 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1229 TRUE, buffer, errorCode); 1230 if(U_FAILURE(errorCode)) { 1231 return; 1232 } 1233 src=firstStarterInSrc; 1234 } 1235 } 1236 if(doCompose) { 1237 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1238 } else { 1239 buffer.appendZeroCC(src, limit, errorCode); 1240 } 1241 } 1242 1243 /** 1244 * Does c have a composition boundary before it? 1245 * True if its decomposition begins with a character that has 1246 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1247 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1248 * (isCompYesAndZeroCC()) so we need not decompose. 1249 */ 1250 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1251 for(;;) { 1252 if(isCompYesAndZeroCC(norm16)) { 1253 return TRUE; 1254 } else if(isMaybeOrNonZeroCC(norm16)) { 1255 return FALSE; 1256 } else if(isDecompNoAlgorithmic(norm16)) { 1257 c=mapAlgorithmic(c, norm16); 1258 norm16=getNorm16(c); 1259 } else { 1260 // c decomposes, get everything from the variable-length extra data 1261 const uint16_t *mapping=getMapping(norm16); 1262 uint16_t firstUnit=*mapping++; 1263 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1264 return FALSE; 1265 } 1266 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) { 1267 return FALSE; // non-zero leadCC 1268 } 1269 int32_t i=0; 1270 UChar32 c; 1271 U16_NEXT_UNSAFE(mapping, i, c); 1272 return isCompYesAndZeroCC(getNorm16(c)); 1273 } 1274 } 1275 } 1276 1277 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1278 for(;;) { 1279 uint16_t norm16=getNorm16(c); 1280 if(isInert(norm16)) { 1281 return TRUE; 1282 } else if(norm16<=minYesNo) { 1283 // Hangul LVT (==minYesNo) has a boundary after it. 1284 // Hangul LV and non-inert yesYes characters combine forward. 1285 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1286 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1287 return FALSE; 1288 } else if(isDecompNoAlgorithmic(norm16)) { 1289 c=mapAlgorithmic(c, norm16); 1290 } else { 1291 // c decomposes, get everything from the variable-length extra data. 1292 // If testInert, then c must be a yesNo character which has lccc=0, 1293 // otherwise it could be a noNo. 1294 const uint16_t *mapping=getMapping(norm16); 1295 uint16_t firstUnit=*mapping; 1296 // TRUE if 1297 // c is not deleted, and 1298 // it and its decomposition do not combine forward, and it has a starter, and 1299 // if FCC then trailCC<=1 1300 return 1301 (firstUnit&MAPPING_LENGTH_MASK)!=0 && 1302 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 && 1303 (!onlyContiguous || firstUnit<=0x1ff); 1304 } 1305 } 1306 } 1307 1308 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1309 BackwardUTrie2StringIterator iter(normTrie, start, p); 1310 uint16_t norm16; 1311 do { 1312 norm16=iter.previous16(); 1313 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1314 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1315 // but that's probably not worth the extra cost. 1316 return iter.codePointStart; 1317 } 1318 1319 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1320 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1321 uint16_t norm16; 1322 do { 1323 norm16=iter.next16(); 1324 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1325 return iter.codePointStart; 1326 } 1327 1328 class FCDTrieSingleton : public UTrie2Singleton { 1329 public: 1330 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : 1331 UTrie2Singleton(s), impl(ni), errorCode(ec) {} 1332 UTrie2 *getInstance(UErrorCode &errorCode) { 1333 return UTrie2Singleton::getInstance(createInstance, this, errorCode); 1334 } 1335 static void *createInstance(const void *context, UErrorCode &errorCode); 1336 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 1337 if(value!=0) { 1338 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode); 1339 } 1340 return U_SUCCESS(errorCode); 1341 } 1342 1343 Normalizer2Impl &impl; 1344 UTrie2 *newFCDTrie; 1345 UErrorCode &errorCode; 1346 }; 1347 1348 U_CDECL_BEGIN 1349 1350 // Set the FCD value for a range of same-norm16 characters. 1351 static UBool U_CALLCONV 1352 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1353 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value); 1354 } 1355 1356 // Collect (OR together) the FCD values for a range of supplementary characters, 1357 // for their lead surrogate code unit. 1358 static UBool U_CALLCONV 1359 enumRangeOrValue(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1360 *((uint32_t *)context)|=value; 1361 return TRUE; 1362 } 1363 1364 U_CDECL_END 1365 1366 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) { 1367 FCDTrieSingleton *me=(FCDTrieSingleton *)context; 1368 me->newFCDTrie=utrie2_open(0, 0, &errorCode); 1369 if(U_SUCCESS(errorCode)) { 1370 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me); 1371 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1372 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead); 1373 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue); 1374 if(oredValue!=0) { 1375 // Set a "bad" value for makeFCD() to break the quick check loop 1376 // and look up the value for the supplementary code point. 1377 // If there is any lccc, then set the worst-case lccc of 1. 1378 // The ORed-together value's tccc is already the worst case. 1379 if(oredValue>0xff) { 1380 oredValue=0x100|(oredValue&0xff); 1381 } 1382 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode); 1383 } 1384 } 1385 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode); 1386 if(U_SUCCESS(errorCode)) { 1387 return me->newFCDTrie; 1388 } 1389 } 1390 utrie2_close(me->newFCDTrie); 1391 return NULL; 1392 } 1393 1394 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1395 UTrie2 *newFCDTrie, UErrorCode &errorCode) const { 1396 // Only loops for 1:1 algorithmic mappings. 1397 for(;;) { 1398 if(norm16>=MIN_NORMAL_MAYBE_YES) { 1399 norm16&=0xff; 1400 norm16|=norm16<<8; 1401 } else if(norm16<=minYesNo || minMaybeYes<=norm16) { 1402 // no decomposition or Hangul syllable, all zeros 1403 break; 1404 } else if(limitNoNo<=norm16) { 1405 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1); 1406 if(start==end) { 1407 start+=delta; 1408 norm16=getNorm16(start); 1409 } else { 1410 // the same delta leads from different original characters to different mappings 1411 do { 1412 UChar32 c=start+delta; 1413 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode); 1414 } while(++start<=end); 1415 break; 1416 } 1417 } else { 1418 // c decomposes, get everything from the variable-length extra data 1419 const uint16_t *mapping=getMapping(norm16); 1420 uint16_t firstUnit=*mapping; 1421 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1422 // A character that is deleted (maps to an empty string) must 1423 // get the worst-case lccc and tccc values because arbitrary 1424 // characters on both sides will become adjacent. 1425 norm16=0x1ff; 1426 } else { 1427 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1428 norm16=mapping[1]&0xff00; // lccc 1429 } else { 1430 norm16=0; 1431 } 1432 norm16|=firstUnit>>8; // tccc 1433 } 1434 } 1435 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode); 1436 break; 1437 } 1438 } 1439 1440 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const { 1441 // Logically const: Synchronized instantiation. 1442 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1443 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode); 1444 } 1445 1446 // Dual functionality: 1447 // buffer!=NULL: normalize 1448 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1449 const UChar * 1450 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1451 ReorderingBuffer *buffer, 1452 UErrorCode &errorCode) const { 1453 if(limit==NULL) { 1454 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1455 if(U_FAILURE(errorCode)) { 1456 return src; 1457 } 1458 limit=u_strchr(src, 0); 1459 } 1460 1461 // Note: In this function we use buffer->appendZeroCC() because we track 1462 // the lead and trail combining classes here, rather than leaving it to 1463 // the ReorderingBuffer. 1464 // The exception is the call to decomposeShort() which uses the buffer 1465 // in the normal way. 1466 1467 const UTrie2 *trie=fcdTrie(); 1468 1469 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1470 // Similar to the prevBoundary in the compose() implementation. 1471 const UChar *prevBoundary=src; 1472 const UChar *prevSrc; 1473 UChar32 c=0; 1474 int32_t prevFCD16=0; 1475 uint16_t fcd16=0; 1476 1477 for(;;) { 1478 // count code units with lccc==0 1479 for(prevSrc=src; src!=limit;) { 1480 if((c=*src)<MIN_CCC_LCCC_CP) { 1481 prevFCD16=~c; 1482 ++src; 1483 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) { 1484 prevFCD16=fcd16; 1485 ++src; 1486 } else if(!U16_IS_SURROGATE(c)) { 1487 break; 1488 } else { 1489 UChar c2; 1490 if(U16_IS_SURROGATE_LEAD(c)) { 1491 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1492 c=U16_GET_SUPPLEMENTARY(c, c2); 1493 } 1494 } else /* trail surrogate */ { 1495 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1496 --src; 1497 c=U16_GET_SUPPLEMENTARY(c2, c); 1498 } 1499 } 1500 if((fcd16=getFCD16(c))<=0xff) { 1501 prevFCD16=fcd16; 1502 src+=U16_LENGTH(c); 1503 } else { 1504 break; 1505 } 1506 } 1507 } 1508 // copy these code units all at once 1509 if(src!=prevSrc) { 1510 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1511 break; 1512 } 1513 if(src==limit) { 1514 break; 1515 } 1516 prevBoundary=src; 1517 // We know that the previous character's lccc==0. 1518 if(prevFCD16<0) { 1519 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1520 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16); 1521 if(prevFCD16>1) { 1522 --prevBoundary; 1523 } 1524 } else { 1525 const UChar *p=src-1; 1526 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1527 --p; 1528 // Need to fetch the previous character's FCD value because 1529 // prevFCD16 was just for the trail surrogate code point. 1530 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]); 1531 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1532 } 1533 if(prevFCD16>1) { 1534 prevBoundary=p; 1535 } 1536 } 1537 // The start of the current character (c). 1538 prevSrc=src; 1539 } else if(src==limit) { 1540 break; 1541 } 1542 1543 src+=U16_LENGTH(c); 1544 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1545 // Check for proper order, and decompose locally if necessary. 1546 if((prevFCD16&0xff)<=(fcd16>>8)) { 1547 // proper order: prev tccc <= current lccc 1548 if((fcd16&0xff)<=1) { 1549 prevBoundary=src; 1550 } 1551 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1552 break; 1553 } 1554 prevFCD16=fcd16; 1555 continue; 1556 } else if(buffer==NULL) { 1557 return prevBoundary; // quick check "no" 1558 } else { 1559 /* 1560 * Back out the part of the source that we copied or appended 1561 * already but is now going to be decomposed. 1562 * prevSrc is set to after what was copied/appended. 1563 */ 1564 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1565 /* 1566 * Find the part of the source that needs to be decomposed, 1567 * up to the next safe boundary. 1568 */ 1569 src=findNextFCDBoundary(src, limit); 1570 /* 1571 * The source text does not fulfill the conditions for FCD. 1572 * Decompose and reorder a limited piece of the text. 1573 */ 1574 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1575 break; 1576 } 1577 prevBoundary=src; 1578 prevFCD16=0; 1579 } 1580 } 1581 return src; 1582 } 1583 1584 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1585 UBool doMakeFCD, 1586 ReorderingBuffer &buffer, 1587 UErrorCode &errorCode) const { 1588 if(!buffer.isEmpty()) { 1589 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1590 if(src!=firstBoundaryInSrc) { 1591 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1592 buffer.getLimit()); 1593 UnicodeString middle(lastBoundaryInDest, 1594 (int32_t)(buffer.getLimit()-lastBoundaryInDest)); 1595 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest)); 1596 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1597 const UChar *middleStart=middle.getBuffer(); 1598 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1599 if(U_FAILURE(errorCode)) { 1600 return; 1601 } 1602 src=firstBoundaryInSrc; 1603 } 1604 } 1605 if(doMakeFCD) { 1606 makeFCD(src, limit, &buffer, errorCode); 1607 } else { 1608 buffer.appendZeroCC(src, limit, errorCode); 1609 } 1610 } 1611 1612 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1613 BackwardUTrie2StringIterator iter(fcdTrie(), start, p); 1614 uint16_t fcd16; 1615 do { 1616 fcd16=iter.previous16(); 1617 } while(fcd16>0xff); 1618 return iter.codePointStart; 1619 } 1620 1621 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1622 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit); 1623 uint16_t fcd16; 1624 do { 1625 fcd16=iter.next16(); 1626 } while(fcd16>0xff); 1627 return iter.codePointStart; 1628 } 1629 1630 U_NAMESPACE_END 1631 1632 // Normalizer2 data swapping ----------------------------------------------- *** 1633 1634 U_NAMESPACE_USE 1635 1636 U_CAPI int32_t U_EXPORT2 1637 unorm2_swap(const UDataSwapper *ds, 1638 const void *inData, int32_t length, void *outData, 1639 UErrorCode *pErrorCode) { 1640 const UDataInfo *pInfo; 1641 int32_t headerSize; 1642 1643 const uint8_t *inBytes; 1644 uint8_t *outBytes; 1645 1646 const int32_t *inIndexes; 1647 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 1648 1649 int32_t i, offset, nextOffset, size; 1650 1651 /* udata_swapDataHeader checks the arguments */ 1652 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1653 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1654 return 0; 1655 } 1656 1657 /* check data format and format version */ 1658 pInfo=(const UDataInfo *)((const char *)inData+4); 1659 if(!( 1660 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1661 pInfo->dataFormat[1]==0x72 && 1662 pInfo->dataFormat[2]==0x6d && 1663 pInfo->dataFormat[3]==0x32 && 1664 pInfo->formatVersion[0]==1 1665 )) { 1666 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 1667 pInfo->dataFormat[0], pInfo->dataFormat[1], 1668 pInfo->dataFormat[2], pInfo->dataFormat[3], 1669 pInfo->formatVersion[0]); 1670 *pErrorCode=U_UNSUPPORTED_ERROR; 1671 return 0; 1672 } 1673 1674 inBytes=(const uint8_t *)inData+headerSize; 1675 outBytes=(uint8_t *)outData+headerSize; 1676 1677 inIndexes=(const int32_t *)inBytes; 1678 1679 if(length>=0) { 1680 length-=headerSize; 1681 if(length<sizeof(indexes)) { 1682 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 1683 length); 1684 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1685 return 0; 1686 } 1687 } 1688 1689 /* read the first few indexes */ 1690 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 1691 indexes[i]=udata_readInt32(ds, inIndexes[i]); 1692 } 1693 1694 /* get the total length of the data */ 1695 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1696 1697 if(length>=0) { 1698 if(length<size) { 1699 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 1700 length); 1701 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1702 return 0; 1703 } 1704 1705 /* copy the data for inaccessible bytes */ 1706 if(inBytes!=outBytes) { 1707 uprv_memcpy(outBytes, inBytes, size); 1708 } 1709 1710 offset=0; 1711 1712 /* swap the int32_t indexes[] */ 1713 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 1714 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 1715 offset=nextOffset; 1716 1717 /* swap the UTrie2 */ 1718 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 1719 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 1720 offset=nextOffset; 1721 1722 /* swap the uint16_t extraData[] */ 1723 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1]; 1724 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 1725 offset=nextOffset; 1726 1727 U_ASSERT(offset==size); 1728 } 1729 1730 return headerSize+size; 1731 } 1732 1733 #endif // !UCONFIG_NO_NORMALIZATION 1734