1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucm.c 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003jun20 16 * created by: Markus W. Scherer 17 * 18 * This file reads a .ucm file, stores its mappings and sorts them. 19 * It implements handling of Unicode conversion mappings from .ucm files 20 * for makeconv, canonucm, rptp2ucm, etc. 21 * 22 * Unicode code point sequences with a length of more than 1, 23 * as well as byte sequences with more than 4 bytes or more than one complete 24 * character sequence are handled to support m:n mappings. 25 */ 26 27 #include "unicode/utypes.h" 28 #include "unicode/ustring.h" 29 #include "cstring.h" 30 #include "cmemory.h" 31 #include "filestrm.h" 32 #include "uarrsort.h" 33 #include "ucnvmbcs.h" 34 #include "ucnv_bld.h" 35 #include "ucnv_ext.h" 36 #include "uparse.h" 37 #include "ucm.h" 38 #include <stdio.h> 39 40 #if !UCONFIG_NO_CONVERSION 41 42 /* -------------------------------------------------------------------------- */ 43 44 static void 45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { 46 int32_t j; 47 48 for(j=0; j<m->uLen; ++j) { 49 fprintf(f, "<U%04lX>", (long)codePoints[j]); 50 } 51 52 fputc(' ', f); 53 54 for(j=0; j<m->bLen; ++j) { 55 fprintf(f, "\\x%02X", bytes[j]); 56 } 57 58 if(m->f>=0) { 59 fprintf(f, " |%u\n", m->f); 60 } else { 61 fputs("\n", f); 62 } 63 } 64 65 U_CAPI void U_EXPORT2 66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { 67 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); 68 } 69 70 U_CAPI void U_EXPORT2 71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { 72 UCMapping *m; 73 int32_t i, length; 74 75 m=table->mappings; 76 length=table->mappingsLength; 77 if(byUnicode) { 78 for(i=0; i<length; ++m, ++i) { 79 ucm_printMapping(table, m, f); 80 } 81 } else { 82 const int32_t *map=table->reverseMap; 83 for(i=0; i<length; ++i) { 84 ucm_printMapping(table, m+map[i], f); 85 } 86 } 87 } 88 89 /* mapping comparisons ------------------------------------------------------ */ 90 91 static int32_t 92 compareUnicode(UCMTable *lTable, const UCMapping *l, 93 UCMTable *rTable, const UCMapping *r) { 94 const UChar32 *lu, *ru; 95 int32_t result, i, length; 96 97 if(l->uLen==1 && r->uLen==1) { 98 /* compare two single code points */ 99 return l->u-r->u; 100 } 101 102 /* get pointers to the code point sequences */ 103 lu=UCM_GET_CODE_POINTS(lTable, l); 104 ru=UCM_GET_CODE_POINTS(rTable, r); 105 106 /* get the minimum length */ 107 if(l->uLen<=r->uLen) { 108 length=l->uLen; 109 } else { 110 length=r->uLen; 111 } 112 113 /* compare the code points */ 114 for(i=0; i<length; ++i) { 115 result=lu[i]-ru[i]; 116 if(result!=0) { 117 return result; 118 } 119 } 120 121 /* compare the lengths */ 122 return l->uLen-r->uLen; 123 } 124 125 static int32_t 126 compareBytes(UCMTable *lTable, const UCMapping *l, 127 UCMTable *rTable, const UCMapping *r, 128 UBool lexical) { 129 const uint8_t *lb, *rb; 130 int32_t result, i, length; 131 132 /* 133 * A lexical comparison is used for sorting in the builder, to allow 134 * an efficient search for a byte sequence that could be a prefix 135 * of a previously entered byte sequence. 136 * 137 * Comparing by lengths first is for compatibility with old .ucm tools 138 * like canonucm and rptp2ucm. 139 */ 140 if(lexical) { 141 /* get the minimum length and continue */ 142 if(l->bLen<=r->bLen) { 143 length=l->bLen; 144 } else { 145 length=r->bLen; 146 } 147 } else { 148 /* compare lengths first */ 149 result=l->bLen-r->bLen; 150 if(result!=0) { 151 return result; 152 } else { 153 length=l->bLen; 154 } 155 } 156 157 /* get pointers to the byte sequences */ 158 lb=UCM_GET_BYTES(lTable, l); 159 rb=UCM_GET_BYTES(rTable, r); 160 161 /* compare the bytes */ 162 for(i=0; i<length; ++i) { 163 result=lb[i]-rb[i]; 164 if(result!=0) { 165 return result; 166 } 167 } 168 169 /* compare the lengths */ 170 return l->bLen-r->bLen; 171 } 172 173 /* compare UCMappings for sorting */ 174 static int32_t 175 compareMappings(UCMTable *lTable, const UCMapping *l, 176 UCMTable *rTable, const UCMapping *r, 177 UBool uFirst) { 178 int32_t result; 179 180 /* choose which side to compare first */ 181 if(uFirst) { 182 /* Unicode then bytes */ 183 result=compareUnicode(lTable, l, rTable, r); 184 if(result==0) { 185 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */ 186 } 187 } else { 188 /* bytes then Unicode */ 189 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */ 190 if(result==0) { 191 result=compareUnicode(lTable, l, rTable, r); 192 } 193 } 194 195 if(result!=0) { 196 return result; 197 } 198 199 /* compare the flags */ 200 return l->f-r->f; 201 } 202 203 /* sorting by Unicode first sorts mappings directly */ 204 static int32_t 205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { 206 return compareMappings( 207 (UCMTable *)context, (const UCMapping *)left, 208 (UCMTable *)context, (const UCMapping *)right, TRUE); 209 } 210 211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ 212 static int32_t 213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) { 214 UCMTable *table=(UCMTable *)context; 215 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; 216 return compareMappings( 217 table, table->mappings+l, 218 table, table->mappings+r, FALSE); 219 } 220 221 U_CAPI void U_EXPORT2 222 ucm_sortTable(UCMTable *t) { 223 UErrorCode errorCode; 224 int32_t i; 225 226 if(t->isSorted) { 227 return; 228 } 229 230 errorCode=U_ZERO_ERROR; 231 232 /* 1. sort by Unicode first */ 233 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), 234 compareMappingsUnicodeFirst, t, 235 FALSE, &errorCode); 236 237 /* build the reverseMap */ 238 if(t->reverseMap==NULL) { 239 /* 240 * allocate mappingsCapacity instead of mappingsLength so that 241 * if mappings are added, the reverseMap need not be 242 * reallocated each time 243 * (see ucm_moveMappings() and ucm_addMapping()) 244 */ 245 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); 246 if(t->reverseMap==NULL) { 247 fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); 248 exit(U_MEMORY_ALLOCATION_ERROR); 249 } 250 } 251 for(i=0; i<t->mappingsLength; ++i) { 252 t->reverseMap[i]=i; 253 } 254 255 /* 2. sort reverseMap by mappings bytes first */ 256 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), 257 compareMappingsBytesFirst, t, 258 FALSE, &errorCode); 259 260 if(U_FAILURE(errorCode)) { 261 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", 262 u_errorName(errorCode)); 263 exit(errorCode); 264 } 265 266 t->isSorted=TRUE; 267 } 268 269 /* 270 * remove mappings with their move flag set from the base table 271 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table 272 */ 273 U_CAPI void U_EXPORT2 274 ucm_moveMappings(UCMTable *base, UCMTable *ext) { 275 UCMapping *mb, *mbLimit; 276 int8_t flag; 277 278 mb=base->mappings; 279 mbLimit=mb+base->mappingsLength; 280 281 while(mb<mbLimit) { 282 flag=mb->moveFlag; 283 if(flag!=0) { 284 /* reset the move flag */ 285 mb->moveFlag=0; 286 287 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { 288 /* add the mapping to the extension table */ 289 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); 290 } 291 292 /* remove this mapping: move the last base mapping down and overwrite the current one */ 293 if(mb<(mbLimit-1)) { 294 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); 295 } 296 --mbLimit; 297 --base->mappingsLength; 298 base->isSorted=FALSE; 299 } else { 300 ++mb; 301 } 302 } 303 } 304 305 enum { 306 NEEDS_MOVE=1, 307 HAS_ERRORS=2 308 }; 309 310 static uint8_t 311 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 312 UBool moveToExt, UBool intersectBase) { 313 UCMapping *mb, *me, *mbLimit, *meLimit; 314 int32_t cmp; 315 uint8_t result; 316 317 mb=base->mappings; 318 mbLimit=mb+base->mappingsLength; 319 320 me=ext->mappings; 321 meLimit=me+ext->mappingsLength; 322 323 result=0; 324 325 for(;;) { 326 /* skip irrelevant mappings on both sides */ 327 for(;;) { 328 if(mb==mbLimit) { 329 return result; 330 } 331 332 if((0<=mb->f && mb->f<=2) || mb->f==4) { 333 break; 334 } 335 336 ++mb; 337 } 338 339 for(;;) { 340 if(me==meLimit) { 341 return result; 342 } 343 344 if((0<=me->f && me->f<=2) || me->f==4) { 345 break; 346 } 347 348 ++me; 349 } 350 351 /* compare the base and extension mappings */ 352 cmp=compareUnicode(base, mb, ext, me); 353 if(cmp<0) { 354 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { 355 /* 356 * mapping in base but not in ext, move it 357 * 358 * if ext is DBCS, move DBCS mappings here 359 * and check SBCS ones for Unicode prefix below 360 */ 361 mb->moveFlag|=UCM_MOVE_TO_EXT; 362 result|=NEEDS_MOVE; 363 364 /* does mb map from an input sequence that is a prefix of me's? */ 365 } else if( mb->uLen<me->uLen && 366 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) 367 ) { 368 if(moveToExt) { 369 /* mark this mapping to be moved to the extension table */ 370 mb->moveFlag|=UCM_MOVE_TO_EXT; 371 result|=NEEDS_MOVE; 372 } else { 373 fprintf(stderr, 374 "ucm error: the base table contains a mapping whose input sequence\n" 375 " is a prefix of the input sequence of an extension mapping\n"); 376 ucm_printMapping(base, mb, stderr); 377 ucm_printMapping(ext, me, stderr); 378 result|=HAS_ERRORS; 379 } 380 } 381 382 ++mb; 383 } else if(cmp==0) { 384 /* 385 * same output: remove the extension mapping, 386 * otherwise treat as an error 387 */ 388 if( mb->f==me->f && mb->bLen==me->bLen && 389 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) 390 ) { 391 me->moveFlag|=UCM_REMOVE_MAPPING; 392 result|=NEEDS_MOVE; 393 } else if(intersectBase) { 394 /* mapping in base but not in ext, move it */ 395 mb->moveFlag|=UCM_MOVE_TO_EXT; 396 result|=NEEDS_MOVE; 397 } else { 398 fprintf(stderr, 399 "ucm error: the base table contains a mapping whose input sequence\n" 400 " is the same as the input sequence of an extension mapping\n" 401 " but it maps differently\n"); 402 ucm_printMapping(base, mb, stderr); 403 ucm_printMapping(ext, me, stderr); 404 result|=HAS_ERRORS; 405 } 406 407 ++mb; 408 } else /* cmp>0 */ { 409 ++me; 410 } 411 } 412 } 413 414 static uint8_t 415 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 416 UBool moveToExt, UBool intersectBase) { 417 UCMapping *mb, *me; 418 int32_t *baseMap, *extMap; 419 int32_t b, e, bLimit, eLimit, cmp; 420 uint8_t result; 421 UBool isSISO; 422 423 baseMap=base->reverseMap; 424 extMap=ext->reverseMap; 425 426 b=e=0; 427 bLimit=base->mappingsLength; 428 eLimit=ext->mappingsLength; 429 430 result=0; 431 432 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); 433 434 for(;;) { 435 /* skip irrelevant mappings on both sides */ 436 for(;; ++b) { 437 if(b==bLimit) { 438 return result; 439 } 440 mb=base->mappings+baseMap[b]; 441 442 if(intersectBase==2 && mb->bLen==1) { 443 /* 444 * comparing a base against a DBCS extension: 445 * leave SBCS base mappings alone 446 */ 447 continue; 448 } 449 450 if(mb->f==0 || mb->f==3) { 451 break; 452 } 453 } 454 455 for(;;) { 456 if(e==eLimit) { 457 return result; 458 } 459 me=ext->mappings+extMap[e]; 460 461 if(me->f==0 || me->f==3) { 462 break; 463 } 464 465 ++e; 466 } 467 468 /* compare the base and extension mappings */ 469 cmp=compareBytes(base, mb, ext, me, TRUE); 470 if(cmp<0) { 471 if(intersectBase) { 472 /* mapping in base but not in ext, move it */ 473 mb->moveFlag|=UCM_MOVE_TO_EXT; 474 result|=NEEDS_MOVE; 475 476 /* 477 * does mb map from an input sequence that is a prefix of me's? 478 * for SI/SO tables, a single byte is never a prefix because it 479 * occurs in a separate single-byte state 480 */ 481 } else if( mb->bLen<me->bLen && 482 (!isSISO || mb->bLen>1) && 483 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) 484 ) { 485 if(moveToExt) { 486 /* mark this mapping to be moved to the extension table */ 487 mb->moveFlag|=UCM_MOVE_TO_EXT; 488 result|=NEEDS_MOVE; 489 } else { 490 fprintf(stderr, 491 "ucm error: the base table contains a mapping whose input sequence\n" 492 " is a prefix of the input sequence of an extension mapping\n"); 493 ucm_printMapping(base, mb, stderr); 494 ucm_printMapping(ext, me, stderr); 495 result|=HAS_ERRORS; 496 } 497 } 498 499 ++b; 500 } else if(cmp==0) { 501 /* 502 * same output: remove the extension mapping, 503 * otherwise treat as an error 504 */ 505 if( mb->f==me->f && mb->uLen==me->uLen && 506 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) 507 ) { 508 me->moveFlag|=UCM_REMOVE_MAPPING; 509 result|=NEEDS_MOVE; 510 } else if(intersectBase) { 511 /* mapping in base but not in ext, move it */ 512 mb->moveFlag|=UCM_MOVE_TO_EXT; 513 result|=NEEDS_MOVE; 514 } else { 515 fprintf(stderr, 516 "ucm error: the base table contains a mapping whose input sequence\n" 517 " is the same as the input sequence of an extension mapping\n" 518 " but it maps differently\n"); 519 ucm_printMapping(base, mb, stderr); 520 ucm_printMapping(ext, me, stderr); 521 result|=HAS_ERRORS; 522 } 523 524 ++b; 525 } else /* cmp>0 */ { 526 ++e; 527 } 528 } 529 } 530 531 U_CAPI UBool U_EXPORT2 532 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { 533 UCMapping *m, *mLimit; 534 int32_t count; 535 UBool isOK; 536 537 m=table->mappings; 538 mLimit=m+table->mappingsLength; 539 isOK=TRUE; 540 541 while(m<mLimit) { 542 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); 543 if(count<1) { 544 ucm_printMapping(table, m, stderr); 545 isOK=FALSE; 546 } 547 ++m; 548 } 549 550 return isOK; 551 } 552 553 U_CAPI UBool U_EXPORT2 554 ucm_checkBaseExt(UCMStates *baseStates, 555 UCMTable *base, UCMTable *ext, UCMTable *moveTarget, 556 UBool intersectBase) { 557 uint8_t result; 558 559 /* if we have an extension table, we must always use precision flags */ 560 if(base->flagsType&UCM_FLAGS_IMPLICIT) { 561 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); 562 return FALSE; 563 } 564 if(ext->flagsType&UCM_FLAGS_IMPLICIT) { 565 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); 566 return FALSE; 567 } 568 569 /* checking requires both tables to be sorted */ 570 ucm_sortTable(base); 571 ucm_sortTable(ext); 572 573 /* check */ 574 result= 575 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| 576 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); 577 578 if(result&HAS_ERRORS) { 579 return FALSE; 580 } 581 582 if(result&NEEDS_MOVE) { 583 ucm_moveMappings(ext, NULL); 584 ucm_moveMappings(base, moveTarget); 585 ucm_sortTable(base); 586 ucm_sortTable(ext); 587 if(moveTarget!=NULL) { 588 ucm_sortTable(moveTarget); 589 } 590 } 591 592 return TRUE; 593 } 594 595 /* merge tables for rptp2ucm ------------------------------------------------ */ 596 597 U_CAPI void U_EXPORT2 598 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 599 const uint8_t *subchar, int32_t subcharLength, 600 uint8_t subchar1) { 601 UCMapping *fromUMapping, *toUMapping; 602 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; 603 604 ucm_sortTable(fromUTable); 605 ucm_sortTable(toUTable); 606 607 fromUMapping=fromUTable->mappings; 608 toUMapping=toUTable->mappings; 609 610 fromUTop=fromUTable->mappingsLength; 611 toUTop=toUTable->mappingsLength; 612 613 fromUIndex=toUIndex=0; 614 615 while(fromUIndex<fromUTop && toUIndex<toUTop) { 616 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); 617 if(cmp==0) { 618 /* equal: roundtrip, nothing to do (flags are initially 0) */ 619 ++fromUMapping; 620 ++toUMapping; 621 622 ++fromUIndex; 623 ++toUIndex; 624 } else if(cmp<0) { 625 /* 626 * the fromU mapping does not have a toU counterpart: 627 * fallback Unicode->codepage 628 */ 629 if( (fromUMapping->bLen==subcharLength && 630 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || 631 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) 632 ) { 633 fromUMapping->f=2; /* SUB mapping */ 634 } else { 635 fromUMapping->f=1; /* normal fallback */ 636 } 637 638 ++fromUMapping; 639 ++fromUIndex; 640 } else { 641 /* 642 * the toU mapping does not have a fromU counterpart: 643 * (reverse) fallback codepage->Unicode, copy it to the fromU table 644 */ 645 646 /* ignore reverse fallbacks to Unicode SUB */ 647 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { 648 toUMapping->f=3; /* reverse fallback */ 649 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); 650 651 /* the table may have been reallocated */ 652 fromUMapping=fromUTable->mappings+fromUIndex; 653 } 654 655 ++toUMapping; 656 ++toUIndex; 657 } 658 } 659 660 /* either one or both tables are exhausted */ 661 while(fromUIndex<fromUTop) { 662 /* leftover fromU mappings are fallbacks */ 663 if( (fromUMapping->bLen==subcharLength && 664 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || 665 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) 666 ) { 667 fromUMapping->f=2; /* SUB mapping */ 668 } else { 669 fromUMapping->f=1; /* normal fallback */ 670 } 671 672 ++fromUMapping; 673 ++fromUIndex; 674 } 675 676 while(toUIndex<toUTop) { 677 /* leftover toU mappings are reverse fallbacks */ 678 679 /* ignore reverse fallbacks to Unicode SUB */ 680 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { 681 toUMapping->f=3; /* reverse fallback */ 682 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); 683 } 684 685 ++toUMapping; 686 ++toUIndex; 687 } 688 689 fromUTable->isSorted=FALSE; 690 } 691 692 /* separate extension mappings out of base table for rptp2ucm --------------- */ 693 694 U_CAPI UBool U_EXPORT2 695 ucm_separateMappings(UCMFile *ucm, UBool isSISO) { 696 UCMTable *table; 697 UCMapping *m, *mLimit; 698 int32_t type; 699 UBool needsMove, isOK; 700 701 table=ucm->base; 702 m=table->mappings; 703 mLimit=m+table->mappingsLength; 704 705 needsMove=FALSE; 706 isOK=TRUE; 707 708 for(; m<mLimit; ++m) { 709 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { 710 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); 711 ucm_printMapping(table, m, stderr); 712 m->moveFlag|=UCM_REMOVE_MAPPING; 713 needsMove=TRUE; 714 continue; 715 } 716 717 type=ucm_mappingType( 718 &ucm->states, m, 719 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); 720 if(type<0) { 721 /* illegal byte sequence */ 722 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); 723 isOK=FALSE; 724 } else if(type>0) { 725 m->moveFlag|=UCM_MOVE_TO_EXT; 726 needsMove=TRUE; 727 } 728 } 729 730 if(!isOK) { 731 return FALSE; 732 } 733 if(needsMove) { 734 ucm_moveMappings(ucm->base, ucm->ext); 735 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); 736 } else { 737 ucm_sortTable(ucm->base); 738 return TRUE; 739 } 740 } 741 742 /* ucm parser --------------------------------------------------------------- */ 743 744 U_CAPI int8_t U_EXPORT2 745 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { 746 const char *s=*ps; 747 char *end; 748 uint8_t byte; 749 int8_t bLen; 750 751 bLen=0; 752 for(;;) { 753 /* skip an optional plus sign */ 754 if(bLen>0 && *s=='+') { 755 ++s; 756 } 757 if(*s!='\\') { 758 break; 759 } 760 761 if( s[1]!='x' || 762 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 763 ) { 764 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); 765 return -1; 766 } 767 768 if(bLen==UCNV_EXT_MAX_BYTES) { 769 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); 770 return -1; 771 } 772 bytes[bLen++]=byte; 773 s=end; 774 } 775 776 *ps=s; 777 return bLen; 778 } 779 780 /* parse a mapping line; must not be empty */ 781 U_CAPI UBool U_EXPORT2 782 ucm_parseMappingLine(UCMapping *m, 783 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 784 uint8_t bytes[UCNV_EXT_MAX_BYTES], 785 const char *line) { 786 const char *s; 787 char *end; 788 UChar32 cp; 789 int32_t u16Length; 790 int8_t uLen, bLen, f; 791 792 s=line; 793 uLen=bLen=0; 794 795 /* parse code points */ 796 for(;;) { 797 /* skip an optional plus sign */ 798 if(uLen>0 && *s=='+') { 799 ++s; 800 } 801 if(*s!='<') { 802 break; 803 } 804 805 if( s[1]!='U' || 806 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || 807 *end!='>' 808 ) { 809 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); 810 return FALSE; 811 } 812 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { 813 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); 814 return FALSE; 815 } 816 817 if(uLen==UCNV_EXT_MAX_UCHARS) { 818 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); 819 return FALSE; 820 } 821 codePoints[uLen++]=cp; 822 s=end+1; 823 } 824 825 if(uLen==0) { 826 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); 827 return FALSE; 828 } else if(uLen==1) { 829 m->u=codePoints[0]; 830 } else { 831 UErrorCode errorCode=U_ZERO_ERROR; 832 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); 833 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || 834 u16Length>UCNV_EXT_MAX_UCHARS 835 ) { 836 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); 837 return FALSE; 838 } 839 } 840 841 s=u_skipWhitespace(s); 842 843 /* parse bytes */ 844 bLen=ucm_parseBytes(bytes, line, &s); 845 846 if(bLen<0) { 847 return FALSE; 848 } else if(bLen==0) { 849 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); 850 return FALSE; 851 } else if(bLen<=4) { 852 uprv_memcpy(m->b.bytes, bytes, bLen); 853 } 854 855 /* skip everything until the fallback indicator, even the start of a comment */ 856 for(;;) { 857 if(*s==0) { 858 f=-1; /* no fallback indicator */ 859 break; 860 } else if(*s=='|') { 861 f=(int8_t)(s[1]-'0'); 862 if((uint8_t)f>4) { 863 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); 864 return FALSE; 865 } 866 break; 867 } 868 ++s; 869 } 870 871 m->uLen=uLen; 872 m->bLen=bLen; 873 m->f=f; 874 return TRUE; 875 } 876 877 /* general APIs ------------------------------------------------------------- */ 878 879 U_CAPI UCMTable * U_EXPORT2 880 ucm_openTable() { 881 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); 882 if(table==NULL) { 883 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); 884 exit(U_MEMORY_ALLOCATION_ERROR); 885 } 886 887 memset(table, 0, sizeof(UCMTable)); 888 return table; 889 } 890 891 U_CAPI void U_EXPORT2 892 ucm_closeTable(UCMTable *table) { 893 if(table!=NULL) { 894 uprv_free(table->mappings); 895 uprv_free(table->codePoints); 896 uprv_free(table->bytes); 897 uprv_free(table->reverseMap); 898 uprv_free(table); 899 } 900 } 901 902 U_CAPI void U_EXPORT2 903 ucm_resetTable(UCMTable *table) { 904 if(table!=NULL) { 905 table->mappingsLength=0; 906 table->flagsType=0; 907 table->unicodeMask=0; 908 table->bytesLength=table->codePointsLength=0; 909 table->isSorted=FALSE; 910 } 911 } 912 913 U_CAPI void U_EXPORT2 914 ucm_addMapping(UCMTable *table, 915 UCMapping *m, 916 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 917 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 918 UCMapping *tm; 919 UChar32 c; 920 int32_t idx; 921 922 if(table->mappingsLength>=table->mappingsCapacity) { 923 /* make the mappings array larger */ 924 if(table->mappingsCapacity==0) { 925 table->mappingsCapacity=1000; 926 } else { 927 table->mappingsCapacity*=10; 928 } 929 table->mappings=(UCMapping *)uprv_realloc(table->mappings, 930 table->mappingsCapacity*sizeof(UCMapping)); 931 if(table->mappings==NULL) { 932 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", 933 (int)table->mappingsCapacity); 934 exit(U_MEMORY_ALLOCATION_ERROR); 935 } 936 937 if(table->reverseMap!=NULL) { 938 /* the reverseMap must be reallocated in a new sort */ 939 uprv_free(table->reverseMap); 940 table->reverseMap=NULL; 941 } 942 } 943 944 if(m->uLen>1 && table->codePointsCapacity==0) { 945 table->codePointsCapacity=10000; 946 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); 947 if(table->codePoints==NULL) { 948 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", 949 (int)table->codePointsCapacity); 950 exit(U_MEMORY_ALLOCATION_ERROR); 951 } 952 } 953 954 if(m->bLen>4 && table->bytesCapacity==0) { 955 table->bytesCapacity=10000; 956 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); 957 if(table->bytes==NULL) { 958 fprintf(stderr, "ucm error: unable to allocate %d bytes\n", 959 (int)table->bytesCapacity); 960 exit(U_MEMORY_ALLOCATION_ERROR); 961 } 962 } 963 964 if(m->uLen>1) { 965 idx=table->codePointsLength; 966 table->codePointsLength+=m->uLen; 967 if(table->codePointsLength>table->codePointsCapacity) { 968 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); 969 exit(U_MEMORY_ALLOCATION_ERROR); 970 } 971 972 uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); 973 m->u=idx; 974 } 975 976 if(m->bLen>4) { 977 idx=table->bytesLength; 978 table->bytesLength+=m->bLen; 979 if(table->bytesLength>table->bytesCapacity) { 980 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); 981 exit(U_MEMORY_ALLOCATION_ERROR); 982 } 983 984 uprv_memcpy(table->bytes+idx, bytes, m->bLen); 985 m->b.idx=idx; 986 } 987 988 /* set unicodeMask */ 989 for(idx=0; idx<m->uLen; ++idx) { 990 c=codePoints[idx]; 991 if(c>=0x10000) { 992 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ 993 } else if(U_IS_SURROGATE(c)) { 994 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ 995 } 996 } 997 998 /* set flagsType */ 999 if(m->f<0) { 1000 table->flagsType|=UCM_FLAGS_IMPLICIT; 1001 } else { 1002 table->flagsType|=UCM_FLAGS_EXPLICIT; 1003 } 1004 1005 tm=table->mappings+table->mappingsLength++; 1006 uprv_memcpy(tm, m, sizeof(UCMapping)); 1007 1008 table->isSorted=FALSE; 1009 } 1010 1011 U_CAPI UCMFile * U_EXPORT2 1012 ucm_open() { 1013 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); 1014 if(ucm==NULL) { 1015 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); 1016 exit(U_MEMORY_ALLOCATION_ERROR); 1017 } 1018 1019 memset(ucm, 0, sizeof(UCMFile)); 1020 1021 ucm->base=ucm_openTable(); 1022 ucm->ext=ucm_openTable(); 1023 1024 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; 1025 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; 1026 ucm->states.outputType=-1; 1027 ucm->states.minCharLength=ucm->states.maxCharLength=1; 1028 1029 return ucm; 1030 } 1031 1032 U_CAPI void U_EXPORT2 1033 ucm_close(UCMFile *ucm) { 1034 if(ucm!=NULL) { 1035 ucm_closeTable(ucm->base); 1036 ucm_closeTable(ucm->ext); 1037 uprv_free(ucm); 1038 } 1039 } 1040 1041 U_CAPI int32_t U_EXPORT2 1042 ucm_mappingType(UCMStates *baseStates, 1043 UCMapping *m, 1044 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1045 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 1046 /* check validity of the bytes and count the characters in them */ 1047 int32_t count=ucm_countChars(baseStates, bytes, m->bLen); 1048 if(count<1) { 1049 /* illegal byte sequence */ 1050 return -1; 1051 } 1052 1053 /* 1054 * Suitable for an ICU conversion base table means: 1055 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) 1056 * - precision flag 0..3 1057 * - SBCS: any 1:1 mapping 1058 * (the table stores additional bits to distinguish mapping types) 1059 * - MBCS: not a |2 SUB mapping for <subchar1> 1060 * - MBCS: not a |1 fallback to 0x00 1061 * - MBCS: not a multi-byte mapping with leading 0x00 bytes 1062 * 1063 * Further restrictions for fromUnicode tables 1064 * are enforced in makeconv (MBCSOkForBaseFromUnicode()). 1065 * 1066 * All of the MBCS fromUnicode specific tests could be removed from here, 1067 * but the ones above are for unusual mappings, and removing the tests 1068 * from here would change canonucm output which seems gratuitous. 1069 * (Markus Scherer 2006-nov-28) 1070 * 1071 * Exception: All implicit mappings (f<0) that need to be moved 1072 * because of fromUnicode restrictions _must_ be moved here because 1073 * makeconv uses a hack for moving mappings only for the fromUnicode table 1074 * that only works with non-negative values of f. 1075 */ 1076 if( m->uLen==1 && count==1 && m->f<=3 && 1077 (baseStates->maxCharLength==1 || 1078 !((m->f==2 && m->bLen==1) || 1079 (m->f==1 && bytes[0]==0) || 1080 (m->f<=1 && m->bLen>1 && bytes[0]==0))) 1081 ) { 1082 return 0; /* suitable for a base table */ 1083 } else { 1084 return 1; /* needs to go into an extension table */ 1085 } 1086 } 1087 1088 U_CAPI UBool U_EXPORT2 1089 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 1090 UCMapping *m, 1091 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1092 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 1093 int32_t type; 1094 1095 if(m->f==2 && m->uLen>1) { 1096 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); 1097 printMapping(m, codePoints, bytes, stderr); 1098 return FALSE; 1099 } 1100 1101 if(baseStates!=NULL) { 1102 /* check validity of the bytes and count the characters in them */ 1103 type=ucm_mappingType(baseStates, m, codePoints, bytes); 1104 if(type<0) { 1105 /* illegal byte sequence */ 1106 printMapping(m, codePoints, bytes, stderr); 1107 return FALSE; 1108 } 1109 } else { 1110 /* not used - adding a mapping for an extension-only table before its base table is read */ 1111 type=1; 1112 } 1113 1114 /* 1115 * Add the mapping to the base table if this is requested and suitable. 1116 * Otherwise, add it to the extension table. 1117 */ 1118 if(forBase && type==0) { 1119 ucm_addMapping(ucm->base, m, codePoints, bytes); 1120 } else { 1121 ucm_addMapping(ucm->ext, m, codePoints, bytes); 1122 } 1123 1124 return TRUE; 1125 } 1126 1127 U_CAPI UBool U_EXPORT2 1128 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { 1129 UCMapping m={ 0, {0}, 0, 0, 0, 0 }; 1130 UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; 1131 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 1132 1133 const char *s; 1134 1135 /* ignore empty and comment lines */ 1136 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { 1137 return TRUE; 1138 } 1139 1140 return 1141 ucm_parseMappingLine(&m, codePoints, bytes, line) && 1142 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); 1143 } 1144 1145 U_CAPI void U_EXPORT2 1146 ucm_readTable(UCMFile *ucm, FileStream* convFile, 1147 UBool forBase, UCMStates *baseStates, 1148 UErrorCode *pErrorCode) { 1149 char line[500]; 1150 char *end; 1151 UBool isOK; 1152 1153 if(U_FAILURE(*pErrorCode)) { 1154 return; 1155 } 1156 1157 isOK=TRUE; 1158 1159 for(;;) { 1160 /* read the next line */ 1161 if(!T_FileStream_readLine(convFile, line, sizeof(line))) { 1162 fprintf(stderr, "incomplete charmap section\n"); 1163 isOK=FALSE; 1164 break; 1165 } 1166 1167 /* remove CR LF */ 1168 end=uprv_strchr(line, 0); 1169 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { 1170 --end; 1171 } 1172 *end=0; 1173 1174 /* ignore empty and comment lines */ 1175 if(line[0]==0 || line[0]=='#') { 1176 continue; 1177 } 1178 1179 /* stop at the end of the mapping table */ 1180 if(0==uprv_strcmp(line, "END CHARMAP")) { 1181 break; 1182 } 1183 1184 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); 1185 } 1186 1187 if(!isOK) { 1188 *pErrorCode=U_INVALID_TABLE_FORMAT; 1189 } 1190 } 1191 #endif 1192