1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2013, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucm.c 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003jun20 16 * created by: Markus W. Scherer 17 * 18 * This file reads a .ucm file, stores its mappings and sorts them. 19 * It implements handling of Unicode conversion mappings from .ucm files 20 * for makeconv, canonucm, rptp2ucm, etc. 21 * 22 * Unicode code point sequences with a length of more than 1, 23 * as well as byte sequences with more than 4 bytes or more than one complete 24 * character sequence are handled to support m:n mappings. 25 */ 26 27 #include "unicode/utypes.h" 28 #include "unicode/ustring.h" 29 #include "cstring.h" 30 #include "cmemory.h" 31 #include "filestrm.h" 32 #include "uarrsort.h" 33 #include "ucnvmbcs.h" 34 #include "ucnv_bld.h" 35 #include "ucnv_ext.h" 36 #include "uparse.h" 37 #include "ucm.h" 38 #include <stdio.h> 39 40 #if !UCONFIG_NO_CONVERSION 41 42 /* -------------------------------------------------------------------------- */ 43 44 static void 45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { 46 int32_t j; 47 48 for(j=0; j<m->uLen; ++j) { 49 fprintf(f, "<U%04lX>", (long)codePoints[j]); 50 } 51 52 fputc(' ', f); 53 54 for(j=0; j<m->bLen; ++j) { 55 fprintf(f, "\\x%02X", bytes[j]); 56 } 57 58 if(m->f>=0) { 59 fprintf(f, " |%u\n", m->f); 60 } else { 61 fputs("\n", f); 62 } 63 } 64 65 U_CAPI void U_EXPORT2 66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { 67 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); 68 } 69 70 U_CAPI void U_EXPORT2 71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { 72 UCMapping *m; 73 int32_t i, length; 74 75 m=table->mappings; 76 length=table->mappingsLength; 77 if(byUnicode) { 78 for(i=0; i<length; ++m, ++i) { 79 ucm_printMapping(table, m, f); 80 } 81 } else { 82 const int32_t *map=table->reverseMap; 83 for(i=0; i<length; ++i) { 84 ucm_printMapping(table, m+map[i], f); 85 } 86 } 87 } 88 89 /* mapping comparisons ------------------------------------------------------ */ 90 91 static int32_t 92 compareUnicode(UCMTable *lTable, const UCMapping *l, 93 UCMTable *rTable, const UCMapping *r) { 94 const UChar32 *lu, *ru; 95 int32_t result, i, length; 96 97 if(l->uLen==1 && r->uLen==1) { 98 /* compare two single code points */ 99 return l->u-r->u; 100 } 101 102 /* get pointers to the code point sequences */ 103 lu=UCM_GET_CODE_POINTS(lTable, l); 104 ru=UCM_GET_CODE_POINTS(rTable, r); 105 106 /* get the minimum length */ 107 if(l->uLen<=r->uLen) { 108 length=l->uLen; 109 } else { 110 length=r->uLen; 111 } 112 113 /* compare the code points */ 114 for(i=0; i<length; ++i) { 115 result=lu[i]-ru[i]; 116 if(result!=0) { 117 return result; 118 } 119 } 120 121 /* compare the lengths */ 122 return l->uLen-r->uLen; 123 } 124 125 static int32_t 126 compareBytes(UCMTable *lTable, const UCMapping *l, 127 UCMTable *rTable, const UCMapping *r, 128 UBool lexical) { 129 const uint8_t *lb, *rb; 130 int32_t result, i, length; 131 132 /* 133 * A lexical comparison is used for sorting in the builder, to allow 134 * an efficient search for a byte sequence that could be a prefix 135 * of a previously entered byte sequence. 136 * 137 * Comparing by lengths first is for compatibility with old .ucm tools 138 * like canonucm and rptp2ucm. 139 */ 140 if(lexical) { 141 /* get the minimum length and continue */ 142 if(l->bLen<=r->bLen) { 143 length=l->bLen; 144 } else { 145 length=r->bLen; 146 } 147 } else { 148 /* compare lengths first */ 149 result=l->bLen-r->bLen; 150 if(result!=0) { 151 return result; 152 } else { 153 length=l->bLen; 154 } 155 } 156 157 /* get pointers to the byte sequences */ 158 lb=UCM_GET_BYTES(lTable, l); 159 rb=UCM_GET_BYTES(rTable, r); 160 161 /* compare the bytes */ 162 for(i=0; i<length; ++i) { 163 result=lb[i]-rb[i]; 164 if(result!=0) { 165 return result; 166 } 167 } 168 169 /* compare the lengths */ 170 return l->bLen-r->bLen; 171 } 172 173 /* compare UCMappings for sorting */ 174 static int32_t 175 compareMappings(UCMTable *lTable, const UCMapping *l, 176 UCMTable *rTable, const UCMapping *r, 177 UBool uFirst) { 178 int32_t result; 179 180 /* choose which side to compare first */ 181 if(uFirst) { 182 /* Unicode then bytes */ 183 result=compareUnicode(lTable, l, rTable, r); 184 if(result==0) { 185 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */ 186 } 187 } else { 188 /* bytes then Unicode */ 189 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */ 190 if(result==0) { 191 result=compareUnicode(lTable, l, rTable, r); 192 } 193 } 194 195 if(result!=0) { 196 return result; 197 } 198 199 /* compare the flags */ 200 return l->f-r->f; 201 } 202 U_CDECL_BEGIN 203 /* sorting by Unicode first sorts mappings directly */ 204 static int32_t U_CALLCONV 205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { 206 return compareMappings( 207 (UCMTable *)context, (const UCMapping *)left, 208 (UCMTable *)context, (const UCMapping *)right, TRUE); 209 } 210 211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ 212 static int32_t U_CALLCONV 213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) { 214 UCMTable *table=(UCMTable *)context; 215 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; 216 return compareMappings( 217 table, table->mappings+l, 218 table, table->mappings+r, FALSE); 219 } 220 U_CDECL_END 221 222 U_CAPI void U_EXPORT2 223 ucm_sortTable(UCMTable *t) { 224 UErrorCode errorCode; 225 int32_t i; 226 227 if(t->isSorted) { 228 return; 229 } 230 231 errorCode=U_ZERO_ERROR; 232 233 /* 1. sort by Unicode first */ 234 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), 235 compareMappingsUnicodeFirst, t, 236 FALSE, &errorCode); 237 238 /* build the reverseMap */ 239 if(t->reverseMap==NULL) { 240 /* 241 * allocate mappingsCapacity instead of mappingsLength so that 242 * if mappings are added, the reverseMap need not be 243 * reallocated each time 244 * (see ucm_moveMappings() and ucm_addMapping()) 245 */ 246 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); 247 if(t->reverseMap==NULL) { 248 fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); 249 exit(U_MEMORY_ALLOCATION_ERROR); 250 } 251 } 252 for(i=0; i<t->mappingsLength; ++i) { 253 t->reverseMap[i]=i; 254 } 255 256 /* 2. sort reverseMap by mappings bytes first */ 257 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), 258 compareMappingsBytesFirst, t, 259 FALSE, &errorCode); 260 261 if(U_FAILURE(errorCode)) { 262 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", 263 u_errorName(errorCode)); 264 exit(errorCode); 265 } 266 267 t->isSorted=TRUE; 268 } 269 270 /* 271 * remove mappings with their move flag set from the base table 272 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table 273 */ 274 U_CAPI void U_EXPORT2 275 ucm_moveMappings(UCMTable *base, UCMTable *ext) { 276 UCMapping *mb, *mbLimit; 277 int8_t flag; 278 279 mb=base->mappings; 280 mbLimit=mb+base->mappingsLength; 281 282 while(mb<mbLimit) { 283 flag=mb->moveFlag; 284 if(flag!=0) { 285 /* reset the move flag */ 286 mb->moveFlag=0; 287 288 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { 289 /* add the mapping to the extension table */ 290 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); 291 } 292 293 /* remove this mapping: move the last base mapping down and overwrite the current one */ 294 if(mb<(mbLimit-1)) { 295 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); 296 } 297 --mbLimit; 298 --base->mappingsLength; 299 base->isSorted=FALSE; 300 } else { 301 ++mb; 302 } 303 } 304 } 305 306 enum { 307 NEEDS_MOVE=1, 308 HAS_ERRORS=2 309 }; 310 311 static uint8_t 312 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 313 UBool moveToExt, UBool intersectBase) { 314 (void)baseStates; 315 316 UCMapping *mb, *me, *mbLimit, *meLimit; 317 int32_t cmp; 318 uint8_t result; 319 320 mb=base->mappings; 321 mbLimit=mb+base->mappingsLength; 322 323 me=ext->mappings; 324 meLimit=me+ext->mappingsLength; 325 326 result=0; 327 328 for(;;) { 329 /* skip irrelevant mappings on both sides */ 330 for(;;) { 331 if(mb==mbLimit) { 332 return result; 333 } 334 335 if((0<=mb->f && mb->f<=2) || mb->f==4) { 336 break; 337 } 338 339 ++mb; 340 } 341 342 for(;;) { 343 if(me==meLimit) { 344 return result; 345 } 346 347 if((0<=me->f && me->f<=2) || me->f==4) { 348 break; 349 } 350 351 ++me; 352 } 353 354 /* compare the base and extension mappings */ 355 cmp=compareUnicode(base, mb, ext, me); 356 if(cmp<0) { 357 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { 358 /* 359 * mapping in base but not in ext, move it 360 * 361 * if ext is DBCS, move DBCS mappings here 362 * and check SBCS ones for Unicode prefix below 363 */ 364 mb->moveFlag|=UCM_MOVE_TO_EXT; 365 result|=NEEDS_MOVE; 366 367 /* does mb map from an input sequence that is a prefix of me's? */ 368 } else if( mb->uLen<me->uLen && 369 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) 370 ) { 371 if(moveToExt) { 372 /* mark this mapping to be moved to the extension table */ 373 mb->moveFlag|=UCM_MOVE_TO_EXT; 374 result|=NEEDS_MOVE; 375 } else { 376 fprintf(stderr, 377 "ucm error: the base table contains a mapping whose input sequence\n" 378 " is a prefix of the input sequence of an extension mapping\n"); 379 ucm_printMapping(base, mb, stderr); 380 ucm_printMapping(ext, me, stderr); 381 result|=HAS_ERRORS; 382 } 383 } 384 385 ++mb; 386 } else if(cmp==0) { 387 /* 388 * same output: remove the extension mapping, 389 * otherwise treat as an error 390 */ 391 if( mb->f==me->f && mb->bLen==me->bLen && 392 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) 393 ) { 394 me->moveFlag|=UCM_REMOVE_MAPPING; 395 result|=NEEDS_MOVE; 396 } else if(intersectBase) { 397 /* mapping in base but not in ext, move it */ 398 mb->moveFlag|=UCM_MOVE_TO_EXT; 399 result|=NEEDS_MOVE; 400 } else { 401 fprintf(stderr, 402 "ucm error: the base table contains a mapping whose input sequence\n" 403 " is the same as the input sequence of an extension mapping\n" 404 " but it maps differently\n"); 405 ucm_printMapping(base, mb, stderr); 406 ucm_printMapping(ext, me, stderr); 407 result|=HAS_ERRORS; 408 } 409 410 ++mb; 411 } else /* cmp>0 */ { 412 ++me; 413 } 414 } 415 } 416 417 static uint8_t 418 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 419 UBool moveToExt, UBool intersectBase) { 420 UCMapping *mb, *me; 421 int32_t *baseMap, *extMap; 422 int32_t b, e, bLimit, eLimit, cmp; 423 uint8_t result; 424 UBool isSISO; 425 426 baseMap=base->reverseMap; 427 extMap=ext->reverseMap; 428 429 b=e=0; 430 bLimit=base->mappingsLength; 431 eLimit=ext->mappingsLength; 432 433 result=0; 434 435 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); 436 437 for(;;) { 438 /* skip irrelevant mappings on both sides */ 439 for(;; ++b) { 440 if(b==bLimit) { 441 return result; 442 } 443 mb=base->mappings+baseMap[b]; 444 445 if(intersectBase==2 && mb->bLen==1) { 446 /* 447 * comparing a base against a DBCS extension: 448 * leave SBCS base mappings alone 449 */ 450 continue; 451 } 452 453 if(mb->f==0 || mb->f==3) { 454 break; 455 } 456 } 457 458 for(;;) { 459 if(e==eLimit) { 460 return result; 461 } 462 me=ext->mappings+extMap[e]; 463 464 if(me->f==0 || me->f==3) { 465 break; 466 } 467 468 ++e; 469 } 470 471 /* compare the base and extension mappings */ 472 cmp=compareBytes(base, mb, ext, me, TRUE); 473 if(cmp<0) { 474 if(intersectBase) { 475 /* mapping in base but not in ext, move it */ 476 mb->moveFlag|=UCM_MOVE_TO_EXT; 477 result|=NEEDS_MOVE; 478 479 /* 480 * does mb map from an input sequence that is a prefix of me's? 481 * for SI/SO tables, a single byte is never a prefix because it 482 * occurs in a separate single-byte state 483 */ 484 } else if( mb->bLen<me->bLen && 485 (!isSISO || mb->bLen>1) && 486 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) 487 ) { 488 if(moveToExt) { 489 /* mark this mapping to be moved to the extension table */ 490 mb->moveFlag|=UCM_MOVE_TO_EXT; 491 result|=NEEDS_MOVE; 492 } else { 493 fprintf(stderr, 494 "ucm error: the base table contains a mapping whose input sequence\n" 495 " is a prefix of the input sequence of an extension mapping\n"); 496 ucm_printMapping(base, mb, stderr); 497 ucm_printMapping(ext, me, stderr); 498 result|=HAS_ERRORS; 499 } 500 } 501 502 ++b; 503 } else if(cmp==0) { 504 /* 505 * same output: remove the extension mapping, 506 * otherwise treat as an error 507 */ 508 if( mb->f==me->f && mb->uLen==me->uLen && 509 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) 510 ) { 511 me->moveFlag|=UCM_REMOVE_MAPPING; 512 result|=NEEDS_MOVE; 513 } else if(intersectBase) { 514 /* mapping in base but not in ext, move it */ 515 mb->moveFlag|=UCM_MOVE_TO_EXT; 516 result|=NEEDS_MOVE; 517 } else { 518 fprintf(stderr, 519 "ucm error: the base table contains a mapping whose input sequence\n" 520 " is the same as the input sequence of an extension mapping\n" 521 " but it maps differently\n"); 522 ucm_printMapping(base, mb, stderr); 523 ucm_printMapping(ext, me, stderr); 524 result|=HAS_ERRORS; 525 } 526 527 ++b; 528 } else /* cmp>0 */ { 529 ++e; 530 } 531 } 532 } 533 534 U_CAPI UBool U_EXPORT2 535 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { 536 UCMapping *m, *mLimit; 537 int32_t count; 538 UBool isOK; 539 540 m=table->mappings; 541 mLimit=m+table->mappingsLength; 542 isOK=TRUE; 543 544 while(m<mLimit) { 545 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); 546 if(count<1) { 547 ucm_printMapping(table, m, stderr); 548 isOK=FALSE; 549 } 550 ++m; 551 } 552 553 return isOK; 554 } 555 556 U_CAPI UBool U_EXPORT2 557 ucm_checkBaseExt(UCMStates *baseStates, 558 UCMTable *base, UCMTable *ext, UCMTable *moveTarget, 559 UBool intersectBase) { 560 uint8_t result; 561 562 /* if we have an extension table, we must always use precision flags */ 563 if(base->flagsType&UCM_FLAGS_IMPLICIT) { 564 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); 565 return FALSE; 566 } 567 if(ext->flagsType&UCM_FLAGS_IMPLICIT) { 568 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); 569 return FALSE; 570 } 571 572 /* checking requires both tables to be sorted */ 573 ucm_sortTable(base); 574 ucm_sortTable(ext); 575 576 /* check */ 577 result= 578 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| 579 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); 580 581 if(result&HAS_ERRORS) { 582 return FALSE; 583 } 584 585 if(result&NEEDS_MOVE) { 586 ucm_moveMappings(ext, NULL); 587 ucm_moveMappings(base, moveTarget); 588 ucm_sortTable(base); 589 ucm_sortTable(ext); 590 if(moveTarget!=NULL) { 591 ucm_sortTable(moveTarget); 592 } 593 } 594 595 return TRUE; 596 } 597 598 /* merge tables for rptp2ucm ------------------------------------------------ */ 599 600 U_CAPI void U_EXPORT2 601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 602 const uint8_t *subchar, int32_t subcharLength, 603 uint8_t subchar1) { 604 UCMapping *fromUMapping, *toUMapping; 605 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; 606 607 ucm_sortTable(fromUTable); 608 ucm_sortTable(toUTable); 609 610 fromUMapping=fromUTable->mappings; 611 toUMapping=toUTable->mappings; 612 613 fromUTop=fromUTable->mappingsLength; 614 toUTop=toUTable->mappingsLength; 615 616 fromUIndex=toUIndex=0; 617 618 while(fromUIndex<fromUTop && toUIndex<toUTop) { 619 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); 620 if(cmp==0) { 621 /* equal: roundtrip, nothing to do (flags are initially 0) */ 622 ++fromUMapping; 623 ++toUMapping; 624 625 ++fromUIndex; 626 ++toUIndex; 627 } else if(cmp<0) { 628 /* 629 * the fromU mapping does not have a toU counterpart: 630 * fallback Unicode->codepage 631 */ 632 if( (fromUMapping->bLen==subcharLength && 633 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || 634 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) 635 ) { 636 fromUMapping->f=2; /* SUB mapping */ 637 } else { 638 fromUMapping->f=1; /* normal fallback */ 639 } 640 641 ++fromUMapping; 642 ++fromUIndex; 643 } else { 644 /* 645 * the toU mapping does not have a fromU counterpart: 646 * (reverse) fallback codepage->Unicode, copy it to the fromU table 647 */ 648 649 /* ignore reverse fallbacks to Unicode SUB */ 650 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { 651 toUMapping->f=3; /* reverse fallback */ 652 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); 653 654 /* the table may have been reallocated */ 655 fromUMapping=fromUTable->mappings+fromUIndex; 656 } 657 658 ++toUMapping; 659 ++toUIndex; 660 } 661 } 662 663 /* either one or both tables are exhausted */ 664 while(fromUIndex<fromUTop) { 665 /* leftover fromU mappings are fallbacks */ 666 if( (fromUMapping->bLen==subcharLength && 667 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || 668 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) 669 ) { 670 fromUMapping->f=2; /* SUB mapping */ 671 } else { 672 fromUMapping->f=1; /* normal fallback */ 673 } 674 675 ++fromUMapping; 676 ++fromUIndex; 677 } 678 679 while(toUIndex<toUTop) { 680 /* leftover toU mappings are reverse fallbacks */ 681 682 /* ignore reverse fallbacks to Unicode SUB */ 683 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { 684 toUMapping->f=3; /* reverse fallback */ 685 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); 686 } 687 688 ++toUMapping; 689 ++toUIndex; 690 } 691 692 fromUTable->isSorted=FALSE; 693 } 694 695 /* separate extension mappings out of base table for rptp2ucm --------------- */ 696 697 U_CAPI UBool U_EXPORT2 698 ucm_separateMappings(UCMFile *ucm, UBool isSISO) { 699 UCMTable *table; 700 UCMapping *m, *mLimit; 701 int32_t type; 702 UBool needsMove, isOK; 703 704 table=ucm->base; 705 m=table->mappings; 706 mLimit=m+table->mappingsLength; 707 708 needsMove=FALSE; 709 isOK=TRUE; 710 711 for(; m<mLimit; ++m) { 712 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { 713 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); 714 ucm_printMapping(table, m, stderr); 715 m->moveFlag|=UCM_REMOVE_MAPPING; 716 needsMove=TRUE; 717 continue; 718 } 719 720 type=ucm_mappingType( 721 &ucm->states, m, 722 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); 723 if(type<0) { 724 /* illegal byte sequence */ 725 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); 726 isOK=FALSE; 727 } else if(type>0) { 728 m->moveFlag|=UCM_MOVE_TO_EXT; 729 needsMove=TRUE; 730 } 731 } 732 733 if(!isOK) { 734 return FALSE; 735 } 736 if(needsMove) { 737 ucm_moveMappings(ucm->base, ucm->ext); 738 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); 739 } else { 740 ucm_sortTable(ucm->base); 741 return TRUE; 742 } 743 } 744 745 /* ucm parser --------------------------------------------------------------- */ 746 747 U_CAPI int8_t U_EXPORT2 748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { 749 const char *s=*ps; 750 char *end; 751 uint8_t byte; 752 int8_t bLen; 753 754 bLen=0; 755 for(;;) { 756 /* skip an optional plus sign */ 757 if(bLen>0 && *s=='+') { 758 ++s; 759 } 760 if(*s!='\\') { 761 break; 762 } 763 764 if( s[1]!='x' || 765 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 766 ) { 767 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); 768 return -1; 769 } 770 771 if(bLen==UCNV_EXT_MAX_BYTES) { 772 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); 773 return -1; 774 } 775 bytes[bLen++]=byte; 776 s=end; 777 } 778 779 *ps=s; 780 return bLen; 781 } 782 783 /* parse a mapping line; must not be empty */ 784 U_CAPI UBool U_EXPORT2 785 ucm_parseMappingLine(UCMapping *m, 786 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 787 uint8_t bytes[UCNV_EXT_MAX_BYTES], 788 const char *line) { 789 const char *s; 790 char *end; 791 UChar32 cp; 792 int32_t u16Length; 793 int8_t uLen, bLen, f; 794 795 s=line; 796 uLen=bLen=0; 797 798 /* parse code points */ 799 for(;;) { 800 /* skip an optional plus sign */ 801 if(uLen>0 && *s=='+') { 802 ++s; 803 } 804 if(*s!='<') { 805 break; 806 } 807 808 if( s[1]!='U' || 809 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || 810 *end!='>' 811 ) { 812 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); 813 return FALSE; 814 } 815 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { 816 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); 817 return FALSE; 818 } 819 820 if(uLen==UCNV_EXT_MAX_UCHARS) { 821 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); 822 return FALSE; 823 } 824 codePoints[uLen++]=cp; 825 s=end+1; 826 } 827 828 if(uLen==0) { 829 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); 830 return FALSE; 831 } else if(uLen==1) { 832 m->u=codePoints[0]; 833 } else { 834 UErrorCode errorCode=U_ZERO_ERROR; 835 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); 836 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || 837 u16Length>UCNV_EXT_MAX_UCHARS 838 ) { 839 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); 840 return FALSE; 841 } 842 } 843 844 s=u_skipWhitespace(s); 845 846 /* parse bytes */ 847 bLen=ucm_parseBytes(bytes, line, &s); 848 849 if(bLen<0) { 850 return FALSE; 851 } else if(bLen==0) { 852 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); 853 return FALSE; 854 } else if(bLen<=4) { 855 uprv_memcpy(m->b.bytes, bytes, bLen); 856 } 857 858 /* skip everything until the fallback indicator, even the start of a comment */ 859 for(;;) { 860 if(*s==0) { 861 f=-1; /* no fallback indicator */ 862 break; 863 } else if(*s=='|') { 864 f=(int8_t)(s[1]-'0'); 865 if((uint8_t)f>4) { 866 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); 867 return FALSE; 868 } 869 break; 870 } 871 ++s; 872 } 873 874 m->uLen=uLen; 875 m->bLen=bLen; 876 m->f=f; 877 return TRUE; 878 } 879 880 /* general APIs ------------------------------------------------------------- */ 881 882 U_CAPI UCMTable * U_EXPORT2 883 ucm_openTable() { 884 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); 885 if(table==NULL) { 886 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); 887 exit(U_MEMORY_ALLOCATION_ERROR); 888 } 889 890 memset(table, 0, sizeof(UCMTable)); 891 return table; 892 } 893 894 U_CAPI void U_EXPORT2 895 ucm_closeTable(UCMTable *table) { 896 if(table!=NULL) { 897 uprv_free(table->mappings); 898 uprv_free(table->codePoints); 899 uprv_free(table->bytes); 900 uprv_free(table->reverseMap); 901 uprv_free(table); 902 } 903 } 904 905 U_CAPI void U_EXPORT2 906 ucm_resetTable(UCMTable *table) { 907 if(table!=NULL) { 908 table->mappingsLength=0; 909 table->flagsType=0; 910 table->unicodeMask=0; 911 table->bytesLength=table->codePointsLength=0; 912 table->isSorted=FALSE; 913 } 914 } 915 916 U_CAPI void U_EXPORT2 917 ucm_addMapping(UCMTable *table, 918 UCMapping *m, 919 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 920 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 921 UCMapping *tm; 922 UChar32 c; 923 int32_t idx; 924 925 if(table->mappingsLength>=table->mappingsCapacity) { 926 /* make the mappings array larger */ 927 if(table->mappingsCapacity==0) { 928 table->mappingsCapacity=1000; 929 } else { 930 table->mappingsCapacity*=10; 931 } 932 table->mappings=(UCMapping *)uprv_realloc(table->mappings, 933 table->mappingsCapacity*sizeof(UCMapping)); 934 if(table->mappings==NULL) { 935 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", 936 (int)table->mappingsCapacity); 937 exit(U_MEMORY_ALLOCATION_ERROR); 938 } 939 940 if(table->reverseMap!=NULL) { 941 /* the reverseMap must be reallocated in a new sort */ 942 uprv_free(table->reverseMap); 943 table->reverseMap=NULL; 944 } 945 } 946 947 if(m->uLen>1 && table->codePointsCapacity==0) { 948 table->codePointsCapacity=10000; 949 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); 950 if(table->codePoints==NULL) { 951 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", 952 (int)table->codePointsCapacity); 953 exit(U_MEMORY_ALLOCATION_ERROR); 954 } 955 } 956 957 if(m->bLen>4 && table->bytesCapacity==0) { 958 table->bytesCapacity=10000; 959 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); 960 if(table->bytes==NULL) { 961 fprintf(stderr, "ucm error: unable to allocate %d bytes\n", 962 (int)table->bytesCapacity); 963 exit(U_MEMORY_ALLOCATION_ERROR); 964 } 965 } 966 967 if(m->uLen>1) { 968 idx=table->codePointsLength; 969 table->codePointsLength+=m->uLen; 970 if(table->codePointsLength>table->codePointsCapacity) { 971 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); 972 exit(U_MEMORY_ALLOCATION_ERROR); 973 } 974 975 uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); 976 m->u=idx; 977 } 978 979 if(m->bLen>4) { 980 idx=table->bytesLength; 981 table->bytesLength+=m->bLen; 982 if(table->bytesLength>table->bytesCapacity) { 983 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); 984 exit(U_MEMORY_ALLOCATION_ERROR); 985 } 986 987 uprv_memcpy(table->bytes+idx, bytes, m->bLen); 988 m->b.idx=idx; 989 } 990 991 /* set unicodeMask */ 992 for(idx=0; idx<m->uLen; ++idx) { 993 c=codePoints[idx]; 994 if(c>=0x10000) { 995 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ 996 } else if(U_IS_SURROGATE(c)) { 997 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ 998 } 999 } 1000 1001 /* set flagsType */ 1002 if(m->f<0) { 1003 table->flagsType|=UCM_FLAGS_IMPLICIT; 1004 } else { 1005 table->flagsType|=UCM_FLAGS_EXPLICIT; 1006 } 1007 1008 tm=table->mappings+table->mappingsLength++; 1009 uprv_memcpy(tm, m, sizeof(UCMapping)); 1010 1011 table->isSorted=FALSE; 1012 } 1013 1014 U_CAPI UCMFile * U_EXPORT2 1015 ucm_open() { 1016 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); 1017 if(ucm==NULL) { 1018 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); 1019 exit(U_MEMORY_ALLOCATION_ERROR); 1020 } 1021 1022 memset(ucm, 0, sizeof(UCMFile)); 1023 1024 ucm->base=ucm_openTable(); 1025 ucm->ext=ucm_openTable(); 1026 1027 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; 1028 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; 1029 ucm->states.outputType=-1; 1030 ucm->states.minCharLength=ucm->states.maxCharLength=1; 1031 1032 return ucm; 1033 } 1034 1035 U_CAPI void U_EXPORT2 1036 ucm_close(UCMFile *ucm) { 1037 if(ucm!=NULL) { 1038 ucm_closeTable(ucm->base); 1039 ucm_closeTable(ucm->ext); 1040 uprv_free(ucm); 1041 } 1042 } 1043 1044 U_CAPI int32_t U_EXPORT2 1045 ucm_mappingType(UCMStates *baseStates, 1046 UCMapping *m, 1047 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1048 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 1049 (void)codePoints; 1050 /* check validity of the bytes and count the characters in them */ 1051 int32_t count=ucm_countChars(baseStates, bytes, m->bLen); 1052 if(count<1) { 1053 /* illegal byte sequence */ 1054 return -1; 1055 } 1056 1057 /* 1058 * Suitable for an ICU conversion base table means: 1059 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) 1060 * - precision flag 0..3 1061 * - SBCS: any 1:1 mapping 1062 * (the table stores additional bits to distinguish mapping types) 1063 * - MBCS: not a |2 SUB mapping for <subchar1> 1064 * - MBCS: not a |1 fallback to 0x00 1065 * - MBCS: not a multi-byte mapping with leading 0x00 bytes 1066 * 1067 * Further restrictions for fromUnicode tables 1068 * are enforced in makeconv (MBCSOkForBaseFromUnicode()). 1069 * 1070 * All of the MBCS fromUnicode specific tests could be removed from here, 1071 * but the ones above are for unusual mappings, and removing the tests 1072 * from here would change canonucm output which seems gratuitous. 1073 * (Markus Scherer 2006-nov-28) 1074 * 1075 * Exception: All implicit mappings (f<0) that need to be moved 1076 * because of fromUnicode restrictions _must_ be moved here because 1077 * makeconv uses a hack for moving mappings only for the fromUnicode table 1078 * that only works with non-negative values of f. 1079 */ 1080 if( m->uLen==1 && count==1 && m->f<=3 && 1081 (baseStates->maxCharLength==1 || 1082 !((m->f==2 && m->bLen==1) || 1083 (m->f==1 && bytes[0]==0) || 1084 (m->f<=1 && m->bLen>1 && bytes[0]==0))) 1085 ) { 1086 return 0; /* suitable for a base table */ 1087 } else { 1088 return 1; /* needs to go into an extension table */ 1089 } 1090 } 1091 1092 U_CAPI UBool U_EXPORT2 1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 1094 UCMapping *m, 1095 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 1096 uint8_t bytes[UCNV_EXT_MAX_BYTES]) { 1097 int32_t type; 1098 1099 if(m->f==2 && m->uLen>1) { 1100 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); 1101 printMapping(m, codePoints, bytes, stderr); 1102 return FALSE; 1103 } 1104 1105 if(baseStates!=NULL) { 1106 /* check validity of the bytes and count the characters in them */ 1107 type=ucm_mappingType(baseStates, m, codePoints, bytes); 1108 if(type<0) { 1109 /* illegal byte sequence */ 1110 printMapping(m, codePoints, bytes, stderr); 1111 return FALSE; 1112 } 1113 } else { 1114 /* not used - adding a mapping for an extension-only table before its base table is read */ 1115 type=1; 1116 } 1117 1118 /* 1119 * Add the mapping to the base table if this is requested and suitable. 1120 * Otherwise, add it to the extension table. 1121 */ 1122 if(forBase && type==0) { 1123 ucm_addMapping(ucm->base, m, codePoints, bytes); 1124 } else { 1125 ucm_addMapping(ucm->ext, m, codePoints, bytes); 1126 } 1127 1128 return TRUE; 1129 } 1130 1131 U_CAPI UBool U_EXPORT2 1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { 1133 UCMapping m={ 0, {0}, 0, 0, 0, 0 }; 1134 UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; 1135 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 1136 1137 const char *s; 1138 1139 /* ignore empty and comment lines */ 1140 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { 1141 return TRUE; 1142 } 1143 1144 return 1145 ucm_parseMappingLine(&m, codePoints, bytes, line) && 1146 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); 1147 } 1148 1149 U_CAPI void U_EXPORT2 1150 ucm_readTable(UCMFile *ucm, FileStream* convFile, 1151 UBool forBase, UCMStates *baseStates, 1152 UErrorCode *pErrorCode) { 1153 char line[500]; 1154 char *end; 1155 UBool isOK; 1156 1157 if(U_FAILURE(*pErrorCode)) { 1158 return; 1159 } 1160 1161 isOK=TRUE; 1162 1163 for(;;) { 1164 /* read the next line */ 1165 if(!T_FileStream_readLine(convFile, line, sizeof(line))) { 1166 fprintf(stderr, "incomplete charmap section\n"); 1167 isOK=FALSE; 1168 break; 1169 } 1170 1171 /* remove CR LF */ 1172 end=uprv_strchr(line, 0); 1173 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { 1174 --end; 1175 } 1176 *end=0; 1177 1178 /* ignore empty and comment lines */ 1179 if(line[0]==0 || line[0]=='#') { 1180 continue; 1181 } 1182 1183 /* stop at the end of the mapping table */ 1184 if(0==uprv_strcmp(line, "END CHARMAP")) { 1185 break; 1186 } 1187 1188 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); 1189 } 1190 1191 if(!isOK) { 1192 *pErrorCode=U_INVALID_TABLE_FORMAT; 1193 } 1194 } 1195 #endif 1196