1 /* 2 ******************************************************************************** 3 * 4 * Copyright (C) 1998-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************** 8 * 9 * 10 * makeconv.c: 11 * tool creating a binary (compressed) representation of the conversion mapping 12 * table (IBM NLTC ucmap format). 13 * 14 * 05/04/2000 helena Added fallback mapping into the picture... 15 * 06/29/2000 helena Major rewrite of the callback APIs. 16 */ 17 18 #include <stdio.h> 19 #include "unicode/putil.h" 20 #include "unicode/ucnv_err.h" 21 #include "ucnv_bld.h" 22 #include "ucnv_imp.h" 23 #include "ucnv_cnv.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "uinvchar.h" 27 #include "filestrm.h" 28 #include "toolutil.h" 29 #include "uoptions.h" 30 #include "unicode/udata.h" 31 #include "unewdata.h" 32 #include "uparse.h" 33 #include "ucm.h" 34 #include "makeconv.h" 35 #include "genmbcs.h" 36 37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 38 39 #define DEBUG 0 40 41 typedef struct ConvData { 42 UCMFile *ucm; 43 NewConverter *cnvData, *extData; 44 UConverterSharedData sharedData; 45 UConverterStaticData staticData; 46 } ConvData; 47 48 static void 49 initConvData(ConvData *data) { 50 uprv_memset(data, 0, sizeof(ConvData)); 51 data->sharedData.structSize=sizeof(UConverterSharedData); 52 data->staticData.structSize=sizeof(UConverterStaticData); 53 data->sharedData.staticData=&data->staticData; 54 } 55 56 static void 57 cleanupConvData(ConvData *data) { 58 if(data!=NULL) { 59 if(data->cnvData!=NULL) { 60 data->cnvData->close(data->cnvData); 61 data->cnvData=NULL; 62 } 63 if(data->extData!=NULL) { 64 data->extData->close(data->extData); 65 data->extData=NULL; 66 } 67 ucm_close(data->ucm); 68 data->ucm=NULL; 69 } 70 } 71 72 /* 73 * from ucnvstat.c - static prototypes of data-based converters 74 */ 75 extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]; 76 77 /* 78 * Global - verbosity 79 */ 80 UBool VERBOSE = FALSE; 81 UBool SMALL = FALSE; 82 UBool IGNORE_SISO_CHECK = FALSE; 83 84 static void 85 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); 86 87 /* 88 * Set up the UNewData and write the converter.. 89 */ 90 static void 91 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); 92 93 UBool haveCopyright=TRUE; 94 95 static UDataInfo dataInfo={ 96 sizeof(UDataInfo), 97 0, 98 99 U_IS_BIG_ENDIAN, 100 U_CHARSET_FAMILY, 101 sizeof(UChar), 102 0, 103 104 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */ 105 {6, 2, 0, 0}, /* formatVersion */ 106 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ 107 }; 108 109 static void 110 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) 111 { 112 UNewDataMemory *mem = NULL; 113 uint32_t sz2; 114 uint32_t size = 0; 115 int32_t tableType; 116 117 if(U_FAILURE(*status)) 118 { 119 return; 120 } 121 122 tableType=TABLE_NONE; 123 if(data->cnvData!=NULL) { 124 tableType|=TABLE_BASE; 125 } 126 if(data->extData!=NULL) { 127 tableType|=TABLE_EXT; 128 } 129 130 mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status); 131 132 if(U_FAILURE(*status)) 133 { 134 fprintf(stderr, "Couldn't create the udata %s.%s: %s\n", 135 cnvName, 136 "cnv", 137 u_errorName(*status)); 138 return; 139 } 140 141 if(VERBOSE) 142 { 143 printf("- Opened udata %s.%s\n", cnvName, "cnv"); 144 } 145 146 147 /* all read only, clean, platform independent data. Mmmm. :) */ 148 udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); 149 size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ 150 /* Now, write the table */ 151 if(tableType&TABLE_BASE) { 152 size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); 153 } 154 if(tableType&TABLE_EXT) { 155 size += data->extData->write(data->extData, &data->staticData, mem, tableType); 156 } 157 158 sz2 = udata_finish(mem, status); 159 if(size != sz2) 160 { 161 fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size); 162 *status=U_INTERNAL_PROGRAM_ERROR; 163 } 164 if(VERBOSE) 165 { 166 printf("- Wrote %u bytes to the udata.\n", (int)sz2); 167 } 168 } 169 170 enum { 171 OPT_HELP_H, 172 OPT_HELP_QUESTION_MARK, 173 OPT_COPYRIGHT, 174 OPT_VERSION, 175 OPT_DESTDIR, 176 OPT_VERBOSE, 177 OPT_SMALL, 178 OPT_IGNORE_SISO_CHECK, 179 OPT_COUNT 180 }; 181 182 static UOption options[]={ 183 UOPTION_HELP_H, 184 UOPTION_HELP_QUESTION_MARK, 185 UOPTION_COPYRIGHT, 186 UOPTION_VERSION, 187 UOPTION_DESTDIR, 188 UOPTION_VERBOSE, 189 { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, 190 { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } 191 }; 192 193 int main(int argc, char* argv[]) 194 { 195 ConvData data; 196 UErrorCode err = U_ZERO_ERROR, localError; 197 char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; 198 const char* destdir, *arg; 199 size_t destdirlen; 200 char* dot = NULL, *outBasename; 201 char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; 202 char cnvNameWithPkg[UCNV_MAX_FULL_FILE_NAME_LENGTH]; 203 UVersionInfo icuVersion; 204 UBool printFilename; 205 206 err = U_ZERO_ERROR; 207 208 U_MAIN_INIT_ARGS(argc, argv); 209 210 /* Set up the ICU version number */ 211 u_getVersion(icuVersion); 212 uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); 213 214 /* preset then read command line options */ 215 options[OPT_DESTDIR].value=u_getDataDirectory(); 216 argc=u_parseArgs(argc, argv, LENGTHOF(options), options); 217 218 /* error handling, printing usage message */ 219 if(argc<0) { 220 fprintf(stderr, 221 "error in command line argument \"%s\"\n", 222 argv[-argc]); 223 } else if(argc<2) { 224 argc=-1; 225 } 226 if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { 227 FILE *stdfile=argc<0 ? stderr : stdout; 228 fprintf(stdfile, 229 "usage: %s [-options] files...\n" 230 "\tread .ucm codepage mapping files and write .cnv files\n" 231 "options:\n" 232 "\t-h or -? or --help this usage text\n" 233 "\t-V or --version show a version message\n" 234 "\t-c or --copyright include a copyright notice\n" 235 "\t-d or --destdir destination directory, followed by the path\n" 236 "\t-v or --verbose Turn on verbose output\n", 237 argv[0]); 238 fprintf(stdfile, 239 "\t --small Generate smaller .cnv files. They will be\n" 240 "\t significantly smaller but may not be compatible with\n" 241 "\t older versions of ICU and will require heap memory\n" 242 "\t allocation when loaded.\n" 243 "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n"); 244 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 245 } 246 247 if(options[OPT_VERSION].doesOccur) { 248 printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n", 249 dataInfo.formatVersion[0], dataInfo.formatVersion[1]); 250 printf("%s\n", U_COPYRIGHT_STRING); 251 exit(0); 252 } 253 254 /* get the options values */ 255 haveCopyright = options[OPT_COPYRIGHT].doesOccur; 256 destdir = options[OPT_DESTDIR].value; 257 VERBOSE = options[OPT_VERBOSE].doesOccur; 258 SMALL = options[OPT_SMALL].doesOccur; 259 260 if (options[OPT_IGNORE_SISO_CHECK].doesOccur) { 261 IGNORE_SISO_CHECK = TRUE; 262 } 263 264 if (destdir != NULL && *destdir != 0) { 265 uprv_strcpy(outFileName, destdir); 266 destdirlen = uprv_strlen(destdir); 267 outBasename = outFileName + destdirlen; 268 if (*(outBasename - 1) != U_FILE_SEP_CHAR) { 269 *outBasename++ = U_FILE_SEP_CHAR; 270 ++destdirlen; 271 } 272 } else { 273 destdirlen = 0; 274 outBasename = outFileName; 275 } 276 277 #if DEBUG 278 { 279 int i; 280 printf("makeconv: processing %d files...\n", argc - 1); 281 for(i=1; i<argc; ++i) { 282 printf("%s ", argv[i]); 283 } 284 printf("\n"); 285 fflush(stdout); 286 } 287 #endif 288 289 err = U_ZERO_ERROR; 290 printFilename = (UBool) (argc > 2 || VERBOSE); 291 for (++argv; --argc; ++argv) 292 { 293 arg = getLongPathname(*argv); 294 295 /* Check for potential buffer overflow */ 296 if(strlen(arg) > UCNV_MAX_FULL_FILE_NAME_LENGTH) 297 { 298 fprintf(stderr, "%s\n", u_errorName(U_BUFFER_OVERFLOW_ERROR)); 299 return U_BUFFER_OVERFLOW_ERROR; 300 } 301 302 /*produces the right destination path for display*/ 303 if (destdirlen != 0) 304 { 305 const char *basename; 306 307 /* find the last file sepator */ 308 basename = findBasename(arg); 309 uprv_strcpy(outBasename, basename); 310 } 311 else 312 { 313 uprv_strcpy(outFileName, arg); 314 } 315 316 /*removes the extension if any is found*/ 317 dot = uprv_strrchr(outBasename, '.'); 318 if (dot) 319 { 320 *dot = '\0'; 321 } 322 323 /* the basename without extension is the converter name */ 324 uprv_strcpy(cnvName, outBasename); 325 326 /*Adds the target extension*/ 327 uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION); 328 329 #if DEBUG 330 printf("makeconv: processing %s ...\n", arg); 331 fflush(stdout); 332 #endif 333 localError = U_ZERO_ERROR; 334 initConvData(&data); 335 createConverter(&data, arg, &localError); 336 337 if (U_FAILURE(localError)) 338 { 339 /* if an error is found, print out an error msg and keep going */ 340 fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg, 341 u_errorName(localError)); 342 if(U_SUCCESS(err)) { 343 err = localError; 344 } 345 } 346 else 347 { 348 /* Insure the static data name matches the file name */ 349 /* Changed to ignore directory and only compare base name 350 LDH 1/2/08*/ 351 char *p; 352 p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */ 353 354 if(p == NULL) /* OK, try alternate */ 355 { 356 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR); 357 if(p == NULL) 358 { 359 p=cnvName; /* If no separators, no problem */ 360 } 361 } 362 else 363 { 364 p++; /* If found separtor, don't include it in compare */ 365 } 366 if(uprv_stricmp(p,data.staticData.name)) 367 { 368 fprintf(stderr, "Warning: %s%s claims to be '%s'\n", 369 cnvName, CONVERTER_FILE_EXTENSION, 370 data.staticData.name); 371 } 372 373 uprv_strcpy((char*)data.staticData.name, cnvName); 374 375 if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { 376 fprintf(stderr, 377 "Error: A converter name must contain only invariant characters.\n" 378 "%s is not a valid converter name.\n", 379 data.staticData.name); 380 if(U_SUCCESS(err)) { 381 err = U_INVALID_TABLE_FORMAT; 382 } 383 } 384 385 uprv_strcpy(cnvNameWithPkg, cnvName); 386 387 localError = U_ZERO_ERROR; 388 writeConverterData(&data, cnvNameWithPkg, destdir, &localError); 389 390 if(U_FAILURE(localError)) 391 { 392 /* if an error is found, print out an error msg and keep going*/ 393 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName, arg, 394 u_errorName(localError)); 395 if(U_SUCCESS(err)) { 396 err = localError; 397 } 398 } 399 else if (printFilename) 400 { 401 puts(outBasename); 402 } 403 } 404 fflush(stdout); 405 fflush(stderr); 406 407 cleanupConvData(&data); 408 } 409 410 return err; 411 } 412 413 static void 414 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { 415 if( (name[0]=='i' || name[0]=='I') && 416 (name[1]=='b' || name[1]=='B') && 417 (name[2]=='m' || name[2]=='M') 418 ) { 419 name+=3; 420 if(*name=='-') { 421 ++name; 422 } 423 *pPlatform=UCNV_IBM; 424 *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10); 425 } else { 426 *pPlatform=UCNV_UNKNOWN; 427 *pCCSID=0; 428 } 429 } 430 431 static void 432 readHeader(ConvData *data, 433 FileStream* convFile, 434 const char* converterName, 435 UErrorCode *pErrorCode) { 436 char line[200]; 437 char *s, *key, *value; 438 const UConverterStaticData *prototype; 439 UConverterStaticData *staticData; 440 441 if(U_FAILURE(*pErrorCode)) { 442 return; 443 } 444 445 staticData=&data->staticData; 446 staticData->platform=UCNV_IBM; 447 staticData->subCharLen=0; 448 449 while(T_FileStream_readLine(convFile, line, sizeof(line))) { 450 /* basic parsing and handling of state-related items */ 451 if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { 452 continue; 453 } 454 455 /* stop at the beginning of the mapping section */ 456 if(uprv_strcmp(line, "CHARMAP")==0) { 457 break; 458 } 459 460 /* collect the information from the header field, ignore unknown keys */ 461 if(uprv_strcmp(key, "code_set_name")==0) { 462 if(*value!=0) { 463 uprv_strcpy((char *)staticData->name, value); 464 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); 465 } 466 } else if(uprv_strcmp(key, "subchar")==0) { 467 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 468 int8_t length; 469 470 s=value; 471 length=ucm_parseBytes(bytes, line, (const char **)&s); 472 if(1<=length && length<=4 && *s==0) { 473 staticData->subCharLen=length; 474 uprv_memcpy(staticData->subChar, bytes, length); 475 } else { 476 fprintf(stderr, "error: illegal <subchar> %s\n", value); 477 *pErrorCode=U_INVALID_TABLE_FORMAT; 478 return; 479 } 480 } else if(uprv_strcmp(key, "subchar1")==0) { 481 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 482 483 s=value; 484 if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) { 485 staticData->subChar1=bytes[0]; 486 } else { 487 fprintf(stderr, "error: illegal <subchar1> %s\n", value); 488 *pErrorCode=U_INVALID_TABLE_FORMAT; 489 return; 490 } 491 } 492 } 493 494 /* copy values from the UCMFile to the static data */ 495 staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength; 496 staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength; 497 staticData->conversionType=data->ucm->states.conversionType; 498 499 if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { 500 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); 501 *pErrorCode=U_INVALID_TABLE_FORMAT; 502 return; 503 } 504 505 /* 506 * Now that we know the type, copy any 'default' values from the table. 507 * We need not check the type any further because the parser only 508 * recognizes what we have prototypes for. 509 * 510 * For delta (extension-only) tables, copy values from the base file 511 * instead, see createConverter(). 512 */ 513 if(data->ucm->baseName[0]==0) { 514 prototype=ucnv_converterStaticData[staticData->conversionType]; 515 if(prototype!=NULL) { 516 if(staticData->name[0]==0) { 517 uprv_strcpy((char *)staticData->name, prototype->name); 518 } 519 520 if(staticData->codepage==0) { 521 staticData->codepage=prototype->codepage; 522 } 523 524 if(staticData->platform==0) { 525 staticData->platform=prototype->platform; 526 } 527 528 if(staticData->minBytesPerChar==0) { 529 staticData->minBytesPerChar=prototype->minBytesPerChar; 530 } 531 532 if(staticData->maxBytesPerChar==0) { 533 staticData->maxBytesPerChar=prototype->maxBytesPerChar; 534 } 535 536 if(staticData->subCharLen==0) { 537 staticData->subCharLen=prototype->subCharLen; 538 if(prototype->subCharLen>0) { 539 uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); 540 } 541 } 542 } 543 } 544 545 if(data->ucm->states.outputType<0) { 546 data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1; 547 } 548 549 if( staticData->subChar1!=0 && 550 (staticData->minBytesPerChar>1 || 551 (staticData->conversionType!=UCNV_MBCS && 552 staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) 553 ) { 554 fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n"); 555 *pErrorCode=U_INVALID_TABLE_FORMAT; 556 } 557 } 558 559 /* return TRUE if a base table was read, FALSE for an extension table */ 560 static UBool 561 readFile(ConvData *data, const char* converterName, 562 UErrorCode *pErrorCode) { 563 char line[200]; 564 char *end; 565 FileStream *convFile; 566 567 UCMStates *baseStates; 568 UBool dataIsBase; 569 570 if(U_FAILURE(*pErrorCode)) { 571 return FALSE; 572 } 573 574 data->ucm=ucm_open(); 575 576 convFile=T_FileStream_open(converterName, "r"); 577 if(convFile==NULL) { 578 *pErrorCode=U_FILE_ACCESS_ERROR; 579 return FALSE; 580 } 581 582 readHeader(data, convFile, converterName, pErrorCode); 583 if(U_FAILURE(*pErrorCode)) { 584 return FALSE; 585 } 586 587 if(data->ucm->baseName[0]==0) { 588 dataIsBase=TRUE; 589 baseStates=&data->ucm->states; 590 ucm_processStates(baseStates, IGNORE_SISO_CHECK); 591 } else { 592 dataIsBase=FALSE; 593 baseStates=NULL; 594 } 595 596 /* read the base table */ 597 ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); 598 if(U_FAILURE(*pErrorCode)) { 599 return FALSE; 600 } 601 602 /* read an extension table if there is one */ 603 while(T_FileStream_readLine(convFile, line, sizeof(line))) { 604 end=uprv_strchr(line, 0); 605 while(line<end && 606 (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) { 607 --end; 608 } 609 *end=0; 610 611 if(line[0]=='#' || u_skipWhitespace(line)==end) { 612 continue; /* ignore empty and comment lines */ 613 } 614 615 if(0==uprv_strcmp(line, "CHARMAP")) { 616 /* read the extension table */ 617 ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode); 618 } else { 619 fprintf(stderr, "unexpected text after the base mapping table\n"); 620 } 621 break; 622 } 623 624 T_FileStream_close(convFile); 625 626 if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { 627 fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); 628 *pErrorCode=U_INVALID_TABLE_FORMAT; 629 } 630 631 return dataIsBase; 632 } 633 634 static void 635 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { 636 ConvData baseData; 637 UBool dataIsBase; 638 639 UConverterStaticData *staticData; 640 UCMStates *states, *baseStates; 641 642 if(U_FAILURE(*pErrorCode)) { 643 return; 644 } 645 646 initConvData(data); 647 648 dataIsBase=readFile(data, converterName, pErrorCode); 649 if(U_FAILURE(*pErrorCode)) { 650 return; 651 } 652 653 staticData=&data->staticData; 654 states=&data->ucm->states; 655 656 if(dataIsBase) { 657 /* 658 * Build a normal .cnv file with a base table 659 * and an optional extension table. 660 */ 661 data->cnvData=MBCSOpen(data->ucm); 662 if(data->cnvData==NULL) { 663 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 664 665 } else if(!data->cnvData->isValid(data->cnvData, 666 staticData->subChar, staticData->subCharLen) 667 ) { 668 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); 669 *pErrorCode=U_INVALID_TABLE_FORMAT; 670 671 } else if(staticData->subChar1!=0 && 672 !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1) 673 ) { 674 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); 675 *pErrorCode=U_INVALID_TABLE_FORMAT; 676 677 } else if( 678 data->ucm->ext->mappingsLength>0 && 679 !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) 680 ) { 681 *pErrorCode=U_INVALID_TABLE_FORMAT; 682 } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) { 683 /* sort the table so that it can be turned into UTF-8-friendly data */ 684 ucm_sortTable(data->ucm->base); 685 } 686 687 if(U_SUCCESS(*pErrorCode)) { 688 if( 689 /* add the base table after ucm_checkBaseExt()! */ 690 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) 691 ) { 692 *pErrorCode=U_INVALID_TABLE_FORMAT; 693 } else { 694 /* 695 * addTable() may have requested moving more mappings to the extension table 696 * if they fit into the base toUnicode table but not into the 697 * base fromUnicode table. 698 * (Especially for UTF-8-friendly fromUnicode tables.) 699 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them 700 * to be excluded from the extension toUnicode data. 701 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into 702 * the base fromUnicode table. 703 */ 704 ucm_moveMappings(data->ucm->base, data->ucm->ext); 705 ucm_sortTable(data->ucm->ext); 706 if(data->ucm->ext->mappingsLength>0) { 707 /* prepare the extension table, if there is one */ 708 data->extData=CnvExtOpen(data->ucm); 709 if(data->extData==NULL) { 710 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 711 } else if( 712 !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) 713 ) { 714 *pErrorCode=U_INVALID_TABLE_FORMAT; 715 } 716 } 717 } 718 } 719 } else { 720 /* Build an extension-only .cnv file. */ 721 char baseFilename[500]; 722 char *basename; 723 724 initConvData(&baseData); 725 726 /* assemble a path/filename for data->ucm->baseName */ 727 uprv_strcpy(baseFilename, converterName); 728 basename=(char *)findBasename(baseFilename); 729 uprv_strcpy(basename, data->ucm->baseName); 730 uprv_strcat(basename, ".ucm"); 731 732 /* read the base table */ 733 dataIsBase=readFile(&baseData, baseFilename, pErrorCode); 734 if(U_FAILURE(*pErrorCode)) { 735 return; 736 } else if(!dataIsBase) { 737 fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename); 738 *pErrorCode=U_INVALID_TABLE_FORMAT; 739 } else { 740 /* prepare the extension table */ 741 data->extData=CnvExtOpen(data->ucm); 742 if(data->extData==NULL) { 743 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 744 } else { 745 /* fill in gaps in extension file header fields */ 746 UCMapping *m, *mLimit; 747 uint8_t fallbackFlags; 748 749 baseStates=&baseData.ucm->states; 750 if(states->conversionType==UCNV_DBCS) { 751 staticData->minBytesPerChar=(int8_t)(states->minCharLength=2); 752 } else if(states->minCharLength==0) { 753 staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength); 754 } 755 if(states->maxCharLength<states->minCharLength) { 756 staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength); 757 } 758 759 if(staticData->subCharLen==0) { 760 uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4); 761 staticData->subCharLen=baseData.staticData.subCharLen; 762 } 763 /* 764 * do not copy subChar1 - 765 * only use what is explicitly specified 766 * because it cannot be unset in the extension file header 767 */ 768 769 /* get the fallback flags */ 770 fallbackFlags=0; 771 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; 772 m<mLimit && fallbackFlags!=3; 773 ++m 774 ) { 775 if(m->f==1) { 776 fallbackFlags|=1; 777 } else if(m->f==3) { 778 fallbackFlags|=2; 779 } 780 } 781 782 if(fallbackFlags&1) { 783 staticData->hasFromUnicodeFallback=TRUE; 784 } 785 if(fallbackFlags&2) { 786 staticData->hasToUnicodeFallback=TRUE; 787 } 788 789 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) { 790 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); 791 *pErrorCode=U_INVALID_TABLE_FORMAT; 792 793 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { 794 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); 795 *pErrorCode=U_INVALID_TABLE_FORMAT; 796 797 } else if( 798 !ucm_checkValidity(data->ucm->ext, baseStates) || 799 !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) 800 ) { 801 *pErrorCode=U_INVALID_TABLE_FORMAT; 802 } else { 803 if(states->maxCharLength>1) { 804 /* 805 * When building a normal .cnv file with a base table 806 * for an MBCS (not SBCS) table with explicit precision flags, 807 * the MBCSAddTable() function marks some mappings for moving 808 * to the extension table. 809 * They fit into the base toUnicode table but not into the 810 * base fromUnicode table. 811 * (Note: We do have explicit precision flags because they are 812 * required for extension table generation, and 813 * ucm_checkBaseExt() verified it.) 814 * 815 * We do not call MBCSAddTable() here (we probably could) 816 * so we need to do the analysis before building the extension table. 817 * We assume that MBCSAddTable() will build a UTF-8-friendly table. 818 * Redundant mappings in the extension table are ok except they cost some size. 819 * 820 * Do this after ucm_checkBaseExt(). 821 */ 822 const MBCSData *mbcsData=MBCSGetDummy(); 823 int32_t needsMove=0; 824 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; 825 m<mLimit; 826 ++m 827 ) { 828 if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) { 829 m->f|=MBCS_FROM_U_EXT_FLAG; 830 m->moveFlag=UCM_MOVE_TO_EXT; 831 ++needsMove; 832 } 833 } 834 835 if(needsMove!=0) { 836 ucm_moveMappings(baseData.ucm->base, data->ucm->ext); 837 ucm_sortTable(data->ucm->ext); 838 } 839 } 840 if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) { 841 *pErrorCode=U_INVALID_TABLE_FORMAT; 842 } 843 } 844 } 845 } 846 847 cleanupConvData(&baseData); 848 } 849 } 850 851 /* 852 * Hey, Emacs, please set the following: 853 * 854 * Local Variables: 855 * indent-tabs-mode: nil 856 * End: 857 * 858 */ 859