1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: icuswap.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003aug08 16 * created by: Markus W. Scherer 17 * 18 * This tool takes an ICU data file and "swaps" it, that is, changes its 19 * platform properties between big-/little-endianness and ASCII/EBCDIC charset 20 * families. 21 * The modified data file is written to a new file. 22 * Useful as an install-time tool for shipping only one flavor of ICU data 23 * and preparing data files for the target platform. 24 * Will not work with data DLLs (shared libraries). 25 */ 26 27 #include "unicode/utypes.h" 28 #include "unicode/putil.h" 29 #include "unicode/udata.h" 30 #include "cmemory.h" 31 #include "cstring.h" 32 #include "uinvchar.h" 33 #include "uarrsort.h" 34 #include "ucmndata.h" 35 #include "udataswp.h" 36 #include "swapimpl.h" 37 #include "toolutil.h" 38 #include "uoptions.h" 39 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 44 /* definitions */ 45 46 #define DEFAULT_PADDING_LENGTH 15 47 48 static UOption options[]={ 49 UOPTION_HELP_H, 50 UOPTION_HELP_QUESTION_MARK, 51 UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG) 52 }; 53 54 enum { 55 OPT_HELP_H, 56 OPT_HELP_QUESTION_MARK, 57 OPT_OUT_TYPE 58 }; 59 60 static int32_t 61 fileSize(FILE *f) { 62 int32_t size; 63 64 fseek(f, 0, SEEK_END); 65 size=(int32_t)ftell(f); 66 fseek(f, 0, SEEK_SET); 67 return size; 68 } 69 70 /** 71 * Swap an ICU .dat package, including swapping of enclosed items. 72 */ 73 U_CFUNC int32_t U_CALLCONV 74 udata_swapPackage(const char *inFilename, const char *outFilename, 75 const UDataSwapper *ds, 76 const void *inData, int32_t length, void *outData, 77 UErrorCode *pErrorCode); 78 79 U_CDECL_BEGIN 80 static void U_CALLCONV 81 printError(void *context, const char *fmt, va_list args) { 82 vfprintf((FILE *)context, fmt, args); 83 } 84 U_CDECL_END 85 86 static int 87 printUsage(const char *pname, UBool ishelp) { 88 fprintf(stderr, 89 "%csage: %s [ -h, -?, --help ] -tl|-tb|-te|--type=b|... infilename outfilename\n", 90 ishelp ? 'U' : 'u', pname); 91 if(ishelp) { 92 fprintf(stderr, 93 "\nOptions: -h, -?, --help print this message and exit\n" 94 " Read the input file, swap its platform properties according\n" 95 " to the -t or --type option, and write the result to the output file.\n" 96 " -tl change to little-endian/ASCII charset family\n" 97 " -tb change to big-endian/ASCII charset family\n" 98 " -te change to big-endian/EBCDIC charset family\n"); 99 } 100 101 return !ishelp; 102 } 103 104 extern int 105 main(int argc, char *argv[]) { 106 FILE *in, *out; 107 const char *pname; 108 char *data; 109 int32_t length; 110 UBool ishelp; 111 int rc; 112 113 UDataSwapper *ds; 114 const UDataInfo *pInfo; 115 UErrorCode errorCode; 116 uint8_t outCharset; 117 UBool outIsBigEndian; 118 119 U_MAIN_INIT_ARGS(argc, argv); 120 121 fprintf(stderr, "Warning: icuswap is an obsolete tool and it will be removed in the next ICU release.\nPlease use the icupkg tool instead.\n"); 122 123 /* get the program basename */ 124 pname=strrchr(argv[0], U_FILE_SEP_CHAR); 125 if(pname==NULL) { 126 pname=strrchr(argv[0], '/'); 127 } 128 if(pname!=NULL) { 129 ++pname; 130 } else { 131 pname=argv[0]; 132 } 133 134 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 135 ishelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur; 136 if(ishelp || argc!=3) { 137 return printUsage(pname, ishelp); 138 } 139 140 /* parse the output type option */ 141 data=(char *)options[OPT_OUT_TYPE].value; 142 if(data[0]==0 || data[1]!=0) { 143 /* the type must be exactly one letter */ 144 return printUsage(pname, FALSE); 145 } 146 switch(data[0]) { 147 case 'l': 148 outIsBigEndian=FALSE; 149 outCharset=U_ASCII_FAMILY; 150 break; 151 case 'b': 152 outIsBigEndian=TRUE; 153 outCharset=U_ASCII_FAMILY; 154 break; 155 case 'e': 156 outIsBigEndian=TRUE; 157 outCharset=U_EBCDIC_FAMILY; 158 break; 159 default: 160 return printUsage(pname, FALSE); 161 } 162 163 in=out=NULL; 164 data=NULL; 165 166 /* open the input file, get its length, allocate memory for it, read the file */ 167 in=fopen(argv[1], "rb"); 168 if(in==NULL) { 169 fprintf(stderr, "%s: unable to open input file \"%s\"\n", pname, argv[1]); 170 rc=2; 171 goto done; 172 } 173 174 length=fileSize(in); 175 if(length<DEFAULT_PADDING_LENGTH) { 176 fprintf(stderr, "%s: empty input file \"%s\"\n", pname, argv[1]); 177 rc=2; 178 goto done; 179 } 180 181 /* 182 * +15: udata_swapPackage() may need to add a few padding bytes to the 183 * last item if charset swapping is done, 184 * because the last item may be resorted into the middle and then needs 185 * additional padding bytes 186 */ 187 data=(char *)malloc(length+DEFAULT_PADDING_LENGTH); 188 if(data==NULL) { 189 fprintf(stderr, "%s: error allocating memory for \"%s\"\n", pname, argv[1]); 190 rc=2; 191 goto done; 192 } 193 194 /* set the last 15 bytes to the usual padding byte, see udata_swapPackage() */ 195 uprv_memset(data+length-DEFAULT_PADDING_LENGTH, 0xaa, DEFAULT_PADDING_LENGTH); 196 197 if(length!=(int32_t)fread(data, 1, length, in)) { 198 fprintf(stderr, "%s: error reading \"%s\"\n", pname, argv[1]); 199 rc=3; 200 goto done; 201 } 202 203 fclose(in); 204 in=NULL; 205 206 /* swap the data in-place */ 207 errorCode=U_ZERO_ERROR; 208 ds=udata_openSwapperForInputData(data, length, outIsBigEndian, outCharset, &errorCode); 209 if(U_FAILURE(errorCode)) { 210 fprintf(stderr, "%s: udata_openSwapperForInputData(\"%s\") failed - %s\n", 211 pname, argv[1], u_errorName(errorCode)); 212 rc=4; 213 goto done; 214 } 215 216 ds->printError=printError; 217 ds->printErrorContext=stderr; 218 219 /* speculative cast, protected by the following length check */ 220 pInfo=(const UDataInfo *)((const char *)data+4); 221 222 if( length>=20 && 223 pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ 224 pInfo->dataFormat[1]==0x6d && 225 pInfo->dataFormat[2]==0x6e && 226 pInfo->dataFormat[3]==0x44 227 ) { 228 /* 229 * swap the .dat package 230 * udata_swapPackage() needs to rename ToC name entries from the old package 231 * name to the new one. 232 * We pass it the filenames, and udata_swapPackage() will extract the 233 * package names. 234 */ 235 length=udata_swapPackage(argv[1], argv[2], ds, data, length, data, &errorCode); 236 udata_closeSwapper(ds); 237 if(U_FAILURE(errorCode)) { 238 fprintf(stderr, "%s: udata_swapPackage(\"%s\") failed - %s\n", 239 pname, argv[1], u_errorName(errorCode)); 240 rc=4; 241 goto done; 242 } 243 } else { 244 /* swap the data, which is not a .dat package */ 245 length=udata_swap(ds, data, length, data, &errorCode); 246 udata_closeSwapper(ds); 247 if(U_FAILURE(errorCode)) { 248 fprintf(stderr, "%s: udata_swap(\"%s\") failed - %s\n", 249 pname, argv[1], u_errorName(errorCode)); 250 rc=4; 251 goto done; 252 } 253 } 254 255 out=fopen(argv[2], "wb"); 256 if(out==NULL) { 257 fprintf(stderr, "%s: unable to open output file \"%s\"\n", pname, argv[2]); 258 rc=5; 259 goto done; 260 } 261 262 if(length!=(int32_t)fwrite(data, 1, length, out)) { 263 fprintf(stderr, "%s: error writing \"%s\"\n", pname, argv[2]); 264 rc=6; 265 goto done; 266 } 267 268 fclose(out); 269 out=NULL; 270 271 /* all done */ 272 rc=0; 273 274 done: 275 if(in!=NULL) { 276 fclose(in); 277 } 278 if(out!=NULL) { 279 fclose(out); 280 } 281 if(data!=NULL) { 282 free(data); 283 } 284 return rc; 285 } 286 287 /* swap .dat package files -------------------------------------------------- */ 288 289 static int32_t 290 extractPackageName(const UDataSwapper *ds, const char *filename, 291 char pkg[], int32_t capacity, 292 UErrorCode *pErrorCode) { 293 const char *basename; 294 int32_t len; 295 296 if(U_FAILURE(*pErrorCode)) { 297 return 0; 298 } 299 300 basename=findBasename(filename); 301 len=(int32_t)uprv_strlen(basename)-4; /* -4: subtract the length of ".dat" */ 302 303 if(len<=0 || 0!=uprv_strcmp(basename+len, ".dat")) { 304 udata_printError(ds, "udata_swapPackage(): \"%s\" is not recognized as a package filename (must end with .dat)\n", 305 basename); 306 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 307 return 0; 308 } 309 310 if(len>=capacity) { 311 udata_printError(ds, "udata_swapPackage(): the package name \"%s\" is too long (>=%ld)\n", 312 (long)capacity); 313 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 314 return 0; 315 } 316 317 uprv_memcpy(pkg, basename, len); 318 pkg[len]=0; 319 return len; 320 } 321 322 struct ToCEntry { 323 uint32_t nameOffset, inOffset, outOffset, length; 324 }; 325 326 U_CDECL_BEGIN 327 static int32_t U_CALLCONV 328 compareToCEntries(const void *context, const void *left, const void *right) { 329 const char *chars=(const char *)context; 330 return (int32_t)uprv_strcmp(chars+((const ToCEntry *)left)->nameOffset, 331 chars+((const ToCEntry *)right)->nameOffset); 332 } 333 U_CDECL_END 334 335 U_CFUNC int32_t U_CALLCONV 336 udata_swapPackage(const char *inFilename, const char *outFilename, 337 const UDataSwapper *ds, 338 const void *inData, int32_t length, void *outData, 339 UErrorCode *pErrorCode) { 340 const UDataInfo *pInfo; 341 int32_t headerSize; 342 343 const uint8_t *inBytes; 344 uint8_t *outBytes; 345 346 uint32_t itemCount, offset, i; 347 int32_t itemLength; 348 349 const UDataOffsetTOCEntry *inEntries; 350 UDataOffsetTOCEntry *outEntries; 351 352 ToCEntry *table; 353 354 char inPkgName[32], outPkgName[32]; 355 int32_t inPkgNameLength, outPkgNameLength; 356 357 /* udata_swapDataHeader checks the arguments */ 358 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 359 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 360 return 0; 361 } 362 363 /* check data format and format version */ 364 pInfo=(const UDataInfo *)((const char *)inData+4); 365 if(!( 366 pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ 367 pInfo->dataFormat[1]==0x6d && 368 pInfo->dataFormat[2]==0x6e && 369 pInfo->dataFormat[3]==0x44 && 370 pInfo->formatVersion[0]==1 371 )) { 372 udata_printError(ds, "udata_swapPackage(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n", 373 pInfo->dataFormat[0], pInfo->dataFormat[1], 374 pInfo->dataFormat[2], pInfo->dataFormat[3], 375 pInfo->formatVersion[0]); 376 *pErrorCode=U_UNSUPPORTED_ERROR; 377 return 0; 378 } 379 380 /* 381 * We need to change the ToC name entries so that they have the correct 382 * package name prefix. 383 * Extract the package names from the in/out filenames. 384 */ 385 inPkgNameLength=extractPackageName( 386 ds, inFilename, 387 inPkgName, (int32_t)sizeof(inPkgName), 388 pErrorCode); 389 outPkgNameLength=extractPackageName( 390 ds, outFilename, 391 outPkgName, (int32_t)sizeof(outPkgName), 392 pErrorCode); 393 if(U_FAILURE(*pErrorCode)) { 394 return 0; 395 } 396 397 /* 398 * It is possible to work with inPkgNameLength!=outPkgNameLength, 399 * but then the length of the data file would change more significantly, 400 * which we are not currently prepared for. 401 */ 402 if(inPkgNameLength!=outPkgNameLength) { 403 udata_printError(ds, "udata_swapPackage(): the package names \"%s\" and \"%s\" must have the same length\n", 404 inPkgName, outPkgName); 405 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 406 return 0; 407 } 408 409 inBytes=(const uint8_t *)inData+headerSize; 410 inEntries=(const UDataOffsetTOCEntry *)(inBytes+4); 411 412 if(length<0) { 413 /* preflighting */ 414 itemCount=ds->readUInt32(*(const uint32_t *)inBytes); 415 if(itemCount==0) { 416 /* no items: count only the item count and return */ 417 return headerSize+4; 418 } 419 420 /* read the last item's offset and preflight it */ 421 offset=ds->readUInt32(inEntries[itemCount-1].dataOffset); 422 itemLength=udata_swap(ds, inBytes+offset, -1, NULL, pErrorCode); 423 424 if(U_SUCCESS(*pErrorCode)) { 425 return headerSize+offset+(uint32_t)itemLength; 426 } else { 427 return 0; 428 } 429 } else { 430 /* check that the itemCount fits, then the ToC table, then at least the header of the last item */ 431 length-=headerSize; 432 if(length<4) { 433 /* itemCount does not fit */ 434 offset=0xffffffff; 435 itemCount=0; /* make compilers happy */ 436 } else { 437 itemCount=ds->readUInt32(*(const uint32_t *)inBytes); 438 if(itemCount==0) { 439 offset=4; 440 } else if((uint32_t)length<(4+8*itemCount)) { 441 /* ToC table does not fit */ 442 offset=0xffffffff; 443 } else { 444 /* offset of the last item plus at least 20 bytes for its header */ 445 offset=20+ds->readUInt32(inEntries[itemCount-1].dataOffset); 446 } 447 } 448 if((uint32_t)length<offset) { 449 udata_printError(ds, "udata_swapPackage(): too few bytes (%d after header) for a .dat package\n", 450 length); 451 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 452 return 0; 453 } 454 455 outBytes=(uint8_t *)outData+headerSize; 456 457 /* swap the item count */ 458 ds->swapArray32(ds, inBytes, 4, outBytes, pErrorCode); 459 460 if(itemCount==0) { 461 /* no items: just return now */ 462 return headerSize+4; 463 } 464 465 /* swap the item name strings */ 466 offset=4+8*itemCount; 467 itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset)-offset); 468 udata_swapInvStringBlock(ds, inBytes+offset, itemLength, outBytes+offset, pErrorCode); 469 if(U_FAILURE(*pErrorCode)) { 470 udata_printError(ds, "udata_swapPackage() failed to swap the data item name strings\n"); 471 return 0; 472 } 473 /* keep offset and itemLength in case we allocate and copy the strings below */ 474 475 /* swap the package names into the output charset */ 476 if(ds->outCharset!=U_CHARSET_FAMILY) { 477 UDataSwapper *ds2; 478 ds2=udata_openSwapper(TRUE, U_CHARSET_FAMILY, TRUE, ds->outCharset, pErrorCode); 479 ds2->swapInvChars(ds2, inPkgName, inPkgNameLength, inPkgName, pErrorCode); 480 ds2->swapInvChars(ds2, outPkgName, outPkgNameLength, outPkgName, pErrorCode); 481 udata_closeSwapper(ds2); 482 if(U_FAILURE(*pErrorCode)) { 483 udata_printError(ds, "udata_swapPackage() failed to swap the input/output package names\n"); 484 } 485 } 486 487 /* change the prefix of each ToC entry name from the old to the new package name */ 488 { 489 char *entryName; 490 491 for(i=0; i<itemCount; ++i) { 492 entryName=(char *)inBytes+ds->readUInt32(inEntries[i].nameOffset); 493 494 if(0==uprv_memcmp(entryName, inPkgName, inPkgNameLength)) { 495 uprv_memcpy(entryName, outPkgName, inPkgNameLength); 496 } else { 497 udata_printError(ds, "udata_swapPackage() failed: ToC item %ld does not have the input package name as a prefix\n", 498 (long)i); 499 *pErrorCode=U_INVALID_FORMAT_ERROR; 500 return 0; 501 } 502 } 503 } 504 505 /* 506 * Allocate the ToC table and, if necessary, a temporary buffer for 507 * pseudo-in-place swapping. 508 * 509 * We cannot swap in-place because: 510 * 511 * 1. If the swapping of an item fails mid-way, then in-place swapping 512 * has destroyed its data. 513 * Out-of-place swapping allows us to then copy its original data. 514 * 515 * 2. If swapping changes the charset family, then we must resort 516 * not only the ToC table but also the data items themselves. 517 * This requires a permutation and is best done with separate in/out 518 * buffers. 519 * 520 * We swapped the strings above to avoid the malloc below if string swapping fails. 521 */ 522 if(inData==outData) { 523 /* +15: prepare for extra padding of a newly-last item */ 524 table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH); 525 if(table!=NULL) { 526 outBytes=(uint8_t *)(table+itemCount); 527 528 /* copy the item count and the swapped strings */ 529 uprv_memcpy(outBytes, inBytes, 4); 530 uprv_memcpy(outBytes+offset, inBytes+offset, itemLength); 531 } 532 } else { 533 table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)); 534 } 535 if(table==NULL) { 536 udata_printError(ds, "udata_swapPackage(): out of memory allocating %d bytes\n", 537 inData==outData ? 538 itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH : 539 itemCount*sizeof(ToCEntry)); 540 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 541 return 0; 542 } 543 outEntries=(UDataOffsetTOCEntry *)(outBytes+4); 544 545 /* read the ToC table */ 546 for(i=0; i<itemCount; ++i) { 547 table[i].nameOffset=ds->readUInt32(inEntries[i].nameOffset); 548 table[i].inOffset=ds->readUInt32(inEntries[i].dataOffset); 549 if(i>0) { 550 table[i-1].length=table[i].inOffset-table[i-1].inOffset; 551 } 552 } 553 table[itemCount-1].length=(uint32_t)length-table[itemCount-1].inOffset; 554 555 if(ds->inCharset==ds->outCharset) { 556 /* no charset swapping, no resorting: keep item offsets the same */ 557 for(i=0; i<itemCount; ++i) { 558 table[i].outOffset=table[i].inOffset; 559 } 560 } else { 561 /* charset swapping: resort items by their swapped names */ 562 563 /* 564 * Before the actual sorting, we need to make sure that each item 565 * has a length that is a multiple of 16 bytes so that all items 566 * are 16-aligned. 567 * Only the old last item may be missing up to 15 padding bytes. 568 * Add padding bytes for it. 569 * Since the icuswap main() function has already allocated enough 570 * input buffer space and set the last 15 bytes there to 0xaa, 571 * we only need to increase the total data length and the length 572 * of the last item here. 573 */ 574 if((length&0xf)!=0) { 575 int32_t delta=16-(length&0xf); 576 length+=delta; 577 table[itemCount-1].length+=(uint32_t)delta; 578 } 579 580 /* Save the offset before we sort the TOC. */ 581 offset=table[0].inOffset; 582 /* sort the TOC entries */ 583 uprv_sortArray(table, (int32_t)itemCount, (int32_t)sizeof(ToCEntry), 584 compareToCEntries, outBytes, FALSE, pErrorCode); 585 586 /* 587 * Note: Before sorting, the inOffset values were in order. 588 * Now the outOffset values are in order. 589 */ 590 591 /* assign outOffset values */ 592 for(i=0; i<itemCount; ++i) { 593 table[i].outOffset=offset; 594 offset+=table[i].length; 595 } 596 } 597 598 /* write the output ToC table */ 599 for(i=0; i<itemCount; ++i) { 600 ds->writeUInt32(&outEntries[i].nameOffset, table[i].nameOffset); 601 ds->writeUInt32(&outEntries[i].dataOffset, table[i].outOffset); 602 } 603 604 /* swap each data item */ 605 for(i=0; i<itemCount; ++i) { 606 /* first copy the item bytes to make sure that unreachable bytes are copied */ 607 uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length); 608 609 /* swap the item */ 610 udata_swap(ds, inBytes+table[i].inOffset, (int32_t)table[i].length, 611 outBytes+table[i].outOffset, pErrorCode); 612 613 if(U_FAILURE(*pErrorCode)) { 614 if(ds->outCharset==U_CHARSET_FAMILY) { 615 udata_printError(ds, "warning: udata_swapPackage() failed to swap item \"%s\"\n" 616 " at inOffset 0x%x length 0x%x - %s\n" 617 " the data item will be copied, not swapped\n\n", 618 (char *)outBytes+table[i].nameOffset, 619 table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); 620 } else { 621 udata_printError(ds, "warning: udata_swapPackage() failed to swap an item\n" 622 " at inOffset 0x%x length 0x%x - %s\n" 623 " the data item will be copied, not swapped\n\n", 624 table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); 625 } 626 /* reset the error code, copy the data item, and continue */ 627 *pErrorCode=U_ZERO_ERROR; 628 uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length); 629 } 630 } 631 632 if(inData==outData) { 633 /* copy the data from the temporary buffer to the in-place buffer */ 634 uprv_memcpy((uint8_t *)outData+headerSize, outBytes, length); 635 } 636 uprv_free(table); 637 638 return headerSize+length; 639 } 640 } 641 642 /* 643 * Hey, Emacs, please set the following: 644 * 645 * Local Variables: 646 * indent-tabs-mode: nil 647 * End: 648 * 649 */ 650