1 /***************************************************************************** 2 * 3 * Copyright (C) 1999-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 * 6 ******************************************************************************/ 7 8 /* 9 * uconv(1): an iconv(1)-like converter using ICU. 10 * 11 * Original code by Jonas Utterström <jonas.utterstrom (at) vittran.norrnod.se> 12 * contributed in 1999. 13 * 14 * Conversion to the C conversion API and many improvements by 15 * Yves Arrouye <yves (at) realnames.com>, current maintainer. 16 * 17 * Markus Scherer maintainer from 2003. 18 * See source code repository history for changes. 19 */ 20 21 #include <unicode/utypes.h> 22 #include <unicode/putil.h> 23 #include <unicode/ucnv.h> 24 #include <unicode/uenum.h> 25 #include <unicode/unistr.h> 26 #include <unicode/translit.h> 27 #include <unicode/uset.h> 28 #include <unicode/uclean.h> 29 #include <unicode/utf16.h> 30 31 #include <stdio.h> 32 #include <errno.h> 33 #include <string.h> 34 #include <stdlib.h> 35 36 #include "cmemory.h" 37 #include "cstring.h" 38 #include "ustrfmt.h" 39 40 #include "unicode/uwmsg.h" 41 42 U_NAMESPACE_USE 43 44 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__) 45 #include <io.h> 46 #include <fcntl.h> 47 #if U_PLATFORM_USES_ONLY_WIN32_API 48 #define USE_FILENO_BINARY_MODE 1 49 /* Windows likes to rename Unix-like functions */ 50 #ifndef fileno 51 #define fileno _fileno 52 #endif 53 #ifndef setmode 54 #define setmode _setmode 55 #endif 56 #ifndef O_BINARY 57 #define O_BINARY _O_BINARY 58 #endif 59 #endif 60 #endif 61 62 #ifdef UCONVMSG_LINK 63 /* below from the README */ 64 #include "unicode/utypes.h" 65 #include "unicode/udata.h" 66 U_CFUNC char uconvmsg_dat[]; 67 #endif 68 69 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 70 71 #define DEFAULT_BUFSZ 4096 72 #define UCONVMSG "uconvmsg" 73 74 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */ 75 76 /* 77 * Initialize the message bundle so that message strings can be fetched 78 * by u_wmsg(). 79 * 80 */ 81 82 static void initMsg(const char *pname) { 83 static int ps = 0; 84 85 if (!ps) { 86 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */ 87 UErrorCode err = U_ZERO_ERROR; 88 89 ps = 1; 90 91 /* Set up our static data - if any */ 92 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */ 93 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err); 94 if (U_FAILURE(err)) { 95 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n", 96 pname, u_errorName(err)); 97 err = U_ZERO_ERROR; /* It may still fail */ 98 } 99 #endif 100 101 /* Get messages. */ 102 gBundle = u_wmsg_setPath(UCONVMSG, &err); 103 if (U_FAILURE(err)) { 104 fprintf(stderr, 105 "%s: warning: couldn't open bundle %s: %s\n", 106 pname, UCONVMSG, u_errorName(err)); 107 #ifdef UCONVMSG_LINK 108 fprintf(stderr, 109 "%s: setAppData was called, internal data %s failed to load\n", 110 pname, UCONVMSG); 111 #endif 112 113 err = U_ZERO_ERROR; 114 /* that was try #1, try again with a path */ 115 uprv_strcpy(dataPath, u_getDataDirectory()); 116 uprv_strcat(dataPath, U_FILE_SEP_STRING); 117 uprv_strcat(dataPath, UCONVMSG); 118 119 gBundle = u_wmsg_setPath(dataPath, &err); 120 if (U_FAILURE(err)) { 121 fprintf(stderr, 122 "%s: warning: still couldn't open bundle %s: %s\n", 123 pname, dataPath, u_errorName(err)); 124 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname); 125 } 126 } 127 } 128 } 129 130 /* Mapping of callback names to the callbacks passed to the converter 131 API. */ 132 133 static struct callback_ent { 134 const char *name; 135 UConverterFromUCallback fromu; 136 const void *fromuctxt; 137 UConverterToUCallback tou; 138 const void *touctxt; 139 } transcode_callbacks[] = { 140 { "substitute", 141 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 142 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 }, 143 { "skip", 144 UCNV_FROM_U_CALLBACK_SKIP, 0, 145 UCNV_TO_U_CALLBACK_SKIP, 0 }, 146 { "stop", 147 UCNV_FROM_U_CALLBACK_STOP, 0, 148 UCNV_TO_U_CALLBACK_STOP, 0 }, 149 { "escape", 150 UCNV_FROM_U_CALLBACK_ESCAPE, 0, 151 UCNV_TO_U_CALLBACK_ESCAPE, 0}, 152 { "escape-icu", 153 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, 154 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU }, 155 { "escape-java", 156 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, 157 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA }, 158 { "escape-c", 159 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 160 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, 161 { "escape-xml", 162 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 163 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 164 { "escape-xml-hex", 165 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 166 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 167 { "escape-xml-dec", 168 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, 170 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, 171 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE } 172 }; 173 174 /* Return a pointer to a callback record given its name. */ 175 176 static const struct callback_ent *findCallback(const char *name) { 177 int i, count = 178 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 179 180 /* We'll do a linear search, there aren't many of them and bsearch() 181 may not be that portable. */ 182 183 for (i = 0; i < count; ++i) { 184 if (!uprv_stricmp(name, transcode_callbacks[i].name)) { 185 return &transcode_callbacks[i]; 186 } 187 } 188 189 return 0; 190 } 191 192 /* Print converter information. If lookfor is set, only that converter will 193 be printed, otherwise all converters will be printed. If canon is non 194 zero, tags and aliases for each converter are printed too, in the format 195 expected for convrters.txt(5). */ 196 197 static int printConverters(const char *pname, const char *lookfor, 198 UBool canon) 199 { 200 UErrorCode err = U_ZERO_ERROR; 201 int32_t num; 202 uint16_t num_stds; 203 const char **stds; 204 205 /* If there is a specified name, just handle that now. */ 206 207 if (lookfor) { 208 if (!canon) { 209 printf("%s\n", lookfor); 210 return 0; 211 } else { 212 /* Because we are printing a canonical name, we need the 213 true converter name. We've done that already except for 214 the default name (because we want to print the exact 215 name one would get when calling ucnv_getDefaultName() 216 in non-canon mode). But since we do not know at this 217 point if we have the default name or something else, we 218 need to normalize again to the canonical converter 219 name. */ 220 221 const char *truename = ucnv_getAlias(lookfor, 0, &err); 222 if (U_SUCCESS(err)) { 223 lookfor = truename; 224 } else { 225 err = U_ZERO_ERROR; 226 } 227 } 228 } 229 230 /* Print converter names. We come here for one of two reasons: we 231 are printing all the names (lookfor was null), or we have a 232 single converter to print but in canon mode, hence we need to 233 get to it in order to print everything. */ 234 235 num = ucnv_countAvailable(); 236 if (num <= 0) { 237 initMsg(pname); 238 u_wmsg(stderr, "cantGetNames"); 239 return -1; 240 } 241 if (lookfor) { 242 num = 1; /* We know where we want to be. */ 243 } 244 245 num_stds = ucnv_countStandards(); 246 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds)); 247 if (!stds) { 248 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR)); 249 return -1; 250 } else { 251 uint16_t s; 252 253 if (canon) { 254 printf("{ "); 255 } 256 for (s = 0; s < num_stds; ++s) { 257 stds[s] = ucnv_getStandard(s, &err); 258 if (canon) { 259 printf("%s ", stds[s]); 260 } 261 if (U_FAILURE(err)) { 262 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err)); 263 goto error_cleanup; 264 } 265 } 266 if (canon) { 267 puts("}"); 268 } 269 } 270 271 for (int32_t i = 0; i < num; i++) { 272 const char *name; 273 uint16_t num_aliases; 274 275 /* Set the name either to what we are looking for, or 276 to the current converter name. */ 277 278 if (lookfor) { 279 name = lookfor; 280 } else { 281 name = ucnv_getAvailableName(i); 282 } 283 284 /* Get all the aliases associated to the name. */ 285 286 err = U_ZERO_ERROR; 287 num_aliases = ucnv_countAliases(name, &err); 288 if (U_FAILURE(err)) { 289 printf("%s", name); 290 291 UnicodeString str(name, ""); 292 putchar('\t'); 293 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 294 u_wmsg_errorName(err)); 295 goto error_cleanup; 296 } else { 297 uint16_t a, s, t; 298 299 /* Write all the aliases and their tags. */ 300 301 for (a = 0; a < num_aliases; ++a) { 302 const char *alias = ucnv_getAlias(name, a, &err); 303 304 if (U_FAILURE(err)) { 305 UnicodeString str(name, ""); 306 putchar('\t'); 307 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 308 u_wmsg_errorName(err)); 309 goto error_cleanup; 310 } 311 312 /* Print the current alias so that it looks right. */ 313 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") , 314 alias, 315 (canon ? "" : " ")); 316 317 /* Look (slowly, linear searching) for a tag. */ 318 319 if (canon) { 320 /* -1 to skip the last standard */ 321 for (s = t = 0; s < num_stds-1; ++s) { 322 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err); 323 if (U_SUCCESS(err)) { 324 /* List the standard tags */ 325 const char *standardName; 326 UBool isFirst = TRUE; 327 UErrorCode enumError = U_ZERO_ERROR; 328 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) { 329 /* See if this alias is supported by this standard. */ 330 if (!strcmp(standardName, alias)) { 331 if (!t) { 332 printf(" {"); 333 t = 1; 334 } 335 /* Print a * after the default standard name */ 336 printf(" %s%s", stds[s], (isFirst ? "*" : "")); 337 } 338 isFirst = FALSE; 339 } 340 } 341 } 342 if (t) { 343 printf(" }"); 344 } 345 } 346 /* Terminate this entry. */ 347 if (canon) { 348 puts(""); 349 } 350 351 /* Move on. */ 352 } 353 /* Terminate this entry. */ 354 if (!canon) { 355 puts(""); 356 } 357 } 358 } 359 360 /* Free temporary data. */ 361 362 uprv_free(stds); 363 364 /* Success. */ 365 366 return 0; 367 error_cleanup: 368 uprv_free(stds); 369 return -1; 370 } 371 372 /* Print all available transliterators. If canon is non zero, print 373 one transliterator per line. */ 374 375 static int printTransliterators(UBool canon) 376 { 377 #if UCONFIG_NO_TRANSLITERATION 378 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n"); 379 return 1; 380 #else 381 UErrorCode status = U_ZERO_ERROR; 382 UEnumeration *ids = utrans_openIDs(&status); 383 int32_t i, numtrans = uenum_count(ids, &status); 384 385 char sepchar = canon ? '\n' : ' '; 386 387 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) { 388 int32_t len; 389 const char *nextTrans = uenum_next(ids, &len, &status); 390 391 printf("%s", nextTrans); 392 if (i < numtrans - 1) { 393 putchar(sepchar); 394 } 395 } 396 397 uenum_close(ids); 398 399 /* Add a terminating newline if needed. */ 400 401 if (sepchar != '\n') { 402 putchar('\n'); 403 } 404 405 /* Success. */ 406 407 return 0; 408 #endif 409 } 410 411 enum { 412 uSP = 0x20, // space 413 uCR = 0xd, // carriage return 414 uLF = 0xa, // line feed 415 uNL = 0x85, // newline 416 uLS = 0x2028, // line separator 417 uPS = 0x2029, // paragraph separator 418 uSig = 0xfeff // signature/BOM character 419 }; 420 421 static inline int32_t 422 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) { 423 // find one of 424 // CR, LF, CRLF, NL, LS, PS 425 // for paragraph ends (see UAX #13/Unicode 4) 426 // and include it in the chunk 427 // all of these characters are on the BMP 428 // do not include FF or VT in case they are part of a paragraph 429 // (important for bidi contexts) 430 static const UChar paraEnds[] = { 431 0xd, 0xa, 0x85, 0x2028, 0x2029 432 }; 433 enum { 434 iCR, iLF, iNL, iLS, iPS, iCount 435 }; 436 437 // first, see if there is a CRLF split between prev and s 438 if (prev.endsWith(paraEnds + iCR, 1)) { 439 if (s.startsWith(paraEnds + iLF, 1)) { 440 return 1; // split CRLF, include the LF 441 } else if (!s.isEmpty()) { 442 return 0; // complete the last chunk 443 } else { 444 return -1; // wait for actual further contents to arrive 445 } 446 } 447 448 const UChar *u = s.getBuffer(), *limit = u + s.length(); 449 UChar c; 450 451 while (u < limit) { 452 c = *u++; 453 if ( 454 ((c < uSP) && (c == uCR || c == uLF)) || 455 (c == uNL) || 456 ((c & uLS) == uLS) 457 ) { 458 if (c == uCR) { 459 // check for CRLF 460 if (u == limit) { 461 return -1; // LF may be in the next chunk 462 } else if (*u == uLF) { 463 ++u; // include the LF in this chunk 464 } 465 } 466 return (int32_t)(u - s.getBuffer()); 467 } 468 } 469 470 return -1; // continue collecting the chunk 471 } 472 473 enum { 474 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM) 475 CNV_WITH_FEFF, // can convert the U+FEFF signature character 476 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character 477 }; 478 479 static inline UChar 480 nibbleToHex(uint8_t n) { 481 n &= 0xf; 482 return 483 n <= 9 ? 484 (UChar)(0x30 + n) : 485 (UChar)((0x61 - 10) + n); 486 } 487 488 // check the converter's Unicode signature properties; 489 // the fromUnicode side of the converter must be in its initial state 490 // and will be reset again if it was used 491 static int32_t 492 cnvSigType(UConverter *cnv) { 493 UErrorCode err; 494 int32_t result; 495 496 // test if the output charset can convert U+FEFF 497 USet *set = uset_open(1, 0); 498 err = U_ZERO_ERROR; 499 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err); 500 if (U_SUCCESS(err) && uset_contains(set, uSig)) { 501 result = CNV_WITH_FEFF; 502 } else { 503 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted 504 } 505 uset_close(set); 506 507 if (result == CNV_WITH_FEFF) { 508 // test if the output charset emits a signature anyway 509 const UChar a[1] = { 0x61 }; // "a" 510 const UChar *in; 511 512 char buffer[20]; 513 char *out; 514 515 in = a; 516 out = buffer; 517 err = U_ZERO_ERROR; 518 ucnv_fromUnicode(cnv, 519 &out, buffer + sizeof(buffer), 520 &in, a + 1, 521 NULL, TRUE, &err); 522 ucnv_resetFromUnicode(cnv); 523 524 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) && 525 U_SUCCESS(err) 526 ) { 527 result = CNV_ADDS_FEFF; 528 } 529 } 530 531 return result; 532 } 533 534 class ConvertFile { 535 public: 536 ConvertFile() : 537 buf(NULL), outbuf(NULL), fromoffsets(NULL), 538 bufsz(0), signature(0) {} 539 540 void 541 setBufferSize(size_t bufferSize) { 542 bufsz = bufferSize; 543 544 buf = new char[2 * bufsz]; 545 outbuf = buf + bufsz; 546 547 // +1 for an added U+FEFF in the intermediate Unicode buffer 548 fromoffsets = new int32_t[bufsz + 1]; 549 } 550 551 ~ConvertFile() { 552 delete [] buf; 553 delete [] fromoffsets; 554 } 555 556 UBool convertFile(const char *pname, 557 const char *fromcpage, 558 UConverterToUCallback toucallback, 559 const void *touctxt, 560 const char *tocpage, 561 UConverterFromUCallback fromucallback, 562 const void *fromuctxt, 563 UBool fallback, 564 const char *translit, 565 const char *infilestr, 566 FILE * outfile, int verbose); 567 private: 568 friend int main(int argc, char **argv); 569 570 char *buf, *outbuf; 571 int32_t *fromoffsets; 572 573 size_t bufsz; 574 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character 575 }; 576 577 // Convert a file from one encoding to another 578 UBool 579 ConvertFile::convertFile(const char *pname, 580 const char *fromcpage, 581 UConverterToUCallback toucallback, 582 const void *touctxt, 583 const char *tocpage, 584 UConverterFromUCallback fromucallback, 585 const void *fromuctxt, 586 UBool fallback, 587 const char *translit, 588 const char *infilestr, 589 FILE * outfile, int verbose) 590 { 591 FILE *infile; 592 UBool ret = TRUE; 593 UConverter *convfrom = 0; 594 UConverter *convto = 0; 595 UErrorCode err = U_ZERO_ERROR; 596 UBool flush; 597 UBool closeFile = FALSE; 598 const char *cbufp, *prevbufp; 599 char *bufp; 600 601 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ 602 603 const UChar *unibuf, *unibufbp; 604 UChar *unibufp; 605 606 size_t rd, wr; 607 608 #if !UCONFIG_NO_TRANSLITERATION 609 Transliterator *t = 0; // Transliterator acting on Unicode data. 610 UnicodeString chunk; // One chunk of the text being collected for transformation. 611 #endif 612 UnicodeString u; // String to do the transliteration. 613 int32_t ulen; 614 615 // use conversion offsets for error messages 616 // unless a transliterator is used - 617 // a text transformation will reorder characters in unpredictable ways 618 UBool useOffsets = TRUE; 619 620 // Open the correct input file or connect to stdin for reading input 621 622 if (infilestr != 0 && strcmp(infilestr, "-")) { 623 infile = fopen(infilestr, "rb"); 624 if (infile == 0) { 625 UnicodeString str1(infilestr, ""); 626 str1.append((UChar32) 0); 627 UnicodeString str2(strerror(errno), ""); 628 str2.append((UChar32) 0); 629 initMsg(pname); 630 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); 631 return FALSE; 632 } 633 closeFile = TRUE; 634 } else { 635 infilestr = "-"; 636 infile = stdin; 637 #ifdef USE_FILENO_BINARY_MODE 638 if (setmode(fileno(stdin), O_BINARY) == -1) { 639 initMsg(pname); 640 u_wmsg(stderr, "cantSetInBinMode"); 641 return FALSE; 642 } 643 #endif 644 } 645 646 if (verbose) { 647 fprintf(stderr, "%s:\n", infilestr); 648 } 649 650 #if !UCONFIG_NO_TRANSLITERATION 651 // Create transliterator as needed. 652 653 if (translit != NULL && *translit) { 654 UParseError parse; 655 UnicodeString str(translit), pestr; 656 657 /* Create from rules or by ID as needed. */ 658 659 parse.line = -1; 660 661 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { 662 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); 663 } else { 664 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); 665 } 666 667 if (U_FAILURE(err)) { 668 str.append((UChar32) 0); 669 initMsg(pname); 670 671 if (parse.line >= 0) { 672 UChar linebuf[20], offsetbuf[20]; 673 uprv_itou(linebuf, 20, parse.line, 10, 0); 674 uprv_itou(offsetbuf, 20, parse.offset, 10, 0); 675 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(), 676 u_wmsg_errorName(err), linebuf, offsetbuf); 677 } else { 678 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(), 679 u_wmsg_errorName(err)); 680 } 681 682 if (t) { 683 delete t; 684 t = 0; 685 } 686 goto error_exit; 687 } 688 689 useOffsets = FALSE; 690 } 691 #endif 692 693 // Create codepage converter. If the codepage or its aliases weren't 694 // available, it returns NULL and a failure code. We also set the 695 // callbacks, and return errors in the same way. 696 697 convfrom = ucnv_open(fromcpage, &err); 698 if (U_FAILURE(err)) { 699 UnicodeString str(fromcpage, ""); 700 initMsg(pname); 701 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(), 702 u_wmsg_errorName(err)); 703 goto error_exit; 704 } 705 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); 706 if (U_FAILURE(err)) { 707 initMsg(pname); 708 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 709 goto error_exit; 710 } 711 712 convto = ucnv_open(tocpage, &err); 713 if (U_FAILURE(err)) { 714 UnicodeString str(tocpage, ""); 715 initMsg(pname); 716 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(), 717 u_wmsg_errorName(err)); 718 goto error_exit; 719 } 720 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); 721 if (U_FAILURE(err)) { 722 initMsg(pname); 723 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 724 goto error_exit; 725 } 726 ucnv_setFallback(convto, fallback); 727 728 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode; 729 int8_t sig; 730 731 // OK, we can convert now. 732 sig = signature; 733 rd = 0; 734 735 do { 736 willexit = FALSE; 737 738 // input file offset at the beginning of the next buffer 739 infoffset += rd; 740 741 rd = fread(buf, 1, bufsz, infile); 742 if (ferror(infile) != 0) { 743 UnicodeString str(strerror(errno)); 744 initMsg(pname); 745 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer()); 746 goto error_exit; 747 } 748 749 // Convert the read buffer into the new encoding via Unicode. 750 // After the call 'unibufp' will be placed behind the last 751 // character that was converted in the 'unibuf'. 752 // Also the 'cbufp' is positioned behind the last converted 753 // character. 754 // At the last conversion in the file, flush should be set to 755 // true so that we get all characters converted. 756 // 757 // The converter must be flushed at the end of conversion so 758 // that characters on hold also will be written. 759 760 cbufp = buf; 761 flush = (UBool)(rd != bufsz); 762 763 // convert until the input is consumed 764 do { 765 // remember the start of the current byte-to-Unicode conversion 766 prevbufp = cbufp; 767 768 unibuf = unibufp = u.getBuffer((int32_t)bufsz); 769 770 // Use bufsz instead of u.getCapacity() for the targetLimit 771 // so that we don't overflow fromoffsets[]. 772 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp, 773 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err); 774 775 ulen = (int32_t)(unibufp - unibuf); 776 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0); 777 778 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done 779 // converting all of the input bytes. 780 // It works like this because ucnv_toUnicode() returns only under the 781 // following conditions: 782 // - an error occurred during conversion (an error code is set) 783 // - the target buffer is filled (the error code indicates an overflow) 784 // - the source is consumed 785 // That is, if the error code does not indicate a failure, 786 // not even an overflow, then the source must be consumed entirely. 787 fromSawEndOfBytes = (UBool)U_SUCCESS(err); 788 789 if (err == U_BUFFER_OVERFLOW_ERROR) { 790 err = U_ZERO_ERROR; 791 } else if (U_FAILURE(err)) { 792 char pos[32], errorBytes[32]; 793 int8_t i, length, errorLength; 794 795 UErrorCode localError = U_ZERO_ERROR; 796 errorLength = (int8_t)sizeof(errorBytes); 797 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError); 798 if (U_FAILURE(localError) || errorLength == 0) { 799 errorLength = 1; 800 } 801 802 // print the input file offset of the start of the error bytes: 803 // input file offset of the current byte buffer + 804 // length of the just consumed bytes - 805 // length of the error bytes 806 length = 807 (int8_t)sprintf(pos, "%d", 808 (int)(infoffset + (cbufp - buf) - errorLength)); 809 810 // output the bytes that caused the error 811 UnicodeString str; 812 for (i = 0; i < errorLength; ++i) { 813 if (i > 0) { 814 str.append((UChar)uSP); 815 } 816 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4)); 817 str.append(nibbleToHex((uint8_t)errorBytes[i])); 818 } 819 820 initMsg(pname); 821 u_wmsg(stderr, "problemCvtToU", 822 UnicodeString(pos, length, "").getTerminatedBuffer(), 823 str.getTerminatedBuffer(), 824 u_wmsg_errorName(err)); 825 826 willexit = TRUE; 827 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 828 } 829 830 // Replaced a check for whether the input was consumed by 831 // looping until it is; message key "premEndInput" now obsolete. 832 833 if (ulen == 0) { 834 continue; 835 } 836 837 // remove a U+FEFF Unicode signature character if requested 838 if (sig < 0) { 839 if (u.charAt(0) == uSig) { 840 u.remove(0, 1); 841 842 // account for the removed UChar and offset 843 --ulen; 844 845 if (useOffsets) { 846 // remove an offset from fromoffsets[] as well 847 // to keep the array parallel with the UChars 848 memmove(fromoffsets, fromoffsets + 1, ulen * 4); 849 } 850 851 } 852 sig = 0; 853 } 854 855 #if !UCONFIG_NO_TRANSLITERATION 856 // Transliterate/transform if needed. 857 858 // For transformation, we use chunking code - 859 // collect Unicode input until, for example, an end-of-line, 860 // then transform and output-convert that and continue collecting. 861 // This makes the transformation result independent of the buffer size 862 // while avoiding the slower keyboard mode. 863 // The end-of-chunk characters are completely included in the 864 // transformed string in case they are to be transformed themselves. 865 if (t != NULL) { 866 UnicodeString out; 867 int32_t chunkLimit; 868 869 do { 870 chunkLimit = getChunkLimit(chunk, u); 871 if (chunkLimit < 0 && flush && fromSawEndOfBytes) { 872 // use all of the rest at the end of the text 873 chunkLimit = u.length(); 874 } 875 if (chunkLimit >= 0) { 876 // complete the chunk and transform it 877 chunk.append(u, 0, chunkLimit); 878 u.remove(0, chunkLimit); 879 t->transliterate(chunk); 880 881 // append the transformation result to the result and empty the chunk 882 out.append(chunk); 883 chunk.remove(); 884 } else { 885 // continue collecting the chunk 886 chunk.append(u); 887 break; 888 } 889 } while (!u.isEmpty()); 890 891 u = out; 892 ulen = u.length(); 893 } 894 #endif 895 896 // add a U+FEFF Unicode signature character if requested 897 // and possible/necessary 898 if (sig > 0) { 899 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) { 900 u.insert(0, (UChar)uSig); 901 902 if (useOffsets) { 903 // insert a pseudo-offset into fromoffsets[] as well 904 // to keep the array parallel with the UChars 905 memmove(fromoffsets + 1, fromoffsets, ulen * 4); 906 fromoffsets[0] = -1; 907 } 908 909 // account for the additional UChar and offset 910 ++ulen; 911 } 912 sig = 0; 913 } 914 915 // Convert the Unicode buffer into the destination codepage 916 // Again 'bufp' will be placed behind the last converted character 917 // And 'unibufp' will be placed behind the last converted unicode character 918 // At the last conversion flush should be set to true to ensure that 919 // all characters left get converted 920 921 unibuf = unibufbp = u.getBuffer(); 922 923 do { 924 bufp = outbuf; 925 926 // Use fromSawEndOfBytes in addition to the flush flag - 927 // it indicates whether the intermediate Unicode string 928 // contains the very last UChars for the very last input bytes. 929 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz, 930 &unibufbp, 931 unibuf + ulen, 932 NULL, (UBool)(flush && fromSawEndOfBytes), &err); 933 934 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done 935 // converting all of the intermediate UChars. 936 // See comment for fromSawEndOfBytes. 937 toSawEndOfUnicode = (UBool)U_SUCCESS(err); 938 939 if (err == U_BUFFER_OVERFLOW_ERROR) { 940 err = U_ZERO_ERROR; 941 } else if (U_FAILURE(err)) { 942 UChar errorUChars[4]; 943 const char *errtag; 944 char pos[32]; 945 UChar32 c; 946 int8_t i, length, errorLength; 947 948 UErrorCode localError = U_ZERO_ERROR; 949 errorLength = (int8_t)LENGTHOF(errorUChars); 950 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError); 951 if (U_FAILURE(localError) || errorLength == 0) { 952 // need at least 1 so that we don't access beyond the length of fromoffsets[] 953 errorLength = 1; 954 } 955 956 int32_t ferroffset; 957 958 if (useOffsets) { 959 // Unicode buffer offset of the start of the error UChars 960 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength); 961 if (ferroffset < 0) { 962 // approximation - the character started in the previous Unicode buffer 963 ferroffset = 0; 964 } 965 966 // get the corresponding byte offset out of fromoffsets[] 967 // go back if the offset is not known for some of the UChars 968 int32_t fromoffset; 969 do { 970 fromoffset = fromoffsets[ferroffset]; 971 } while (fromoffset < 0 && --ferroffset >= 0); 972 973 // total input file offset = 974 // input file offset of the current byte buffer + 975 // byte buffer offset of where the current Unicode buffer is converted from + 976 // fromoffsets[Unicode offset] 977 ferroffset = infoffset + (prevbufp - buf) + fromoffset; 978 errtag = "problemCvtFromU"; 979 } else { 980 // Do not use fromoffsets if (t != NULL) because the Unicode text may 981 // be different from what the offsets refer to. 982 983 // output file offset 984 ferroffset = (int32_t)(outfoffset + (bufp - outbuf)); 985 errtag = "problemCvtFromUOut"; 986 } 987 988 length = (int8_t)sprintf(pos, "%u", (int)ferroffset); 989 990 // output the code points that caused the error 991 UnicodeString str; 992 for (i = 0; i < errorLength;) { 993 if (i > 0) { 994 str.append((UChar)uSP); 995 } 996 U16_NEXT(errorUChars, i, errorLength, c); 997 if (c >= 0x100000) { 998 str.append(nibbleToHex((uint8_t)(c >> 20))); 999 } 1000 if (c >= 0x10000) { 1001 str.append(nibbleToHex((uint8_t)(c >> 16))); 1002 } 1003 str.append(nibbleToHex((uint8_t)(c >> 12))); 1004 str.append(nibbleToHex((uint8_t)(c >> 8))); 1005 str.append(nibbleToHex((uint8_t)(c >> 4))); 1006 str.append(nibbleToHex((uint8_t)c)); 1007 } 1008 1009 initMsg(pname); 1010 u_wmsg(stderr, errtag, 1011 UnicodeString(pos, length, "").getTerminatedBuffer(), 1012 str.getTerminatedBuffer(), 1013 u_wmsg_errorName(err)); 1014 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer()); 1015 1016 willexit = TRUE; 1017 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 1018 } 1019 1020 // Replaced a check for whether the intermediate Unicode characters were all consumed by 1021 // looping until they are; message key "premEnd" now obsolete. 1022 1023 // Finally, write the converted buffer to the output file 1024 size_t outlen = (size_t) (bufp - outbuf); 1025 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile)); 1026 if (wr != outlen) { 1027 UnicodeString str(strerror(errno)); 1028 initMsg(pname); 1029 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer()); 1030 willexit = TRUE; 1031 } 1032 1033 if (willexit) { 1034 goto error_exit; 1035 } 1036 } while (!toSawEndOfUnicode); 1037 } while (!fromSawEndOfBytes); 1038 } while (!flush); // Stop when we have flushed the 1039 // converters (this means that it's 1040 // the end of output) 1041 1042 goto normal_exit; 1043 1044 error_exit: 1045 ret = FALSE; 1046 1047 normal_exit: 1048 // Cleanup. 1049 1050 ucnv_close(convfrom); 1051 ucnv_close(convto); 1052 1053 #if !UCONFIG_NO_TRANSLITERATION 1054 delete t; 1055 #endif 1056 1057 if (closeFile) { 1058 fclose(infile); 1059 } 1060 1061 return ret; 1062 } 1063 1064 static void usage(const char *pname, int ecode) { 1065 const UChar *msg; 1066 int32_t msgLen; 1067 UErrorCode err = U_ZERO_ERROR; 1068 FILE *fp = ecode ? stderr : stdout; 1069 int res; 1070 1071 initMsg(pname); 1072 msg = 1073 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", 1074 &msgLen, &err); 1075 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1)); 1076 UnicodeString mname(msg, msgLen + 1); 1077 1078 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer()); 1079 if (!ecode) { 1080 if (!res) { 1081 fputc('\n', fp); 1082 } 1083 if (!u_wmsg(fp, "help")) { 1084 /* Now dump callbacks and finish. */ 1085 1086 int i, count = 1087 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 1088 for (i = 0; i < count; ++i) { 1089 fprintf(fp, " %s", transcode_callbacks[i].name); 1090 } 1091 fputc('\n', fp); 1092 } 1093 } 1094 1095 exit(ecode); 1096 } 1097 1098 extern int 1099 main(int argc, char **argv) 1100 { 1101 FILE *outfile; 1102 int ret = 0; 1103 1104 size_t bufsz = DEFAULT_BUFSZ; 1105 1106 const char *fromcpage = 0; 1107 const char *tocpage = 0; 1108 const char *translit = 0; 1109 const char *outfilestr = 0; 1110 UBool fallback = FALSE; 1111 1112 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; 1113 const void *fromuctxt = 0; 1114 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; 1115 const void *touctxt = 0; 1116 1117 char **iter, **remainArgv, **remainArgvLimit; 1118 char **end = argv + argc; 1119 1120 const char *pname; 1121 1122 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE; 1123 const char *printName = 0; 1124 1125 UBool verbose = FALSE; 1126 UErrorCode status = U_ZERO_ERROR; 1127 1128 ConvertFile cf; 1129 1130 /* Initialize ICU */ 1131 u_init(&status); 1132 if (U_FAILURE(status)) { 1133 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 1134 argv[0], u_errorName(status)); 1135 exit(1); 1136 } 1137 1138 // Get and prettify pname. 1139 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); 1140 #if U_PLATFORM_USES_ONLY_WIN32_API 1141 if (!pname) { 1142 pname = uprv_strrchr(*argv, '/'); 1143 } 1144 #endif 1145 if (!pname) { 1146 pname = *argv; 1147 } else { 1148 ++pname; 1149 } 1150 1151 // First, get the arguments from command-line 1152 // to know the codepages to convert between 1153 1154 remainArgv = remainArgvLimit = argv + 1; 1155 for (iter = argv + 1; iter != end; iter++) { 1156 // Check for from charset 1157 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { 1158 iter++; 1159 if (iter != end) 1160 fromcpage = *iter; 1161 else 1162 usage(pname, 1); 1163 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { 1164 iter++; 1165 if (iter != end) 1166 tocpage = *iter; 1167 else 1168 usage(pname, 1); 1169 } else if (strcmp("-x", *iter) == 0) { 1170 iter++; 1171 if (iter != end) 1172 translit = *iter; 1173 else 1174 usage(pname, 1); 1175 } else if (!strcmp("--fallback", *iter)) { 1176 fallback = TRUE; 1177 } else if (!strcmp("--no-fallback", *iter)) { 1178 fallback = FALSE; 1179 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) { 1180 iter++; 1181 if (iter != end) { 1182 bufsz = atoi(*iter); 1183 if ((int) bufsz <= 0) { 1184 initMsg(pname); 1185 UnicodeString str(*iter); 1186 initMsg(pname); 1187 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer()); 1188 return 3; 1189 } 1190 } else { 1191 usage(pname, 1); 1192 } 1193 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { 1194 if (printTranslits) { 1195 usage(pname, 1); 1196 } 1197 printConvs = TRUE; 1198 } else if (strcmp("--default-code", *iter) == 0) { 1199 if (printTranslits) { 1200 usage(pname, 1); 1201 } 1202 printName = ucnv_getDefaultName(); 1203 } else if (strcmp("--list-code", *iter) == 0) { 1204 if (printTranslits) { 1205 usage(pname, 1); 1206 } 1207 1208 iter++; 1209 if (iter != end) { 1210 UErrorCode e = U_ZERO_ERROR; 1211 printName = ucnv_getAlias(*iter, 0, &e); 1212 if (U_FAILURE(e) || !printName) { 1213 UnicodeString str(*iter); 1214 initMsg(pname); 1215 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer()); 1216 return 2; 1217 } 1218 } else 1219 usage(pname, 1); 1220 } else if (strcmp("--canon", *iter) == 0) { 1221 printCanon = TRUE; 1222 } else if (strcmp("-L", *iter) == 0 1223 || !strcmp("--list-transliterators", *iter)) { 1224 if (printConvs) { 1225 usage(pname, 1); 1226 } 1227 printTranslits = TRUE; 1228 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter) 1229 || !strcmp("--help", *iter)) { 1230 usage(pname, 0); 1231 } else if (!strcmp("-c", *iter)) { 1232 fromucallback = UCNV_FROM_U_CALLBACK_SKIP; 1233 } else if (!strcmp("--to-callback", *iter)) { 1234 iter++; 1235 if (iter != end) { 1236 const struct callback_ent *cbe = findCallback(*iter); 1237 if (cbe) { 1238 fromucallback = cbe->fromu; 1239 fromuctxt = cbe->fromuctxt; 1240 } else { 1241 UnicodeString str(*iter); 1242 initMsg(pname); 1243 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1244 return 4; 1245 } 1246 } else { 1247 usage(pname, 1); 1248 } 1249 } else if (!strcmp("--from-callback", *iter)) { 1250 iter++; 1251 if (iter != end) { 1252 const struct callback_ent *cbe = findCallback(*iter); 1253 if (cbe) { 1254 toucallback = cbe->tou; 1255 touctxt = cbe->touctxt; 1256 } else { 1257 UnicodeString str(*iter); 1258 initMsg(pname); 1259 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1260 return 4; 1261 } 1262 } else { 1263 usage(pname, 1); 1264 } 1265 } else if (!strcmp("-i", *iter)) { 1266 toucallback = UCNV_TO_U_CALLBACK_SKIP; 1267 } else if (!strcmp("--callback", *iter)) { 1268 iter++; 1269 if (iter != end) { 1270 const struct callback_ent *cbe = findCallback(*iter); 1271 if (cbe) { 1272 fromucallback = cbe->fromu; 1273 fromuctxt = cbe->fromuctxt; 1274 toucallback = cbe->tou; 1275 touctxt = cbe->touctxt; 1276 } else { 1277 UnicodeString str(*iter); 1278 initMsg(pname); 1279 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1280 return 4; 1281 } 1282 } else { 1283 usage(pname, 1); 1284 } 1285 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { 1286 verbose = FALSE; 1287 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { 1288 verbose = TRUE; 1289 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { 1290 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname); 1291 return 0; 1292 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { 1293 ++iter; 1294 if (iter != end && !outfilestr) { 1295 outfilestr = *iter; 1296 } else { 1297 usage(pname, 1); 1298 } 1299 } else if (0 == strcmp("--add-signature", *iter)) { 1300 cf.signature = 1; 1301 } else if (0 == strcmp("--remove-signature", *iter)) { 1302 cf.signature = -1; 1303 } else if (**iter == '-' && (*iter)[1]) { 1304 usage(pname, 1); 1305 } else { 1306 // move a non-option up in argv[] 1307 *remainArgvLimit++ = *iter; 1308 } 1309 } 1310 1311 if (printConvs || printName) { 1312 return printConverters(pname, printName, printCanon) ? 2 : 0; 1313 } else if (printTranslits) { 1314 return printTransliterators(printCanon) ? 3 : 0; 1315 } 1316 1317 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) { 1318 fromcpage = ucnv_getDefaultName(); 1319 } 1320 if (!tocpage || !uprv_strcmp(tocpage, "-")) { 1321 tocpage = ucnv_getDefaultName(); 1322 } 1323 1324 // Open the correct output file or connect to stdout for reading input 1325 if (outfilestr != 0 && strcmp(outfilestr, "-")) { 1326 outfile = fopen(outfilestr, "wb"); 1327 if (outfile == 0) { 1328 UnicodeString str1(outfilestr, ""); 1329 UnicodeString str2(strerror(errno), ""); 1330 initMsg(pname); 1331 u_wmsg(stderr, "cantCreateOutputF", 1332 str1.getBuffer(), str2.getBuffer()); 1333 return 1; 1334 } 1335 } else { 1336 outfilestr = "-"; 1337 outfile = stdout; 1338 #ifdef USE_FILENO_BINARY_MODE 1339 if (setmode(fileno(outfile), O_BINARY) == -1) { 1340 u_wmsg(stderr, "cantSetOutBinMode"); 1341 exit(-1); 1342 } 1343 #endif 1344 } 1345 1346 /* Loop again on the arguments to find all the input files, and 1347 convert them. */ 1348 1349 cf.setBufferSize(bufsz); 1350 1351 if(remainArgv < remainArgvLimit) { 1352 for (iter = remainArgv; iter != remainArgvLimit; iter++) { 1353 if (!cf.convertFile( 1354 pname, fromcpage, toucallback, touctxt, tocpage, 1355 fromucallback, fromuctxt, fallback, translit, *iter, 1356 outfile, verbose) 1357 ) { 1358 goto error_exit; 1359 } 1360 } 1361 } else { 1362 if (!cf.convertFile( 1363 pname, fromcpage, toucallback, touctxt, tocpage, 1364 fromucallback, fromuctxt, fallback, translit, 0, 1365 outfile, verbose) 1366 ) { 1367 goto error_exit; 1368 } 1369 } 1370 1371 goto normal_exit; 1372 error_exit: 1373 #if !UCONFIG_NO_LEGACY_CONVERSION 1374 ret = 1; 1375 #else 1376 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n"); 1377 #endif 1378 normal_exit: 1379 1380 if (outfile != stdout) { 1381 fclose(outfile); 1382 } 1383 1384 u_cleanup(); 1385 1386 return ret; 1387 } 1388 1389 1390 /* 1391 * Hey, Emacs, please set the following: 1392 * 1393 * Local Variables: 1394 * indent-tabs-mode: nil 1395 * End: 1396 * 1397 */ 1398