1 /***************************************************************************** 2 * 3 * Copyright (C) 1999-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 * 6 ******************************************************************************/ 7 8 /* 9 * uconv(1): an iconv(1)-like converter using ICU. 10 * 11 * Original code by Jonas Utterström <jonas.utterstrom (at) vittran.norrnod.se> 12 * contributed in 1999. 13 * 14 * Conversion to the C conversion API and many improvements by 15 * Yves Arrouye <yves (at) realnames.com>, current maintainer. 16 * 17 * Markus Scherer maintainer from 2003. 18 * See source code repository history for changes. 19 */ 20 21 #include <unicode/utypes.h> 22 #include <unicode/putil.h> 23 #include <unicode/ucnv.h> 24 #include <unicode/uenum.h> 25 #include <unicode/unistr.h> 26 #include <unicode/translit.h> 27 #include <unicode/uset.h> 28 #include <unicode/uclean.h> 29 30 #include <stdio.h> 31 #include <errno.h> 32 #include <string.h> 33 #include <stdlib.h> 34 35 #include "cmemory.h" 36 #include "cstring.h" 37 #include "ustrfmt.h" 38 39 #include "unicode/uwmsg.h" 40 41 U_NAMESPACE_USE 42 43 #if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__) 44 #include <io.h> 45 #include <fcntl.h> 46 #if defined(U_WINDOWS) 47 #define USE_FILENO_BINARY_MODE 1 48 /* Windows likes to rename Unix-like functions */ 49 #ifndef fileno 50 #define fileno _fileno 51 #endif 52 #ifndef setmode 53 #define setmode _setmode 54 #endif 55 #ifndef O_BINARY 56 #define O_BINARY _O_BINARY 57 #endif 58 #endif 59 #endif 60 61 #ifdef UCONVMSG_LINK 62 /* below from the README */ 63 #include "unicode/utypes.h" 64 #include "unicode/udata.h" 65 U_CFUNC char uconvmsg_dat[]; 66 #endif 67 68 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 69 70 #define DEFAULT_BUFSZ 4096 71 #define UCONVMSG "uconvmsg" 72 73 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */ 74 75 /* 76 * Initialize the message bundle so that message strings can be fetched 77 * by u_wmsg(). 78 * 79 */ 80 81 static void initMsg(const char *pname) { 82 static int ps = 0; 83 84 if (!ps) { 85 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */ 86 UErrorCode err = U_ZERO_ERROR; 87 88 ps = 1; 89 90 /* Set up our static data - if any */ 91 #ifdef UCONVMSG_LINK 92 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err); 93 if (U_FAILURE(err)) { 94 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n", 95 pname, u_errorName(err)); 96 err = U_ZERO_ERROR; /* It may still fail */ 97 } 98 #endif 99 100 /* Get messages. */ 101 gBundle = u_wmsg_setPath(UCONVMSG, &err); 102 if (U_FAILURE(err)) { 103 fprintf(stderr, 104 "%s: warning: couldn't open bundle %s: %s\n", 105 pname, UCONVMSG, u_errorName(err)); 106 #ifdef UCONVMSG_LINK 107 fprintf(stderr, 108 "%s: setAppData was called, internal data %s failed to load\n", 109 pname, UCONVMSG); 110 #endif 111 112 err = U_ZERO_ERROR; 113 /* that was try #1, try again with a path */ 114 uprv_strcpy(dataPath, u_getDataDirectory()); 115 uprv_strcat(dataPath, U_FILE_SEP_STRING); 116 uprv_strcat(dataPath, UCONVMSG); 117 118 gBundle = u_wmsg_setPath(dataPath, &err); 119 if (U_FAILURE(err)) { 120 fprintf(stderr, 121 "%s: warning: still couldn't open bundle %s: %s\n", 122 pname, dataPath, u_errorName(err)); 123 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname); 124 } 125 } 126 } 127 } 128 129 /* Mapping of callback names to the callbacks passed to the converter 130 API. */ 131 132 static struct callback_ent { 133 const char *name; 134 UConverterFromUCallback fromu; 135 const void *fromuctxt; 136 UConverterToUCallback tou; 137 const void *touctxt; 138 } transcode_callbacks[] = { 139 { "substitute", 140 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 141 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 }, 142 { "skip", 143 UCNV_FROM_U_CALLBACK_SKIP, 0, 144 UCNV_TO_U_CALLBACK_SKIP, 0 }, 145 { "stop", 146 UCNV_FROM_U_CALLBACK_STOP, 0, 147 UCNV_TO_U_CALLBACK_STOP, 0 }, 148 { "escape", 149 UCNV_FROM_U_CALLBACK_ESCAPE, 0, 150 UCNV_TO_U_CALLBACK_ESCAPE, 0}, 151 { "escape-icu", 152 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, 153 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU }, 154 { "escape-java", 155 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, 156 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA }, 157 { "escape-c", 158 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 159 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, 160 { "escape-xml", 161 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 162 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 163 { "escape-xml-hex", 164 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 165 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 166 { "escape-xml-dec", 167 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 168 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, 169 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, 170 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE } 171 }; 172 173 /* Return a pointer to a callback record given its name. */ 174 175 static const struct callback_ent *findCallback(const char *name) { 176 int i, count = 177 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 178 179 /* We'll do a linear search, there aren't many of them and bsearch() 180 may not be that portable. */ 181 182 for (i = 0; i < count; ++i) { 183 if (!uprv_stricmp(name, transcode_callbacks[i].name)) { 184 return &transcode_callbacks[i]; 185 } 186 } 187 188 return 0; 189 } 190 191 /* Print converter information. If lookfor is set, only that converter will 192 be printed, otherwise all converters will be printed. If canon is non 193 zero, tags and aliases for each converter are printed too, in the format 194 expected for convrters.txt(5). */ 195 196 static int printConverters(const char *pname, const char *lookfor, 197 UBool canon) 198 { 199 UErrorCode err = U_ZERO_ERROR; 200 int32_t num; 201 uint16_t num_stds; 202 const char **stds; 203 204 /* If there is a specified name, just handle that now. */ 205 206 if (lookfor) { 207 if (!canon) { 208 printf("%s\n", lookfor); 209 return 0; 210 } else { 211 /* Because we are printing a canonical name, we need the 212 true converter name. We've done that already except for 213 the default name (because we want to print the exact 214 name one would get when calling ucnv_getDefaultName() 215 in non-canon mode). But since we do not know at this 216 point if we have the default name or something else, we 217 need to normalize again to the canonical converter 218 name. */ 219 220 const char *truename = ucnv_getAlias(lookfor, 0, &err); 221 if (U_SUCCESS(err)) { 222 lookfor = truename; 223 } else { 224 err = U_ZERO_ERROR; 225 } 226 } 227 } 228 229 /* Print converter names. We come here for one of two reasons: we 230 are printing all the names (lookfor was null), or we have a 231 single converter to print but in canon mode, hence we need to 232 get to it in order to print everything. */ 233 234 num = ucnv_countAvailable(); 235 if (num <= 0) { 236 initMsg(pname); 237 u_wmsg(stderr, "cantGetNames"); 238 return -1; 239 } 240 if (lookfor) { 241 num = 1; /* We know where we want to be. */ 242 } 243 244 num_stds = ucnv_countStandards(); 245 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds)); 246 if (!stds) { 247 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR)); 248 return -1; 249 } else { 250 uint16_t s; 251 252 if (canon) { 253 printf("{ "); 254 } 255 for (s = 0; s < num_stds; ++s) { 256 stds[s] = ucnv_getStandard(s, &err); 257 if (canon) { 258 printf("%s ", stds[s]); 259 } 260 if (U_FAILURE(err)) { 261 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err)); 262 goto error_cleanup; 263 } 264 } 265 if (canon) { 266 puts("}"); 267 } 268 } 269 270 for (int32_t i = 0; i < num; i++) { 271 const char *name; 272 uint16_t num_aliases; 273 274 /* Set the name either to what we are looking for, or 275 to the current converter name. */ 276 277 if (lookfor) { 278 name = lookfor; 279 } else { 280 name = ucnv_getAvailableName(i); 281 } 282 283 /* Get all the aliases associated to the name. */ 284 285 err = U_ZERO_ERROR; 286 num_aliases = ucnv_countAliases(name, &err); 287 if (U_FAILURE(err)) { 288 printf("%s", name); 289 290 UnicodeString str(name, ""); 291 putchar('\t'); 292 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 293 u_wmsg_errorName(err)); 294 goto error_cleanup; 295 } else { 296 uint16_t a, s, t; 297 298 /* Write all the aliases and their tags. */ 299 300 for (a = 0; a < num_aliases; ++a) { 301 const char *alias = ucnv_getAlias(name, a, &err); 302 303 if (U_FAILURE(err)) { 304 UnicodeString str(name, ""); 305 putchar('\t'); 306 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 307 u_wmsg_errorName(err)); 308 goto error_cleanup; 309 } 310 311 /* Print the current alias so that it looks right. */ 312 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") , 313 alias, 314 (canon ? "" : " ")); 315 316 /* Look (slowly, linear searching) for a tag. */ 317 318 if (canon) { 319 /* -1 to skip the last standard */ 320 for (s = t = 0; s < num_stds-1; ++s) { 321 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err); 322 if (U_SUCCESS(err)) { 323 /* List the standard tags */ 324 const char *standardName; 325 UBool isFirst = TRUE; 326 UErrorCode enumError = U_ZERO_ERROR; 327 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) { 328 /* See if this alias is supported by this standard. */ 329 if (!strcmp(standardName, alias)) { 330 if (!t) { 331 printf(" {"); 332 t = 1; 333 } 334 /* Print a * after the default standard name */ 335 printf(" %s%s", stds[s], (isFirst ? "*" : "")); 336 } 337 isFirst = FALSE; 338 } 339 } 340 } 341 if (t) { 342 printf(" }"); 343 } 344 } 345 /* Terminate this entry. */ 346 if (canon) { 347 puts(""); 348 } 349 350 /* Move on. */ 351 } 352 /* Terminate this entry. */ 353 if (!canon) { 354 puts(""); 355 } 356 } 357 } 358 359 /* Free temporary data. */ 360 361 uprv_free(stds); 362 363 /* Success. */ 364 365 return 0; 366 error_cleanup: 367 uprv_free(stds); 368 return -1; 369 } 370 371 /* Print all available transliterators. If canon is non zero, print 372 one transliterator per line. */ 373 374 static int printTransliterators(UBool canon) 375 { 376 #if UCONFIG_NO_TRANSLITERATION 377 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n"); 378 return 1; 379 #else 380 UErrorCode status = U_ZERO_ERROR; 381 UEnumeration *ids = utrans_openIDs(&status); 382 int32_t i, numtrans = uenum_count(ids, &status); 383 384 char sepchar = canon ? '\n' : ' '; 385 386 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) { 387 int32_t len; 388 const char *nextTrans = uenum_next(ids, &len, &status); 389 390 printf("%s", nextTrans); 391 if (i < numtrans - 1) { 392 putchar(sepchar); 393 } 394 } 395 396 uenum_close(ids); 397 398 /* Add a terminating newline if needed. */ 399 400 if (sepchar != '\n') { 401 putchar('\n'); 402 } 403 404 /* Success. */ 405 406 return 0; 407 #endif 408 } 409 410 enum { 411 uSP = 0x20, // space 412 uCR = 0xd, // carriage return 413 uLF = 0xa, // line feed 414 uNL = 0x85, // newline 415 uLS = 0x2028, // line separator 416 uPS = 0x2029, // paragraph separator 417 uSig = 0xfeff // signature/BOM character 418 }; 419 420 static inline int32_t 421 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) { 422 // find one of 423 // CR, LF, CRLF, NL, LS, PS 424 // for paragraph ends (see UAX #13/Unicode 4) 425 // and include it in the chunk 426 // all of these characters are on the BMP 427 // do not include FF or VT in case they are part of a paragraph 428 // (important for bidi contexts) 429 static const UChar paraEnds[] = { 430 0xd, 0xa, 0x85, 0x2028, 0x2029 431 }; 432 enum { 433 iCR, iLF, iNL, iLS, iPS, iCount 434 }; 435 436 // first, see if there is a CRLF split between prev and s 437 if (prev.endsWith(paraEnds + iCR, 1)) { 438 if (s.startsWith(paraEnds + iLF, 1)) { 439 return 1; // split CRLF, include the LF 440 } else if (!s.isEmpty()) { 441 return 0; // complete the last chunk 442 } else { 443 return -1; // wait for actual further contents to arrive 444 } 445 } 446 447 const UChar *u = s.getBuffer(), *limit = u + s.length(); 448 UChar c; 449 450 while (u < limit) { 451 c = *u++; 452 if ( 453 ((c < uSP) && (c == uCR || c == uLF)) || 454 (c == uNL) || 455 ((c & uLS) == uLS) 456 ) { 457 if (c == uCR) { 458 // check for CRLF 459 if (u == limit) { 460 return -1; // LF may be in the next chunk 461 } else if (*u == uLF) { 462 ++u; // include the LF in this chunk 463 } 464 } 465 return (int32_t)(u - s.getBuffer()); 466 } 467 } 468 469 return -1; // continue collecting the chunk 470 } 471 472 enum { 473 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM) 474 CNV_WITH_FEFF, // can convert the U+FEFF signature character 475 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character 476 }; 477 478 static inline UChar 479 nibbleToHex(uint8_t n) { 480 n &= 0xf; 481 return 482 n <= 9 ? 483 (UChar)(0x30 + n) : 484 (UChar)((0x61 - 10) + n); 485 } 486 487 // check the converter's Unicode signature properties; 488 // the fromUnicode side of the converter must be in its initial state 489 // and will be reset again if it was used 490 static int32_t 491 cnvSigType(UConverter *cnv) { 492 UErrorCode err; 493 int32_t result; 494 495 // test if the output charset can convert U+FEFF 496 USet *set = uset_open(1, 0); 497 err = U_ZERO_ERROR; 498 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err); 499 if (U_SUCCESS(err) && uset_contains(set, uSig)) { 500 result = CNV_WITH_FEFF; 501 } else { 502 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted 503 } 504 uset_close(set); 505 506 if (result == CNV_WITH_FEFF) { 507 // test if the output charset emits a signature anyway 508 const UChar a[1] = { 0x61 }; // "a" 509 const UChar *in; 510 511 char buffer[20]; 512 char *out; 513 514 in = a; 515 out = buffer; 516 err = U_ZERO_ERROR; 517 ucnv_fromUnicode(cnv, 518 &out, buffer + sizeof(buffer), 519 &in, a + 1, 520 NULL, TRUE, &err); 521 ucnv_resetFromUnicode(cnv); 522 523 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) && 524 U_SUCCESS(err) 525 ) { 526 result = CNV_ADDS_FEFF; 527 } 528 } 529 530 return result; 531 } 532 533 class ConvertFile { 534 public: 535 ConvertFile() : 536 buf(NULL), outbuf(NULL), fromoffsets(NULL), 537 bufsz(0), signature(0) {} 538 539 void 540 setBufferSize(size_t bufferSize) { 541 bufsz = bufferSize; 542 543 buf = new char[2 * bufsz]; 544 outbuf = buf + bufsz; 545 546 // +1 for an added U+FEFF in the intermediate Unicode buffer 547 fromoffsets = new int32_t[bufsz + 1]; 548 } 549 550 ~ConvertFile() { 551 delete [] buf; 552 delete [] fromoffsets; 553 } 554 555 UBool convertFile(const char *pname, 556 const char *fromcpage, 557 UConverterToUCallback toucallback, 558 const void *touctxt, 559 const char *tocpage, 560 UConverterFromUCallback fromucallback, 561 const void *fromuctxt, 562 UBool fallback, 563 const char *translit, 564 const char *infilestr, 565 FILE * outfile, int verbose); 566 private: 567 friend int main(int argc, char **argv); 568 569 char *buf, *outbuf; 570 int32_t *fromoffsets; 571 572 size_t bufsz; 573 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character 574 }; 575 576 // Convert a file from one encoding to another 577 UBool 578 ConvertFile::convertFile(const char *pname, 579 const char *fromcpage, 580 UConverterToUCallback toucallback, 581 const void *touctxt, 582 const char *tocpage, 583 UConverterFromUCallback fromucallback, 584 const void *fromuctxt, 585 UBool fallback, 586 const char *translit, 587 const char *infilestr, 588 FILE * outfile, int verbose) 589 { 590 FILE *infile; 591 UBool ret = TRUE; 592 UConverter *convfrom = 0; 593 UConverter *convto = 0; 594 UErrorCode err = U_ZERO_ERROR; 595 UBool flush; 596 const char *cbufp, *prevbufp; 597 char *bufp; 598 599 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ 600 601 const UChar *unibuf, *unibufbp; 602 UChar *unibufp; 603 604 size_t rd, wr; 605 606 #if !UCONFIG_NO_TRANSLITERATION 607 Transliterator *t = 0; // Transliterator acting on Unicode data. 608 UnicodeString chunk; // One chunk of the text being collected for transformation. 609 #endif 610 UnicodeString u; // String to do the transliteration. 611 int32_t ulen; 612 613 // use conversion offsets for error messages 614 // unless a transliterator is used - 615 // a text transformation will reorder characters in unpredictable ways 616 UBool useOffsets = TRUE; 617 618 // Open the correct input file or connect to stdin for reading input 619 620 if (infilestr != 0 && strcmp(infilestr, "-")) { 621 infile = fopen(infilestr, "rb"); 622 if (infile == 0) { 623 UnicodeString str1(infilestr, ""); 624 str1.append((UChar32) 0); 625 UnicodeString str2(strerror(errno), ""); 626 str2.append((UChar32) 0); 627 initMsg(pname); 628 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); 629 return FALSE; 630 } 631 } else { 632 infilestr = "-"; 633 infile = stdin; 634 #ifdef USE_FILENO_BINARY_MODE 635 if (setmode(fileno(stdin), O_BINARY) == -1) { 636 initMsg(pname); 637 u_wmsg(stderr, "cantSetInBinMode"); 638 return FALSE; 639 } 640 #endif 641 } 642 643 if (verbose) { 644 fprintf(stderr, "%s:\n", infilestr); 645 } 646 647 #if !UCONFIG_NO_TRANSLITERATION 648 // Create transliterator as needed. 649 650 if (translit != NULL && *translit) { 651 UParseError parse; 652 UnicodeString str(translit), pestr; 653 654 /* Create from rules or by ID as needed. */ 655 656 parse.line = -1; 657 658 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { 659 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); 660 } else { 661 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); 662 } 663 664 if (U_FAILURE(err)) { 665 str.append((UChar32) 0); 666 initMsg(pname); 667 668 if (parse.line >= 0) { 669 UChar linebuf[20], offsetbuf[20]; 670 uprv_itou(linebuf, 20, parse.line, 10, 0); 671 uprv_itou(offsetbuf, 20, parse.offset, 10, 0); 672 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(), 673 u_wmsg_errorName(err), linebuf, offsetbuf); 674 } else { 675 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(), 676 u_wmsg_errorName(err)); 677 } 678 679 if (t) { 680 delete t; 681 t = 0; 682 } 683 goto error_exit; 684 } 685 686 useOffsets = FALSE; 687 } 688 #endif 689 690 // Create codepage converter. If the codepage or its aliases weren't 691 // available, it returns NULL and a failure code. We also set the 692 // callbacks, and return errors in the same way. 693 694 convfrom = ucnv_open(fromcpage, &err); 695 if (U_FAILURE(err)) { 696 UnicodeString str(fromcpage, ""); 697 initMsg(pname); 698 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(), 699 u_wmsg_errorName(err)); 700 goto error_exit; 701 } 702 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); 703 if (U_FAILURE(err)) { 704 initMsg(pname); 705 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 706 goto error_exit; 707 } 708 709 convto = ucnv_open(tocpage, &err); 710 if (U_FAILURE(err)) { 711 UnicodeString str(tocpage, ""); 712 initMsg(pname); 713 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(), 714 u_wmsg_errorName(err)); 715 goto error_exit; 716 } 717 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); 718 if (U_FAILURE(err)) { 719 initMsg(pname); 720 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 721 goto error_exit; 722 } 723 ucnv_setFallback(convto, fallback); 724 725 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode; 726 int8_t sig; 727 728 // OK, we can convert now. 729 sig = signature; 730 rd = 0; 731 732 do { 733 willexit = FALSE; 734 735 // input file offset at the beginning of the next buffer 736 infoffset += rd; 737 738 rd = fread(buf, 1, bufsz, infile); 739 if (ferror(infile) != 0) { 740 UnicodeString str(strerror(errno)); 741 initMsg(pname); 742 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer()); 743 goto error_exit; 744 } 745 746 // Convert the read buffer into the new encoding via Unicode. 747 // After the call 'unibufp' will be placed behind the last 748 // character that was converted in the 'unibuf'. 749 // Also the 'cbufp' is positioned behind the last converted 750 // character. 751 // At the last conversion in the file, flush should be set to 752 // true so that we get all characters converted. 753 // 754 // The converter must be flushed at the end of conversion so 755 // that characters on hold also will be written. 756 757 cbufp = buf; 758 flush = (UBool)(rd != bufsz); 759 760 // convert until the input is consumed 761 do { 762 // remember the start of the current byte-to-Unicode conversion 763 prevbufp = cbufp; 764 765 unibuf = unibufp = u.getBuffer((int32_t)bufsz); 766 767 // Use bufsz instead of u.getCapacity() for the targetLimit 768 // so that we don't overflow fromoffsets[]. 769 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp, 770 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err); 771 772 ulen = (int32_t)(unibufp - unibuf); 773 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0); 774 775 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done 776 // converting all of the input bytes. 777 // It works like this because ucnv_toUnicode() returns only under the 778 // following conditions: 779 // - an error occurred during conversion (an error code is set) 780 // - the target buffer is filled (the error code indicates an overflow) 781 // - the source is consumed 782 // That is, if the error code does not indicate a failure, 783 // not even an overflow, then the source must be consumed entirely. 784 fromSawEndOfBytes = (UBool)U_SUCCESS(err); 785 786 if (err == U_BUFFER_OVERFLOW_ERROR) { 787 err = U_ZERO_ERROR; 788 } else if (U_FAILURE(err)) { 789 char pos[32], errorBytes[32]; 790 int8_t i, length, errorLength; 791 792 UErrorCode localError = U_ZERO_ERROR; 793 errorLength = (int8_t)sizeof(errorBytes); 794 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError); 795 if (U_FAILURE(localError) || errorLength == 0) { 796 errorLength = 1; 797 } 798 799 // print the input file offset of the start of the error bytes: 800 // input file offset of the current byte buffer + 801 // length of the just consumed bytes - 802 // length of the error bytes 803 length = 804 (int8_t)sprintf(pos, "%d", 805 (int)(infoffset + (cbufp - buf) - errorLength)); 806 807 // output the bytes that caused the error 808 UnicodeString str; 809 for (i = 0; i < errorLength; ++i) { 810 if (i > 0) { 811 str.append((UChar)uSP); 812 } 813 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4)); 814 str.append(nibbleToHex((uint8_t)errorBytes[i])); 815 } 816 817 initMsg(pname); 818 u_wmsg(stderr, "problemCvtToU", 819 UnicodeString(pos, length, "").getTerminatedBuffer(), 820 str.getTerminatedBuffer(), 821 u_wmsg_errorName(err)); 822 823 willexit = TRUE; 824 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 825 } 826 827 // Replaced a check for whether the input was consumed by 828 // looping until it is; message key "premEndInput" now obsolete. 829 830 if (ulen == 0) { 831 continue; 832 } 833 834 // remove a U+FEFF Unicode signature character if requested 835 if (sig < 0) { 836 if (u.charAt(0) == uSig) { 837 u.remove(0, 1); 838 839 // account for the removed UChar and offset 840 --ulen; 841 842 if (useOffsets) { 843 // remove an offset from fromoffsets[] as well 844 // to keep the array parallel with the UChars 845 memmove(fromoffsets, fromoffsets + 1, ulen * 4); 846 } 847 848 } 849 sig = 0; 850 } 851 852 #if !UCONFIG_NO_TRANSLITERATION 853 // Transliterate/transform if needed. 854 855 // For transformation, we use chunking code - 856 // collect Unicode input until, for example, an end-of-line, 857 // then transform and output-convert that and continue collecting. 858 // This makes the transformation result independent of the buffer size 859 // while avoiding the slower keyboard mode. 860 // The end-of-chunk characters are completely included in the 861 // transformed string in case they are to be transformed themselves. 862 if (t != NULL) { 863 UnicodeString out; 864 int32_t chunkLimit; 865 866 do { 867 chunkLimit = getChunkLimit(chunk, u); 868 if (chunkLimit < 0 && flush && fromSawEndOfBytes) { 869 // use all of the rest at the end of the text 870 chunkLimit = u.length(); 871 } 872 if (chunkLimit >= 0) { 873 // complete the chunk and transform it 874 chunk.append(u, 0, chunkLimit); 875 u.remove(0, chunkLimit); 876 t->transliterate(chunk); 877 878 // append the transformation result to the result and empty the chunk 879 out.append(chunk); 880 chunk.remove(); 881 } else { 882 // continue collecting the chunk 883 chunk.append(u); 884 break; 885 } 886 } while (!u.isEmpty()); 887 888 u = out; 889 ulen = u.length(); 890 } 891 #endif 892 893 // add a U+FEFF Unicode signature character if requested 894 // and possible/necessary 895 if (sig > 0) { 896 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) { 897 u.insert(0, (UChar)uSig); 898 899 if (useOffsets) { 900 // insert a pseudo-offset into fromoffsets[] as well 901 // to keep the array parallel with the UChars 902 memmove(fromoffsets + 1, fromoffsets, ulen * 4); 903 fromoffsets[0] = -1; 904 } 905 906 // account for the additional UChar and offset 907 ++ulen; 908 } 909 sig = 0; 910 } 911 912 // Convert the Unicode buffer into the destination codepage 913 // Again 'bufp' will be placed behind the last converted character 914 // And 'unibufp' will be placed behind the last converted unicode character 915 // At the last conversion flush should be set to true to ensure that 916 // all characters left get converted 917 918 unibuf = unibufbp = u.getBuffer(); 919 920 do { 921 bufp = outbuf; 922 923 // Use fromSawEndOfBytes in addition to the flush flag - 924 // it indicates whether the intermediate Unicode string 925 // contains the very last UChars for the very last input bytes. 926 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz, 927 &unibufbp, 928 unibuf + ulen, 929 NULL, (UBool)(flush && fromSawEndOfBytes), &err); 930 931 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done 932 // converting all of the intermediate UChars. 933 // See comment for fromSawEndOfBytes. 934 toSawEndOfUnicode = (UBool)U_SUCCESS(err); 935 936 if (err == U_BUFFER_OVERFLOW_ERROR) { 937 err = U_ZERO_ERROR; 938 } else if (U_FAILURE(err)) { 939 UChar errorUChars[4]; 940 const char *errtag; 941 char pos[32]; 942 UChar32 c; 943 int8_t i, length, errorLength; 944 945 UErrorCode localError = U_ZERO_ERROR; 946 errorLength = (int8_t)LENGTHOF(errorUChars); 947 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError); 948 if (U_FAILURE(localError) || errorLength == 0) { 949 // need at least 1 so that we don't access beyond the length of fromoffsets[] 950 errorLength = 1; 951 } 952 953 int32_t ferroffset; 954 955 if (useOffsets) { 956 // Unicode buffer offset of the start of the error UChars 957 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength); 958 if (ferroffset < 0) { 959 // approximation - the character started in the previous Unicode buffer 960 ferroffset = 0; 961 } 962 963 // get the corresponding byte offset out of fromoffsets[] 964 // go back if the offset is not known for some of the UChars 965 int32_t fromoffset; 966 do { 967 fromoffset = fromoffsets[ferroffset]; 968 } while (fromoffset < 0 && --ferroffset >= 0); 969 970 // total input file offset = 971 // input file offset of the current byte buffer + 972 // byte buffer offset of where the current Unicode buffer is converted from + 973 // fromoffsets[Unicode offset] 974 ferroffset = infoffset + (prevbufp - buf) + fromoffset; 975 errtag = "problemCvtFromU"; 976 } else { 977 // Do not use fromoffsets if (t != NULL) because the Unicode text may 978 // be different from what the offsets refer to. 979 980 // output file offset 981 ferroffset = (int32_t)(outfoffset + (bufp - outbuf)); 982 errtag = "problemCvtFromUOut"; 983 } 984 985 length = (int8_t)sprintf(pos, "%u", (int)ferroffset); 986 987 // output the code points that caused the error 988 UnicodeString str; 989 for (i = 0; i < errorLength;) { 990 if (i > 0) { 991 str.append((UChar)uSP); 992 } 993 U16_NEXT(errorUChars, i, errorLength, c); 994 if (c >= 0x100000) { 995 str.append(nibbleToHex((uint8_t)(c >> 20))); 996 } 997 if (c >= 0x10000) { 998 str.append(nibbleToHex((uint8_t)(c >> 16))); 999 } 1000 str.append(nibbleToHex((uint8_t)(c >> 12))); 1001 str.append(nibbleToHex((uint8_t)(c >> 8))); 1002 str.append(nibbleToHex((uint8_t)(c >> 4))); 1003 str.append(nibbleToHex((uint8_t)c)); 1004 } 1005 1006 initMsg(pname); 1007 u_wmsg(stderr, errtag, 1008 UnicodeString(pos, length, "").getTerminatedBuffer(), 1009 str.getTerminatedBuffer(), 1010 u_wmsg_errorName(err)); 1011 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer()); 1012 1013 willexit = TRUE; 1014 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 1015 } 1016 1017 // Replaced a check for whether the intermediate Unicode characters were all consumed by 1018 // looping until they are; message key "premEnd" now obsolete. 1019 1020 // Finally, write the converted buffer to the output file 1021 size_t outlen = (size_t) (bufp - outbuf); 1022 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile)); 1023 if (wr != outlen) { 1024 UnicodeString str(strerror(errno)); 1025 initMsg(pname); 1026 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer()); 1027 willexit = TRUE; 1028 } 1029 1030 if (willexit) { 1031 goto error_exit; 1032 } 1033 } while (!toSawEndOfUnicode); 1034 } while (!fromSawEndOfBytes); 1035 } while (!flush); // Stop when we have flushed the 1036 // converters (this means that it's 1037 // the end of output) 1038 1039 goto normal_exit; 1040 1041 error_exit: 1042 ret = FALSE; 1043 1044 normal_exit: 1045 // Cleanup. 1046 1047 ucnv_close(convfrom); 1048 ucnv_close(convto); 1049 1050 #if !UCONFIG_NO_TRANSLITERATION 1051 delete t; 1052 #endif 1053 1054 if (infile != stdin) { 1055 fclose(infile); 1056 } 1057 1058 return ret; 1059 } 1060 1061 static void usage(const char *pname, int ecode) { 1062 const UChar *msg; 1063 int32_t msgLen; 1064 UErrorCode err = U_ZERO_ERROR; 1065 FILE *fp = ecode ? stderr : stdout; 1066 int res; 1067 1068 initMsg(pname); 1069 msg = 1070 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", 1071 &msgLen, &err); 1072 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1)); 1073 UnicodeString mname(msg, msgLen + 1); 1074 1075 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer()); 1076 if (!ecode) { 1077 if (!res) { 1078 fputc('\n', fp); 1079 } 1080 if (!u_wmsg(fp, "help")) { 1081 /* Now dump callbacks and finish. */ 1082 1083 int i, count = 1084 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 1085 for (i = 0; i < count; ++i) { 1086 fprintf(fp, " %s", transcode_callbacks[i].name); 1087 } 1088 fputc('\n', fp); 1089 } 1090 } 1091 1092 exit(ecode); 1093 } 1094 1095 extern int 1096 main(int argc, char **argv) 1097 { 1098 FILE *outfile; 1099 int ret = 0; 1100 1101 size_t bufsz = DEFAULT_BUFSZ; 1102 1103 const char *fromcpage = 0; 1104 const char *tocpage = 0; 1105 const char *translit = 0; 1106 const char *outfilestr = 0; 1107 UBool fallback = FALSE; 1108 1109 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; 1110 const void *fromuctxt = 0; 1111 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; 1112 const void *touctxt = 0; 1113 1114 char **iter, **remainArgv, **remainArgvLimit; 1115 char **end = argv + argc; 1116 1117 const char *pname; 1118 1119 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE; 1120 const char *printName = 0; 1121 1122 UBool verbose = FALSE; 1123 UErrorCode status = U_ZERO_ERROR; 1124 1125 ConvertFile cf; 1126 1127 /* Initialize ICU */ 1128 u_init(&status); 1129 if (U_FAILURE(status)) { 1130 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 1131 argv[0], u_errorName(status)); 1132 exit(1); 1133 } 1134 1135 // Get and prettify pname. 1136 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); 1137 #ifdef U_WINDOWS 1138 if (!pname) { 1139 pname = uprv_strrchr(*argv, '/'); 1140 } 1141 #endif 1142 if (!pname) { 1143 pname = *argv; 1144 } else { 1145 ++pname; 1146 } 1147 1148 // First, get the arguments from command-line 1149 // to know the codepages to convert between 1150 1151 remainArgv = remainArgvLimit = argv + 1; 1152 for (iter = argv + 1; iter != end; iter++) { 1153 // Check for from charset 1154 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { 1155 iter++; 1156 if (iter != end) 1157 fromcpage = *iter; 1158 else 1159 usage(pname, 1); 1160 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { 1161 iter++; 1162 if (iter != end) 1163 tocpage = *iter; 1164 else 1165 usage(pname, 1); 1166 } else if (strcmp("-x", *iter) == 0) { 1167 iter++; 1168 if (iter != end) 1169 translit = *iter; 1170 else 1171 usage(pname, 1); 1172 } else if (!strcmp("--fallback", *iter)) { 1173 fallback = TRUE; 1174 } else if (!strcmp("--no-fallback", *iter)) { 1175 fallback = FALSE; 1176 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) { 1177 iter++; 1178 if (iter != end) { 1179 bufsz = atoi(*iter); 1180 if ((int) bufsz <= 0) { 1181 initMsg(pname); 1182 UnicodeString str(*iter); 1183 initMsg(pname); 1184 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer()); 1185 return 3; 1186 } 1187 } else { 1188 usage(pname, 1); 1189 } 1190 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { 1191 if (printTranslits) { 1192 usage(pname, 1); 1193 } 1194 printConvs = TRUE; 1195 } else if (strcmp("--default-code", *iter) == 0) { 1196 if (printTranslits) { 1197 usage(pname, 1); 1198 } 1199 printName = ucnv_getDefaultName(); 1200 } else if (strcmp("--list-code", *iter) == 0) { 1201 if (printTranslits) { 1202 usage(pname, 1); 1203 } 1204 1205 iter++; 1206 if (iter != end) { 1207 UErrorCode e = U_ZERO_ERROR; 1208 printName = ucnv_getAlias(*iter, 0, &e); 1209 if (U_FAILURE(e) || !printName) { 1210 UnicodeString str(*iter); 1211 initMsg(pname); 1212 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer()); 1213 return 2; 1214 } 1215 } else 1216 usage(pname, 1); 1217 } else if (strcmp("--canon", *iter) == 0) { 1218 printCanon = TRUE; 1219 } else if (strcmp("-L", *iter) == 0 1220 || !strcmp("--list-transliterators", *iter)) { 1221 if (printConvs) { 1222 usage(pname, 1); 1223 } 1224 printTranslits = TRUE; 1225 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter) 1226 || !strcmp("--help", *iter)) { 1227 usage(pname, 0); 1228 } else if (!strcmp("-c", *iter)) { 1229 fromucallback = UCNV_FROM_U_CALLBACK_SKIP; 1230 } else if (!strcmp("--to-callback", *iter)) { 1231 iter++; 1232 if (iter != end) { 1233 const struct callback_ent *cbe = findCallback(*iter); 1234 if (cbe) { 1235 fromucallback = cbe->fromu; 1236 fromuctxt = cbe->fromuctxt; 1237 } else { 1238 UnicodeString str(*iter); 1239 initMsg(pname); 1240 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1241 return 4; 1242 } 1243 } else { 1244 usage(pname, 1); 1245 } 1246 } else if (!strcmp("--from-callback", *iter)) { 1247 iter++; 1248 if (iter != end) { 1249 const struct callback_ent *cbe = findCallback(*iter); 1250 if (cbe) { 1251 toucallback = cbe->tou; 1252 touctxt = cbe->touctxt; 1253 } else { 1254 UnicodeString str(*iter); 1255 initMsg(pname); 1256 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1257 return 4; 1258 } 1259 } else { 1260 usage(pname, 1); 1261 } 1262 } else if (!strcmp("-i", *iter)) { 1263 toucallback = UCNV_TO_U_CALLBACK_SKIP; 1264 } else if (!strcmp("--callback", *iter)) { 1265 iter++; 1266 if (iter != end) { 1267 const struct callback_ent *cbe = findCallback(*iter); 1268 if (cbe) { 1269 fromucallback = cbe->fromu; 1270 fromuctxt = cbe->fromuctxt; 1271 toucallback = cbe->tou; 1272 touctxt = cbe->touctxt; 1273 } else { 1274 UnicodeString str(*iter); 1275 initMsg(pname); 1276 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1277 return 4; 1278 } 1279 } else { 1280 usage(pname, 1); 1281 } 1282 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { 1283 verbose = FALSE; 1284 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { 1285 verbose = TRUE; 1286 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { 1287 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname); 1288 return 0; 1289 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { 1290 ++iter; 1291 if (iter != end && !outfilestr) { 1292 outfilestr = *iter; 1293 } else { 1294 usage(pname, 1); 1295 } 1296 } else if (0 == strcmp("--add-signature", *iter)) { 1297 cf.signature = 1; 1298 } else if (0 == strcmp("--remove-signature", *iter)) { 1299 cf.signature = -1; 1300 } else if (**iter == '-' && (*iter)[1]) { 1301 usage(pname, 1); 1302 } else { 1303 // move a non-option up in argv[] 1304 *remainArgvLimit++ = *iter; 1305 } 1306 } 1307 1308 if (printConvs || printName) { 1309 return printConverters(pname, printName, printCanon) ? 2 : 0; 1310 } else if (printTranslits) { 1311 return printTransliterators(printCanon) ? 3 : 0; 1312 } 1313 1314 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) { 1315 fromcpage = ucnv_getDefaultName(); 1316 } 1317 if (!tocpage || !uprv_strcmp(tocpage, "-")) { 1318 tocpage = ucnv_getDefaultName(); 1319 } 1320 1321 // Open the correct output file or connect to stdout for reading input 1322 if (outfilestr != 0 && strcmp(outfilestr, "-")) { 1323 outfile = fopen(outfilestr, "wb"); 1324 if (outfile == 0) { 1325 UnicodeString str1(outfilestr, ""); 1326 UnicodeString str2(strerror(errno), ""); 1327 initMsg(pname); 1328 u_wmsg(stderr, "cantCreateOutputF", 1329 str1.getBuffer(), str2.getBuffer()); 1330 return 1; 1331 } 1332 } else { 1333 outfilestr = "-"; 1334 outfile = stdout; 1335 #ifdef USE_FILENO_BINARY_MODE 1336 if (setmode(fileno(outfile), O_BINARY) == -1) { 1337 u_wmsg(stderr, "cantSetOutBinMode"); 1338 exit(-1); 1339 } 1340 #endif 1341 } 1342 1343 /* Loop again on the arguments to find all the input files, and 1344 convert them. */ 1345 1346 cf.setBufferSize(bufsz); 1347 1348 if(remainArgv < remainArgvLimit) { 1349 for (iter = remainArgv; iter != remainArgvLimit; iter++) { 1350 if (!cf.convertFile( 1351 pname, fromcpage, toucallback, touctxt, tocpage, 1352 fromucallback, fromuctxt, fallback, translit, *iter, 1353 outfile, verbose) 1354 ) { 1355 goto error_exit; 1356 } 1357 } 1358 } else { 1359 if (!cf.convertFile( 1360 pname, fromcpage, toucallback, touctxt, tocpage, 1361 fromucallback, fromuctxt, fallback, translit, 0, 1362 outfile, verbose) 1363 ) { 1364 goto error_exit; 1365 } 1366 } 1367 1368 goto normal_exit; 1369 error_exit: 1370 #if !UCONFIG_NO_LEGACY_CONVERSION 1371 ret = 1; 1372 #else 1373 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n"); 1374 #endif 1375 normal_exit: 1376 1377 if (outfile != stdout) { 1378 fclose(outfile); 1379 } 1380 1381 return ret; 1382 } 1383 1384 1385 /* 1386 * Hey, Emacs, please set the following: 1387 * 1388 * Local Variables: 1389 * indent-tabs-mode: nil 1390 * End: 1391 * 1392 */ 1393