1 /************************************************************************** 2 * 3 * Copyright (C) 2000-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 * 6 *************************************************************************** 7 * file name: convsamp.c 8 * encoding: ASCII (7-bit) 9 * 10 * created on: 2000may30 11 * created by: Steven R. Loomis 12 * 13 * Sample code for the ICU conversion routines. 14 * 15 * Note: Nothing special is needed to build this sample. Link with 16 * the icu UC and icu I18N libraries. 17 * 18 * I use 'assert' for error checking, you probably will want 19 * something more flexible. '***BEGIN SAMPLE***' and 20 * '***END SAMPLE***' mark pieces suitable for stand alone 21 * code snippets. 22 * 23 * 24 * Each test can define it's own BUFFERSIZE 25 * 26 */ 27 28 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 29 30 #include <stdio.h> 31 #include <ctype.h> /* for isspace, etc. */ 32 #include <assert.h> 33 #include <string.h> 34 #include <stdlib.h> /* malloc */ 35 36 #include "unicode/utypes.h" /* Basic ICU data types */ 37 #include "unicode/ucnv.h" /* C Converter API */ 38 #include "unicode/ustring.h" /* some more string fcns*/ 39 #include "unicode/uchar.h" /* char names */ 40 #include "unicode/uloc.h" 41 #include "unicode/unistr.h" 42 43 #include "flagcb.h" 44 45 /* Some utility functions */ 46 47 static const UChar kNone[] = { 0x0000 }; 48 49 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 50 51 /* Print a UChar if possible, in seven characters. */ 52 void prettyPrintUChar(UChar c) 53 { 54 if( (c <= 0x007F) && 55 (isgraph(c)) ) { 56 printf(" '%c' ", (char)(0x00FF&c)); 57 } else if ( c > 0x007F ) { 58 char buf[1000]; 59 UErrorCode status = U_ZERO_ERROR; 60 int32_t o; 61 62 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); 63 if(U_SUCCESS(status) && (o>0) ) { 64 buf[6] = 0; 65 printf("%7s", buf); 66 } else { 67 printf(" ??????"); 68 } 69 } else { 70 switch((char)(c & 0x007F)) { 71 case ' ': 72 printf(" ' ' "); 73 break; 74 case '\t': 75 printf(" \\t "); 76 break; 77 case '\n': 78 printf(" \\n "); 79 break; 80 default: 81 printf(" _ "); 82 break; 83 } 84 } 85 } 86 87 88 void printUChars(const char *name = "?", 89 const UChar *uch = kNone, 90 int32_t len = -1 ) 91 { 92 int32_t i; 93 94 if( (len == -1) && (uch) ) { 95 len = u_strlen(uch); 96 } 97 98 printf("%5s: ", name); 99 for( i = 0; i <len; i++) { 100 printf("%-6d ", i); 101 } 102 printf("\n"); 103 104 printf("%5s: ", "uni"); 105 for( i = 0; i <len; i++) { 106 printf("\\u%04X ", (int)uch[i]); 107 } 108 printf("\n"); 109 110 printf("%5s:", "ch"); 111 for( i = 0; i <len; i++) { 112 prettyPrintUChar(uch[i]); 113 } 114 printf("\n"); 115 } 116 117 void printBytes(const char *name = "?", 118 const char *uch = "", 119 int32_t len = -1 ) 120 { 121 int32_t i; 122 123 if( (len == -1) && (uch) ) { 124 len = strlen(uch); 125 } 126 127 printf("%5s: ", name); 128 for( i = 0; i <len; i++) { 129 printf("%-4d ", i); 130 } 131 printf("\n"); 132 133 printf("%5s: ", "uni"); 134 for( i = 0; i <len; i++) { 135 printf("\\x%02X ", 0x00FF & (int)uch[i]); 136 } 137 printf("\n"); 138 139 printf("%5s:", "ch"); 140 for( i = 0; i <len; i++) { 141 if(isgraph(0x00FF & (int)uch[i])) { 142 printf(" '%c' ", (char)uch[i]); 143 } else { 144 printf(" "); 145 } 146 } 147 printf("\n"); 148 } 149 150 void printUChar(UChar32 ch32) 151 { 152 if(ch32 > 0xFFFF) { 153 printf("ch: U+%06X\n", ch32); 154 } 155 else { 156 UChar ch = (UChar)ch32; 157 printUChars("C", &ch, 1); 158 } 159 } 160 161 /******************************************************************* 162 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 163 followed by an exclamation mark (!) into the KOI8-R Russian code page. 164 165 This example first creates a UChar String out of the Unicode chars. 166 167 targetSize must be set to the amount of space available in the target 168 buffer. After fromUChars is called, 169 len will contain the number of bytes in target[] which were 170 used in the resulting codepage. In this case, there is a 1:1 mapping 171 between the input and output characters. The exclamation mark has the 172 same value in both KOI8-R and Unicode. 173 174 src: 0 1 2 3 4 5 6 175 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 176 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 177 178 targ: 0 1 2 3 4 5 6 179 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 180 ch: '!' 181 182 183 Converting FROM unicode 184 to koi8-r. 185 You must call ucnv_close to clean up the memory used by the 186 converter. 187 188 'len' returns the number of OUTPUT bytes resulting from the 189 conversion. 190 */ 191 192 UErrorCode convsample_02() 193 { 194 printf("\n\n==============================================\n" 195 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 196 197 198 // **************************** START SAMPLE ******************* 199 // "cat<cat>OK" 200 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 201 0x0430, 0x0021, 0x0000 }; 202 char target[100]; 203 UErrorCode status = U_ZERO_ERROR; 204 UConverter *conv; 205 int32_t len; 206 207 // set up the converter 208 //! [ucnv_open] 209 conv = ucnv_open("koi8-r", &status); 210 //! [ucnv_open] 211 assert(U_SUCCESS(status)); 212 213 // convert to koi8-r 214 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 215 assert(U_SUCCESS(status)); 216 217 // close the converter 218 ucnv_close(conv); 219 220 // ***************************** END SAMPLE ******************** 221 222 // Print it out 223 printUChars("src", source); 224 printf("\n"); 225 printBytes("targ", target, len); 226 227 return U_ZERO_ERROR; 228 } 229 230 231 UErrorCode convsample_03() 232 { 233 printf("\n\n==============================================\n" 234 "Sample 03: C: print out all converters\n"); 235 236 int32_t count; 237 int32_t i; 238 239 // **************************** START SAMPLE ******************* 240 count = ucnv_countAvailable(); 241 printf("Available converters: %d\n", count); 242 243 for(i=0;i<count;i++) 244 { 245 printf("%s ", ucnv_getAvailableName(i)); 246 } 247 248 // ***************************** END SAMPLE ******************** 249 250 printf("\n"); 251 252 return U_ZERO_ERROR; 253 } 254 255 256 257 #define BUFFERSIZE 17 /* make it interesting :) */ 258 259 /* 260 Converting from a codepage to Unicode in bulk.. 261 What is the best way to determine the buffer size? 262 263 The 'buffersize' is in bytes of input. 264 For a given converter, divinding this by the minimum char size 265 give you the maximum number of Unicode characters that could be 266 expected for a given number of input bytes. 267 see: ucnv_getMinCharSize() 268 269 For example, a single byte codepage like 'Latin-3' has a 270 minimum char size of 1. (It takes at least 1 byte to represent 271 each Unicode char.) So the unicode buffer has the same number of 272 UChars as the input buffer has bytes. 273 274 In a strictly double byte codepage such as cp1362 (Windows 275 Korean), the minimum char size is 2. So, only half as many Unicode 276 chars as bytes are needed. 277 278 This work to calculate the buffer size is an optimization. Any 279 size of input and output buffer can be used, as long as the 280 program handles the following cases: If the input buffer is empty, 281 the source pointer will be equal to sourceLimit. If the output 282 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 283 */ 284 285 UErrorCode convsample_05() 286 { 287 printf("\n\n==============================================\n" 288 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 289 290 FILE *f; 291 int32_t count; 292 char inBuf[BUFFERSIZE]; 293 const char *source; 294 const char *sourceLimit; 295 UChar *uBuf; 296 UChar *target; 297 UChar *targetLimit; 298 UChar *p; 299 int32_t uBufSize = 0; 300 UConverter *conv; 301 UErrorCode status = U_ZERO_ERROR; 302 uint32_t letters=0, total=0; 303 304 f = fopen("data01.txt", "r"); 305 if(!f) 306 { 307 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 308 return U_FILE_ACCESS_ERROR; 309 } 310 311 // **************************** START SAMPLE ******************* 312 conv = ucnv_open("utf-8", &status); 313 assert(U_SUCCESS(status)); 314 315 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 316 printf("input bytes %d / min chars %d = %d UChars\n", 317 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 318 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 319 assert(uBuf!=NULL); 320 321 // grab another buffer's worth 322 while((!feof(f)) && 323 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 324 { 325 // Convert bytes to unicode 326 source = inBuf; 327 sourceLimit = inBuf + count; 328 329 do 330 { 331 target = uBuf; 332 targetLimit = uBuf + uBufSize; 333 334 ucnv_toUnicode(conv, &target, targetLimit, 335 &source, sourceLimit, NULL, 336 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 337 /* is true (when no more data will come) */ 338 &status); 339 340 if(status == U_BUFFER_OVERFLOW_ERROR) 341 { 342 // simply ran out of space - we'll reset the target ptr the next 343 // time through the loop. 344 status = U_ZERO_ERROR; 345 } 346 else 347 { 348 // Check other errors here. 349 assert(U_SUCCESS(status)); 350 // Break out of the loop (by force) 351 } 352 353 // Process the Unicode 354 // Todo: handle UTF-16/surrogates 355 356 for(p = uBuf; p<target; p++) 357 { 358 if(u_isalpha(*p)) 359 letters++; 360 total++; 361 } 362 } while (source < sourceLimit); // while simply out of space 363 } 364 365 printf("%d letters out of %d total UChars.\n", letters, total); 366 367 // ***************************** END SAMPLE ******************** 368 ucnv_close(conv); 369 370 printf("\n"); 371 372 fclose(f); 373 374 return U_ZERO_ERROR; 375 } 376 #undef BUFFERSIZE 377 378 #define BUFFERSIZE 1024 379 typedef struct 380 { 381 UChar32 codepoint; 382 uint32_t frequency; 383 } CharFreqInfo; 384 385 UErrorCode convsample_06() 386 { 387 printf("\n\n==============================================\n" 388 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 389 390 FILE *f; 391 int32_t count; 392 char inBuf[BUFFERSIZE]; 393 const char *source; 394 const char *sourceLimit; 395 int32_t uBufSize = 0; 396 UConverter *conv; 397 UErrorCode status = U_ZERO_ERROR; 398 uint32_t letters=0, total=0; 399 400 CharFreqInfo *info; 401 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 402 UChar32 p; 403 404 uint32_t ie = 0; 405 uint32_t gh = 0; 406 UChar32 l = 0; 407 408 f = fopen("data06.txt", "r"); 409 if(!f) 410 { 411 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 412 return U_FILE_ACCESS_ERROR; 413 } 414 415 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 416 if(!info) 417 { 418 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); 419 } 420 421 /* reset frequencies */ 422 for(p=0;p<charCount;p++) 423 { 424 info[p].codepoint = p; 425 info[p].frequency = 0; 426 } 427 428 // **************************** START SAMPLE ******************* 429 conv = ucnv_open("utf-8", &status); 430 assert(U_SUCCESS(status)); 431 432 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 433 printf("input bytes %d / min chars %d = %d UChars\n", 434 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 435 436 // grab another buffer's worth 437 while((!feof(f)) && 438 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 439 { 440 // Convert bytes to unicode 441 source = inBuf; 442 sourceLimit = inBuf + count; 443 444 while(source < sourceLimit) 445 { 446 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 447 if(U_FAILURE(status)) 448 { 449 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 450 status = U_ZERO_ERROR; 451 continue; 452 } 453 U_ASSERT(status); 454 total++; 455 456 if(u_isalpha(p)) 457 letters++; 458 459 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 460 ie++; 461 462 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 463 gh++; 464 465 if(p>charCount) 466 { 467 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 468 free(info); 469 fclose(f); 470 ucnv_close(conv); 471 return U_UNSUPPORTED_ERROR; 472 } 473 info[p].frequency++; 474 l = p; 475 } 476 } 477 478 fclose(f); 479 ucnv_close(conv); 480 481 printf("%d letters out of %d total UChars.\n", letters, total); 482 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 483 484 // now, we could sort it.. 485 486 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 487 488 for(p=0;p<charCount;p++) 489 { 490 if(info[p].frequency) 491 { 492 printf("% 5d U+%06X ", info[p].frequency, p); 493 if(p <= 0xFFFF) 494 { 495 prettyPrintUChar((UChar)p); 496 } 497 printf("\n"); 498 } 499 } 500 free(info); 501 // ***************************** END SAMPLE ******************** 502 503 printf("\n"); 504 505 return U_ZERO_ERROR; 506 } 507 #undef BUFFERSIZE 508 509 510 /****************************************************** 511 You must call ucnv_close to clean up the memory used by the 512 converter. 513 514 'len' returns the number of OUTPUT bytes resulting from the 515 conversion. 516 */ 517 518 UErrorCode convsample_12() 519 { 520 printf("\n\n==============================================\n" 521 "Sample 12: C: simple sjis -> unicode conversion\n"); 522 523 524 // **************************** START SAMPLE ******************* 525 526 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 527 UChar target[100]; 528 UErrorCode status = U_ZERO_ERROR; 529 UConverter *conv; 530 int32_t len; 531 532 // set up the converter 533 conv = ucnv_open("shift_jis", &status); 534 assert(U_SUCCESS(status)); 535 536 // convert to Unicode 537 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 538 target[6] = 0xFDCA; 539 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); 540 U_ASSERT(status); 541 // close the converter 542 ucnv_close(conv); 543 544 // ***************************** END SAMPLE ******************** 545 546 // Print it out 547 printBytes("src", source, strlen(source) ); 548 printf("\n"); 549 printUChars("targ", target, len); 550 551 return U_ZERO_ERROR; 552 } 553 554 /****************************************************************** 555 C: Convert from codepage to Unicode one at a time. 556 */ 557 558 UErrorCode convsample_13() 559 { 560 printf("\n\n==============================================\n" 561 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 562 563 564 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 565 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 566 const char *source, *sourceLimit; 567 UChar32 target; 568 UErrorCode status = U_ZERO_ERROR; 569 UConverter *conv = NULL; 570 int32_t srcCount=0; 571 int32_t dstCount=0; 572 573 srcCount = sizeof(sourceChars); 574 575 conv = ucnv_open("Big5", &status); 576 U_ASSERT(status); 577 578 source = sourceChars; 579 sourceLimit = sourceChars + sizeof(sourceChars); 580 581 // **************************** START SAMPLE ******************* 582 583 584 printBytes("src",source,sourceLimit-source); 585 586 while(source < sourceLimit) 587 { 588 puts(""); 589 target = ucnv_getNextUChar (conv, 590 &source, 591 sourceLimit, 592 &status); 593 594 // printBytes("src",source,sourceLimit-source); 595 U_ASSERT(status); 596 printUChar(target); 597 dstCount++; 598 } 599 600 601 // ************************** END SAMPLE ************************* 602 603 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 604 ucnv_close(conv); 605 606 return U_ZERO_ERROR; 607 } 608 609 610 611 612 UBool convsample_20_didSubstitute(const char *source) 613 { 614 UChar uchars[100]; 615 char bytes[100]; 616 UConverter *conv = NULL; 617 UErrorCode status = U_ZERO_ERROR; 618 uint32_t len, len2; 619 UBool flagVal; 620 621 FromUFLAGContext * context = NULL; 622 623 printf("\n\n==============================================\n" 624 "Sample 20: C: Test for substitution using callbacks\n"); 625 626 /* print out the original source */ 627 printBytes("src", source); 628 printf("\n"); 629 630 /* First, convert from UTF8 to unicode */ 631 conv = ucnv_open("utf-8", &status); 632 U_ASSERT(status); 633 634 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 635 U_ASSERT(status); 636 637 printUChars("uch", uchars, len); 638 printf("\n"); 639 640 /* Now, close the converter */ 641 ucnv_close(conv); 642 643 /* Now, convert to windows-1252 */ 644 conv = ucnv_open("windows-1252", &status); 645 U_ASSERT(status); 646 647 /* Converter starts out with the SUBSTITUTE callback set. */ 648 649 /* initialize our callback */ 650 context = flagCB_fromU_openContext(); 651 652 /* Set our special callback */ 653 ucnv_setFromUCallBack(conv, 654 flagCB_fromU, 655 context, 656 &(context->subCallback), 657 &(context->subContext), 658 &status); 659 660 U_ASSERT(status); 661 662 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 663 U_ASSERT(status); 664 665 flagVal = context->flag; /* it's about to go away when we close the cnv */ 666 667 ucnv_close(conv); 668 669 /* print out the original source */ 670 printBytes("bytes", bytes, len2); 671 672 return flagVal; /* true if callback was called */ 673 } 674 675 UErrorCode convsample_20() 676 { 677 const char *sample1 = "abc\xdf\xbf"; 678 const char *sample2 = "abc_def"; 679 680 681 if(convsample_20_didSubstitute(sample1)) 682 { 683 printf("DID substitute.\n******\n"); 684 } 685 else 686 { 687 printf("Did NOT substitute.\n*****\n"); 688 } 689 690 if(convsample_20_didSubstitute(sample2)) 691 { 692 printf("DID substitute.\n******\n"); 693 } 694 else 695 { 696 printf("Did NOT substitute.\n*****\n"); 697 } 698 699 return U_ZERO_ERROR; 700 } 701 702 // 21 - C, callback, with clone and debug 703 704 705 706 UBool convsample_21_didSubstitute(const char *source) 707 { 708 UChar uchars[100]; 709 char bytes[100]; 710 UConverter *conv = NULL, *cloneCnv = NULL; 711 UErrorCode status = U_ZERO_ERROR; 712 uint32_t len, len2; 713 int32_t cloneLen; 714 UBool flagVal = FALSE; 715 UConverterFromUCallback junkCB; 716 717 FromUFLAGContext *flagCtx = NULL, 718 *cloneFlagCtx = NULL; 719 720 debugCBContext *debugCtx1 = NULL, 721 *debugCtx2 = NULL, 722 *cloneDebugCtx = NULL; 723 724 printf("\n\n==============================================\n" 725 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 726 727 /* print out the original source */ 728 printBytes("src", source); 729 printf("\n"); 730 731 /* First, convert from UTF8 to unicode */ 732 conv = ucnv_open("utf-8", &status); 733 U_ASSERT(status); 734 735 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 736 U_ASSERT(status); 737 738 printUChars("uch", uchars, len); 739 printf("\n"); 740 741 /* Now, close the converter */ 742 ucnv_close(conv); 743 744 /* Now, convert to windows-1252 */ 745 conv = ucnv_open("windows-1252", &status); 746 U_ASSERT(status); 747 748 /* Converter starts out with the SUBSTITUTE callback set. */ 749 750 /* initialize our callback */ 751 /* from the 'bottom' innermost, out 752 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 753 754 #if DEBUG_TMI 755 printf("flagCB_fromU = %p\n", &flagCB_fromU); 756 printf("debugCB_fromU = %p\n", &debugCB_fromU); 757 #endif 758 759 debugCtx1 = debugCB_openContext(); 760 flagCtx = flagCB_fromU_openContext(); 761 debugCtx2 = debugCB_openContext(); 762 763 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 764 debugCtx1->subContext = flagCtx; 765 766 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 767 flagCtx->subContext = debugCtx2; 768 769 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 770 debugCtx2->subContext = NULL; 771 772 /* Set our special callback */ 773 774 ucnv_setFromUCallBack(conv, 775 debugCB_fromU, 776 debugCtx1, 777 &(debugCtx2->subCallback), 778 &(debugCtx2->subContext), 779 &status); 780 781 U_ASSERT(status); 782 783 #if DEBUG_TMI 784 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 785 conv, debugCtx1, debugCtx1->subCallback, 786 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 787 #endif 788 789 cloneLen = 1; /* but passing in null so it will clone */ 790 cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status); 791 792 U_ASSERT(status); 793 794 #if DEBUG_TMI 795 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 796 #endif 797 798 ucnv_close(conv); 799 800 #if DEBUG_TMI 801 printf("%p closed.\n", conv); 802 #endif 803 804 U_ASSERT(status); 805 /* Now, we have to extract the context */ 806 cloneDebugCtx = NULL; 807 cloneFlagCtx = NULL; 808 809 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 810 if(cloneDebugCtx != NULL) { 811 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 812 } 813 814 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 815 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 816 817 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 818 U_ASSERT(status); 819 820 if(cloneFlagCtx != NULL) { 821 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 822 } else { 823 printf("** Warning, couldn't get the subcallback \n"); 824 } 825 826 ucnv_close(cloneCnv); 827 828 /* print out the original source */ 829 printBytes("bytes", bytes, len2); 830 831 return flagVal; /* true if callback was called */ 832 } 833 834 UErrorCode convsample_21() 835 { 836 const char *sample1 = "abc\xdf\xbf"; 837 const char *sample2 = "abc_def"; 838 839 if(convsample_21_didSubstitute(sample1)) 840 { 841 printf("DID substitute.\n******\n"); 842 } 843 else 844 { 845 printf("Did NOT substitute.\n*****\n"); 846 } 847 848 if(convsample_21_didSubstitute(sample2)) 849 { 850 printf("DID substitute.\n******\n"); 851 } 852 else 853 { 854 printf("Did NOT substitute.\n*****\n"); 855 } 856 857 return U_ZERO_ERROR; 858 } 859 860 861 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 862 863 #define BUFFERSIZE 17 /* make it interesting :) */ 864 865 UErrorCode convsample_40() 866 { 867 printf("\n\n==============================================\n" 868 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 869 870 FILE *f; 871 FILE *out; 872 int32_t count; 873 char inBuf[BUFFERSIZE]; 874 const char *source; 875 const char *sourceLimit; 876 UChar *uBuf; 877 UChar *target; 878 UChar *targetLimit; 879 int32_t uBufSize = 0; 880 UConverter *conv = NULL; 881 UErrorCode status = U_ZERO_ERROR; 882 uint32_t inbytes=0, total=0; 883 884 f = fopen("data02.bin", "rb"); 885 if(!f) 886 { 887 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 888 return U_FILE_ACCESS_ERROR; 889 } 890 891 out = fopen("data40.utf16", "wb"); 892 if(!out) 893 { 894 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 895 fclose(f); 896 return U_FILE_ACCESS_ERROR; 897 } 898 899 // **************************** START SAMPLE ******************* 900 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 901 assert(U_SUCCESS(status)); 902 903 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 904 printf("input bytes %d / min chars %d = %d UChars\n", 905 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 906 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 907 assert(uBuf!=NULL); 908 909 // grab another buffer's worth 910 while((!feof(f)) && 911 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 912 { 913 inbytes += count; 914 915 // Convert bytes to unicode 916 source = inBuf; 917 sourceLimit = inBuf + count; 918 919 do 920 { 921 target = uBuf; 922 targetLimit = uBuf + uBufSize; 923 924 ucnv_toUnicode( conv, &target, targetLimit, 925 &source, sourceLimit, NULL, 926 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 927 /* is true (when no more data will come) */ 928 &status); 929 930 if(status == U_BUFFER_OVERFLOW_ERROR) 931 { 932 // simply ran out of space - we'll reset the target ptr the next 933 // time through the loop. 934 status = U_ZERO_ERROR; 935 } 936 else 937 { 938 // Check other errors here. 939 assert(U_SUCCESS(status)); 940 // Break out of the loop (by force) 941 } 942 943 // Process the Unicode 944 // Todo: handle UTF-16/surrogates 945 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == 946 (size_t)(target-uBuf)); 947 total += (target-uBuf); 948 } while (source < sourceLimit); // while simply out of space 949 } 950 951 printf("%d bytes in, %d UChars out.\n", inbytes, total); 952 953 // ***************************** END SAMPLE ******************** 954 ucnv_close(conv); 955 956 fclose(f); 957 fclose(out); 958 printf("\n"); 959 960 return U_ZERO_ERROR; 961 } 962 #undef BUFFERSIZE 963 964 965 966 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 967 968 #define BUFFERSIZE 24 /* make it interesting :) */ 969 970 UErrorCode convsample_46() 971 { 972 printf("\n\n==============================================\n" 973 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 974 975 FILE *f; 976 FILE *out; 977 int32_t count; 978 UChar inBuf[BUFFERSIZE]; 979 const UChar *source; 980 const UChar *sourceLimit; 981 char *buf; 982 char *target; 983 char *targetLimit; 984 985 int32_t bufSize = 0; 986 UConverter *conv = NULL; 987 UErrorCode status = U_ZERO_ERROR; 988 uint32_t inchars=0, total=0; 989 990 f = fopen("data40.utf16", "rb"); 991 if(!f) 992 { 993 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 994 return U_FILE_ACCESS_ERROR; 995 } 996 997 out = fopen("data46.out", "wb"); 998 if(!out) 999 { 1000 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1001 fclose(f); 1002 return U_FILE_ACCESS_ERROR; 1003 } 1004 1005 // **************************** START SAMPLE ******************* 1006 conv = ucnv_open( "iso-8859-2", &status); 1007 assert(U_SUCCESS(status)); 1008 1009 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1010 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1011 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1012 buf = (char*)malloc(bufSize * sizeof(char)); 1013 assert(buf!=NULL); 1014 1015 // grab another buffer's worth 1016 while((!feof(f)) && 1017 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) 1018 { 1019 inchars += count; 1020 1021 // Convert bytes to unicode 1022 source = inBuf; 1023 sourceLimit = inBuf + count; 1024 1025 do 1026 { 1027 target = buf; 1028 targetLimit = buf + bufSize; 1029 1030 ucnv_fromUnicode( conv, &target, targetLimit, 1031 &source, sourceLimit, NULL, 1032 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1033 /* is true (when no more data will come) */ 1034 &status); 1035 1036 if(status == U_BUFFER_OVERFLOW_ERROR) 1037 { 1038 // simply ran out of space - we'll reset the target ptr the next 1039 // time through the loop. 1040 status = U_ZERO_ERROR; 1041 } 1042 else 1043 { 1044 // Check other errors here. 1045 assert(U_SUCCESS(status)); 1046 // Break out of the loop (by force) 1047 } 1048 1049 // Process the Unicode 1050 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == 1051 (size_t)(target-buf)); 1052 total += (target-buf); 1053 } while (source < sourceLimit); // while simply out of space 1054 } 1055 1056 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); 1057 1058 // ***************************** END SAMPLE ******************** 1059 ucnv_close(conv); 1060 1061 fclose(f); 1062 fclose(out); 1063 printf("\n"); 1064 1065 return U_ZERO_ERROR; 1066 } 1067 #undef BUFFERSIZE 1068 1069 #define BUFFERSIZE 219 1070 1071 void convsample_50() { 1072 printf("\n\n==============================================\n" 1073 "Sample 50: C: ucnv_detectUnicodeSignature\n"); 1074 1075 //! [ucnv_detectUnicodeSignature] 1076 UErrorCode err = U_ZERO_ERROR; 1077 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ 1078 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; 1079 int32_t signatureLength = 0; 1080 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); 1081 UConverter *conv = NULL; 1082 UChar output[100]; 1083 UChar *target = output, *out; 1084 const char *source = input; 1085 if(encoding!=NULL && U_SUCCESS(err)){ 1086 // should signature be discarded ? 1087 conv = ucnv_open(encoding, &err); 1088 // do the conversion 1089 ucnv_toUnicode(conv, 1090 &target, output + sizeof(output)/U_SIZEOF_UCHAR, 1091 &source, input + sizeof(input), 1092 NULL, TRUE, &err); 1093 out = output; 1094 if (discardSignature){ 1095 ++out; // ignore initial U+FEFF 1096 } 1097 while(out != target) { 1098 printf("%04x ", *out++); 1099 } 1100 puts(""); 1101 } 1102 //! [ucnv_detectUnicodeSignature] 1103 puts(""); 1104 } 1105 1106 1107 1108 /* main */ 1109 1110 int main() 1111 { 1112 1113 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1114 1115 convsample_02(); // C , u->koi8r, conv 1116 convsample_03(); // C, iterate 1117 1118 convsample_05(); // C, utf8->u, getNextUChar 1119 convsample_06(); // C freq counter thingy 1120 1121 convsample_12(); // C, sjis->u, conv 1122 convsample_13(); // C, big5->u, getNextU 1123 1124 convsample_20(); // C, callback 1125 convsample_21(); // C, callback debug 1126 1127 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1128 1129 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1130 1131 convsample_50(); // C, detect unicode signature 1132 1133 printf("End of converter samples.\n"); 1134 1135 fflush(stdout); 1136 fflush(stderr); 1137 1138 return 0; 1139 } 1140