1 /************************************************************************* 2 * 3 * 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html#License 5 * 6 ************************************************************************** 7 ************************************************************************** 8 * 9 * Copyright (C) 2000-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 * 12 *************************************************************************** 13 * file name: convsamp.c 14 * encoding: ASCII (7-bit) 15 * 16 * created on: 2000may30 17 * created by: Steven R. Loomis 18 * 19 * Sample code for the ICU conversion routines. 20 * 21 * Note: Nothing special is needed to build this sample. Link with 22 * the icu UC and icu I18N libraries. 23 * 24 * I use 'assert' for error checking, you probably will want 25 * something more flexible. '***BEGIN SAMPLE***' and 26 * '***END SAMPLE***' mark pieces suitable for stand alone 27 * code snippets. 28 * 29 * 30 * Each test can define it's own BUFFERSIZE 31 * 32 */ 33 34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 35 36 #include <stdio.h> 37 #include <ctype.h> /* for isspace, etc. */ 38 #include <assert.h> 39 #include <string.h> 40 #include <stdlib.h> /* malloc */ 41 42 #include "unicode/utypes.h" /* Basic ICU data types */ 43 #include "unicode/ucnv.h" /* C Converter API */ 44 #include "unicode/ustring.h" /* some more string fcns*/ 45 #include "unicode/uchar.h" /* char names */ 46 #include "unicode/uloc.h" 47 #include "unicode/unistr.h" 48 49 #include "flagcb.h" 50 51 /* Some utility functions */ 52 #ifndef UPRV_LENGTHOF 53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 54 #endif 55 56 static const UChar kNone[] = { 0x0000 }; 57 58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 59 60 /* Print a UChar if possible, in seven characters. */ 61 void prettyPrintUChar(UChar c) 62 { 63 if( (c <= 0x007F) && 64 (isgraph(c)) ) { 65 printf(" '%c' ", (char)(0x00FF&c)); 66 } else if ( c > 0x007F ) { 67 char buf[1000]; 68 UErrorCode status = U_ZERO_ERROR; 69 int32_t o; 70 71 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); 72 if(U_SUCCESS(status) && (o>0) ) { 73 buf[6] = 0; 74 printf("%7s", buf); 75 } else { 76 printf(" ??????"); 77 } 78 } else { 79 switch((char)(c & 0x007F)) { 80 case ' ': 81 printf(" ' ' "); 82 break; 83 case '\t': 84 printf(" \\t "); 85 break; 86 case '\n': 87 printf(" \\n "); 88 break; 89 default: 90 printf(" _ "); 91 break; 92 } 93 } 94 } 95 96 97 void printUChars(const char *name = "?", 98 const UChar *uch = kNone, 99 int32_t len = -1 ) 100 { 101 int32_t i; 102 103 if( (len == -1) && (uch) ) { 104 len = u_strlen(uch); 105 } 106 107 printf("%5s: ", name); 108 for( i = 0; i <len; i++) { 109 printf("%-6d ", i); 110 } 111 printf("\n"); 112 113 printf("%5s: ", "uni"); 114 for( i = 0; i <len; i++) { 115 printf("\\u%04X ", (int)uch[i]); 116 } 117 printf("\n"); 118 119 printf("%5s:", "ch"); 120 for( i = 0; i <len; i++) { 121 prettyPrintUChar(uch[i]); 122 } 123 printf("\n"); 124 } 125 126 void printBytes(const char *name = "?", 127 const char *uch = "", 128 int32_t len = -1 ) 129 { 130 int32_t i; 131 132 if( (len == -1) && (uch) ) { 133 len = strlen(uch); 134 } 135 136 printf("%5s: ", name); 137 for( i = 0; i <len; i++) { 138 printf("%-4d ", i); 139 } 140 printf("\n"); 141 142 printf("%5s: ", "uni"); 143 for( i = 0; i <len; i++) { 144 printf("\\x%02X ", 0x00FF & (int)uch[i]); 145 } 146 printf("\n"); 147 148 printf("%5s:", "ch"); 149 for( i = 0; i <len; i++) { 150 if(isgraph(0x00FF & (int)uch[i])) { 151 printf(" '%c' ", (char)uch[i]); 152 } else { 153 printf(" "); 154 } 155 } 156 printf("\n"); 157 } 158 159 void printUChar(UChar32 ch32) 160 { 161 if(ch32 > 0xFFFF) { 162 printf("ch: U+%06X\n", ch32); 163 } 164 else { 165 UChar ch = (UChar)ch32; 166 printUChars("C", &ch, 1); 167 } 168 } 169 170 /******************************************************************* 171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 172 followed by an exclamation mark (!) into the KOI8-R Russian code page. 173 174 This example first creates a UChar String out of the Unicode chars. 175 176 targetSize must be set to the amount of space available in the target 177 buffer. After fromUChars is called, 178 len will contain the number of bytes in target[] which were 179 used in the resulting codepage. In this case, there is a 1:1 mapping 180 between the input and output characters. The exclamation mark has the 181 same value in both KOI8-R and Unicode. 182 183 src: 0 1 2 3 4 5 6 184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 186 187 targ: 0 1 2 3 4 5 6 188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 189 ch: '!' 190 191 192 Converting FROM unicode 193 to koi8-r. 194 You must call ucnv_close to clean up the memory used by the 195 converter. 196 197 'len' returns the number of OUTPUT bytes resulting from the 198 conversion. 199 */ 200 201 UErrorCode convsample_02() 202 { 203 printf("\n\n==============================================\n" 204 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 205 206 207 // **************************** START SAMPLE ******************* 208 // "cat<cat>OK" 209 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 210 0x0430, 0x0021, 0x0000 }; 211 char target[100]; 212 UErrorCode status = U_ZERO_ERROR; 213 UConverter *conv; 214 int32_t len; 215 216 // set up the converter 217 //! [ucnv_open] 218 conv = ucnv_open("koi8-r", &status); 219 //! [ucnv_open] 220 assert(U_SUCCESS(status)); 221 222 // convert to koi8-r 223 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 224 assert(U_SUCCESS(status)); 225 226 // close the converter 227 ucnv_close(conv); 228 229 // ***************************** END SAMPLE ******************** 230 231 // Print it out 232 printUChars("src", source); 233 printf("\n"); 234 printBytes("targ", target, len); 235 236 return U_ZERO_ERROR; 237 } 238 239 240 UErrorCode convsample_03() 241 { 242 printf("\n\n==============================================\n" 243 "Sample 03: C: print out all converters\n"); 244 245 int32_t count; 246 int32_t i; 247 248 // **************************** START SAMPLE ******************* 249 count = ucnv_countAvailable(); 250 printf("Available converters: %d\n", count); 251 252 for(i=0;i<count;i++) 253 { 254 printf("%s ", ucnv_getAvailableName(i)); 255 } 256 257 // ***************************** END SAMPLE ******************** 258 259 printf("\n"); 260 261 return U_ZERO_ERROR; 262 } 263 264 265 266 #define BUFFERSIZE 17 /* make it interesting :) */ 267 268 /* 269 Converting from a codepage to Unicode in bulk.. 270 What is the best way to determine the buffer size? 271 272 The 'buffersize' is in bytes of input. 273 For a given converter, divinding this by the minimum char size 274 give you the maximum number of Unicode characters that could be 275 expected for a given number of input bytes. 276 see: ucnv_getMinCharSize() 277 278 For example, a single byte codepage like 'Latin-3' has a 279 minimum char size of 1. (It takes at least 1 byte to represent 280 each Unicode char.) So the unicode buffer has the same number of 281 UChars as the input buffer has bytes. 282 283 In a strictly double byte codepage such as cp1362 (Windows 284 Korean), the minimum char size is 2. So, only half as many Unicode 285 chars as bytes are needed. 286 287 This work to calculate the buffer size is an optimization. Any 288 size of input and output buffer can be used, as long as the 289 program handles the following cases: If the input buffer is empty, 290 the source pointer will be equal to sourceLimit. If the output 291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 292 */ 293 294 UErrorCode convsample_05() 295 { 296 printf("\n\n==============================================\n" 297 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 298 299 FILE *f; 300 int32_t count; 301 char inBuf[BUFFERSIZE]; 302 const char *source; 303 const char *sourceLimit; 304 UChar *uBuf; 305 UChar *target; 306 UChar *targetLimit; 307 UChar *p; 308 int32_t uBufSize = 0; 309 UConverter *conv; 310 UErrorCode status = U_ZERO_ERROR; 311 uint32_t letters=0, total=0; 312 313 f = fopen("data01.txt", "r"); 314 if(!f) 315 { 316 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 317 return U_FILE_ACCESS_ERROR; 318 } 319 320 // **************************** START SAMPLE ******************* 321 conv = ucnv_open("utf-8", &status); 322 assert(U_SUCCESS(status)); 323 324 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 325 printf("input bytes %d / min chars %d = %d UChars\n", 326 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 327 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 328 assert(uBuf!=NULL); 329 330 // grab another buffer's worth 331 while((!feof(f)) && 332 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 333 { 334 // Convert bytes to unicode 335 source = inBuf; 336 sourceLimit = inBuf + count; 337 338 do 339 { 340 target = uBuf; 341 targetLimit = uBuf + uBufSize; 342 343 ucnv_toUnicode(conv, &target, targetLimit, 344 &source, sourceLimit, NULL, 345 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 346 /* is true (when no more data will come) */ 347 &status); 348 349 if(status == U_BUFFER_OVERFLOW_ERROR) 350 { 351 // simply ran out of space - we'll reset the target ptr the next 352 // time through the loop. 353 status = U_ZERO_ERROR; 354 } 355 else 356 { 357 // Check other errors here. 358 assert(U_SUCCESS(status)); 359 // Break out of the loop (by force) 360 } 361 362 // Process the Unicode 363 // Todo: handle UTF-16/surrogates 364 365 for(p = uBuf; p<target; p++) 366 { 367 if(u_isalpha(*p)) 368 letters++; 369 total++; 370 } 371 } while (source < sourceLimit); // while simply out of space 372 } 373 374 printf("%d letters out of %d total UChars.\n", letters, total); 375 376 // ***************************** END SAMPLE ******************** 377 ucnv_close(conv); 378 379 printf("\n"); 380 381 fclose(f); 382 383 return U_ZERO_ERROR; 384 } 385 #undef BUFFERSIZE 386 387 #define BUFFERSIZE 1024 388 typedef struct 389 { 390 UChar32 codepoint; 391 uint32_t frequency; 392 } CharFreqInfo; 393 394 UErrorCode convsample_06() 395 { 396 printf("\n\n==============================================\n" 397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 398 399 FILE *f; 400 int32_t count; 401 char inBuf[BUFFERSIZE]; 402 const char *source; 403 const char *sourceLimit; 404 int32_t uBufSize = 0; 405 UConverter *conv; 406 UErrorCode status = U_ZERO_ERROR; 407 uint32_t letters=0, total=0; 408 409 CharFreqInfo *info; 410 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 411 UChar32 p; 412 413 uint32_t ie = 0; 414 uint32_t gh = 0; 415 UChar32 l = 0; 416 417 f = fopen("data06.txt", "r"); 418 if(!f) 419 { 420 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 421 return U_FILE_ACCESS_ERROR; 422 } 423 424 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 425 if(!info) 426 { 427 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); 428 } 429 430 /* reset frequencies */ 431 for(p=0;p<charCount;p++) 432 { 433 info[p].codepoint = p; 434 info[p].frequency = 0; 435 } 436 437 // **************************** START SAMPLE ******************* 438 conv = ucnv_open("utf-8", &status); 439 assert(U_SUCCESS(status)); 440 441 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 442 printf("input bytes %d / min chars %d = %d UChars\n", 443 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 444 445 // grab another buffer's worth 446 while((!feof(f)) && 447 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 448 { 449 // Convert bytes to unicode 450 source = inBuf; 451 sourceLimit = inBuf + count; 452 453 while(source < sourceLimit) 454 { 455 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 456 if(U_FAILURE(status)) 457 { 458 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 459 status = U_ZERO_ERROR; 460 continue; 461 } 462 U_ASSERT(status); 463 total++; 464 465 if(u_isalpha(p)) 466 letters++; 467 468 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 469 ie++; 470 471 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 472 gh++; 473 474 if(p>charCount) 475 { 476 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 477 free(info); 478 fclose(f); 479 ucnv_close(conv); 480 return U_UNSUPPORTED_ERROR; 481 } 482 info[p].frequency++; 483 l = p; 484 } 485 } 486 487 fclose(f); 488 ucnv_close(conv); 489 490 printf("%d letters out of %d total UChars.\n", letters, total); 491 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 492 493 // now, we could sort it.. 494 495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 496 497 for(p=0;p<charCount;p++) 498 { 499 if(info[p].frequency) 500 { 501 printf("% 5d U+%06X ", info[p].frequency, p); 502 if(p <= 0xFFFF) 503 { 504 prettyPrintUChar((UChar)p); 505 } 506 printf("\n"); 507 } 508 } 509 free(info); 510 // ***************************** END SAMPLE ******************** 511 512 printf("\n"); 513 514 return U_ZERO_ERROR; 515 } 516 #undef BUFFERSIZE 517 518 519 /****************************************************** 520 You must call ucnv_close to clean up the memory used by the 521 converter. 522 523 'len' returns the number of OUTPUT bytes resulting from the 524 conversion. 525 */ 526 527 UErrorCode convsample_12() 528 { 529 printf("\n\n==============================================\n" 530 "Sample 12: C: simple sjis -> unicode conversion\n"); 531 532 533 // **************************** START SAMPLE ******************* 534 535 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 536 UChar target[100]; 537 UErrorCode status = U_ZERO_ERROR; 538 UConverter *conv; 539 int32_t len; 540 541 // set up the converter 542 conv = ucnv_open("shift_jis", &status); 543 assert(U_SUCCESS(status)); 544 545 // convert to Unicode 546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 547 target[6] = 0xFDCA; 548 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); 549 U_ASSERT(status); 550 // close the converter 551 ucnv_close(conv); 552 553 // ***************************** END SAMPLE ******************** 554 555 // Print it out 556 printBytes("src", source, strlen(source) ); 557 printf("\n"); 558 printUChars("targ", target, len); 559 560 return U_ZERO_ERROR; 561 } 562 563 /****************************************************************** 564 C: Convert from codepage to Unicode one at a time. 565 */ 566 567 UErrorCode convsample_13() 568 { 569 printf("\n\n==============================================\n" 570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 571 572 573 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 575 const char *source, *sourceLimit; 576 UChar32 target; 577 UErrorCode status = U_ZERO_ERROR; 578 UConverter *conv = NULL; 579 int32_t srcCount=0; 580 int32_t dstCount=0; 581 582 srcCount = sizeof(sourceChars); 583 584 conv = ucnv_open("Big5", &status); 585 U_ASSERT(status); 586 587 source = sourceChars; 588 sourceLimit = sourceChars + sizeof(sourceChars); 589 590 // **************************** START SAMPLE ******************* 591 592 593 printBytes("src",source,sourceLimit-source); 594 595 while(source < sourceLimit) 596 { 597 puts(""); 598 target = ucnv_getNextUChar (conv, 599 &source, 600 sourceLimit, 601 &status); 602 603 // printBytes("src",source,sourceLimit-source); 604 U_ASSERT(status); 605 printUChar(target); 606 dstCount++; 607 } 608 609 610 // ************************** END SAMPLE ************************* 611 612 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 613 ucnv_close(conv); 614 615 return U_ZERO_ERROR; 616 } 617 618 619 620 621 UBool convsample_20_didSubstitute(const char *source) 622 { 623 UChar uchars[100]; 624 char bytes[100]; 625 UConverter *conv = NULL; 626 UErrorCode status = U_ZERO_ERROR; 627 uint32_t len, len2; 628 UBool flagVal; 629 630 FromUFLAGContext * context = NULL; 631 632 printf("\n\n==============================================\n" 633 "Sample 20: C: Test for substitution using callbacks\n"); 634 635 /* print out the original source */ 636 printBytes("src", source); 637 printf("\n"); 638 639 /* First, convert from UTF8 to unicode */ 640 conv = ucnv_open("utf-8", &status); 641 U_ASSERT(status); 642 643 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 644 U_ASSERT(status); 645 646 printUChars("uch", uchars, len); 647 printf("\n"); 648 649 /* Now, close the converter */ 650 ucnv_close(conv); 651 652 /* Now, convert to windows-1252 */ 653 conv = ucnv_open("windows-1252", &status); 654 U_ASSERT(status); 655 656 /* Converter starts out with the SUBSTITUTE callback set. */ 657 658 /* initialize our callback */ 659 context = flagCB_fromU_openContext(); 660 661 /* Set our special callback */ 662 ucnv_setFromUCallBack(conv, 663 flagCB_fromU, 664 context, 665 &(context->subCallback), 666 &(context->subContext), 667 &status); 668 669 U_ASSERT(status); 670 671 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 672 U_ASSERT(status); 673 674 flagVal = context->flag; /* it's about to go away when we close the cnv */ 675 676 ucnv_close(conv); 677 678 /* print out the original source */ 679 printBytes("bytes", bytes, len2); 680 681 return flagVal; /* true if callback was called */ 682 } 683 684 UErrorCode convsample_20() 685 { 686 const char *sample1 = "abc\xdf\xbf"; 687 const char *sample2 = "abc_def"; 688 689 690 if(convsample_20_didSubstitute(sample1)) 691 { 692 printf("DID substitute.\n******\n"); 693 } 694 else 695 { 696 printf("Did NOT substitute.\n*****\n"); 697 } 698 699 if(convsample_20_didSubstitute(sample2)) 700 { 701 printf("DID substitute.\n******\n"); 702 } 703 else 704 { 705 printf("Did NOT substitute.\n*****\n"); 706 } 707 708 return U_ZERO_ERROR; 709 } 710 711 // 21 - C, callback, with clone and debug 712 713 714 715 UBool convsample_21_didSubstitute(const char *source) 716 { 717 UChar uchars[100]; 718 char bytes[100]; 719 UConverter *conv = NULL, *cloneCnv = NULL; 720 UErrorCode status = U_ZERO_ERROR; 721 uint32_t len, len2; 722 int32_t cloneLen; 723 UBool flagVal = FALSE; 724 UConverterFromUCallback junkCB; 725 726 FromUFLAGContext *flagCtx = NULL, 727 *cloneFlagCtx = NULL; 728 729 debugCBContext *debugCtx1 = NULL, 730 *debugCtx2 = NULL, 731 *cloneDebugCtx = NULL; 732 733 printf("\n\n==============================================\n" 734 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 735 736 /* print out the original source */ 737 printBytes("src", source); 738 printf("\n"); 739 740 /* First, convert from UTF8 to unicode */ 741 conv = ucnv_open("utf-8", &status); 742 U_ASSERT(status); 743 744 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 745 U_ASSERT(status); 746 747 printUChars("uch", uchars, len); 748 printf("\n"); 749 750 /* Now, close the converter */ 751 ucnv_close(conv); 752 753 /* Now, convert to windows-1252 */ 754 conv = ucnv_open("windows-1252", &status); 755 U_ASSERT(status); 756 757 /* Converter starts out with the SUBSTITUTE callback set. */ 758 759 /* initialize our callback */ 760 /* from the 'bottom' innermost, out 761 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 762 763 #if DEBUG_TMI 764 printf("flagCB_fromU = %p\n", &flagCB_fromU); 765 printf("debugCB_fromU = %p\n", &debugCB_fromU); 766 #endif 767 768 debugCtx1 = debugCB_openContext(); 769 flagCtx = flagCB_fromU_openContext(); 770 debugCtx2 = debugCB_openContext(); 771 772 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 773 debugCtx1->subContext = flagCtx; 774 775 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 776 flagCtx->subContext = debugCtx2; 777 778 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 779 debugCtx2->subContext = NULL; 780 781 /* Set our special callback */ 782 783 ucnv_setFromUCallBack(conv, 784 debugCB_fromU, 785 debugCtx1, 786 &(debugCtx2->subCallback), 787 &(debugCtx2->subContext), 788 &status); 789 790 U_ASSERT(status); 791 792 #if DEBUG_TMI 793 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 794 conv, debugCtx1, debugCtx1->subCallback, 795 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 796 #endif 797 798 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); 799 800 U_ASSERT(status); 801 802 #if DEBUG_TMI 803 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 804 #endif 805 806 ucnv_close(conv); 807 808 #if DEBUG_TMI 809 printf("%p closed.\n", conv); 810 #endif 811 812 U_ASSERT(status); 813 /* Now, we have to extract the context */ 814 cloneDebugCtx = NULL; 815 cloneFlagCtx = NULL; 816 817 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 818 if(cloneDebugCtx != NULL) { 819 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 820 } 821 822 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 823 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 824 825 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 826 U_ASSERT(status); 827 828 if(cloneFlagCtx != NULL) { 829 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 830 } else { 831 printf("** Warning, couldn't get the subcallback \n"); 832 } 833 834 ucnv_close(cloneCnv); 835 836 /* print out the original source */ 837 printBytes("bytes", bytes, len2); 838 839 return flagVal; /* true if callback was called */ 840 } 841 842 UErrorCode convsample_21() 843 { 844 const char *sample1 = "abc\xdf\xbf"; 845 const char *sample2 = "abc_def"; 846 847 if(convsample_21_didSubstitute(sample1)) 848 { 849 printf("DID substitute.\n******\n"); 850 } 851 else 852 { 853 printf("Did NOT substitute.\n*****\n"); 854 } 855 856 if(convsample_21_didSubstitute(sample2)) 857 { 858 printf("DID substitute.\n******\n"); 859 } 860 else 861 { 862 printf("Did NOT substitute.\n*****\n"); 863 } 864 865 return U_ZERO_ERROR; 866 } 867 868 869 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 870 871 #define BUFFERSIZE 17 /* make it interesting :) */ 872 873 UErrorCode convsample_40() 874 { 875 printf("\n\n==============================================\n" 876 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 877 878 FILE *f; 879 FILE *out; 880 int32_t count; 881 char inBuf[BUFFERSIZE]; 882 const char *source; 883 const char *sourceLimit; 884 UChar *uBuf; 885 UChar *target; 886 UChar *targetLimit; 887 int32_t uBufSize = 0; 888 UConverter *conv = NULL; 889 UErrorCode status = U_ZERO_ERROR; 890 uint32_t inbytes=0, total=0; 891 892 f = fopen("data02.bin", "rb"); 893 if(!f) 894 { 895 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 896 return U_FILE_ACCESS_ERROR; 897 } 898 899 out = fopen("data40.utf16", "wb"); 900 if(!out) 901 { 902 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 903 fclose(f); 904 return U_FILE_ACCESS_ERROR; 905 } 906 907 // **************************** START SAMPLE ******************* 908 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 909 assert(U_SUCCESS(status)); 910 911 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 912 printf("input bytes %d / min chars %d = %d UChars\n", 913 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 914 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 915 assert(uBuf!=NULL); 916 917 // grab another buffer's worth 918 while((!feof(f)) && 919 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 920 { 921 inbytes += count; 922 923 // Convert bytes to unicode 924 source = inBuf; 925 sourceLimit = inBuf + count; 926 927 do 928 { 929 target = uBuf; 930 targetLimit = uBuf + uBufSize; 931 932 ucnv_toUnicode( conv, &target, targetLimit, 933 &source, sourceLimit, NULL, 934 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 935 /* is true (when no more data will come) */ 936 &status); 937 938 if(status == U_BUFFER_OVERFLOW_ERROR) 939 { 940 // simply ran out of space - we'll reset the target ptr the next 941 // time through the loop. 942 status = U_ZERO_ERROR; 943 } 944 else 945 { 946 // Check other errors here. 947 assert(U_SUCCESS(status)); 948 // Break out of the loop (by force) 949 } 950 951 // Process the Unicode 952 // Todo: handle UTF-16/surrogates 953 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == 954 (size_t)(target-uBuf)); 955 total += (target-uBuf); 956 } while (source < sourceLimit); // while simply out of space 957 } 958 959 printf("%d bytes in, %d UChars out.\n", inbytes, total); 960 961 // ***************************** END SAMPLE ******************** 962 ucnv_close(conv); 963 964 fclose(f); 965 fclose(out); 966 printf("\n"); 967 968 return U_ZERO_ERROR; 969 } 970 #undef BUFFERSIZE 971 972 973 974 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 975 976 #define BUFFERSIZE 24 /* make it interesting :) */ 977 978 UErrorCode convsample_46() 979 { 980 printf("\n\n==============================================\n" 981 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 982 983 FILE *f; 984 FILE *out; 985 int32_t count; 986 UChar inBuf[BUFFERSIZE]; 987 const UChar *source; 988 const UChar *sourceLimit; 989 char *buf; 990 char *target; 991 char *targetLimit; 992 993 int32_t bufSize = 0; 994 UConverter *conv = NULL; 995 UErrorCode status = U_ZERO_ERROR; 996 uint32_t inchars=0, total=0; 997 998 f = fopen("data40.utf16", "rb"); 999 if(!f) 1000 { 1001 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 1002 return U_FILE_ACCESS_ERROR; 1003 } 1004 1005 out = fopen("data46.out", "wb"); 1006 if(!out) 1007 { 1008 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1009 fclose(f); 1010 return U_FILE_ACCESS_ERROR; 1011 } 1012 1013 // **************************** START SAMPLE ******************* 1014 conv = ucnv_open( "iso-8859-2", &status); 1015 assert(U_SUCCESS(status)); 1016 1017 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1018 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1019 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1020 buf = (char*)malloc(bufSize * sizeof(char)); 1021 assert(buf!=NULL); 1022 1023 // grab another buffer's worth 1024 while((!feof(f)) && 1025 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) 1026 { 1027 inchars += count; 1028 1029 // Convert bytes to unicode 1030 source = inBuf; 1031 sourceLimit = inBuf + count; 1032 1033 do 1034 { 1035 target = buf; 1036 targetLimit = buf + bufSize; 1037 1038 ucnv_fromUnicode( conv, &target, targetLimit, 1039 &source, sourceLimit, NULL, 1040 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1041 /* is true (when no more data will come) */ 1042 &status); 1043 1044 if(status == U_BUFFER_OVERFLOW_ERROR) 1045 { 1046 // simply ran out of space - we'll reset the target ptr the next 1047 // time through the loop. 1048 status = U_ZERO_ERROR; 1049 } 1050 else 1051 { 1052 // Check other errors here. 1053 assert(U_SUCCESS(status)); 1054 // Break out of the loop (by force) 1055 } 1056 1057 // Process the Unicode 1058 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == 1059 (size_t)(target-buf)); 1060 total += (target-buf); 1061 } while (source < sourceLimit); // while simply out of space 1062 } 1063 1064 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); 1065 1066 // ***************************** END SAMPLE ******************** 1067 ucnv_close(conv); 1068 1069 fclose(f); 1070 fclose(out); 1071 printf("\n"); 1072 1073 return U_ZERO_ERROR; 1074 } 1075 #undef BUFFERSIZE 1076 1077 #define BUFFERSIZE 219 1078 1079 void convsample_50() { 1080 printf("\n\n==============================================\n" 1081 "Sample 50: C: ucnv_detectUnicodeSignature\n"); 1082 1083 //! [ucnv_detectUnicodeSignature] 1084 UErrorCode err = U_ZERO_ERROR; 1085 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ 1086 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; 1087 int32_t signatureLength = 0; 1088 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); 1089 UConverter *conv = NULL; 1090 UChar output[100]; 1091 UChar *target = output, *out; 1092 const char *source = input; 1093 if(encoding!=NULL && U_SUCCESS(err)){ 1094 // should signature be discarded ? 1095 conv = ucnv_open(encoding, &err); 1096 // do the conversion 1097 ucnv_toUnicode(conv, 1098 &target, output + UPRV_LENGTHOF(output), 1099 &source, input + sizeof(input), 1100 NULL, TRUE, &err); 1101 out = output; 1102 if (discardSignature){ 1103 ++out; // ignore initial U+FEFF 1104 } 1105 while(out != target) { 1106 printf("%04x ", *out++); 1107 } 1108 puts(""); 1109 } 1110 //! [ucnv_detectUnicodeSignature] 1111 puts(""); 1112 } 1113 1114 1115 1116 /* main */ 1117 1118 int main() 1119 { 1120 1121 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1122 1123 convsample_02(); // C , u->koi8r, conv 1124 convsample_03(); // C, iterate 1125 1126 convsample_05(); // C, utf8->u, getNextUChar 1127 convsample_06(); // C freq counter thingy 1128 1129 convsample_12(); // C, sjis->u, conv 1130 convsample_13(); // C, big5->u, getNextU 1131 1132 convsample_20(); // C, callback 1133 convsample_21(); // C, callback debug 1134 1135 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1136 1137 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1138 1139 convsample_50(); // C, detect unicode signature 1140 1141 printf("End of converter samples.\n"); 1142 1143 fflush(stdout); 1144 fflush(stderr); 1145 1146 return 0; 1147 } 1148