1 /************************************************************************* 2 * 3 * Copyright (C) 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html#License 5 * 6 ************************************************************************** 7 ************************************************************************** 8 * 9 * Copyright (C) 2000-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 * 12 *************************************************************************** 13 * file name: convsamp.c 14 * encoding: ASCII (7-bit) 15 * 16 * created on: 2000may30 17 * created by: Steven R. Loomis 18 * 19 * Sample code for the ICU conversion routines. 20 * 21 * Note: Nothing special is needed to build this sample. Link with 22 * the icu UC and icu I18N libraries. 23 * 24 * I use 'assert' for error checking, you probably will want 25 * something more flexible. '***BEGIN SAMPLE***' and 26 * '***END SAMPLE***' mark pieces suitable for stand alone 27 * code snippets. 28 * 29 * 30 * Each test can define it's own BUFFERSIZE 31 * 32 */ 33 34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 35 36 #include <stdio.h> 37 #include <ctype.h> /* for isspace, etc. */ 38 #include <assert.h> 39 #include <string.h> 40 #include <stdlib.h> /* malloc */ 41 42 #include "cmemory.h" 43 #include "unicode/utypes.h" /* Basic ICU data types */ 44 #include "unicode/ucnv.h" /* C Converter API */ 45 #include "unicode/ustring.h" /* some more string fcns*/ 46 #include "unicode/uchar.h" /* char names */ 47 #include "unicode/uloc.h" 48 #include "unicode/unistr.h" 49 50 #include "flagcb.h" 51 52 /* Some utility functions */ 53 54 static const UChar kNone[] = { 0x0000 }; 55 56 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 57 58 /* Print a UChar if possible, in seven characters. */ 59 void prettyPrintUChar(UChar c) 60 { 61 if( (c <= 0x007F) && 62 (isgraph(c)) ) { 63 printf(" '%c' ", (char)(0x00FF&c)); 64 } else if ( c > 0x007F ) { 65 char buf[1000]; 66 UErrorCode status = U_ZERO_ERROR; 67 int32_t o; 68 69 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); 70 if(U_SUCCESS(status) && (o>0) ) { 71 buf[6] = 0; 72 printf("%7s", buf); 73 } else { 74 printf(" ??????"); 75 } 76 } else { 77 switch((char)(c & 0x007F)) { 78 case ' ': 79 printf(" ' ' "); 80 break; 81 case '\t': 82 printf(" \\t "); 83 break; 84 case '\n': 85 printf(" \\n "); 86 break; 87 default: 88 printf(" _ "); 89 break; 90 } 91 } 92 } 93 94 95 void printUChars(const char *name = "?", 96 const UChar *uch = kNone, 97 int32_t len = -1 ) 98 { 99 int32_t i; 100 101 if( (len == -1) && (uch) ) { 102 len = u_strlen(uch); 103 } 104 105 printf("%5s: ", name); 106 for( i = 0; i <len; i++) { 107 printf("%-6d ", i); 108 } 109 printf("\n"); 110 111 printf("%5s: ", "uni"); 112 for( i = 0; i <len; i++) { 113 printf("\\u%04X ", (int)uch[i]); 114 } 115 printf("\n"); 116 117 printf("%5s:", "ch"); 118 for( i = 0; i <len; i++) { 119 prettyPrintUChar(uch[i]); 120 } 121 printf("\n"); 122 } 123 124 void printBytes(const char *name = "?", 125 const char *uch = "", 126 int32_t len = -1 ) 127 { 128 int32_t i; 129 130 if( (len == -1) && (uch) ) { 131 len = strlen(uch); 132 } 133 134 printf("%5s: ", name); 135 for( i = 0; i <len; i++) { 136 printf("%-4d ", i); 137 } 138 printf("\n"); 139 140 printf("%5s: ", "uni"); 141 for( i = 0; i <len; i++) { 142 printf("\\x%02X ", 0x00FF & (int)uch[i]); 143 } 144 printf("\n"); 145 146 printf("%5s:", "ch"); 147 for( i = 0; i <len; i++) { 148 if(isgraph(0x00FF & (int)uch[i])) { 149 printf(" '%c' ", (char)uch[i]); 150 } else { 151 printf(" "); 152 } 153 } 154 printf("\n"); 155 } 156 157 void printUChar(UChar32 ch32) 158 { 159 if(ch32 > 0xFFFF) { 160 printf("ch: U+%06X\n", ch32); 161 } 162 else { 163 UChar ch = (UChar)ch32; 164 printUChars("C", &ch, 1); 165 } 166 } 167 168 /******************************************************************* 169 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 170 followed by an exclamation mark (!) into the KOI8-R Russian code page. 171 172 This example first creates a UChar String out of the Unicode chars. 173 174 targetSize must be set to the amount of space available in the target 175 buffer. After fromUChars is called, 176 len will contain the number of bytes in target[] which were 177 used in the resulting codepage. In this case, there is a 1:1 mapping 178 between the input and output characters. The exclamation mark has the 179 same value in both KOI8-R and Unicode. 180 181 src: 0 1 2 3 4 5 6 182 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 183 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 184 185 targ: 0 1 2 3 4 5 6 186 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 187 ch: '!' 188 189 190 Converting FROM unicode 191 to koi8-r. 192 You must call ucnv_close to clean up the memory used by the 193 converter. 194 195 'len' returns the number of OUTPUT bytes resulting from the 196 conversion. 197 */ 198 199 UErrorCode convsample_02() 200 { 201 printf("\n\n==============================================\n" 202 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 203 204 205 // **************************** START SAMPLE ******************* 206 // "cat<cat>OK" 207 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 208 0x0430, 0x0021, 0x0000 }; 209 char target[100]; 210 UErrorCode status = U_ZERO_ERROR; 211 UConverter *conv; 212 int32_t len; 213 214 // set up the converter 215 //! [ucnv_open] 216 conv = ucnv_open("koi8-r", &status); 217 //! [ucnv_open] 218 assert(U_SUCCESS(status)); 219 220 // convert to koi8-r 221 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 222 assert(U_SUCCESS(status)); 223 224 // close the converter 225 ucnv_close(conv); 226 227 // ***************************** END SAMPLE ******************** 228 229 // Print it out 230 printUChars("src", source); 231 printf("\n"); 232 printBytes("targ", target, len); 233 234 return U_ZERO_ERROR; 235 } 236 237 238 UErrorCode convsample_03() 239 { 240 printf("\n\n==============================================\n" 241 "Sample 03: C: print out all converters\n"); 242 243 int32_t count; 244 int32_t i; 245 246 // **************************** START SAMPLE ******************* 247 count = ucnv_countAvailable(); 248 printf("Available converters: %d\n", count); 249 250 for(i=0;i<count;i++) 251 { 252 printf("%s ", ucnv_getAvailableName(i)); 253 } 254 255 // ***************************** END SAMPLE ******************** 256 257 printf("\n"); 258 259 return U_ZERO_ERROR; 260 } 261 262 263 264 #define BUFFERSIZE 17 /* make it interesting :) */ 265 266 /* 267 Converting from a codepage to Unicode in bulk.. 268 What is the best way to determine the buffer size? 269 270 The 'buffersize' is in bytes of input. 271 For a given converter, divinding this by the minimum char size 272 give you the maximum number of Unicode characters that could be 273 expected for a given number of input bytes. 274 see: ucnv_getMinCharSize() 275 276 For example, a single byte codepage like 'Latin-3' has a 277 minimum char size of 1. (It takes at least 1 byte to represent 278 each Unicode char.) So the unicode buffer has the same number of 279 UChars as the input buffer has bytes. 280 281 In a strictly double byte codepage such as cp1362 (Windows 282 Korean), the minimum char size is 2. So, only half as many Unicode 283 chars as bytes are needed. 284 285 This work to calculate the buffer size is an optimization. Any 286 size of input and output buffer can be used, as long as the 287 program handles the following cases: If the input buffer is empty, 288 the source pointer will be equal to sourceLimit. If the output 289 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 290 */ 291 292 UErrorCode convsample_05() 293 { 294 printf("\n\n==============================================\n" 295 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 296 297 FILE *f; 298 int32_t count; 299 char inBuf[BUFFERSIZE]; 300 const char *source; 301 const char *sourceLimit; 302 UChar *uBuf; 303 UChar *target; 304 UChar *targetLimit; 305 UChar *p; 306 int32_t uBufSize = 0; 307 UConverter *conv; 308 UErrorCode status = U_ZERO_ERROR; 309 uint32_t letters=0, total=0; 310 311 f = fopen("data01.txt", "r"); 312 if(!f) 313 { 314 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 315 return U_FILE_ACCESS_ERROR; 316 } 317 318 // **************************** START SAMPLE ******************* 319 conv = ucnv_open("utf-8", &status); 320 assert(U_SUCCESS(status)); 321 322 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 323 printf("input bytes %d / min chars %d = %d UChars\n", 324 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 325 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 326 assert(uBuf!=NULL); 327 328 // grab another buffer's worth 329 while((!feof(f)) && 330 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 331 { 332 // Convert bytes to unicode 333 source = inBuf; 334 sourceLimit = inBuf + count; 335 336 do 337 { 338 target = uBuf; 339 targetLimit = uBuf + uBufSize; 340 341 ucnv_toUnicode(conv, &target, targetLimit, 342 &source, sourceLimit, NULL, 343 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 344 /* is true (when no more data will come) */ 345 &status); 346 347 if(status == U_BUFFER_OVERFLOW_ERROR) 348 { 349 // simply ran out of space - we'll reset the target ptr the next 350 // time through the loop. 351 status = U_ZERO_ERROR; 352 } 353 else 354 { 355 // Check other errors here. 356 assert(U_SUCCESS(status)); 357 // Break out of the loop (by force) 358 } 359 360 // Process the Unicode 361 // Todo: handle UTF-16/surrogates 362 363 for(p = uBuf; p<target; p++) 364 { 365 if(u_isalpha(*p)) 366 letters++; 367 total++; 368 } 369 } while (source < sourceLimit); // while simply out of space 370 } 371 372 printf("%d letters out of %d total UChars.\n", letters, total); 373 374 // ***************************** END SAMPLE ******************** 375 ucnv_close(conv); 376 377 printf("\n"); 378 379 fclose(f); 380 381 return U_ZERO_ERROR; 382 } 383 #undef BUFFERSIZE 384 385 #define BUFFERSIZE 1024 386 typedef struct 387 { 388 UChar32 codepoint; 389 uint32_t frequency; 390 } CharFreqInfo; 391 392 UErrorCode convsample_06() 393 { 394 printf("\n\n==============================================\n" 395 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 396 397 FILE *f; 398 int32_t count; 399 char inBuf[BUFFERSIZE]; 400 const char *source; 401 const char *sourceLimit; 402 int32_t uBufSize = 0; 403 UConverter *conv; 404 UErrorCode status = U_ZERO_ERROR; 405 uint32_t letters=0, total=0; 406 407 CharFreqInfo *info; 408 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 409 UChar32 p; 410 411 uint32_t ie = 0; 412 uint32_t gh = 0; 413 UChar32 l = 0; 414 415 f = fopen("data06.txt", "r"); 416 if(!f) 417 { 418 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 419 return U_FILE_ACCESS_ERROR; 420 } 421 422 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 423 if(!info) 424 { 425 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); 426 } 427 428 /* reset frequencies */ 429 for(p=0;p<charCount;p++) 430 { 431 info[p].codepoint = p; 432 info[p].frequency = 0; 433 } 434 435 // **************************** START SAMPLE ******************* 436 conv = ucnv_open("utf-8", &status); 437 assert(U_SUCCESS(status)); 438 439 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 440 printf("input bytes %d / min chars %d = %d UChars\n", 441 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 442 443 // grab another buffer's worth 444 while((!feof(f)) && 445 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 446 { 447 // Convert bytes to unicode 448 source = inBuf; 449 sourceLimit = inBuf + count; 450 451 while(source < sourceLimit) 452 { 453 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 454 if(U_FAILURE(status)) 455 { 456 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 457 status = U_ZERO_ERROR; 458 continue; 459 } 460 U_ASSERT(status); 461 total++; 462 463 if(u_isalpha(p)) 464 letters++; 465 466 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 467 ie++; 468 469 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 470 gh++; 471 472 if(p>charCount) 473 { 474 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 475 free(info); 476 fclose(f); 477 ucnv_close(conv); 478 return U_UNSUPPORTED_ERROR; 479 } 480 info[p].frequency++; 481 l = p; 482 } 483 } 484 485 fclose(f); 486 ucnv_close(conv); 487 488 printf("%d letters out of %d total UChars.\n", letters, total); 489 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 490 491 // now, we could sort it.. 492 493 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 494 495 for(p=0;p<charCount;p++) 496 { 497 if(info[p].frequency) 498 { 499 printf("% 5d U+%06X ", info[p].frequency, p); 500 if(p <= 0xFFFF) 501 { 502 prettyPrintUChar((UChar)p); 503 } 504 printf("\n"); 505 } 506 } 507 free(info); 508 // ***************************** END SAMPLE ******************** 509 510 printf("\n"); 511 512 return U_ZERO_ERROR; 513 } 514 #undef BUFFERSIZE 515 516 517 /****************************************************** 518 You must call ucnv_close to clean up the memory used by the 519 converter. 520 521 'len' returns the number of OUTPUT bytes resulting from the 522 conversion. 523 */ 524 525 UErrorCode convsample_12() 526 { 527 printf("\n\n==============================================\n" 528 "Sample 12: C: simple sjis -> unicode conversion\n"); 529 530 531 // **************************** START SAMPLE ******************* 532 533 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 534 UChar target[100]; 535 UErrorCode status = U_ZERO_ERROR; 536 UConverter *conv; 537 int32_t len; 538 539 // set up the converter 540 conv = ucnv_open("shift_jis", &status); 541 assert(U_SUCCESS(status)); 542 543 // convert to Unicode 544 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 545 target[6] = 0xFDCA; 546 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); 547 U_ASSERT(status); 548 // close the converter 549 ucnv_close(conv); 550 551 // ***************************** END SAMPLE ******************** 552 553 // Print it out 554 printBytes("src", source, strlen(source) ); 555 printf("\n"); 556 printUChars("targ", target, len); 557 558 return U_ZERO_ERROR; 559 } 560 561 /****************************************************************** 562 C: Convert from codepage to Unicode one at a time. 563 */ 564 565 UErrorCode convsample_13() 566 { 567 printf("\n\n==============================================\n" 568 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 569 570 571 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 572 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 573 const char *source, *sourceLimit; 574 UChar32 target; 575 UErrorCode status = U_ZERO_ERROR; 576 UConverter *conv = NULL; 577 int32_t srcCount=0; 578 int32_t dstCount=0; 579 580 srcCount = sizeof(sourceChars); 581 582 conv = ucnv_open("Big5", &status); 583 U_ASSERT(status); 584 585 source = sourceChars; 586 sourceLimit = sourceChars + sizeof(sourceChars); 587 588 // **************************** START SAMPLE ******************* 589 590 591 printBytes("src",source,sourceLimit-source); 592 593 while(source < sourceLimit) 594 { 595 puts(""); 596 target = ucnv_getNextUChar (conv, 597 &source, 598 sourceLimit, 599 &status); 600 601 // printBytes("src",source,sourceLimit-source); 602 U_ASSERT(status); 603 printUChar(target); 604 dstCount++; 605 } 606 607 608 // ************************** END SAMPLE ************************* 609 610 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 611 ucnv_close(conv); 612 613 return U_ZERO_ERROR; 614 } 615 616 617 618 619 UBool convsample_20_didSubstitute(const char *source) 620 { 621 UChar uchars[100]; 622 char bytes[100]; 623 UConverter *conv = NULL; 624 UErrorCode status = U_ZERO_ERROR; 625 uint32_t len, len2; 626 UBool flagVal; 627 628 FromUFLAGContext * context = NULL; 629 630 printf("\n\n==============================================\n" 631 "Sample 20: C: Test for substitution using callbacks\n"); 632 633 /* print out the original source */ 634 printBytes("src", source); 635 printf("\n"); 636 637 /* First, convert from UTF8 to unicode */ 638 conv = ucnv_open("utf-8", &status); 639 U_ASSERT(status); 640 641 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 642 U_ASSERT(status); 643 644 printUChars("uch", uchars, len); 645 printf("\n"); 646 647 /* Now, close the converter */ 648 ucnv_close(conv); 649 650 /* Now, convert to windows-1252 */ 651 conv = ucnv_open("windows-1252", &status); 652 U_ASSERT(status); 653 654 /* Converter starts out with the SUBSTITUTE callback set. */ 655 656 /* initialize our callback */ 657 context = flagCB_fromU_openContext(); 658 659 /* Set our special callback */ 660 ucnv_setFromUCallBack(conv, 661 flagCB_fromU, 662 context, 663 &(context->subCallback), 664 &(context->subContext), 665 &status); 666 667 U_ASSERT(status); 668 669 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 670 U_ASSERT(status); 671 672 flagVal = context->flag; /* it's about to go away when we close the cnv */ 673 674 ucnv_close(conv); 675 676 /* print out the original source */ 677 printBytes("bytes", bytes, len2); 678 679 return flagVal; /* true if callback was called */ 680 } 681 682 UErrorCode convsample_20() 683 { 684 const char *sample1 = "abc\xdf\xbf"; 685 const char *sample2 = "abc_def"; 686 687 688 if(convsample_20_didSubstitute(sample1)) 689 { 690 printf("DID substitute.\n******\n"); 691 } 692 else 693 { 694 printf("Did NOT substitute.\n*****\n"); 695 } 696 697 if(convsample_20_didSubstitute(sample2)) 698 { 699 printf("DID substitute.\n******\n"); 700 } 701 else 702 { 703 printf("Did NOT substitute.\n*****\n"); 704 } 705 706 return U_ZERO_ERROR; 707 } 708 709 // 21 - C, callback, with clone and debug 710 711 712 713 UBool convsample_21_didSubstitute(const char *source) 714 { 715 UChar uchars[100]; 716 char bytes[100]; 717 UConverter *conv = NULL, *cloneCnv = NULL; 718 UErrorCode status = U_ZERO_ERROR; 719 uint32_t len, len2; 720 int32_t cloneLen; 721 UBool flagVal = FALSE; 722 UConverterFromUCallback junkCB; 723 724 FromUFLAGContext *flagCtx = NULL, 725 *cloneFlagCtx = NULL; 726 727 debugCBContext *debugCtx1 = NULL, 728 *debugCtx2 = NULL, 729 *cloneDebugCtx = NULL; 730 731 printf("\n\n==============================================\n" 732 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 733 734 /* print out the original source */ 735 printBytes("src", source); 736 printf("\n"); 737 738 /* First, convert from UTF8 to unicode */ 739 conv = ucnv_open("utf-8", &status); 740 U_ASSERT(status); 741 742 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 743 U_ASSERT(status); 744 745 printUChars("uch", uchars, len); 746 printf("\n"); 747 748 /* Now, close the converter */ 749 ucnv_close(conv); 750 751 /* Now, convert to windows-1252 */ 752 conv = ucnv_open("windows-1252", &status); 753 U_ASSERT(status); 754 755 /* Converter starts out with the SUBSTITUTE callback set. */ 756 757 /* initialize our callback */ 758 /* from the 'bottom' innermost, out 759 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 760 761 #if DEBUG_TMI 762 printf("flagCB_fromU = %p\n", &flagCB_fromU); 763 printf("debugCB_fromU = %p\n", &debugCB_fromU); 764 #endif 765 766 debugCtx1 = debugCB_openContext(); 767 flagCtx = flagCB_fromU_openContext(); 768 debugCtx2 = debugCB_openContext(); 769 770 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 771 debugCtx1->subContext = flagCtx; 772 773 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 774 flagCtx->subContext = debugCtx2; 775 776 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 777 debugCtx2->subContext = NULL; 778 779 /* Set our special callback */ 780 781 ucnv_setFromUCallBack(conv, 782 debugCB_fromU, 783 debugCtx1, 784 &(debugCtx2->subCallback), 785 &(debugCtx2->subContext), 786 &status); 787 788 U_ASSERT(status); 789 790 #if DEBUG_TMI 791 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 792 conv, debugCtx1, debugCtx1->subCallback, 793 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 794 #endif 795 796 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); 797 798 U_ASSERT(status); 799 800 #if DEBUG_TMI 801 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 802 #endif 803 804 ucnv_close(conv); 805 806 #if DEBUG_TMI 807 printf("%p closed.\n", conv); 808 #endif 809 810 U_ASSERT(status); 811 /* Now, we have to extract the context */ 812 cloneDebugCtx = NULL; 813 cloneFlagCtx = NULL; 814 815 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 816 if(cloneDebugCtx != NULL) { 817 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 818 } 819 820 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 821 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 822 823 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 824 U_ASSERT(status); 825 826 if(cloneFlagCtx != NULL) { 827 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 828 } else { 829 printf("** Warning, couldn't get the subcallback \n"); 830 } 831 832 ucnv_close(cloneCnv); 833 834 /* print out the original source */ 835 printBytes("bytes", bytes, len2); 836 837 return flagVal; /* true if callback was called */ 838 } 839 840 UErrorCode convsample_21() 841 { 842 const char *sample1 = "abc\xdf\xbf"; 843 const char *sample2 = "abc_def"; 844 845 if(convsample_21_didSubstitute(sample1)) 846 { 847 printf("DID substitute.\n******\n"); 848 } 849 else 850 { 851 printf("Did NOT substitute.\n*****\n"); 852 } 853 854 if(convsample_21_didSubstitute(sample2)) 855 { 856 printf("DID substitute.\n******\n"); 857 } 858 else 859 { 860 printf("Did NOT substitute.\n*****\n"); 861 } 862 863 return U_ZERO_ERROR; 864 } 865 866 867 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 868 869 #define BUFFERSIZE 17 /* make it interesting :) */ 870 871 UErrorCode convsample_40() 872 { 873 printf("\n\n==============================================\n" 874 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 875 876 FILE *f; 877 FILE *out; 878 int32_t count; 879 char inBuf[BUFFERSIZE]; 880 const char *source; 881 const char *sourceLimit; 882 UChar *uBuf; 883 UChar *target; 884 UChar *targetLimit; 885 int32_t uBufSize = 0; 886 UConverter *conv = NULL; 887 UErrorCode status = U_ZERO_ERROR; 888 uint32_t inbytes=0, total=0; 889 890 f = fopen("data02.bin", "rb"); 891 if(!f) 892 { 893 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 894 return U_FILE_ACCESS_ERROR; 895 } 896 897 out = fopen("data40.utf16", "wb"); 898 if(!out) 899 { 900 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 901 fclose(f); 902 return U_FILE_ACCESS_ERROR; 903 } 904 905 // **************************** START SAMPLE ******************* 906 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 907 assert(U_SUCCESS(status)); 908 909 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 910 printf("input bytes %d / min chars %d = %d UChars\n", 911 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 912 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 913 assert(uBuf!=NULL); 914 915 // grab another buffer's worth 916 while((!feof(f)) && 917 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 918 { 919 inbytes += count; 920 921 // Convert bytes to unicode 922 source = inBuf; 923 sourceLimit = inBuf + count; 924 925 do 926 { 927 target = uBuf; 928 targetLimit = uBuf + uBufSize; 929 930 ucnv_toUnicode( conv, &target, targetLimit, 931 &source, sourceLimit, NULL, 932 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 933 /* is true (when no more data will come) */ 934 &status); 935 936 if(status == U_BUFFER_OVERFLOW_ERROR) 937 { 938 // simply ran out of space - we'll reset the target ptr the next 939 // time through the loop. 940 status = U_ZERO_ERROR; 941 } 942 else 943 { 944 // Check other errors here. 945 assert(U_SUCCESS(status)); 946 // Break out of the loop (by force) 947 } 948 949 // Process the Unicode 950 // Todo: handle UTF-16/surrogates 951 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == 952 (size_t)(target-uBuf)); 953 total += (target-uBuf); 954 } while (source < sourceLimit); // while simply out of space 955 } 956 957 printf("%d bytes in, %d UChars out.\n", inbytes, total); 958 959 // ***************************** END SAMPLE ******************** 960 ucnv_close(conv); 961 962 fclose(f); 963 fclose(out); 964 printf("\n"); 965 966 return U_ZERO_ERROR; 967 } 968 #undef BUFFERSIZE 969 970 971 972 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 973 974 #define BUFFERSIZE 24 /* make it interesting :) */ 975 976 UErrorCode convsample_46() 977 { 978 printf("\n\n==============================================\n" 979 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 980 981 FILE *f; 982 FILE *out; 983 int32_t count; 984 UChar inBuf[BUFFERSIZE]; 985 const UChar *source; 986 const UChar *sourceLimit; 987 char *buf; 988 char *target; 989 char *targetLimit; 990 991 int32_t bufSize = 0; 992 UConverter *conv = NULL; 993 UErrorCode status = U_ZERO_ERROR; 994 uint32_t inchars=0, total=0; 995 996 f = fopen("data40.utf16", "rb"); 997 if(!f) 998 { 999 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 1000 return U_FILE_ACCESS_ERROR; 1001 } 1002 1003 out = fopen("data46.out", "wb"); 1004 if(!out) 1005 { 1006 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1007 fclose(f); 1008 return U_FILE_ACCESS_ERROR; 1009 } 1010 1011 // **************************** START SAMPLE ******************* 1012 conv = ucnv_open( "iso-8859-2", &status); 1013 assert(U_SUCCESS(status)); 1014 1015 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1016 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1017 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1018 buf = (char*)malloc(bufSize * sizeof(char)); 1019 assert(buf!=NULL); 1020 1021 // grab another buffer's worth 1022 while((!feof(f)) && 1023 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) 1024 { 1025 inchars += count; 1026 1027 // Convert bytes to unicode 1028 source = inBuf; 1029 sourceLimit = inBuf + count; 1030 1031 do 1032 { 1033 target = buf; 1034 targetLimit = buf + bufSize; 1035 1036 ucnv_fromUnicode( conv, &target, targetLimit, 1037 &source, sourceLimit, NULL, 1038 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1039 /* is true (when no more data will come) */ 1040 &status); 1041 1042 if(status == U_BUFFER_OVERFLOW_ERROR) 1043 { 1044 // simply ran out of space - we'll reset the target ptr the next 1045 // time through the loop. 1046 status = U_ZERO_ERROR; 1047 } 1048 else 1049 { 1050 // Check other errors here. 1051 assert(U_SUCCESS(status)); 1052 // Break out of the loop (by force) 1053 } 1054 1055 // Process the Unicode 1056 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == 1057 (size_t)(target-buf)); 1058 total += (target-buf); 1059 } while (source < sourceLimit); // while simply out of space 1060 } 1061 1062 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); 1063 1064 // ***************************** END SAMPLE ******************** 1065 ucnv_close(conv); 1066 1067 fclose(f); 1068 fclose(out); 1069 printf("\n"); 1070 1071 return U_ZERO_ERROR; 1072 } 1073 #undef BUFFERSIZE 1074 1075 #define BUFFERSIZE 219 1076 1077 void convsample_50() { 1078 printf("\n\n==============================================\n" 1079 "Sample 50: C: ucnv_detectUnicodeSignature\n"); 1080 1081 //! [ucnv_detectUnicodeSignature] 1082 UErrorCode err = U_ZERO_ERROR; 1083 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ 1084 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; 1085 int32_t signatureLength = 0; 1086 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); 1087 UConverter *conv = NULL; 1088 UChar output[100]; 1089 UChar *target = output, *out; 1090 const char *source = input; 1091 if(encoding!=NULL && U_SUCCESS(err)){ 1092 // should signature be discarded ? 1093 conv = ucnv_open(encoding, &err); 1094 // do the conversion 1095 ucnv_toUnicode(conv, 1096 &target, output + UPRV_LENGTHOF(output), 1097 &source, input + sizeof(input), 1098 NULL, TRUE, &err); 1099 out = output; 1100 if (discardSignature){ 1101 ++out; // ignore initial U+FEFF 1102 } 1103 while(out != target) { 1104 printf("%04x ", *out++); 1105 } 1106 puts(""); 1107 } 1108 //! [ucnv_detectUnicodeSignature] 1109 puts(""); 1110 } 1111 1112 1113 1114 /* main */ 1115 1116 int main() 1117 { 1118 1119 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1120 1121 convsample_02(); // C , u->koi8r, conv 1122 convsample_03(); // C, iterate 1123 1124 convsample_05(); // C, utf8->u, getNextUChar 1125 convsample_06(); // C freq counter thingy 1126 1127 convsample_12(); // C, sjis->u, conv 1128 convsample_13(); // C, big5->u, getNextU 1129 1130 convsample_20(); // C, callback 1131 convsample_21(); // C, callback debug 1132 1133 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1134 1135 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1136 1137 convsample_50(); // C, detect unicode signature 1138 1139 printf("End of converter samples.\n"); 1140 1141 fflush(stdout); 1142 fflush(stderr); 1143 1144 return 0; 1145 } 1146