1 /************************************************************************* 2 * 3 * 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html#License 5 * 6 ************************************************************************** 7 ************************************************************************** 8 * 9 * Copyright (C) 2000-2016, International Business Machines 10 * Corporation and others. All Rights Reserved. 11 * 12 *************************************************************************** 13 * file name: convsamp.c 14 * encoding: ASCII (7-bit) 15 * 16 * created on: 2000may30 17 * created by: Steven R. Loomis 18 * 19 * Sample code for the ICU conversion routines. 20 * 21 * Note: Nothing special is needed to build this sample. Link with 22 * the icu UC and icu I18N libraries. 23 * 24 * I use 'assert' for error checking, you probably will want 25 * something more flexible. '***BEGIN SAMPLE***' and 26 * '***END SAMPLE***' mark pieces suitable for stand alone 27 * code snippets. 28 * 29 * 30 * Each test can define it's own BUFFERSIZE 31 * 32 */ 33 34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 35 36 #include <stdio.h> 37 #include <ctype.h> /* for isspace, etc. */ 38 #include <assert.h> 39 #include <string.h> 40 #include <stdlib.h> /* malloc */ 41 42 #include "unicode/utypes.h" /* Basic ICU data types */ 43 #include "unicode/ucnv.h" /* C Converter API */ 44 #include "unicode/ustring.h" /* some more string fcns*/ 45 #include "unicode/uchar.h" /* char names */ 46 #include "unicode/uloc.h" 47 #include "unicode/unistr.h" 48 49 #include "flagcb.h" 50 51 /* Some utility functions */ 52 #ifndef UPRV_LENGTHOF 53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 54 #endif 55 56 static const UChar kNone[] = { 0x0000 }; 57 58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 59 60 /* Print a UChar if possible, in seven characters. */ 61 void prettyPrintUChar(UChar c) 62 { 63 if( (c <= 0x007F) && 64 (isgraph(c)) ) { 65 printf(" '%c' ", (char)(0x00FF&c)); 66 } else if ( c > 0x007F ) { 67 char buf[1000]; 68 UErrorCode status = U_ZERO_ERROR; 69 int32_t o; 70 71 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); 72 if(U_SUCCESS(status) && (o>0) ) { 73 buf[6] = 0; 74 printf("%7s", buf); 75 } else { 76 printf(" ??????"); 77 } 78 } else { 79 switch((char)(c & 0x007F)) { 80 case ' ': 81 printf(" ' ' "); 82 break; 83 case '\t': 84 printf(" \\t "); 85 break; 86 case '\n': 87 printf(" \\n "); 88 break; 89 default: 90 printf(" _ "); 91 break; 92 } 93 } 94 } 95 96 97 void printUChars(const char *name = "?", 98 const UChar *uch = kNone, 99 int32_t len = -1 ) 100 { 101 int32_t i; 102 103 if( (len == -1) && (uch) ) { 104 len = u_strlen(uch); 105 } 106 107 printf("%5s: ", name); 108 for( i = 0; i <len; i++) { 109 printf("%-6d ", i); 110 } 111 printf("\n"); 112 113 printf("%5s: ", "uni"); 114 for( i = 0; i <len; i++) { 115 printf("\\u%04X ", (int)uch[i]); 116 } 117 printf("\n"); 118 119 printf("%5s:", "ch"); 120 for( i = 0; i <len; i++) { 121 prettyPrintUChar(uch[i]); 122 } 123 printf("\n"); 124 } 125 126 void printBytes(const char *name = "?", 127 const char *uch = "", 128 int32_t len = -1 ) 129 { 130 int32_t i; 131 132 if( (len == -1) && (uch) ) { 133 len = static_cast<int32_t>(strlen(uch)); 134 } 135 136 printf("%5s: ", name); 137 for( i = 0; i <len; i++) { 138 printf("%-4d ", i); 139 } 140 printf("\n"); 141 142 printf("%5s: ", "uni"); 143 for( i = 0; i <len; i++) { 144 printf("\\x%02X ", 0x00FF & (int)uch[i]); 145 } 146 printf("\n"); 147 148 printf("%5s:", "ch"); 149 for( i = 0; i <len; i++) { 150 if(isgraph(0x00FF & (int)uch[i])) { 151 printf(" '%c' ", (char)uch[i]); 152 } else { 153 printf(" "); 154 } 155 } 156 printf("\n"); 157 } 158 159 void printUChar(UChar32 ch32) 160 { 161 if(ch32 > 0xFFFF) { 162 printf("ch: U+%06X\n", ch32); 163 } 164 else { 165 UChar ch = (UChar)ch32; 166 printUChars("C", &ch, 1); 167 } 168 } 169 170 /******************************************************************* 171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 172 followed by an exclamation mark (!) into the KOI8-R Russian code page. 173 174 This example first creates a UChar String out of the Unicode chars. 175 176 targetSize must be set to the amount of space available in the target 177 buffer. After fromUChars is called, 178 len will contain the number of bytes in target[] which were 179 used in the resulting codepage. In this case, there is a 1:1 mapping 180 between the input and output characters. The exclamation mark has the 181 same value in both KOI8-R and Unicode. 182 183 src: 0 1 2 3 4 5 6 184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 186 187 targ: 0 1 2 3 4 5 6 188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 189 ch: '!' 190 191 192 Converting FROM unicode 193 to koi8-r. 194 You must call ucnv_close to clean up the memory used by the 195 converter. 196 197 'len' returns the number of OUTPUT bytes resulting from the 198 conversion. 199 */ 200 201 UErrorCode convsample_02() 202 { 203 printf("\n\n==============================================\n" 204 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 205 206 207 // **************************** START SAMPLE ******************* 208 // "cat<cat>OK" 209 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 210 0x0430, 0x0021, 0x0000 }; 211 char target[100]; 212 UErrorCode status = U_ZERO_ERROR; 213 UConverter *conv; 214 int32_t len; 215 216 // set up the converter 217 //! [ucnv_open] 218 conv = ucnv_open("koi8-r", &status); 219 //! [ucnv_open] 220 assert(U_SUCCESS(status)); 221 222 // convert to koi8-r 223 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 224 assert(U_SUCCESS(status)); 225 226 // close the converter 227 ucnv_close(conv); 228 229 // ***************************** END SAMPLE ******************** 230 231 // Print it out 232 printUChars("src", source); 233 printf("\n"); 234 printBytes("targ", target, len); 235 236 return U_ZERO_ERROR; 237 } 238 239 240 UErrorCode convsample_03() 241 { 242 printf("\n\n==============================================\n" 243 "Sample 03: C: print out all converters\n"); 244 245 int32_t count; 246 int32_t i; 247 248 // **************************** START SAMPLE ******************* 249 count = ucnv_countAvailable(); 250 printf("Available converters: %d\n", count); 251 252 for(i=0;i<count;i++) 253 { 254 printf("%s ", ucnv_getAvailableName(i)); 255 } 256 257 // ***************************** END SAMPLE ******************** 258 259 printf("\n"); 260 261 return U_ZERO_ERROR; 262 } 263 264 265 266 #define BUFFERSIZE 17 /* make it interesting :) */ 267 268 /* 269 Converting from a codepage to Unicode in bulk.. 270 What is the best way to determine the buffer size? 271 272 The 'buffersize' is in bytes of input. 273 For a given converter, divinding this by the minimum char size 274 give you the maximum number of Unicode characters that could be 275 expected for a given number of input bytes. 276 see: ucnv_getMinCharSize() 277 278 For example, a single byte codepage like 'Latin-3' has a 279 minimum char size of 1. (It takes at least 1 byte to represent 280 each Unicode char.) So the unicode buffer has the same number of 281 UChars as the input buffer has bytes. 282 283 In a strictly double byte codepage such as cp1362 (Windows 284 Korean), the minimum char size is 2. So, only half as many Unicode 285 chars as bytes are needed. 286 287 This work to calculate the buffer size is an optimization. Any 288 size of input and output buffer can be used, as long as the 289 program handles the following cases: If the input buffer is empty, 290 the source pointer will be equal to sourceLimit. If the output 291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 292 */ 293 294 UErrorCode convsample_05() 295 { 296 printf("\n\n==============================================\n" 297 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 298 299 FILE *f; 300 int32_t count; 301 char inBuf[BUFFERSIZE]; 302 const char *source; 303 const char *sourceLimit; 304 UChar *uBuf; 305 UChar *target; 306 UChar *targetLimit; 307 UChar *p; 308 int32_t uBufSize = 0; 309 UConverter *conv; 310 UErrorCode status = U_ZERO_ERROR; 311 uint32_t letters=0, total=0; 312 313 f = fopen("data01.txt", "r"); 314 if(!f) 315 { 316 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 317 return U_FILE_ACCESS_ERROR; 318 } 319 320 // **************************** START SAMPLE ******************* 321 conv = ucnv_open("utf-8", &status); 322 assert(U_SUCCESS(status)); 323 324 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 325 printf("input bytes %d / min chars %d = %d UChars\n", 326 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 327 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 328 assert(uBuf!=NULL); 329 330 // grab another buffer's worth 331 while((!feof(f)) && 332 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) 333 { 334 // Convert bytes to unicode 335 source = inBuf; 336 sourceLimit = inBuf + count; 337 338 do 339 { 340 target = uBuf; 341 targetLimit = uBuf + uBufSize; 342 343 ucnv_toUnicode(conv, &target, targetLimit, 344 &source, sourceLimit, NULL, 345 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 346 /* is true (when no more data will come) */ 347 &status); 348 349 if(status == U_BUFFER_OVERFLOW_ERROR) 350 { 351 // simply ran out of space - we'll reset the target ptr the next 352 // time through the loop. 353 status = U_ZERO_ERROR; 354 } 355 else 356 { 357 // Check other errors here. 358 assert(U_SUCCESS(status)); 359 // Break out of the loop (by force) 360 } 361 362 // Process the Unicode 363 // Todo: handle UTF-16/surrogates 364 365 for(p = uBuf; p<target; p++) 366 { 367 if(u_isalpha(*p)) 368 letters++; 369 total++; 370 } 371 } while (source < sourceLimit); // while simply out of space 372 } 373 374 printf("%d letters out of %d total UChars.\n", letters, total); 375 376 // ***************************** END SAMPLE ******************** 377 ucnv_close(conv); 378 379 printf("\n"); 380 381 fclose(f); 382 383 return U_ZERO_ERROR; 384 } 385 #undef BUFFERSIZE 386 387 #define BUFFERSIZE 1024 388 typedef struct 389 { 390 UChar32 codepoint; 391 uint32_t frequency; 392 } CharFreqInfo; 393 394 UErrorCode convsample_06() 395 { 396 printf("\n\n==============================================\n" 397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 398 399 FILE *f; 400 int32_t count; 401 char inBuf[BUFFERSIZE]; 402 const char *source; 403 const char *sourceLimit; 404 int32_t uBufSize = 0; 405 UConverter *conv; 406 UErrorCode status = U_ZERO_ERROR; 407 uint32_t letters=0, total=0; 408 409 CharFreqInfo *info; 410 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 411 UChar32 p; 412 413 uint32_t ie = 0; 414 uint32_t gh = 0; 415 UChar32 l = 0; 416 417 f = fopen("data06.txt", "r"); 418 if(!f) 419 { 420 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 421 return U_FILE_ACCESS_ERROR; 422 } 423 424 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 425 if(!info) 426 { 427 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", static_cast<int>(sizeof(CharFreqInfo)*charCount)); 428 } 429 430 /* reset frequencies */ 431 for(p=0;p<charCount;p++) 432 { 433 info[p].codepoint = p; 434 info[p].frequency = 0; 435 } 436 437 // **************************** START SAMPLE ******************* 438 conv = ucnv_open("utf-8", &status); 439 assert(U_SUCCESS(status)); 440 441 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 442 printf("input bytes %d / min chars %d = %d UChars\n", 443 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 444 445 // grab another buffer's worth 446 while((!feof(f)) && 447 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) 448 { 449 // Convert bytes to unicode 450 source = inBuf; 451 sourceLimit = inBuf + count; 452 453 while(source < sourceLimit) 454 { 455 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 456 if(U_FAILURE(status)) 457 { 458 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 459 status = U_ZERO_ERROR; 460 continue; 461 } 462 U_ASSERT(status); 463 total++; 464 465 if(u_isalpha(p)) 466 letters++; 467 468 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 469 ie++; 470 471 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 472 gh++; 473 474 if(p>charCount) 475 { 476 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 477 free(info); 478 fclose(f); 479 ucnv_close(conv); 480 return U_UNSUPPORTED_ERROR; 481 } 482 info[p].frequency++; 483 l = p; 484 } 485 } 486 487 fclose(f); 488 ucnv_close(conv); 489 490 printf("%d letters out of %d total UChars.\n", letters, total); 491 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 492 493 // now, we could sort it.. 494 495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 496 497 for(p=0;p<charCount;p++) 498 { 499 if(info[p].frequency) 500 { 501 printf("% 5d U+%06X ", info[p].frequency, p); 502 if(p <= 0xFFFF) 503 { 504 prettyPrintUChar((UChar)p); 505 } 506 printf("\n"); 507 } 508 } 509 free(info); 510 // ***************************** END SAMPLE ******************** 511 512 printf("\n"); 513 514 return U_ZERO_ERROR; 515 } 516 #undef BUFFERSIZE 517 518 519 /****************************************************** 520 You must call ucnv_close to clean up the memory used by the 521 converter. 522 523 'len' returns the number of OUTPUT bytes resulting from the 524 conversion. 525 */ 526 527 UErrorCode convsample_12() 528 { 529 printf("\n\n==============================================\n" 530 "Sample 12: C: simple sjis -> unicode conversion\n"); 531 532 533 // **************************** START SAMPLE ******************* 534 535 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 536 UChar target[100]; 537 UErrorCode status = U_ZERO_ERROR; 538 UConverter *conv; 539 int32_t len; 540 541 // set up the converter 542 conv = ucnv_open("shift_jis", &status); 543 assert(U_SUCCESS(status)); 544 545 // convert to Unicode 546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 547 target[6] = 0xFDCA; 548 len = ucnv_toUChars(conv, target, 100, source, static_cast<int32_t>(strlen(source)), &status); 549 U_ASSERT(status); 550 // close the converter 551 ucnv_close(conv); 552 553 // ***************************** END SAMPLE ******************** 554 555 // Print it out 556 printBytes("src", source, static_cast<int32_t>(strlen(source)) ); 557 printf("\n"); 558 printUChars("targ", target, len); 559 560 return U_ZERO_ERROR; 561 } 562 563 /****************************************************************** 564 C: Convert from codepage to Unicode one at a time. 565 */ 566 567 UErrorCode convsample_13() 568 { 569 printf("\n\n==============================================\n" 570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 571 572 573 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 575 const char *source, *sourceLimit; 576 UChar32 target; 577 UErrorCode status = U_ZERO_ERROR; 578 UConverter *conv = NULL; 579 int32_t srcCount=0; 580 int32_t dstCount=0; 581 582 srcCount = sizeof(sourceChars); 583 584 conv = ucnv_open("Big5", &status); 585 U_ASSERT(status); 586 587 source = sourceChars; 588 sourceLimit = sourceChars + sizeof(sourceChars); 589 590 // **************************** START SAMPLE ******************* 591 592 593 printBytes("src", source, static_cast<int32_t>(sourceLimit - source)); 594 595 while(source < sourceLimit) 596 { 597 puts(""); 598 target = ucnv_getNextUChar (conv, 599 &source, 600 sourceLimit, 601 &status); 602 603 // printBytes("src",source,sourceLimit-source); 604 U_ASSERT(status); 605 printUChar(target); 606 dstCount++; 607 } 608 609 610 // ************************** END SAMPLE ************************* 611 612 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 613 ucnv_close(conv); 614 615 return U_ZERO_ERROR; 616 } 617 618 619 620 621 UBool convsample_20_didSubstitute(const char *source) 622 { 623 UChar uchars[100]; 624 char bytes[100]; 625 UConverter *conv = NULL; 626 UErrorCode status = U_ZERO_ERROR; 627 uint32_t len, len2; 628 UBool flagVal; 629 630 FromUFLAGContext * context = NULL; 631 632 printf("\n\n==============================================\n" 633 "Sample 20: C: Test for substitution using callbacks\n"); 634 635 /* print out the original source */ 636 printBytes("src", source); 637 printf("\n"); 638 639 /* First, convert from UTF8 to unicode */ 640 conv = ucnv_open("utf-8", &status); 641 U_ASSERT(status); 642 643 len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status); 644 U_ASSERT(status); 645 646 printUChars("uch", uchars, len); 647 printf("\n"); 648 649 /* Now, close the converter */ 650 ucnv_close(conv); 651 652 /* Now, convert to windows-1252 */ 653 conv = ucnv_open("windows-1252", &status); 654 U_ASSERT(status); 655 656 /* Converter starts out with the SUBSTITUTE callback set. */ 657 658 /* initialize our callback */ 659 context = flagCB_fromU_openContext(); 660 661 /* Set our special callback */ 662 ucnv_setFromUCallBack(conv, 663 flagCB_fromU, 664 context, 665 &(context->subCallback), 666 &(context->subContext), 667 &status); 668 669 U_ASSERT(status); 670 671 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 672 U_ASSERT(status); 673 674 flagVal = context->flag; /* it's about to go away when we close the cnv */ 675 676 ucnv_close(conv); 677 678 /* print out the original source */ 679 printBytes("bytes", bytes, len2); 680 681 return flagVal; /* true if callback was called */ 682 } 683 684 UErrorCode convsample_20() 685 { 686 const char *sample1 = "abc\xdf\xbf"; 687 const char *sample2 = "abc_def"; 688 689 690 if(convsample_20_didSubstitute(sample1)) 691 { 692 printf("DID substitute.\n******\n"); 693 } 694 else 695 { 696 printf("Did NOT substitute.\n*****\n"); 697 } 698 699 if(convsample_20_didSubstitute(sample2)) 700 { 701 printf("DID substitute.\n******\n"); 702 } 703 else 704 { 705 printf("Did NOT substitute.\n*****\n"); 706 } 707 708 return U_ZERO_ERROR; 709 } 710 711 // 21 - C, callback, with clone and debug 712 713 714 715 UBool convsample_21_didSubstitute(const char *source) 716 { 717 UChar uchars[100]; 718 char bytes[100]; 719 UConverter *conv = NULL, *cloneCnv = NULL; 720 UErrorCode status = U_ZERO_ERROR; 721 uint32_t len, len2; 722 UBool flagVal = FALSE; 723 UConverterFromUCallback junkCB; 724 725 FromUFLAGContext *flagCtx = NULL, 726 *cloneFlagCtx = NULL; 727 728 debugCBContext *debugCtx1 = NULL, 729 *debugCtx2 = NULL, 730 *cloneDebugCtx = NULL; 731 732 printf("\n\n==============================================\n" 733 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 734 735 /* print out the original source */ 736 printBytes("src", source); 737 printf("\n"); 738 739 /* First, convert from UTF8 to unicode */ 740 conv = ucnv_open("utf-8", &status); 741 U_ASSERT(status); 742 743 len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status); 744 U_ASSERT(status); 745 746 printUChars("uch", uchars, len); 747 printf("\n"); 748 749 /* Now, close the converter */ 750 ucnv_close(conv); 751 752 /* Now, convert to windows-1252 */ 753 conv = ucnv_open("windows-1252", &status); 754 U_ASSERT(status); 755 756 /* Converter starts out with the SUBSTITUTE callback set. */ 757 758 /* initialize our callback */ 759 /* from the 'bottom' innermost, out 760 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 761 762 #if DEBUG_TMI 763 printf("flagCB_fromU = %p\n", &flagCB_fromU); 764 printf("debugCB_fromU = %p\n", &debugCB_fromU); 765 #endif 766 767 debugCtx1 = debugCB_openContext(); 768 flagCtx = flagCB_fromU_openContext(); 769 debugCtx2 = debugCB_openContext(); 770 771 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 772 debugCtx1->subContext = flagCtx; 773 774 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 775 flagCtx->subContext = debugCtx2; 776 777 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 778 debugCtx2->subContext = NULL; 779 780 /* Set our special callback */ 781 782 ucnv_setFromUCallBack(conv, 783 debugCB_fromU, 784 debugCtx1, 785 &(debugCtx2->subCallback), 786 &(debugCtx2->subContext), 787 &status); 788 789 U_ASSERT(status); 790 791 #if DEBUG_TMI 792 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 793 conv, debugCtx1, debugCtx1->subCallback, 794 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 795 #endif 796 797 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); 798 799 U_ASSERT(status); 800 801 #if DEBUG_TMI 802 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 803 #endif 804 805 ucnv_close(conv); 806 807 #if DEBUG_TMI 808 printf("%p closed.\n", conv); 809 #endif 810 811 U_ASSERT(status); 812 /* Now, we have to extract the context */ 813 cloneDebugCtx = NULL; 814 cloneFlagCtx = NULL; 815 816 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 817 if(cloneDebugCtx != NULL) { 818 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 819 } 820 821 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 822 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 823 824 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 825 U_ASSERT(status); 826 827 if(cloneFlagCtx != NULL) { 828 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 829 } else { 830 printf("** Warning, couldn't get the subcallback \n"); 831 } 832 833 ucnv_close(cloneCnv); 834 835 /* print out the original source */ 836 printBytes("bytes", bytes, len2); 837 838 return flagVal; /* true if callback was called */ 839 } 840 841 UErrorCode convsample_21() 842 { 843 const char *sample1 = "abc\xdf\xbf"; 844 const char *sample2 = "abc_def"; 845 846 if(convsample_21_didSubstitute(sample1)) 847 { 848 printf("DID substitute.\n******\n"); 849 } 850 else 851 { 852 printf("Did NOT substitute.\n*****\n"); 853 } 854 855 if(convsample_21_didSubstitute(sample2)) 856 { 857 printf("DID substitute.\n******\n"); 858 } 859 else 860 { 861 printf("Did NOT substitute.\n*****\n"); 862 } 863 864 return U_ZERO_ERROR; 865 } 866 867 868 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 869 870 #define BUFFERSIZE 17 /* make it interesting :) */ 871 872 UErrorCode convsample_40() 873 { 874 printf("\n\n==============================================\n" 875 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 876 877 FILE *f; 878 FILE *out; 879 int32_t count; 880 char inBuf[BUFFERSIZE]; 881 const char *source; 882 const char *sourceLimit; 883 UChar *uBuf; 884 UChar *target; 885 UChar *targetLimit; 886 int32_t uBufSize = 0; 887 UConverter *conv = NULL; 888 UErrorCode status = U_ZERO_ERROR; 889 uint32_t inbytes=0, total=0; 890 891 f = fopen("data02.bin", "rb"); 892 if(!f) 893 { 894 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 895 return U_FILE_ACCESS_ERROR; 896 } 897 898 out = fopen("data40.utf16", "wb"); 899 if(!out) 900 { 901 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 902 fclose(f); 903 return U_FILE_ACCESS_ERROR; 904 } 905 906 // **************************** START SAMPLE ******************* 907 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 908 assert(U_SUCCESS(status)); 909 910 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 911 printf("input bytes %d / min chars %d = %d UChars\n", 912 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 913 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 914 assert(uBuf!=NULL); 915 916 // grab another buffer's worth 917 while((!feof(f)) && 918 ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) 919 { 920 inbytes += count; 921 922 // Convert bytes to unicode 923 source = inBuf; 924 sourceLimit = inBuf + count; 925 926 do 927 { 928 target = uBuf; 929 targetLimit = uBuf + uBufSize; 930 931 ucnv_toUnicode( conv, &target, targetLimit, 932 &source, sourceLimit, NULL, 933 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 934 /* is true (when no more data will come) */ 935 &status); 936 937 if(status == U_BUFFER_OVERFLOW_ERROR) 938 { 939 // simply ran out of space - we'll reset the target ptr the next 940 // time through the loop. 941 status = U_ZERO_ERROR; 942 } 943 else 944 { 945 // Check other errors here. 946 assert(U_SUCCESS(status)); 947 // Break out of the loop (by force) 948 } 949 950 // Process the Unicode 951 // Todo: handle UTF-16/surrogates 952 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf)); 953 total += static_cast<uint32_t>((target-uBuf)); 954 } while (source < sourceLimit); // while simply out of space 955 } 956 957 printf("%d bytes in, %d UChars out.\n", inbytes, total); 958 959 // ***************************** END SAMPLE ******************** 960 ucnv_close(conv); 961 962 fclose(f); 963 fclose(out); 964 printf("\n"); 965 966 return U_ZERO_ERROR; 967 } 968 #undef BUFFERSIZE 969 970 971 972 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 973 974 #define BUFFERSIZE 24 /* make it interesting :) */ 975 976 UErrorCode convsample_46() 977 { 978 printf("\n\n==============================================\n" 979 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 980 981 FILE *f; 982 FILE *out; 983 int32_t count; 984 UChar inBuf[BUFFERSIZE]; 985 const UChar *source; 986 const UChar *sourceLimit; 987 char *buf; 988 char *target; 989 char *targetLimit; 990 991 int32_t bufSize = 0; 992 UConverter *conv = NULL; 993 UErrorCode status = U_ZERO_ERROR; 994 uint32_t inchars=0, total=0; 995 996 f = fopen("data40.utf16", "rb"); 997 if(!f) 998 { 999 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 1000 return U_FILE_ACCESS_ERROR; 1001 } 1002 1003 out = fopen("data46.out", "wb"); 1004 if(!out) 1005 { 1006 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1007 fclose(f); 1008 return U_FILE_ACCESS_ERROR; 1009 } 1010 1011 // **************************** START SAMPLE ******************* 1012 conv = ucnv_open( "iso-8859-2", &status); 1013 assert(U_SUCCESS(status)); 1014 1015 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1016 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1017 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1018 buf = (char*)malloc(bufSize * sizeof(char)); 1019 assert(buf!=NULL); 1020 1021 // grab another buffer's worth 1022 while((!feof(f)) && 1023 ((count=static_cast<int32_t>(fread(inBuf, sizeof(UChar), BUFFERSIZE , f))) > 0) ) 1024 { 1025 inchars += count; 1026 1027 // Convert bytes to unicode 1028 source = inBuf; 1029 sourceLimit = inBuf + count; 1030 1031 do 1032 { 1033 target = buf; 1034 targetLimit = buf + bufSize; 1035 1036 ucnv_fromUnicode( conv, &target, targetLimit, 1037 &source, sourceLimit, NULL, 1038 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1039 /* is true (when no more data will come) */ 1040 &status); 1041 1042 if(status == U_BUFFER_OVERFLOW_ERROR) 1043 { 1044 // simply ran out of space - we'll reset the target ptr the next 1045 // time through the loop. 1046 status = U_ZERO_ERROR; 1047 } 1048 else 1049 { 1050 // Check other errors here. 1051 assert(U_SUCCESS(status)); 1052 // Break out of the loop (by force) 1053 } 1054 1055 // Process the Unicode 1056 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == (size_t)(target-buf)); 1057 total += static_cast<uint32_t>((target-buf)); 1058 } while (source < sourceLimit); // while simply out of space 1059 } 1060 1061 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, static_cast<int>(inchars * sizeof(UChar)), total); 1062 1063 // ***************************** END SAMPLE ******************** 1064 ucnv_close(conv); 1065 1066 fclose(f); 1067 fclose(out); 1068 printf("\n"); 1069 1070 return U_ZERO_ERROR; 1071 } 1072 #undef BUFFERSIZE 1073 1074 #define BUFFERSIZE 219 1075 1076 void convsample_50() { 1077 printf("\n\n==============================================\n" 1078 "Sample 50: C: ucnv_detectUnicodeSignature\n"); 1079 1080 //! [ucnv_detectUnicodeSignature] 1081 UErrorCode err = U_ZERO_ERROR; 1082 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ 1083 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; 1084 int32_t signatureLength = 0; 1085 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); 1086 UConverter *conv = NULL; 1087 UChar output[100]; 1088 UChar *target = output, *out; 1089 const char *source = input; 1090 if(encoding!=NULL && U_SUCCESS(err)){ 1091 // should signature be discarded ? 1092 conv = ucnv_open(encoding, &err); 1093 // do the conversion 1094 ucnv_toUnicode(conv, 1095 &target, output + UPRV_LENGTHOF(output), 1096 &source, input + sizeof(input), 1097 NULL, TRUE, &err); 1098 out = output; 1099 if (discardSignature){ 1100 ++out; // ignore initial U+FEFF 1101 } 1102 while(out != target) { 1103 printf("%04x ", *out++); 1104 } 1105 puts(""); 1106 } 1107 //! [ucnv_detectUnicodeSignature] 1108 puts(""); 1109 } 1110 1111 1112 1113 /* main */ 1114 1115 int main() 1116 { 1117 1118 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1119 1120 convsample_02(); // C , u->koi8r, conv 1121 convsample_03(); // C, iterate 1122 1123 convsample_05(); // C, utf8->u, getNextUChar 1124 convsample_06(); // C freq counter thingy 1125 1126 convsample_12(); // C, sjis->u, conv 1127 convsample_13(); // C, big5->u, getNextU 1128 1129 convsample_20(); // C, callback 1130 convsample_21(); // C, callback debug 1131 1132 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1133 1134 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1135 1136 convsample_50(); // C, detect unicode signature 1137 1138 printf("End of converter samples.\n"); 1139 1140 fflush(stdout); 1141 fflush(stderr); 1142 1143 return 0; 1144 } 1145