Home | History | Annotate | Download | only in ucnv
      1 /*************************************************************************
      2 *
      3 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
      4 *   License & terms of use: http://www.unicode.org/copyright.html#License
      5 *
      6 **************************************************************************
      7 **************************************************************************
      8 *
      9 *   Copyright (C) 2000-2016, International Business Machines
     10 *   Corporation and others.  All Rights Reserved.
     11 *
     12 ***************************************************************************
     13 *   file name:  convsamp.c
     14 *   encoding:   ASCII (7-bit)
     15 *
     16 *   created on: 2000may30
     17 *   created by: Steven R. Loomis
     18 *
     19 *   Sample code for the ICU conversion routines.
     20 *
     21 * Note: Nothing special is needed to build this sample. Link with
     22 *       the icu UC and icu I18N libraries.
     23 *
     24 *       I use 'assert' for error checking, you probably will want
     25 *       something more flexible.  '***BEGIN SAMPLE***' and
     26 *       '***END SAMPLE***' mark pieces suitable for stand alone
     27 *       code snippets.
     28 *
     29 *
     30 *  Each test can define it's own BUFFERSIZE
     31 *
     32 */
     33 
     34 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
     35 
     36 #include <stdio.h>
     37 #include <ctype.h>            /* for isspace, etc.    */
     38 #include <assert.h>
     39 #include <string.h>
     40 #include <stdlib.h>  /* malloc */
     41 
     42 #include "cmemory.h"
     43 #include "unicode/utypes.h"   /* Basic ICU data types */
     44 #include "unicode/ucnv.h"     /* C   Converter API    */
     45 #include "unicode/ustring.h"  /* some more string fcns*/
     46 #include "unicode/uchar.h"    /* char names           */
     47 #include "unicode/uloc.h"
     48 #include "unicode/unistr.h"
     49 
     50 #include "flagcb.h"
     51 
     52 /* Some utility functions */
     53 
     54 static const UChar kNone[] = { 0x0000 };
     55 
     56 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
     57 
     58 /* Print a UChar if possible, in seven characters. */
     59 void prettyPrintUChar(UChar c)
     60 {
     61   if(  (c <= 0x007F) &&
     62        (isgraph(c))  ) {
     63     printf(" '%c'   ", (char)(0x00FF&c));
     64   } else if ( c > 0x007F ) {
     65     char buf[1000];
     66     UErrorCode status = U_ZERO_ERROR;
     67     int32_t o;
     68 
     69     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
     70     if(U_SUCCESS(status) && (o>0) ) {
     71       buf[6] = 0;
     72       printf("%7s", buf);
     73     } else {
     74       printf(" ??????");
     75     }
     76   } else {
     77     switch((char)(c & 0x007F)) {
     78     case ' ':
     79       printf(" ' '   ");
     80       break;
     81     case '\t':
     82       printf(" \\t    ");
     83       break;
     84     case '\n':
     85       printf(" \\n    ");
     86       break;
     87     default:
     88       printf("  _    ");
     89       break;
     90     }
     91   }
     92 }
     93 
     94 
     95 void printUChars(const char  *name = "?",
     96                  const UChar *uch  = kNone,
     97                  int32_t     len   = -1 )
     98 {
     99   int32_t i;
    100 
    101   if( (len == -1) && (uch) ) {
    102     len = u_strlen(uch);
    103   }
    104 
    105   printf("%5s: ", name);
    106   for( i = 0; i <len; i++) {
    107     printf("%-6d ", i);
    108   }
    109   printf("\n");
    110 
    111   printf("%5s: ", "uni");
    112   for( i = 0; i <len; i++) {
    113     printf("\\u%04X ", (int)uch[i]);
    114   }
    115   printf("\n");
    116 
    117   printf("%5s:", "ch");
    118   for( i = 0; i <len; i++) {
    119     prettyPrintUChar(uch[i]);
    120   }
    121   printf("\n");
    122 }
    123 
    124 void printBytes(const char  *name = "?",
    125                  const char *uch  = "",
    126                  int32_t     len   = -1 )
    127 {
    128   int32_t i;
    129 
    130   if( (len == -1) && (uch) ) {
    131     len = strlen(uch);
    132   }
    133 
    134   printf("%5s: ", name);
    135   for( i = 0; i <len; i++) {
    136     printf("%-4d ", i);
    137   }
    138   printf("\n");
    139 
    140   printf("%5s: ", "uni");
    141   for( i = 0; i <len; i++) {
    142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
    143   }
    144   printf("\n");
    145 
    146   printf("%5s:", "ch");
    147   for( i = 0; i <len; i++) {
    148     if(isgraph(0x00FF & (int)uch[i])) {
    149       printf(" '%c' ", (char)uch[i]);
    150     } else {
    151       printf("     ");
    152     }
    153   }
    154   printf("\n");
    155 }
    156 
    157 void printUChar(UChar32 ch32)
    158 {
    159     if(ch32 > 0xFFFF) {
    160       printf("ch: U+%06X\n", ch32);
    161     }
    162     else {
    163       UChar ch = (UChar)ch32;
    164       printUChars("C", &ch, 1);
    165     }
    166 }
    167 
    168 /*******************************************************************
    169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
    170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
    171 
    172   This example first creates a UChar String out of the Unicode chars.
    173 
    174   targetSize must be set to the amount of space available in the target
    175   buffer. After fromUChars is called,
    176   len will contain the number of bytes in target[] which were
    177   used in the resulting codepage.  In this case, there is a 1:1 mapping
    178   between the input and output characters. The exclamation mark has the
    179   same value in both KOI8-R and Unicode.
    180 
    181   src: 0      1      2      3      4      5      6
    182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
    183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
    184 
    185  targ:  0    1    2    3    4    5    6
    186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
    187    ch:                                '!'
    188 
    189 
    190 Converting FROM unicode
    191   to koi8-r.
    192   You must call ucnv_close to clean up the memory used by the
    193   converter.
    194 
    195   'len' returns the number of OUTPUT bytes resulting from the
    196   conversion.
    197  */
    198 
    199 UErrorCode convsample_02()
    200 {
    201   printf("\n\n==============================================\n"
    202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
    203 
    204 
    205   // **************************** START SAMPLE *******************
    206   // "cat<cat>OK"
    207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
    208                      0x0430, 0x0021, 0x0000 };
    209   char target[100];
    210   UErrorCode status = U_ZERO_ERROR;
    211   UConverter *conv;
    212   int32_t     len;
    213 
    214   // set up the converter
    215   //! [ucnv_open]
    216   conv = ucnv_open("koi8-r", &status);
    217   //! [ucnv_open]
    218   assert(U_SUCCESS(status));
    219 
    220   // convert to koi8-r
    221   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
    222   assert(U_SUCCESS(status));
    223 
    224   // close the converter
    225   ucnv_close(conv);
    226 
    227   // ***************************** END SAMPLE ********************
    228 
    229   // Print it out
    230   printUChars("src", source);
    231   printf("\n");
    232   printBytes("targ", target, len);
    233 
    234   return U_ZERO_ERROR;
    235 }
    236 
    237 
    238 UErrorCode convsample_03()
    239 {
    240   printf("\n\n==============================================\n"
    241          "Sample 03: C: print out all converters\n");
    242 
    243   int32_t count;
    244   int32_t i;
    245 
    246   // **************************** START SAMPLE *******************
    247   count = ucnv_countAvailable();
    248   printf("Available converters: %d\n", count);
    249 
    250   for(i=0;i<count;i++)
    251   {
    252     printf("%s ", ucnv_getAvailableName(i));
    253   }
    254 
    255   // ***************************** END SAMPLE ********************
    256 
    257   printf("\n");
    258 
    259   return U_ZERO_ERROR;
    260 }
    261 
    262 
    263 
    264 #define BUFFERSIZE 17 /* make it interesting :) */
    265 
    266 /*
    267   Converting from a codepage to Unicode in bulk..
    268   What is the best way to determine the buffer size?
    269 
    270      The 'buffersize' is in bytes of input.
    271     For a given converter, divinding this by the minimum char size
    272     give you the maximum number of Unicode characters that could be
    273     expected for a given number of input bytes.
    274      see: ucnv_getMinCharSize()
    275 
    276      For example, a single byte codepage like 'Latin-3' has a
    277     minimum char size of 1. (It takes at least 1 byte to represent
    278     each Unicode char.) So the unicode buffer has the same number of
    279     UChars as the input buffer has bytes.
    280 
    281      In a strictly double byte codepage such as cp1362 (Windows
    282     Korean), the minimum char size is 2. So, only half as many Unicode
    283     chars as bytes are needed.
    284 
    285      This work to calculate the buffer size is an optimization. Any
    286     size of input and output buffer can be used, as long as the
    287     program handles the following cases: If the input buffer is empty,
    288     the source pointer will be equal to sourceLimit.  If the output
    289     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
    290  */
    291 
    292 UErrorCode convsample_05()
    293 {
    294   printf("\n\n==============================================\n"
    295          "Sample 05: C: count the number of letters in a UTF-8 document\n");
    296 
    297   FILE *f;
    298   int32_t count;
    299   char inBuf[BUFFERSIZE];
    300   const char *source;
    301   const char *sourceLimit;
    302   UChar *uBuf;
    303   UChar *target;
    304   UChar *targetLimit;
    305   UChar *p;
    306   int32_t uBufSize = 0;
    307   UConverter *conv;
    308   UErrorCode status = U_ZERO_ERROR;
    309   uint32_t letters=0, total=0;
    310 
    311   f = fopen("data01.txt", "r");
    312   if(!f)
    313   {
    314     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    315     return U_FILE_ACCESS_ERROR;
    316   }
    317 
    318   // **************************** START SAMPLE *******************
    319   conv = ucnv_open("utf-8", &status);
    320   assert(U_SUCCESS(status));
    321 
    322   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    323   printf("input bytes %d / min chars %d = %d UChars\n",
    324          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    325   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    326   assert(uBuf!=NULL);
    327 
    328   // grab another buffer's worth
    329   while((!feof(f)) &&
    330         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    331   {
    332     // Convert bytes to unicode
    333     source = inBuf;
    334     sourceLimit = inBuf + count;
    335 
    336     do
    337     {
    338         target = uBuf;
    339         targetLimit = uBuf + uBufSize;
    340 
    341         ucnv_toUnicode(conv, &target, targetLimit,
    342                        &source, sourceLimit, NULL,
    343                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    344                                    /* is true (when no more data will come) */
    345                        &status);
    346 
    347         if(status == U_BUFFER_OVERFLOW_ERROR)
    348         {
    349           // simply ran out of space - we'll reset the target ptr the next
    350           // time through the loop.
    351           status = U_ZERO_ERROR;
    352         }
    353         else
    354         {
    355           //  Check other errors here.
    356           assert(U_SUCCESS(status));
    357           // Break out of the loop (by force)
    358         }
    359 
    360         // Process the Unicode
    361         // Todo: handle UTF-16/surrogates
    362 
    363         for(p = uBuf; p<target; p++)
    364         {
    365           if(u_isalpha(*p))
    366             letters++;
    367           total++;
    368         }
    369     } while (source < sourceLimit); // while simply out of space
    370   }
    371 
    372   printf("%d letters out of %d total UChars.\n", letters, total);
    373 
    374   // ***************************** END SAMPLE ********************
    375   ucnv_close(conv);
    376 
    377   printf("\n");
    378 
    379   fclose(f);
    380 
    381   return U_ZERO_ERROR;
    382 }
    383 #undef BUFFERSIZE
    384 
    385 #define BUFFERSIZE 1024
    386 typedef struct
    387 {
    388   UChar32  codepoint;
    389   uint32_t frequency;
    390 } CharFreqInfo;
    391 
    392 UErrorCode convsample_06()
    393 {
    394   printf("\n\n==============================================\n"
    395          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
    396 
    397   FILE *f;
    398   int32_t count;
    399   char inBuf[BUFFERSIZE];
    400   const char *source;
    401   const char *sourceLimit;
    402   int32_t uBufSize = 0;
    403   UConverter *conv;
    404   UErrorCode status = U_ZERO_ERROR;
    405   uint32_t letters=0, total=0;
    406 
    407   CharFreqInfo   *info;
    408   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
    409   UChar32   p;
    410 
    411   uint32_t ie = 0;
    412   uint32_t gh = 0;
    413   UChar32 l = 0;
    414 
    415   f = fopen("data06.txt", "r");
    416   if(!f)
    417   {
    418     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    419     return U_FILE_ACCESS_ERROR;
    420   }
    421 
    422   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
    423   if(!info)
    424   {
    425     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
    426   }
    427 
    428   /* reset frequencies */
    429   for(p=0;p<charCount;p++)
    430   {
    431     info[p].codepoint = p;
    432     info[p].frequency = 0;
    433   }
    434 
    435   // **************************** START SAMPLE *******************
    436   conv = ucnv_open("utf-8", &status);
    437   assert(U_SUCCESS(status));
    438 
    439   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    440   printf("input bytes %d / min chars %d = %d UChars\n",
    441          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    442 
    443   // grab another buffer's worth
    444   while((!feof(f)) &&
    445         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    446   {
    447     // Convert bytes to unicode
    448     source = inBuf;
    449     sourceLimit = inBuf + count;
    450 
    451     while(source < sourceLimit)
    452     {
    453       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    454       if(U_FAILURE(status))
    455       {
    456         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
    457         status = U_ZERO_ERROR;
    458         continue;
    459       }
    460       U_ASSERT(status);
    461       total++;
    462 
    463       if(u_isalpha(p))
    464         letters++;
    465 
    466       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
    467         ie++;
    468 
    469       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
    470         gh++;
    471 
    472       if(p>charCount)
    473       {
    474         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
    475         free(info);
    476         fclose(f);
    477         ucnv_close(conv);
    478         return U_UNSUPPORTED_ERROR;
    479       }
    480       info[p].frequency++;
    481       l = p;
    482     }
    483   }
    484 
    485   fclose(f);
    486   ucnv_close(conv);
    487 
    488   printf("%d letters out of %d total UChars.\n", letters, total);
    489   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
    490 
    491   // now, we could sort it..
    492 
    493   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
    494 
    495   for(p=0;p<charCount;p++)
    496   {
    497     if(info[p].frequency)
    498     {
    499       printf("% 5d U+%06X ", info[p].frequency, p);
    500       if(p <= 0xFFFF)
    501       {
    502         prettyPrintUChar((UChar)p);
    503       }
    504       printf("\n");
    505     }
    506   }
    507   free(info);
    508   // ***************************** END SAMPLE ********************
    509 
    510   printf("\n");
    511 
    512   return U_ZERO_ERROR;
    513 }
    514 #undef BUFFERSIZE
    515 
    516 
    517 /******************************************************
    518   You must call ucnv_close to clean up the memory used by the
    519   converter.
    520 
    521   'len' returns the number of OUTPUT bytes resulting from the
    522   conversion.
    523  */
    524 
    525 UErrorCode convsample_12()
    526 {
    527   printf("\n\n==============================================\n"
    528          "Sample 12: C: simple sjis -> unicode conversion\n");
    529 
    530 
    531   // **************************** START SAMPLE *******************
    532 
    533   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
    534   UChar target[100];
    535   UErrorCode status = U_ZERO_ERROR;
    536   UConverter *conv;
    537   int32_t     len;
    538 
    539   // set up the converter
    540   conv = ucnv_open("shift_jis", &status);
    541   assert(U_SUCCESS(status));
    542 
    543   // convert to Unicode
    544   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
    545   target[6] = 0xFDCA;
    546   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
    547   U_ASSERT(status);
    548   // close the converter
    549   ucnv_close(conv);
    550 
    551   // ***************************** END SAMPLE ********************
    552 
    553   // Print it out
    554   printBytes("src", source, strlen(source) );
    555   printf("\n");
    556   printUChars("targ", target, len);
    557 
    558   return U_ZERO_ERROR;
    559 }
    560 
    561 /******************************************************************
    562    C: Convert from codepage to Unicode one at a time.
    563 */
    564 
    565 UErrorCode convsample_13()
    566 {
    567   printf("\n\n==============================================\n"
    568          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
    569 
    570 
    571   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
    572   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
    573   const char *source, *sourceLimit;
    574   UChar32 target;
    575   UErrorCode status = U_ZERO_ERROR;
    576   UConverter *conv = NULL;
    577   int32_t srcCount=0;
    578   int32_t dstCount=0;
    579 
    580   srcCount = sizeof(sourceChars);
    581 
    582   conv = ucnv_open("Big5", &status);
    583   U_ASSERT(status);
    584 
    585   source = sourceChars;
    586   sourceLimit = sourceChars + sizeof(sourceChars);
    587 
    588   // **************************** START SAMPLE *******************
    589 
    590 
    591   printBytes("src",source,sourceLimit-source);
    592 
    593   while(source < sourceLimit)
    594   {
    595     puts("");
    596     target = ucnv_getNextUChar (conv,
    597                                 &source,
    598                                 sourceLimit,
    599                                 &status);
    600 
    601     //    printBytes("src",source,sourceLimit-source);
    602     U_ASSERT(status);
    603     printUChar(target);
    604     dstCount++;
    605   }
    606 
    607 
    608   // ************************** END SAMPLE *************************
    609 
    610   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
    611   ucnv_close(conv);
    612 
    613   return U_ZERO_ERROR;
    614 }
    615 
    616 
    617 
    618 
    619 UBool convsample_20_didSubstitute(const char *source)
    620 {
    621   UChar uchars[100];
    622   char bytes[100];
    623   UConverter *conv = NULL;
    624   UErrorCode status = U_ZERO_ERROR;
    625   uint32_t len, len2;
    626   UBool  flagVal;
    627 
    628   FromUFLAGContext * context = NULL;
    629 
    630   printf("\n\n==============================================\n"
    631          "Sample 20: C: Test for substitution using callbacks\n");
    632 
    633   /* print out the original source */
    634   printBytes("src", source);
    635   printf("\n");
    636 
    637   /* First, convert from UTF8 to unicode */
    638   conv = ucnv_open("utf-8", &status);
    639   U_ASSERT(status);
    640 
    641   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    642   U_ASSERT(status);
    643 
    644   printUChars("uch", uchars, len);
    645   printf("\n");
    646 
    647   /* Now, close the converter */
    648   ucnv_close(conv);
    649 
    650   /* Now, convert to windows-1252 */
    651   conv = ucnv_open("windows-1252", &status);
    652   U_ASSERT(status);
    653 
    654   /* Converter starts out with the SUBSTITUTE callback set. */
    655 
    656   /* initialize our callback */
    657   context = flagCB_fromU_openContext();
    658 
    659   /* Set our special callback */
    660   ucnv_setFromUCallBack(conv,
    661                         flagCB_fromU,
    662                         context,
    663                         &(context->subCallback),
    664                         &(context->subContext),
    665                         &status);
    666 
    667   U_ASSERT(status);
    668 
    669   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
    670   U_ASSERT(status);
    671 
    672   flagVal = context->flag;  /* it's about to go away when we close the cnv */
    673 
    674   ucnv_close(conv);
    675 
    676   /* print out the original source */
    677   printBytes("bytes", bytes, len2);
    678 
    679   return flagVal; /* true if callback was called */
    680 }
    681 
    682 UErrorCode convsample_20()
    683 {
    684   const char *sample1 = "abc\xdf\xbf";
    685   const char *sample2 = "abc_def";
    686 
    687 
    688   if(convsample_20_didSubstitute(sample1))
    689   {
    690     printf("DID substitute.\n******\n");
    691   }
    692   else
    693   {
    694     printf("Did NOT substitute.\n*****\n");
    695   }
    696 
    697   if(convsample_20_didSubstitute(sample2))
    698   {
    699     printf("DID substitute.\n******\n");
    700   }
    701   else
    702   {
    703     printf("Did NOT substitute.\n*****\n");
    704   }
    705 
    706   return U_ZERO_ERROR;
    707 }
    708 
    709 // 21  - C, callback, with clone and debug
    710 
    711 
    712 
    713 UBool convsample_21_didSubstitute(const char *source)
    714 {
    715   UChar uchars[100];
    716   char bytes[100];
    717   UConverter *conv = NULL, *cloneCnv = NULL;
    718   UErrorCode status = U_ZERO_ERROR;
    719   uint32_t len, len2;
    720   int32_t  cloneLen;
    721   UBool  flagVal = FALSE;
    722   UConverterFromUCallback junkCB;
    723 
    724   FromUFLAGContext *flagCtx = NULL,
    725                    *cloneFlagCtx = NULL;
    726 
    727   debugCBContext   *debugCtx1 = NULL,
    728                    *debugCtx2 = NULL,
    729                    *cloneDebugCtx = NULL;
    730 
    731   printf("\n\n==============================================\n"
    732          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
    733 
    734   /* print out the original source */
    735   printBytes("src", source);
    736   printf("\n");
    737 
    738   /* First, convert from UTF8 to unicode */
    739   conv = ucnv_open("utf-8", &status);
    740   U_ASSERT(status);
    741 
    742   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    743   U_ASSERT(status);
    744 
    745   printUChars("uch", uchars, len);
    746   printf("\n");
    747 
    748   /* Now, close the converter */
    749   ucnv_close(conv);
    750 
    751   /* Now, convert to windows-1252 */
    752   conv = ucnv_open("windows-1252", &status);
    753   U_ASSERT(status);
    754 
    755   /* Converter starts out with the SUBSTITUTE callback set. */
    756 
    757   /* initialize our callback */
    758   /* from the 'bottom' innermost, out
    759    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
    760 
    761 #if DEBUG_TMI
    762   printf("flagCB_fromU = %p\n", &flagCB_fromU);
    763   printf("debugCB_fromU = %p\n", &debugCB_fromU);
    764 #endif
    765 
    766   debugCtx1 = debugCB_openContext();
    767    flagCtx  = flagCB_fromU_openContext();
    768   debugCtx2 = debugCB_openContext();
    769 
    770   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
    771   debugCtx1->subContext  =  flagCtx;
    772 
    773   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
    774   flagCtx->subContext    =  debugCtx2;
    775 
    776   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
    777   debugCtx2->subContext  = NULL;
    778 
    779   /* Set our special callback */
    780 
    781   ucnv_setFromUCallBack(conv,
    782                         debugCB_fromU,
    783                         debugCtx1,
    784                         &(debugCtx2->subCallback),
    785                         &(debugCtx2->subContext),
    786                         &status);
    787 
    788   U_ASSERT(status);
    789 
    790 #if DEBUG_TMI
    791   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
    792          conv, debugCtx1, debugCtx1->subCallback,
    793          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
    794 #endif
    795 
    796   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
    797 
    798   U_ASSERT(status);
    799 
    800 #if DEBUG_TMI
    801   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
    802 #endif
    803 
    804   ucnv_close(conv);
    805 
    806 #if DEBUG_TMI
    807   printf("%p closed.\n", conv);
    808 #endif
    809 
    810   U_ASSERT(status);
    811   /* Now, we have to extract the context */
    812   cloneDebugCtx = NULL;
    813   cloneFlagCtx  = NULL;
    814 
    815   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
    816   if(cloneDebugCtx != NULL) {
    817       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
    818   }
    819 
    820   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
    821          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
    822 
    823   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
    824   U_ASSERT(status);
    825 
    826   if(cloneFlagCtx != NULL) {
    827       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
    828   } else {
    829       printf("** Warning, couldn't get the subcallback \n");
    830   }
    831 
    832   ucnv_close(cloneCnv);
    833 
    834   /* print out the original source */
    835   printBytes("bytes", bytes, len2);
    836 
    837   return flagVal; /* true if callback was called */
    838 }
    839 
    840 UErrorCode convsample_21()
    841 {
    842   const char *sample1 = "abc\xdf\xbf";
    843   const char *sample2 = "abc_def";
    844 
    845   if(convsample_21_didSubstitute(sample1))
    846   {
    847     printf("DID substitute.\n******\n");
    848   }
    849   else
    850   {
    851     printf("Did NOT substitute.\n*****\n");
    852   }
    853 
    854   if(convsample_21_didSubstitute(sample2))
    855   {
    856     printf("DID substitute.\n******\n");
    857   }
    858   else
    859   {
    860     printf("Did NOT substitute.\n*****\n");
    861   }
    862 
    863   return U_ZERO_ERROR;
    864 }
    865 
    866 
    867 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
    868 
    869 #define BUFFERSIZE 17 /* make it interesting :) */
    870 
    871 UErrorCode convsample_40()
    872 {
    873   printf("\n\n==============================================\n"
    874     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
    875 
    876   FILE *f;
    877   FILE *out;
    878   int32_t count;
    879   char inBuf[BUFFERSIZE];
    880   const char *source;
    881   const char *sourceLimit;
    882   UChar *uBuf;
    883   UChar *target;
    884   UChar *targetLimit;
    885   int32_t uBufSize = 0;
    886   UConverter *conv = NULL;
    887   UErrorCode status = U_ZERO_ERROR;
    888   uint32_t inbytes=0, total=0;
    889 
    890   f = fopen("data02.bin", "rb");
    891   if(!f)
    892   {
    893     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    894     return U_FILE_ACCESS_ERROR;
    895   }
    896 
    897   out = fopen("data40.utf16", "wb");
    898   if(!out)
    899   {
    900     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    901     fclose(f);
    902     return U_FILE_ACCESS_ERROR;
    903   }
    904 
    905   // **************************** START SAMPLE *******************
    906   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
    907   assert(U_SUCCESS(status));
    908 
    909   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    910   printf("input bytes %d / min chars %d = %d UChars\n",
    911          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    912   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    913   assert(uBuf!=NULL);
    914 
    915   // grab another buffer's worth
    916   while((!feof(f)) &&
    917         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    918   {
    919     inbytes += count;
    920 
    921     // Convert bytes to unicode
    922     source = inBuf;
    923     sourceLimit = inBuf + count;
    924 
    925     do
    926     {
    927         target = uBuf;
    928         targetLimit = uBuf + uBufSize;
    929 
    930         ucnv_toUnicode( conv, &target, targetLimit,
    931                        &source, sourceLimit, NULL,
    932                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    933                                    /* is true (when no more data will come) */
    934                          &status);
    935 
    936         if(status == U_BUFFER_OVERFLOW_ERROR)
    937         {
    938           // simply ran out of space - we'll reset the target ptr the next
    939           // time through the loop.
    940           status = U_ZERO_ERROR;
    941         }
    942         else
    943         {
    944           //  Check other errors here.
    945           assert(U_SUCCESS(status));
    946           // Break out of the loop (by force)
    947         }
    948 
    949         // Process the Unicode
    950         // Todo: handle UTF-16/surrogates
    951         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
    952                (size_t)(target-uBuf));
    953         total += (target-uBuf);
    954     } while (source < sourceLimit); // while simply out of space
    955   }
    956 
    957   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
    958 
    959   // ***************************** END SAMPLE ********************
    960   ucnv_close(conv);
    961 
    962   fclose(f);
    963   fclose(out);
    964   printf("\n");
    965 
    966   return U_ZERO_ERROR;
    967 }
    968 #undef BUFFERSIZE
    969 
    970 
    971 
    972 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
    973 
    974 #define BUFFERSIZE 24 /* make it interesting :) */
    975 
    976 UErrorCode convsample_46()
    977 {
    978   printf("\n\n==============================================\n"
    979     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
    980 
    981   FILE *f;
    982   FILE *out;
    983   int32_t count;
    984   UChar inBuf[BUFFERSIZE];
    985   const UChar *source;
    986   const UChar *sourceLimit;
    987   char *buf;
    988   char *target;
    989   char *targetLimit;
    990 
    991   int32_t bufSize = 0;
    992   UConverter *conv = NULL;
    993   UErrorCode status = U_ZERO_ERROR;
    994   uint32_t inchars=0, total=0;
    995 
    996   f = fopen("data40.utf16", "rb");
    997   if(!f)
    998   {
    999     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
   1000     return U_FILE_ACCESS_ERROR;
   1001   }
   1002 
   1003   out = fopen("data46.out", "wb");
   1004   if(!out)
   1005   {
   1006     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
   1007     fclose(f);
   1008     return U_FILE_ACCESS_ERROR;
   1009   }
   1010 
   1011   // **************************** START SAMPLE *******************
   1012   conv = ucnv_open( "iso-8859-2", &status);
   1013   assert(U_SUCCESS(status));
   1014 
   1015   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
   1016   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
   1017          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
   1018   buf = (char*)malloc(bufSize * sizeof(char));
   1019   assert(buf!=NULL);
   1020 
   1021   // grab another buffer's worth
   1022   while((!feof(f)) &&
   1023         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
   1024   {
   1025     inchars += count;
   1026 
   1027     // Convert bytes to unicode
   1028     source = inBuf;
   1029     sourceLimit = inBuf + count;
   1030 
   1031     do
   1032     {
   1033         target = buf;
   1034         targetLimit = buf + bufSize;
   1035 
   1036         ucnv_fromUnicode( conv, &target, targetLimit,
   1037                        &source, sourceLimit, NULL,
   1038                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
   1039                                    /* is true (when no more data will come) */
   1040                          &status);
   1041 
   1042         if(status == U_BUFFER_OVERFLOW_ERROR)
   1043         {
   1044           // simply ran out of space - we'll reset the target ptr the next
   1045           // time through the loop.
   1046           status = U_ZERO_ERROR;
   1047         }
   1048         else
   1049         {
   1050           //  Check other errors here.
   1051           assert(U_SUCCESS(status));
   1052           // Break out of the loop (by force)
   1053         }
   1054 
   1055         // Process the Unicode
   1056         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
   1057                (size_t)(target-buf));
   1058         total += (target-buf);
   1059     } while (source < sourceLimit); // while simply out of space
   1060   }
   1061 
   1062   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
   1063 
   1064   // ***************************** END SAMPLE ********************
   1065   ucnv_close(conv);
   1066 
   1067   fclose(f);
   1068   fclose(out);
   1069   printf("\n");
   1070 
   1071   return U_ZERO_ERROR;
   1072 }
   1073 #undef BUFFERSIZE
   1074 
   1075 #define BUFFERSIZE 219
   1076 
   1077 void convsample_50() {
   1078   printf("\n\n==============================================\n"
   1079          "Sample 50: C: ucnv_detectUnicodeSignature\n");
   1080 
   1081   //! [ucnv_detectUnicodeSignature]
   1082   UErrorCode err = U_ZERO_ERROR;
   1083   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
   1084   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
   1085   int32_t signatureLength = 0;
   1086   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
   1087   UConverter *conv = NULL;
   1088   UChar output[100];
   1089   UChar *target = output, *out;
   1090   const char *source = input;
   1091   if(encoding!=NULL && U_SUCCESS(err)){
   1092     // should signature be discarded ?
   1093     conv = ucnv_open(encoding, &err);
   1094     // do the conversion
   1095     ucnv_toUnicode(conv,
   1096                    &target, output + UPRV_LENGTHOF(output),
   1097                    &source, input + sizeof(input),
   1098                    NULL, TRUE, &err);
   1099     out = output;
   1100     if (discardSignature){
   1101       ++out; // ignore initial U+FEFF
   1102     }
   1103     while(out != target) {
   1104       printf("%04x ", *out++);
   1105     }
   1106     puts("");
   1107   }
   1108   //! [ucnv_detectUnicodeSignature]
   1109   puts("");
   1110 }
   1111 
   1112 
   1113 
   1114 /* main */
   1115 
   1116 int main()
   1117 {
   1118 
   1119   printf("Default Converter=%s\n", ucnv_getDefaultName() );
   1120 
   1121   convsample_02();  // C  , u->koi8r, conv
   1122   convsample_03();  // C,   iterate
   1123 
   1124   convsample_05();  // C,  utf8->u, getNextUChar
   1125   convsample_06(); // C freq counter thingy
   1126 
   1127   convsample_12();  // C,  sjis->u, conv
   1128   convsample_13();  // C,  big5->u, getNextU
   1129 
   1130   convsample_20();  // C, callback
   1131   convsample_21();  // C, callback debug
   1132 
   1133   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
   1134 
   1135   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
   1136 
   1137   convsample_50();  // C, detect unicode signature
   1138 
   1139   printf("End of converter samples.\n");
   1140 
   1141   fflush(stdout);
   1142   fflush(stderr);
   1143 
   1144   return 0;
   1145 }
   1146