Home | History | Annotate | Download | only in ucnv
      1 /**************************************************************************
      2 *
      3 *   Copyright (C) 2000-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *
      6 ***************************************************************************
      7 *   file name:  convsamp.c
      8 *   encoding:   ASCII (7-bit)
      9 *
     10 *   created on: 2000may30
     11 *   created by: Steven R. Loomis
     12 *
     13 *   Sample code for the ICU conversion routines.
     14 *
     15 * Note: Nothing special is needed to build this sample. Link with
     16 *       the icu UC and icu I18N libraries.
     17 *
     18 *       I use 'assert' for error checking, you probably will want
     19 *       something more flexible.  '***BEGIN SAMPLE***' and
     20 *       '***END SAMPLE***' mark pieces suitable for stand alone
     21 *       code snippets.
     22 *
     23 *
     24 *  Each test can define it's own BUFFERSIZE
     25 *
     26 */
     27 
     28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
     29 
     30 #include <stdio.h>
     31 #include <ctype.h>            /* for isspace, etc.    */
     32 #include <assert.h>
     33 #include <string.h>
     34 #include <stdlib.h>  /* malloc */
     35 
     36 #include "unicode/utypes.h"   /* Basic ICU data types */
     37 #include "unicode/ucnv.h"     /* C   Converter API    */
     38 #include "unicode/ustring.h"  /* some more string fcns*/
     39 #include "unicode/uchar.h"    /* char names           */
     40 #include "unicode/uloc.h"
     41 #include "unicode/unistr.h"
     42 
     43 #include "flagcb.h"
     44 
     45 /* Some utility functions */
     46 
     47 static const UChar kNone[] = { 0x0000 };
     48 
     49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
     50 
     51 /* Print a UChar if possible, in seven characters. */
     52 void prettyPrintUChar(UChar c)
     53 {
     54   if(  (c <= 0x007F) &&
     55        (isgraph(c))  ) {
     56     printf(" '%c'   ", (char)(0x00FF&c));
     57   } else if ( c > 0x007F ) {
     58     char buf[1000];
     59     UErrorCode status = U_ZERO_ERROR;
     60     int32_t o;
     61 
     62     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
     63     if(U_SUCCESS(status) && (o>0) ) {
     64       buf[6] = 0;
     65       printf("%7s", buf);
     66     } else {
     67       printf(" ??????");
     68     }
     69   } else {
     70     switch((char)(c & 0x007F)) {
     71     case ' ':
     72       printf(" ' '   ");
     73       break;
     74     case '\t':
     75       printf(" \\t    ");
     76       break;
     77     case '\n':
     78       printf(" \\n    ");
     79       break;
     80     default:
     81       printf("  _    ");
     82       break;
     83     }
     84   }
     85 }
     86 
     87 
     88 void printUChars(const char  *name = "?",
     89                  const UChar *uch  = kNone,
     90                  int32_t     len   = -1 )
     91 {
     92   int32_t i;
     93 
     94   if( (len == -1) && (uch) ) {
     95     len = u_strlen(uch);
     96   }
     97 
     98   printf("%5s: ", name);
     99   for( i = 0; i <len; i++) {
    100     printf("%-6d ", i);
    101   }
    102   printf("\n");
    103 
    104   printf("%5s: ", "uni");
    105   for( i = 0; i <len; i++) {
    106     printf("\\u%04X ", (int)uch[i]);
    107   }
    108   printf("\n");
    109 
    110   printf("%5s:", "ch");
    111   for( i = 0; i <len; i++) {
    112     prettyPrintUChar(uch[i]);
    113   }
    114   printf("\n");
    115 }
    116 
    117 void printBytes(const char  *name = "?",
    118                  const char *uch  = "",
    119                  int32_t     len   = -1 )
    120 {
    121   int32_t i;
    122 
    123   if( (len == -1) && (uch) ) {
    124     len = strlen(uch);
    125   }
    126 
    127   printf("%5s: ", name);
    128   for( i = 0; i <len; i++) {
    129     printf("%-4d ", i);
    130   }
    131   printf("\n");
    132 
    133   printf("%5s: ", "uni");
    134   for( i = 0; i <len; i++) {
    135     printf("\\x%02X ", 0x00FF & (int)uch[i]);
    136   }
    137   printf("\n");
    138 
    139   printf("%5s:", "ch");
    140   for( i = 0; i <len; i++) {
    141     if(isgraph(0x00FF & (int)uch[i])) {
    142       printf(" '%c' ", (char)uch[i]);
    143     } else {
    144       printf("     ");
    145     }
    146   }
    147   printf("\n");
    148 }
    149 
    150 void printUChar(UChar32 ch32)
    151 {
    152     if(ch32 > 0xFFFF) {
    153       printf("ch: U+%06X\n", ch32);
    154     }
    155     else {
    156       UChar ch = (UChar)ch32;
    157       printUChars("C", &ch, 1);
    158     }
    159 }
    160 
    161 /*******************************************************************
    162   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
    163   followed by an exclamation mark (!) into the KOI8-R Russian code page.
    164 
    165   This example first creates a UChar String out of the Unicode chars.
    166 
    167   targetSize must be set to the amount of space available in the target
    168   buffer. After fromUChars is called,
    169   len will contain the number of bytes in target[] which were
    170   used in the resulting codepage.  In this case, there is a 1:1 mapping
    171   between the input and output characters. The exclamation mark has the
    172   same value in both KOI8-R and Unicode.
    173 
    174   src: 0      1      2      3      4      5      6
    175   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
    176    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
    177 
    178  targ:  0    1    2    3    4    5    6
    179   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
    180    ch:                                '!'
    181 
    182 
    183 Converting FROM unicode
    184   to koi8-r.
    185   You must call ucnv_close to clean up the memory used by the
    186   converter.
    187 
    188   'len' returns the number of OUTPUT bytes resulting from the
    189   conversion.
    190  */
    191 
    192 UErrorCode convsample_02()
    193 {
    194   printf("\n\n==============================================\n"
    195          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
    196 
    197 
    198   // **************************** START SAMPLE *******************
    199   // "cat<cat>OK"
    200   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
    201                      0x0430, 0x0021, 0x0000 };
    202   char target[100];
    203   UErrorCode status = U_ZERO_ERROR;
    204   UConverter *conv;
    205   int32_t     len;
    206 
    207   // set up the converter
    208   //! [ucnv_open]
    209   conv = ucnv_open("koi8-r", &status);
    210   //! [ucnv_open]
    211   assert(U_SUCCESS(status));
    212 
    213   // convert to koi8-r
    214   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
    215   assert(U_SUCCESS(status));
    216 
    217   // close the converter
    218   ucnv_close(conv);
    219 
    220   // ***************************** END SAMPLE ********************
    221 
    222   // Print it out
    223   printUChars("src", source);
    224   printf("\n");
    225   printBytes("targ", target, len);
    226 
    227   return U_ZERO_ERROR;
    228 }
    229 
    230 
    231 UErrorCode convsample_03()
    232 {
    233   printf("\n\n==============================================\n"
    234          "Sample 03: C: print out all converters\n");
    235 
    236   int32_t count;
    237   int32_t i;
    238 
    239   // **************************** START SAMPLE *******************
    240   count = ucnv_countAvailable();
    241   printf("Available converters: %d\n", count);
    242 
    243   for(i=0;i<count;i++)
    244   {
    245     printf("%s ", ucnv_getAvailableName(i));
    246   }
    247 
    248   // ***************************** END SAMPLE ********************
    249 
    250   printf("\n");
    251 
    252   return U_ZERO_ERROR;
    253 }
    254 
    255 
    256 
    257 #define BUFFERSIZE 17 /* make it interesting :) */
    258 
    259 /*
    260   Converting from a codepage to Unicode in bulk..
    261   What is the best way to determine the buffer size?
    262 
    263      The 'buffersize' is in bytes of input.
    264     For a given converter, divinding this by the minimum char size
    265     give you the maximum number of Unicode characters that could be
    266     expected for a given number of input bytes.
    267      see: ucnv_getMinCharSize()
    268 
    269      For example, a single byte codepage like 'Latin-3' has a
    270     minimum char size of 1. (It takes at least 1 byte to represent
    271     each Unicode char.) So the unicode buffer has the same number of
    272     UChars as the input buffer has bytes.
    273 
    274      In a strictly double byte codepage such as cp1362 (Windows
    275     Korean), the minimum char size is 2. So, only half as many Unicode
    276     chars as bytes are needed.
    277 
    278      This work to calculate the buffer size is an optimization. Any
    279     size of input and output buffer can be used, as long as the
    280     program handles the following cases: If the input buffer is empty,
    281     the source pointer will be equal to sourceLimit.  If the output
    282     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
    283  */
    284 
    285 UErrorCode convsample_05()
    286 {
    287   printf("\n\n==============================================\n"
    288          "Sample 05: C: count the number of letters in a UTF-8 document\n");
    289 
    290   FILE *f;
    291   int32_t count;
    292   char inBuf[BUFFERSIZE];
    293   const char *source;
    294   const char *sourceLimit;
    295   UChar *uBuf;
    296   UChar *target;
    297   UChar *targetLimit;
    298   UChar *p;
    299   int32_t uBufSize = 0;
    300   UConverter *conv;
    301   UErrorCode status = U_ZERO_ERROR;
    302   uint32_t letters=0, total=0;
    303 
    304   f = fopen("data01.txt", "r");
    305   if(!f)
    306   {
    307     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    308     return U_FILE_ACCESS_ERROR;
    309   }
    310 
    311   // **************************** START SAMPLE *******************
    312   conv = ucnv_open("utf-8", &status);
    313   assert(U_SUCCESS(status));
    314 
    315   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    316   printf("input bytes %d / min chars %d = %d UChars\n",
    317          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    318   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    319   assert(uBuf!=NULL);
    320 
    321   // grab another buffer's worth
    322   while((!feof(f)) &&
    323         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    324   {
    325     // Convert bytes to unicode
    326     source = inBuf;
    327     sourceLimit = inBuf + count;
    328 
    329     do
    330     {
    331         target = uBuf;
    332         targetLimit = uBuf + uBufSize;
    333 
    334         ucnv_toUnicode(conv, &target, targetLimit,
    335                        &source, sourceLimit, NULL,
    336                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    337                                    /* is true (when no more data will come) */
    338                        &status);
    339 
    340         if(status == U_BUFFER_OVERFLOW_ERROR)
    341         {
    342           // simply ran out of space - we'll reset the target ptr the next
    343           // time through the loop.
    344           status = U_ZERO_ERROR;
    345         }
    346         else
    347         {
    348           //  Check other errors here.
    349           assert(U_SUCCESS(status));
    350           // Break out of the loop (by force)
    351         }
    352 
    353         // Process the Unicode
    354         // Todo: handle UTF-16/surrogates
    355 
    356         for(p = uBuf; p<target; p++)
    357         {
    358           if(u_isalpha(*p))
    359             letters++;
    360           total++;
    361         }
    362     } while (source < sourceLimit); // while simply out of space
    363   }
    364 
    365   printf("%d letters out of %d total UChars.\n", letters, total);
    366 
    367   // ***************************** END SAMPLE ********************
    368   ucnv_close(conv);
    369 
    370   printf("\n");
    371 
    372   fclose(f);
    373 
    374   return U_ZERO_ERROR;
    375 }
    376 #undef BUFFERSIZE
    377 
    378 #define BUFFERSIZE 1024
    379 typedef struct
    380 {
    381   UChar32  codepoint;
    382   uint32_t frequency;
    383 } CharFreqInfo;
    384 
    385 UErrorCode convsample_06()
    386 {
    387   printf("\n\n==============================================\n"
    388          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
    389 
    390   FILE *f;
    391   int32_t count;
    392   char inBuf[BUFFERSIZE];
    393   const char *source;
    394   const char *sourceLimit;
    395   int32_t uBufSize = 0;
    396   UConverter *conv;
    397   UErrorCode status = U_ZERO_ERROR;
    398   uint32_t letters=0, total=0;
    399 
    400   CharFreqInfo   *info;
    401   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
    402   UChar32   p;
    403 
    404   uint32_t ie = 0;
    405   uint32_t gh = 0;
    406   UChar32 l = 0;
    407 
    408   f = fopen("data06.txt", "r");
    409   if(!f)
    410   {
    411     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    412     return U_FILE_ACCESS_ERROR;
    413   }
    414 
    415   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
    416   if(!info)
    417   {
    418     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
    419   }
    420 
    421   /* reset frequencies */
    422   for(p=0;p<charCount;p++)
    423   {
    424     info[p].codepoint = p;
    425     info[p].frequency = 0;
    426   }
    427 
    428   // **************************** START SAMPLE *******************
    429   conv = ucnv_open("utf-8", &status);
    430   assert(U_SUCCESS(status));
    431 
    432   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    433   printf("input bytes %d / min chars %d = %d UChars\n",
    434          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    435 
    436   // grab another buffer's worth
    437   while((!feof(f)) &&
    438         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    439   {
    440     // Convert bytes to unicode
    441     source = inBuf;
    442     sourceLimit = inBuf + count;
    443 
    444     while(source < sourceLimit)
    445     {
    446       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    447       if(U_FAILURE(status))
    448       {
    449         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
    450         status = U_ZERO_ERROR;
    451         continue;
    452       }
    453       U_ASSERT(status);
    454       total++;
    455 
    456       if(u_isalpha(p))
    457         letters++;
    458 
    459       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
    460         ie++;
    461 
    462       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
    463         gh++;
    464 
    465       if(p>charCount)
    466       {
    467         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
    468         free(info);
    469         fclose(f);
    470         ucnv_close(conv);
    471         return U_UNSUPPORTED_ERROR;
    472       }
    473       info[p].frequency++;
    474       l = p;
    475     }
    476   }
    477 
    478   fclose(f);
    479   ucnv_close(conv);
    480 
    481   printf("%d letters out of %d total UChars.\n", letters, total);
    482   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
    483 
    484   // now, we could sort it..
    485 
    486   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
    487 
    488   for(p=0;p<charCount;p++)
    489   {
    490     if(info[p].frequency)
    491     {
    492       printf("% 5d U+%06X ", info[p].frequency, p);
    493       if(p <= 0xFFFF)
    494       {
    495         prettyPrintUChar((UChar)p);
    496       }
    497       printf("\n");
    498     }
    499   }
    500   free(info);
    501   // ***************************** END SAMPLE ********************
    502 
    503   printf("\n");
    504 
    505   return U_ZERO_ERROR;
    506 }
    507 #undef BUFFERSIZE
    508 
    509 
    510 /******************************************************
    511   You must call ucnv_close to clean up the memory used by the
    512   converter.
    513 
    514   'len' returns the number of OUTPUT bytes resulting from the
    515   conversion.
    516  */
    517 
    518 UErrorCode convsample_12()
    519 {
    520   printf("\n\n==============================================\n"
    521          "Sample 12: C: simple sjis -> unicode conversion\n");
    522 
    523 
    524   // **************************** START SAMPLE *******************
    525 
    526   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
    527   UChar target[100];
    528   UErrorCode status = U_ZERO_ERROR;
    529   UConverter *conv;
    530   int32_t     len;
    531 
    532   // set up the converter
    533   conv = ucnv_open("shift_jis", &status);
    534   assert(U_SUCCESS(status));
    535 
    536   // convert to Unicode
    537   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
    538   target[6] = 0xFDCA;
    539   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
    540   U_ASSERT(status);
    541   // close the converter
    542   ucnv_close(conv);
    543 
    544   // ***************************** END SAMPLE ********************
    545 
    546   // Print it out
    547   printBytes("src", source, strlen(source) );
    548   printf("\n");
    549   printUChars("targ", target, len);
    550 
    551   return U_ZERO_ERROR;
    552 }
    553 
    554 /******************************************************************
    555    C: Convert from codepage to Unicode one at a time.
    556 */
    557 
    558 UErrorCode convsample_13()
    559 {
    560   printf("\n\n==============================================\n"
    561          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
    562 
    563 
    564   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
    565   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
    566   const char *source, *sourceLimit;
    567   UChar32 target;
    568   UErrorCode status = U_ZERO_ERROR;
    569   UConverter *conv = NULL;
    570   int32_t srcCount=0;
    571   int32_t dstCount=0;
    572 
    573   srcCount = sizeof(sourceChars);
    574 
    575   conv = ucnv_open("Big5", &status);
    576   U_ASSERT(status);
    577 
    578   source = sourceChars;
    579   sourceLimit = sourceChars + sizeof(sourceChars);
    580 
    581   // **************************** START SAMPLE *******************
    582 
    583 
    584   printBytes("src",source,sourceLimit-source);
    585 
    586   while(source < sourceLimit)
    587   {
    588     puts("");
    589     target = ucnv_getNextUChar (conv,
    590                                 &source,
    591                                 sourceLimit,
    592                                 &status);
    593 
    594     //    printBytes("src",source,sourceLimit-source);
    595     U_ASSERT(status);
    596     printUChar(target);
    597     dstCount++;
    598   }
    599 
    600 
    601   // ************************** END SAMPLE *************************
    602 
    603   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
    604   ucnv_close(conv);
    605 
    606   return U_ZERO_ERROR;
    607 }
    608 
    609 
    610 
    611 
    612 UBool convsample_20_didSubstitute(const char *source)
    613 {
    614   UChar uchars[100];
    615   char bytes[100];
    616   UConverter *conv = NULL;
    617   UErrorCode status = U_ZERO_ERROR;
    618   uint32_t len, len2;
    619   UBool  flagVal;
    620 
    621   FromUFLAGContext * context = NULL;
    622 
    623   printf("\n\n==============================================\n"
    624          "Sample 20: C: Test for substitution using callbacks\n");
    625 
    626   /* print out the original source */
    627   printBytes("src", source);
    628   printf("\n");
    629 
    630   /* First, convert from UTF8 to unicode */
    631   conv = ucnv_open("utf-8", &status);
    632   U_ASSERT(status);
    633 
    634   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    635   U_ASSERT(status);
    636 
    637   printUChars("uch", uchars, len);
    638   printf("\n");
    639 
    640   /* Now, close the converter */
    641   ucnv_close(conv);
    642 
    643   /* Now, convert to windows-1252 */
    644   conv = ucnv_open("windows-1252", &status);
    645   U_ASSERT(status);
    646 
    647   /* Converter starts out with the SUBSTITUTE callback set. */
    648 
    649   /* initialize our callback */
    650   context = flagCB_fromU_openContext();
    651 
    652   /* Set our special callback */
    653   ucnv_setFromUCallBack(conv,
    654                         flagCB_fromU,
    655                         context,
    656                         &(context->subCallback),
    657                         &(context->subContext),
    658                         &status);
    659 
    660   U_ASSERT(status);
    661 
    662   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
    663   U_ASSERT(status);
    664 
    665   flagVal = context->flag;  /* it's about to go away when we close the cnv */
    666 
    667   ucnv_close(conv);
    668 
    669   /* print out the original source */
    670   printBytes("bytes", bytes, len2);
    671 
    672   return flagVal; /* true if callback was called */
    673 }
    674 
    675 UErrorCode convsample_20()
    676 {
    677   const char *sample1 = "abc\xdf\xbf";
    678   const char *sample2 = "abc_def";
    679 
    680 
    681   if(convsample_20_didSubstitute(sample1))
    682   {
    683     printf("DID substitute.\n******\n");
    684   }
    685   else
    686   {
    687     printf("Did NOT substitute.\n*****\n");
    688   }
    689 
    690   if(convsample_20_didSubstitute(sample2))
    691   {
    692     printf("DID substitute.\n******\n");
    693   }
    694   else
    695   {
    696     printf("Did NOT substitute.\n*****\n");
    697   }
    698 
    699   return U_ZERO_ERROR;
    700 }
    701 
    702 // 21  - C, callback, with clone and debug
    703 
    704 
    705 
    706 UBool convsample_21_didSubstitute(const char *source)
    707 {
    708   UChar uchars[100];
    709   char bytes[100];
    710   UConverter *conv = NULL, *cloneCnv = NULL;
    711   UErrorCode status = U_ZERO_ERROR;
    712   uint32_t len, len2;
    713   int32_t  cloneLen;
    714   UBool  flagVal = FALSE;
    715   UConverterFromUCallback junkCB;
    716 
    717   FromUFLAGContext *flagCtx = NULL,
    718                    *cloneFlagCtx = NULL;
    719 
    720   debugCBContext   *debugCtx1 = NULL,
    721                    *debugCtx2 = NULL,
    722                    *cloneDebugCtx = NULL;
    723 
    724   printf("\n\n==============================================\n"
    725          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
    726 
    727   /* print out the original source */
    728   printBytes("src", source);
    729   printf("\n");
    730 
    731   /* First, convert from UTF8 to unicode */
    732   conv = ucnv_open("utf-8", &status);
    733   U_ASSERT(status);
    734 
    735   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    736   U_ASSERT(status);
    737 
    738   printUChars("uch", uchars, len);
    739   printf("\n");
    740 
    741   /* Now, close the converter */
    742   ucnv_close(conv);
    743 
    744   /* Now, convert to windows-1252 */
    745   conv = ucnv_open("windows-1252", &status);
    746   U_ASSERT(status);
    747 
    748   /* Converter starts out with the SUBSTITUTE callback set. */
    749 
    750   /* initialize our callback */
    751   /* from the 'bottom' innermost, out
    752    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
    753 
    754 #if DEBUG_TMI
    755   printf("flagCB_fromU = %p\n", &flagCB_fromU);
    756   printf("debugCB_fromU = %p\n", &debugCB_fromU);
    757 #endif
    758 
    759   debugCtx1 = debugCB_openContext();
    760    flagCtx  = flagCB_fromU_openContext();
    761   debugCtx2 = debugCB_openContext();
    762 
    763   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
    764   debugCtx1->subContext  =  flagCtx;
    765 
    766   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
    767   flagCtx->subContext    =  debugCtx2;
    768 
    769   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
    770   debugCtx2->subContext  = NULL;
    771 
    772   /* Set our special callback */
    773 
    774   ucnv_setFromUCallBack(conv,
    775                         debugCB_fromU,
    776                         debugCtx1,
    777                         &(debugCtx2->subCallback),
    778                         &(debugCtx2->subContext),
    779                         &status);
    780 
    781   U_ASSERT(status);
    782 
    783 #if DEBUG_TMI
    784   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
    785          conv, debugCtx1, debugCtx1->subCallback,
    786          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
    787 #endif
    788 
    789   cloneLen = 1; /* but passing in null so it will clone */
    790   cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
    791 
    792   U_ASSERT(status);
    793 
    794 #if DEBUG_TMI
    795   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
    796 #endif
    797 
    798   ucnv_close(conv);
    799 
    800 #if DEBUG_TMI
    801   printf("%p closed.\n", conv);
    802 #endif
    803 
    804   U_ASSERT(status);
    805   /* Now, we have to extract the context */
    806   cloneDebugCtx = NULL;
    807   cloneFlagCtx  = NULL;
    808 
    809   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
    810   if(cloneDebugCtx != NULL) {
    811       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
    812   }
    813 
    814   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
    815          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
    816 
    817   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
    818   U_ASSERT(status);
    819 
    820   if(cloneFlagCtx != NULL) {
    821       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
    822   } else {
    823       printf("** Warning, couldn't get the subcallback \n");
    824   }
    825 
    826   ucnv_close(cloneCnv);
    827 
    828   /* print out the original source */
    829   printBytes("bytes", bytes, len2);
    830 
    831   return flagVal; /* true if callback was called */
    832 }
    833 
    834 UErrorCode convsample_21()
    835 {
    836   const char *sample1 = "abc\xdf\xbf";
    837   const char *sample2 = "abc_def";
    838 
    839   if(convsample_21_didSubstitute(sample1))
    840   {
    841     printf("DID substitute.\n******\n");
    842   }
    843   else
    844   {
    845     printf("Did NOT substitute.\n*****\n");
    846   }
    847 
    848   if(convsample_21_didSubstitute(sample2))
    849   {
    850     printf("DID substitute.\n******\n");
    851   }
    852   else
    853   {
    854     printf("Did NOT substitute.\n*****\n");
    855   }
    856 
    857   return U_ZERO_ERROR;
    858 }
    859 
    860 
    861 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
    862 
    863 #define BUFFERSIZE 17 /* make it interesting :) */
    864 
    865 UErrorCode convsample_40()
    866 {
    867   printf("\n\n==============================================\n"
    868     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
    869 
    870   FILE *f;
    871   FILE *out;
    872   int32_t count;
    873   char inBuf[BUFFERSIZE];
    874   const char *source;
    875   const char *sourceLimit;
    876   UChar *uBuf;
    877   UChar *target;
    878   UChar *targetLimit;
    879   int32_t uBufSize = 0;
    880   UConverter *conv = NULL;
    881   UErrorCode status = U_ZERO_ERROR;
    882   uint32_t inbytes=0, total=0;
    883 
    884   f = fopen("data02.bin", "rb");
    885   if(!f)
    886   {
    887     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    888     return U_FILE_ACCESS_ERROR;
    889   }
    890 
    891   out = fopen("data40.utf16", "wb");
    892   if(!out)
    893   {
    894     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    895     fclose(f);
    896     return U_FILE_ACCESS_ERROR;
    897   }
    898 
    899   // **************************** START SAMPLE *******************
    900   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
    901   assert(U_SUCCESS(status));
    902 
    903   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    904   printf("input bytes %d / min chars %d = %d UChars\n",
    905          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    906   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    907   assert(uBuf!=NULL);
    908 
    909   // grab another buffer's worth
    910   while((!feof(f)) &&
    911         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    912   {
    913     inbytes += count;
    914 
    915     // Convert bytes to unicode
    916     source = inBuf;
    917     sourceLimit = inBuf + count;
    918 
    919     do
    920     {
    921         target = uBuf;
    922         targetLimit = uBuf + uBufSize;
    923 
    924         ucnv_toUnicode( conv, &target, targetLimit,
    925                        &source, sourceLimit, NULL,
    926                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    927                                    /* is true (when no more data will come) */
    928                          &status);
    929 
    930         if(status == U_BUFFER_OVERFLOW_ERROR)
    931         {
    932           // simply ran out of space - we'll reset the target ptr the next
    933           // time through the loop.
    934           status = U_ZERO_ERROR;
    935         }
    936         else
    937         {
    938           //  Check other errors here.
    939           assert(U_SUCCESS(status));
    940           // Break out of the loop (by force)
    941         }
    942 
    943         // Process the Unicode
    944         // Todo: handle UTF-16/surrogates
    945         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
    946                (size_t)(target-uBuf));
    947         total += (target-uBuf);
    948     } while (source < sourceLimit); // while simply out of space
    949   }
    950 
    951   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
    952 
    953   // ***************************** END SAMPLE ********************
    954   ucnv_close(conv);
    955 
    956   fclose(f);
    957   fclose(out);
    958   printf("\n");
    959 
    960   return U_ZERO_ERROR;
    961 }
    962 #undef BUFFERSIZE
    963 
    964 
    965 
    966 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
    967 
    968 #define BUFFERSIZE 24 /* make it interesting :) */
    969 
    970 UErrorCode convsample_46()
    971 {
    972   printf("\n\n==============================================\n"
    973     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
    974 
    975   FILE *f;
    976   FILE *out;
    977   int32_t count;
    978   UChar inBuf[BUFFERSIZE];
    979   const UChar *source;
    980   const UChar *sourceLimit;
    981   char *buf;
    982   char *target;
    983   char *targetLimit;
    984 
    985   int32_t bufSize = 0;
    986   UConverter *conv = NULL;
    987   UErrorCode status = U_ZERO_ERROR;
    988   uint32_t inchars=0, total=0;
    989 
    990   f = fopen("data40.utf16", "rb");
    991   if(!f)
    992   {
    993     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
    994     return U_FILE_ACCESS_ERROR;
    995   }
    996 
    997   out = fopen("data46.out", "wb");
    998   if(!out)
    999   {
   1000     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
   1001     fclose(f);
   1002     return U_FILE_ACCESS_ERROR;
   1003   }
   1004 
   1005   // **************************** START SAMPLE *******************
   1006   conv = ucnv_open( "iso-8859-2", &status);
   1007   assert(U_SUCCESS(status));
   1008 
   1009   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
   1010   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
   1011          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
   1012   buf = (char*)malloc(bufSize * sizeof(char));
   1013   assert(buf!=NULL);
   1014 
   1015   // grab another buffer's worth
   1016   while((!feof(f)) &&
   1017         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
   1018   {
   1019     inchars += count;
   1020 
   1021     // Convert bytes to unicode
   1022     source = inBuf;
   1023     sourceLimit = inBuf + count;
   1024 
   1025     do
   1026     {
   1027         target = buf;
   1028         targetLimit = buf + bufSize;
   1029 
   1030         ucnv_fromUnicode( conv, &target, targetLimit,
   1031                        &source, sourceLimit, NULL,
   1032                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
   1033                                    /* is true (when no more data will come) */
   1034                          &status);
   1035 
   1036         if(status == U_BUFFER_OVERFLOW_ERROR)
   1037         {
   1038           // simply ran out of space - we'll reset the target ptr the next
   1039           // time through the loop.
   1040           status = U_ZERO_ERROR;
   1041         }
   1042         else
   1043         {
   1044           //  Check other errors here.
   1045           assert(U_SUCCESS(status));
   1046           // Break out of the loop (by force)
   1047         }
   1048 
   1049         // Process the Unicode
   1050         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
   1051                (size_t)(target-buf));
   1052         total += (target-buf);
   1053     } while (source < sourceLimit); // while simply out of space
   1054   }
   1055 
   1056   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
   1057 
   1058   // ***************************** END SAMPLE ********************
   1059   ucnv_close(conv);
   1060 
   1061   fclose(f);
   1062   fclose(out);
   1063   printf("\n");
   1064 
   1065   return U_ZERO_ERROR;
   1066 }
   1067 #undef BUFFERSIZE
   1068 
   1069 #define BUFFERSIZE 219
   1070 
   1071 void convsample_50() {
   1072   printf("\n\n==============================================\n"
   1073          "Sample 50: C: ucnv_detectUnicodeSignature\n");
   1074 
   1075   //! [ucnv_detectUnicodeSignature]
   1076   UErrorCode err = U_ZERO_ERROR;
   1077   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
   1078   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
   1079   int32_t signatureLength = 0;
   1080   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
   1081   UConverter *conv = NULL;
   1082   UChar output[100];
   1083   UChar *target = output, *out;
   1084   const char *source = input;
   1085   if(encoding!=NULL && U_SUCCESS(err)){
   1086     // should signature be discarded ?
   1087     conv = ucnv_open(encoding, &err);
   1088     // do the conversion
   1089     ucnv_toUnicode(conv,
   1090                    &target, output + sizeof(output)/U_SIZEOF_UCHAR,
   1091                    &source, input + sizeof(input),
   1092                    NULL, TRUE, &err);
   1093     out = output;
   1094     if (discardSignature){
   1095       ++out; // ignore initial U+FEFF
   1096     }
   1097     while(out != target) {
   1098       printf("%04x ", *out++);
   1099     }
   1100     puts("");
   1101   }
   1102   //! [ucnv_detectUnicodeSignature]
   1103   puts("");
   1104 }
   1105 
   1106 
   1107 
   1108 /* main */
   1109 
   1110 int main()
   1111 {
   1112 
   1113   printf("Default Converter=%s\n", ucnv_getDefaultName() );
   1114 
   1115   convsample_02();  // C  , u->koi8r, conv
   1116   convsample_03();  // C,   iterate
   1117 
   1118   convsample_05();  // C,  utf8->u, getNextUChar
   1119   convsample_06(); // C freq counter thingy
   1120 
   1121   convsample_12();  // C,  sjis->u, conv
   1122   convsample_13();  // C,  big5->u, getNextU
   1123 
   1124   convsample_20();  // C, callback
   1125   convsample_21();  // C, callback debug
   1126 
   1127   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
   1128 
   1129   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
   1130 
   1131   convsample_50();  // C, detect unicode signature
   1132 
   1133   printf("End of converter samples.\n");
   1134 
   1135   fflush(stdout);
   1136   fflush(stderr);
   1137 
   1138   return 0;
   1139 }
   1140