Home | History | Annotate | Download | only in ucnv
      1 /**************************************************************************
      2 *
      3 *   Copyright (C) 2000-2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *
      6 ***************************************************************************
      7 *   file name:  convsamp.c
      8 *   encoding:   ASCII (7-bit)
      9 *
     10 *   created on: 2000may30
     11 *   created by: Steven R. Loomis
     12 *
     13 *   Sample code for the ICU conversion routines.
     14 *
     15 * Note: Nothing special is needed to build this sample. Link with
     16 *       the icu UC and icu I18N libraries.
     17 *
     18 *       I use 'assert' for error checking, you probably will want
     19 *       something more flexible.  '***BEGIN SAMPLE***' and
     20 *       '***END SAMPLE***' mark pieces suitable for stand alone
     21 *       code snippets.
     22 *
     23 *
     24 *  Each test can define it's own BUFFERSIZE
     25 *
     26 */
     27 
     28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
     29 
     30 #include <stdio.h>
     31 #include <ctype.h>            /* for isspace, etc.    */
     32 #include <assert.h>
     33 #include <string.h>
     34 #include <stdlib.h>  /* malloc */
     35 
     36 #include "unicode/utypes.h"   /* Basic ICU data types */
     37 #include "unicode/ucnv.h"     /* C   Converter API    */
     38 #include "unicode/ustring.h"  /* some more string fcns*/
     39 #include "unicode/uchar.h"    /* char names           */
     40 #include "unicode/uloc.h"
     41 #include "unicode/unistr.h"
     42 
     43 #include "flagcb.h"
     44 
     45 /* Some utility functions */
     46 
     47 static const UChar kNone[] = { 0x0000 };
     48 
     49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
     50 
     51 /* Print a UChar if possible, in seven characters. */
     52 void prettyPrintUChar(UChar c)
     53 {
     54   if(  (c <= 0x007F) &&
     55        (isgraph(c))  ) {
     56     printf(" '%c'   ", (char)(0x00FF&c));
     57   } else if ( c > 0x007F ) {
     58     char buf[1000];
     59     UErrorCode status = U_ZERO_ERROR;
     60     int32_t o;
     61 
     62     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
     63     if(U_SUCCESS(status) && (o>0) ) {
     64       buf[6] = 0;
     65       printf("%7s", buf);
     66     } else {
     67       printf(" ??????");
     68     }
     69   } else {
     70     switch((char)(c & 0x007F)) {
     71     case ' ':
     72       printf(" ' '   ");
     73       break;
     74     case '\t':
     75       printf(" \\t    ");
     76       break;
     77     case '\n':
     78       printf(" \\n    ");
     79       break;
     80     default:
     81       printf("  _    ");
     82       break;
     83     }
     84   }
     85 }
     86 
     87 
     88 void printUChars(const char  *name = "?",
     89                  const UChar *uch  = kNone,
     90                  int32_t     len   = -1 )
     91 {
     92   int32_t i;
     93 
     94   if( (len == -1) && (uch) ) {
     95     len = u_strlen(uch);
     96   }
     97 
     98   printf("%5s: ", name);
     99   for( i = 0; i <len; i++) {
    100     printf("%-6d ", i);
    101   }
    102   printf("\n");
    103 
    104   printf("%5s: ", "uni");
    105   for( i = 0; i <len; i++) {
    106     printf("\\u%04X ", (int)uch[i]);
    107   }
    108   printf("\n");
    109 
    110   printf("%5s:", "ch");
    111   for( i = 0; i <len; i++) {
    112     prettyPrintUChar(uch[i]);
    113   }
    114   printf("\n");
    115 }
    116 
    117 void printBytes(const char  *name = "?",
    118                  const char *uch  = "",
    119                  int32_t     len   = -1 )
    120 {
    121   int32_t i;
    122 
    123   if( (len == -1) && (uch) ) {
    124     len = strlen(uch);
    125   }
    126 
    127   printf("%5s: ", name);
    128   for( i = 0; i <len; i++) {
    129     printf("%-4d ", i);
    130   }
    131   printf("\n");
    132 
    133   printf("%5s: ", "uni");
    134   for( i = 0; i <len; i++) {
    135     printf("\\x%02X ", 0x00FF & (int)uch[i]);
    136   }
    137   printf("\n");
    138 
    139   printf("%5s:", "ch");
    140   for( i = 0; i <len; i++) {
    141     if(isgraph(0x00FF & (int)uch[i])) {
    142       printf(" '%c' ", (char)uch[i]);
    143     } else {
    144       printf("     ");
    145     }
    146   }
    147   printf("\n");
    148 }
    149 
    150 void printUChar(UChar32 ch32)
    151 {
    152     if(ch32 > 0xFFFF) {
    153       printf("ch: U+%06X\n", ch32);
    154     }
    155     else {
    156       UChar ch = (UChar)ch32;
    157       printUChars("C", &ch, 1);
    158     }
    159 }
    160 
    161 /*******************************************************************
    162   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
    163   followed by an exclamation mark (!) into the KOI8-R Russian code page.
    164 
    165   This example first creates a UChar String out of the Unicode chars.
    166 
    167   targetSize must be set to the amount of space available in the target
    168   buffer. After fromUChars is called,
    169   len will contain the number of bytes in target[] which were
    170   used in the resulting codepage.  In this case, there is a 1:1 mapping
    171   between the input and output characters. The exclamation mark has the
    172   same value in both KOI8-R and Unicode.
    173 
    174   src: 0      1      2      3      4      5      6
    175   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
    176    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
    177 
    178  targ:  0    1    2    3    4    5    6
    179   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
    180    ch:                                '!'
    181 
    182 
    183 Converting FROM unicode
    184   to koi8-r.
    185   You must call ucnv_close to clean up the memory used by the
    186   converter.
    187 
    188   'len' returns the number of OUTPUT bytes resulting from the
    189   conversion.
    190  */
    191 
    192 UErrorCode convsample_02()
    193 {
    194   printf("\n\n==============================================\n"
    195          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
    196 
    197 
    198   // **************************** START SAMPLE *******************
    199   // "cat<cat>OK"
    200   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
    201                      0x0430, 0x0021, 0x0000 };
    202   char target[100];
    203   UErrorCode status = U_ZERO_ERROR;
    204   UConverter *conv;
    205   int32_t     len;
    206 
    207   // set up the converter
    208   //! [ucnv_open]
    209   conv = ucnv_open("koi8-r", &status);
    210   //! [ucnv_open]
    211   assert(U_SUCCESS(status));
    212 
    213   // convert to koi8-r
    214   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
    215   assert(U_SUCCESS(status));
    216 
    217   // close the converter
    218   ucnv_close(conv);
    219 
    220   // ***************************** END SAMPLE ********************
    221 
    222   // Print it out
    223   printUChars("src", source);
    224   printf("\n");
    225   printBytes("targ", target, len);
    226 
    227   return U_ZERO_ERROR;
    228 }
    229 
    230 
    231 UErrorCode convsample_03()
    232 {
    233   printf("\n\n==============================================\n"
    234          "Sample 03: C: print out all converters\n");
    235 
    236   int32_t count;
    237   int32_t i;
    238 
    239   // **************************** START SAMPLE *******************
    240   count = ucnv_countAvailable();
    241   printf("Available converters: %d\n", count);
    242 
    243   for(i=0;i<count;i++)
    244   {
    245     printf("%s ", ucnv_getAvailableName(i));
    246   }
    247 
    248   // ***************************** END SAMPLE ********************
    249 
    250   printf("\n");
    251 
    252   return U_ZERO_ERROR;
    253 }
    254 
    255 
    256 
    257 #define BUFFERSIZE 17 /* make it interesting :) */
    258 
    259 /*
    260   Converting from a codepage to Unicode in bulk..
    261   What is the best way to determine the buffer size?
    262 
    263      The 'buffersize' is in bytes of input.
    264     For a given converter, divinding this by the minimum char size
    265     give you the maximum number of Unicode characters that could be
    266     expected for a given number of input bytes.
    267      see: ucnv_getMinCharSize()
    268 
    269      For example, a single byte codepage like 'Latin-3' has a
    270     minimum char size of 1. (It takes at least 1 byte to represent
    271     each Unicode char.) So the unicode buffer has the same number of
    272     UChars as the input buffer has bytes.
    273 
    274      In a strictly double byte codepage such as cp1362 (Windows
    275     Korean), the minimum char size is 2. So, only half as many Unicode
    276     chars as bytes are needed.
    277 
    278      This work to calculate the buffer size is an optimization. Any
    279     size of input and output buffer can be used, as long as the
    280     program handles the following cases: If the input buffer is empty,
    281     the source pointer will be equal to sourceLimit.  If the output
    282     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
    283  */
    284 
    285 UErrorCode convsample_05()
    286 {
    287   printf("\n\n==============================================\n"
    288          "Sample 05: C: count the number of letters in a UTF-8 document\n");
    289 
    290   FILE *f;
    291   int32_t count;
    292   char inBuf[BUFFERSIZE];
    293   const char *source;
    294   const char *sourceLimit;
    295   UChar *uBuf;
    296   UChar *target;
    297   UChar *targetLimit;
    298   UChar *p;
    299   int32_t uBufSize = 0;
    300   UConverter *conv;
    301   UErrorCode status = U_ZERO_ERROR;
    302   uint32_t letters=0, total=0;
    303 
    304   f = fopen("data01.txt", "r");
    305   if(!f)
    306   {
    307     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    308     return U_FILE_ACCESS_ERROR;
    309   }
    310 
    311   // **************************** START SAMPLE *******************
    312   conv = ucnv_open("utf-8", &status);
    313   assert(U_SUCCESS(status));
    314 
    315   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    316   printf("input bytes %d / min chars %d = %d UChars\n",
    317          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    318   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    319   assert(uBuf!=NULL);
    320 
    321   // grab another buffer's worth
    322   while((!feof(f)) &&
    323         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    324   {
    325     // Convert bytes to unicode
    326     source = inBuf;
    327     sourceLimit = inBuf + count;
    328 
    329     do
    330     {
    331         target = uBuf;
    332         targetLimit = uBuf + uBufSize;
    333 
    334         ucnv_toUnicode(conv, &target, targetLimit,
    335                        &source, sourceLimit, NULL,
    336                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    337                                    /* is true (when no more data will come) */
    338                        &status);
    339 
    340         if(status == U_BUFFER_OVERFLOW_ERROR)
    341         {
    342           // simply ran out of space - we'll reset the target ptr the next
    343           // time through the loop.
    344           status = U_ZERO_ERROR;
    345         }
    346         else
    347         {
    348           //  Check other errors here.
    349           assert(U_SUCCESS(status));
    350           // Break out of the loop (by force)
    351         }
    352 
    353         // Process the Unicode
    354         // Todo: handle UTF-16/surrogates
    355 
    356         for(p = uBuf; p<target; p++)
    357         {
    358           if(u_isalpha(*p))
    359             letters++;
    360           total++;
    361         }
    362     } while (source < sourceLimit); // while simply out of space
    363   }
    364 
    365   printf("%d letters out of %d total UChars.\n", letters, total);
    366 
    367   // ***************************** END SAMPLE ********************
    368   ucnv_close(conv);
    369 
    370   printf("\n");
    371 
    372   fclose(f);
    373 
    374   return U_ZERO_ERROR;
    375 }
    376 #undef BUFFERSIZE
    377 
    378 #define BUFFERSIZE 1024
    379 typedef struct
    380 {
    381   UChar32  codepoint;
    382   uint32_t frequency;
    383 } CharFreqInfo;
    384 
    385 UErrorCode convsample_06()
    386 {
    387   printf("\n\n==============================================\n"
    388          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
    389 
    390   FILE *f;
    391   int32_t count;
    392   char inBuf[BUFFERSIZE];
    393   const char *source;
    394   const char *sourceLimit;
    395   int32_t uBufSize = 0;
    396   UConverter *conv;
    397   UErrorCode status = U_ZERO_ERROR;
    398   uint32_t letters=0, total=0;
    399 
    400   CharFreqInfo   *info;
    401   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
    402   UChar32   p;
    403 
    404   uint32_t ie = 0;
    405   uint32_t gh = 0;
    406   UChar32 l = 0;
    407 
    408   f = fopen("data06.txt", "r");
    409   if(!f)
    410   {
    411     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    412     return U_FILE_ACCESS_ERROR;
    413   }
    414 
    415   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
    416   if(!info)
    417   {
    418     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
    419   }
    420 
    421   /* reset frequencies */
    422   for(p=0;p<charCount;p++)
    423   {
    424     info[p].codepoint = p;
    425     info[p].frequency = 0;
    426   }
    427 
    428   // **************************** START SAMPLE *******************
    429   conv = ucnv_open("utf-8", &status);
    430   assert(U_SUCCESS(status));
    431 
    432   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    433   printf("input bytes %d / min chars %d = %d UChars\n",
    434          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    435 
    436   // grab another buffer's worth
    437   while((!feof(f)) &&
    438         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    439   {
    440     // Convert bytes to unicode
    441     source = inBuf;
    442     sourceLimit = inBuf + count;
    443 
    444     while(source < sourceLimit)
    445     {
    446       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    447       if(U_FAILURE(status))
    448       {
    449         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
    450         status = U_ZERO_ERROR;
    451         continue;
    452       }
    453       U_ASSERT(status);
    454       total++;
    455 
    456       if(u_isalpha(p))
    457         letters++;
    458 
    459       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
    460         ie++;
    461 
    462       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
    463         gh++;
    464 
    465       if(p>charCount)
    466       {
    467         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
    468         free(info);
    469         fclose(f);
    470         ucnv_close(conv);
    471         return U_UNSUPPORTED_ERROR;
    472       }
    473       info[p].frequency++;
    474       l = p;
    475     }
    476   }
    477 
    478   fclose(f);
    479   ucnv_close(conv);
    480 
    481   printf("%d letters out of %d total UChars.\n", letters, total);
    482   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
    483 
    484   // now, we could sort it..
    485 
    486   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
    487 
    488   for(p=0;p<charCount;p++)
    489   {
    490     if(info[p].frequency)
    491     {
    492       printf("% 5d U+%06X ", info[p].frequency, p);
    493       if(p <= 0xFFFF)
    494       {
    495         prettyPrintUChar((UChar)p);
    496       }
    497       printf("\n");
    498     }
    499   }
    500   free(info);
    501   // ***************************** END SAMPLE ********************
    502 
    503   printf("\n");
    504 
    505   return U_ZERO_ERROR;
    506 }
    507 #undef BUFFERSIZE
    508 
    509 
    510 /******************************************************
    511   You must call ucnv_close to clean up the memory used by the
    512   converter.
    513 
    514   'len' returns the number of OUTPUT bytes resulting from the
    515   conversion.
    516  */
    517 
    518 UErrorCode convsample_12()
    519 {
    520   printf("\n\n==============================================\n"
    521          "Sample 12: C: simple sjis -> unicode conversion\n");
    522 
    523 
    524   // **************************** START SAMPLE *******************
    525 
    526   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
    527   UChar target[100];
    528   UErrorCode status = U_ZERO_ERROR;
    529   UConverter *conv;
    530   int32_t     len;
    531 
    532   // set up the converter
    533   conv = ucnv_open("shift_jis", &status);
    534   assert(U_SUCCESS(status));
    535 
    536   // convert to Unicode
    537   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
    538   target[6] = 0xFDCA;
    539   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
    540   U_ASSERT(status);
    541   // close the converter
    542   ucnv_close(conv);
    543 
    544   // ***************************** END SAMPLE ********************
    545 
    546   // Print it out
    547   printBytes("src", source, strlen(source) );
    548   printf("\n");
    549   printUChars("targ", target, len);
    550 
    551   return U_ZERO_ERROR;
    552 }
    553 
    554 /******************************************************************
    555    C: Convert from codepage to Unicode one at a time.
    556 */
    557 
    558 UErrorCode convsample_13()
    559 {
    560   printf("\n\n==============================================\n"
    561          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
    562 
    563 
    564   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
    565   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
    566   const char *source, *sourceLimit;
    567   UChar32 target;
    568   UErrorCode status = U_ZERO_ERROR;
    569   UConverter *conv = NULL;
    570   int32_t srcCount=0;
    571   int32_t dstCount=0;
    572 
    573   srcCount = sizeof(sourceChars);
    574 
    575   conv = ucnv_open("Big5", &status);
    576   U_ASSERT(status);
    577 
    578   source = sourceChars;
    579   sourceLimit = sourceChars + sizeof(sourceChars);
    580 
    581   // **************************** START SAMPLE *******************
    582 
    583 
    584   printBytes("src",source,sourceLimit-source);
    585 
    586   while(source < sourceLimit)
    587   {
    588     puts("");
    589     target = ucnv_getNextUChar (conv,
    590                                 &source,
    591                                 sourceLimit,
    592                                 &status);
    593 
    594     //    printBytes("src",source,sourceLimit-source);
    595     U_ASSERT(status);
    596     printUChar(target);
    597     dstCount++;
    598   }
    599 
    600 
    601   // ************************** END SAMPLE *************************
    602 
    603   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
    604   ucnv_close(conv);
    605 
    606   return U_ZERO_ERROR;
    607 }
    608 
    609 
    610 
    611 
    612 UBool convsample_20_didSubstitute(const char *source)
    613 {
    614   UChar uchars[100];
    615   char bytes[100];
    616   UConverter *conv = NULL;
    617   UErrorCode status = U_ZERO_ERROR;
    618   uint32_t len, len2;
    619   UBool  flagVal;
    620 
    621   FromUFLAGContext * context = NULL;
    622 
    623   printf("\n\n==============================================\n"
    624          "Sample 20: C: Test for substitution using callbacks\n");
    625 
    626   /* print out the original source */
    627   printBytes("src", source);
    628   printf("\n");
    629 
    630   /* First, convert from UTF8 to unicode */
    631   conv = ucnv_open("utf-8", &status);
    632   U_ASSERT(status);
    633 
    634   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    635   U_ASSERT(status);
    636 
    637   printUChars("uch", uchars, len);
    638   printf("\n");
    639 
    640   /* Now, close the converter */
    641   ucnv_close(conv);
    642 
    643   /* Now, convert to windows-1252 */
    644   conv = ucnv_open("windows-1252", &status);
    645   U_ASSERT(status);
    646 
    647   /* Converter starts out with the SUBSTITUTE callback set. */
    648 
    649   /* initialize our callback */
    650   context = flagCB_fromU_openContext();
    651 
    652   /* Set our special callback */
    653   ucnv_setFromUCallBack(conv,
    654                         flagCB_fromU,
    655                         context,
    656                         &(context->subCallback),
    657                         &(context->subContext),
    658                         &status);
    659 
    660   U_ASSERT(status);
    661 
    662   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
    663   U_ASSERT(status);
    664 
    665   flagVal = context->flag;  /* it's about to go away when we close the cnv */
    666 
    667   ucnv_close(conv);
    668 
    669   /* print out the original source */
    670   printBytes("bytes", bytes, len2);
    671 
    672   return flagVal; /* true if callback was called */
    673 }
    674 
    675 UErrorCode convsample_20()
    676 {
    677   const char *sample1 = "abc\xdf\xbf";
    678   const char *sample2 = "abc_def";
    679 
    680 
    681   if(convsample_20_didSubstitute(sample1))
    682   {
    683     printf("DID substitute.\n******\n");
    684   }
    685   else
    686   {
    687     printf("Did NOT substitute.\n*****\n");
    688   }
    689 
    690   if(convsample_20_didSubstitute(sample2))
    691   {
    692     printf("DID substitute.\n******\n");
    693   }
    694   else
    695   {
    696     printf("Did NOT substitute.\n*****\n");
    697   }
    698 
    699   return U_ZERO_ERROR;
    700 }
    701 
    702 // 21  - C, callback, with clone and debug
    703 
    704 
    705 
    706 UBool convsample_21_didSubstitute(const char *source)
    707 {
    708   UChar uchars[100];
    709   char bytes[100];
    710   UConverter *conv = NULL, *cloneCnv = NULL;
    711   UErrorCode status = U_ZERO_ERROR;
    712   uint32_t len, len2;
    713   int32_t  cloneLen;
    714   UBool  flagVal = FALSE;
    715   UConverterFromUCallback junkCB;
    716 
    717   FromUFLAGContext *flagCtx = NULL,
    718                    *cloneFlagCtx = NULL;
    719 
    720   debugCBContext   *debugCtx1 = NULL,
    721                    *debugCtx2 = NULL,
    722                    *cloneDebugCtx = NULL;
    723 
    724   printf("\n\n==============================================\n"
    725          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
    726 
    727   /* print out the original source */
    728   printBytes("src", source);
    729   printf("\n");
    730 
    731   /* First, convert from UTF8 to unicode */
    732   conv = ucnv_open("utf-8", &status);
    733   U_ASSERT(status);
    734 
    735   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    736   U_ASSERT(status);
    737 
    738   printUChars("uch", uchars, len);
    739   printf("\n");
    740 
    741   /* Now, close the converter */
    742   ucnv_close(conv);
    743 
    744   /* Now, convert to windows-1252 */
    745   conv = ucnv_open("windows-1252", &status);
    746   U_ASSERT(status);
    747 
    748   /* Converter starts out with the SUBSTITUTE callback set. */
    749 
    750   /* initialize our callback */
    751   /* from the 'bottom' innermost, out
    752    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
    753 
    754 #if DEBUG_TMI
    755   printf("flagCB_fromU = %p\n", &flagCB_fromU);
    756   printf("debugCB_fromU = %p\n", &debugCB_fromU);
    757 #endif
    758 
    759   debugCtx1 = debugCB_openContext();
    760    flagCtx  = flagCB_fromU_openContext();
    761   debugCtx2 = debugCB_openContext();
    762 
    763   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
    764   debugCtx1->subContext  =  flagCtx;
    765 
    766   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
    767   flagCtx->subContext    =  debugCtx2;
    768 
    769   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
    770   debugCtx2->subContext  = NULL;
    771 
    772   /* Set our special callback */
    773 
    774   ucnv_setFromUCallBack(conv,
    775                         debugCB_fromU,
    776                         debugCtx1,
    777                         &(debugCtx2->subCallback),
    778                         &(debugCtx2->subContext),
    779                         &status);
    780 
    781   U_ASSERT(status);
    782 
    783 #if DEBUG_TMI
    784   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
    785          conv, debugCtx1, debugCtx1->subCallback,
    786          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
    787 #endif
    788 
    789   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
    790 
    791   U_ASSERT(status);
    792 
    793 #if DEBUG_TMI
    794   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
    795 #endif
    796 
    797   ucnv_close(conv);
    798 
    799 #if DEBUG_TMI
    800   printf("%p closed.\n", conv);
    801 #endif
    802 
    803   U_ASSERT(status);
    804   /* Now, we have to extract the context */
    805   cloneDebugCtx = NULL;
    806   cloneFlagCtx  = NULL;
    807 
    808   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
    809   if(cloneDebugCtx != NULL) {
    810       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
    811   }
    812 
    813   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
    814          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
    815 
    816   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
    817   U_ASSERT(status);
    818 
    819   if(cloneFlagCtx != NULL) {
    820       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
    821   } else {
    822       printf("** Warning, couldn't get the subcallback \n");
    823   }
    824 
    825   ucnv_close(cloneCnv);
    826 
    827   /* print out the original source */
    828   printBytes("bytes", bytes, len2);
    829 
    830   return flagVal; /* true if callback was called */
    831 }
    832 
    833 UErrorCode convsample_21()
    834 {
    835   const char *sample1 = "abc\xdf\xbf";
    836   const char *sample2 = "abc_def";
    837 
    838   if(convsample_21_didSubstitute(sample1))
    839   {
    840     printf("DID substitute.\n******\n");
    841   }
    842   else
    843   {
    844     printf("Did NOT substitute.\n*****\n");
    845   }
    846 
    847   if(convsample_21_didSubstitute(sample2))
    848   {
    849     printf("DID substitute.\n******\n");
    850   }
    851   else
    852   {
    853     printf("Did NOT substitute.\n*****\n");
    854   }
    855 
    856   return U_ZERO_ERROR;
    857 }
    858 
    859 
    860 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
    861 
    862 #define BUFFERSIZE 17 /* make it interesting :) */
    863 
    864 UErrorCode convsample_40()
    865 {
    866   printf("\n\n==============================================\n"
    867     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
    868 
    869   FILE *f;
    870   FILE *out;
    871   int32_t count;
    872   char inBuf[BUFFERSIZE];
    873   const char *source;
    874   const char *sourceLimit;
    875   UChar *uBuf;
    876   UChar *target;
    877   UChar *targetLimit;
    878   int32_t uBufSize = 0;
    879   UConverter *conv = NULL;
    880   UErrorCode status = U_ZERO_ERROR;
    881   uint32_t inbytes=0, total=0;
    882 
    883   f = fopen("data02.bin", "rb");
    884   if(!f)
    885   {
    886     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    887     return U_FILE_ACCESS_ERROR;
    888   }
    889 
    890   out = fopen("data40.utf16", "wb");
    891   if(!out)
    892   {
    893     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    894     fclose(f);
    895     return U_FILE_ACCESS_ERROR;
    896   }
    897 
    898   // **************************** START SAMPLE *******************
    899   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
    900   assert(U_SUCCESS(status));
    901 
    902   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    903   printf("input bytes %d / min chars %d = %d UChars\n",
    904          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    905   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    906   assert(uBuf!=NULL);
    907 
    908   // grab another buffer's worth
    909   while((!feof(f)) &&
    910         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    911   {
    912     inbytes += count;
    913 
    914     // Convert bytes to unicode
    915     source = inBuf;
    916     sourceLimit = inBuf + count;
    917 
    918     do
    919     {
    920         target = uBuf;
    921         targetLimit = uBuf + uBufSize;
    922 
    923         ucnv_toUnicode( conv, &target, targetLimit,
    924                        &source, sourceLimit, NULL,
    925                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    926                                    /* is true (when no more data will come) */
    927                          &status);
    928 
    929         if(status == U_BUFFER_OVERFLOW_ERROR)
    930         {
    931           // simply ran out of space - we'll reset the target ptr the next
    932           // time through the loop.
    933           status = U_ZERO_ERROR;
    934         }
    935         else
    936         {
    937           //  Check other errors here.
    938           assert(U_SUCCESS(status));
    939           // Break out of the loop (by force)
    940         }
    941 
    942         // Process the Unicode
    943         // Todo: handle UTF-16/surrogates
    944         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
    945                (size_t)(target-uBuf));
    946         total += (target-uBuf);
    947     } while (source < sourceLimit); // while simply out of space
    948   }
    949 
    950   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
    951 
    952   // ***************************** END SAMPLE ********************
    953   ucnv_close(conv);
    954 
    955   fclose(f);
    956   fclose(out);
    957   printf("\n");
    958 
    959   return U_ZERO_ERROR;
    960 }
    961 #undef BUFFERSIZE
    962 
    963 
    964 
    965 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
    966 
    967 #define BUFFERSIZE 24 /* make it interesting :) */
    968 
    969 UErrorCode convsample_46()
    970 {
    971   printf("\n\n==============================================\n"
    972     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
    973 
    974   FILE *f;
    975   FILE *out;
    976   int32_t count;
    977   UChar inBuf[BUFFERSIZE];
    978   const UChar *source;
    979   const UChar *sourceLimit;
    980   char *buf;
    981   char *target;
    982   char *targetLimit;
    983 
    984   int32_t bufSize = 0;
    985   UConverter *conv = NULL;
    986   UErrorCode status = U_ZERO_ERROR;
    987   uint32_t inchars=0, total=0;
    988 
    989   f = fopen("data40.utf16", "rb");
    990   if(!f)
    991   {
    992     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
    993     return U_FILE_ACCESS_ERROR;
    994   }
    995 
    996   out = fopen("data46.out", "wb");
    997   if(!out)
    998   {
    999     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
   1000     fclose(f);
   1001     return U_FILE_ACCESS_ERROR;
   1002   }
   1003 
   1004   // **************************** START SAMPLE *******************
   1005   conv = ucnv_open( "iso-8859-2", &status);
   1006   assert(U_SUCCESS(status));
   1007 
   1008   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
   1009   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
   1010          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
   1011   buf = (char*)malloc(bufSize * sizeof(char));
   1012   assert(buf!=NULL);
   1013 
   1014   // grab another buffer's worth
   1015   while((!feof(f)) &&
   1016         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
   1017   {
   1018     inchars += count;
   1019 
   1020     // Convert bytes to unicode
   1021     source = inBuf;
   1022     sourceLimit = inBuf + count;
   1023 
   1024     do
   1025     {
   1026         target = buf;
   1027         targetLimit = buf + bufSize;
   1028 
   1029         ucnv_fromUnicode( conv, &target, targetLimit,
   1030                        &source, sourceLimit, NULL,
   1031                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
   1032                                    /* is true (when no more data will come) */
   1033                          &status);
   1034 
   1035         if(status == U_BUFFER_OVERFLOW_ERROR)
   1036         {
   1037           // simply ran out of space - we'll reset the target ptr the next
   1038           // time through the loop.
   1039           status = U_ZERO_ERROR;
   1040         }
   1041         else
   1042         {
   1043           //  Check other errors here.
   1044           assert(U_SUCCESS(status));
   1045           // Break out of the loop (by force)
   1046         }
   1047 
   1048         // Process the Unicode
   1049         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
   1050                (size_t)(target-buf));
   1051         total += (target-buf);
   1052     } while (source < sourceLimit); // while simply out of space
   1053   }
   1054 
   1055   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
   1056 
   1057   // ***************************** END SAMPLE ********************
   1058   ucnv_close(conv);
   1059 
   1060   fclose(f);
   1061   fclose(out);
   1062   printf("\n");
   1063 
   1064   return U_ZERO_ERROR;
   1065 }
   1066 #undef BUFFERSIZE
   1067 
   1068 #define BUFFERSIZE 219
   1069 
   1070 void convsample_50() {
   1071   printf("\n\n==============================================\n"
   1072          "Sample 50: C: ucnv_detectUnicodeSignature\n");
   1073 
   1074   //! [ucnv_detectUnicodeSignature]
   1075   UErrorCode err = U_ZERO_ERROR;
   1076   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
   1077   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
   1078   int32_t signatureLength = 0;
   1079   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
   1080   UConverter *conv = NULL;
   1081   UChar output[100];
   1082   UChar *target = output, *out;
   1083   const char *source = input;
   1084   if(encoding!=NULL && U_SUCCESS(err)){
   1085     // should signature be discarded ?
   1086     conv = ucnv_open(encoding, &err);
   1087     // do the conversion
   1088     ucnv_toUnicode(conv,
   1089                    &target, output + sizeof(output)/U_SIZEOF_UCHAR,
   1090                    &source, input + sizeof(input),
   1091                    NULL, TRUE, &err);
   1092     out = output;
   1093     if (discardSignature){
   1094       ++out; // ignore initial U+FEFF
   1095     }
   1096     while(out != target) {
   1097       printf("%04x ", *out++);
   1098     }
   1099     puts("");
   1100   }
   1101   //! [ucnv_detectUnicodeSignature]
   1102   puts("");
   1103 }
   1104 
   1105 
   1106 
   1107 /* main */
   1108 
   1109 int main()
   1110 {
   1111 
   1112   printf("Default Converter=%s\n", ucnv_getDefaultName() );
   1113 
   1114   convsample_02();  // C  , u->koi8r, conv
   1115   convsample_03();  // C,   iterate
   1116 
   1117   convsample_05();  // C,  utf8->u, getNextUChar
   1118   convsample_06(); // C freq counter thingy
   1119 
   1120   convsample_12();  // C,  sjis->u, conv
   1121   convsample_13();  // C,  big5->u, getNextU
   1122 
   1123   convsample_20();  // C, callback
   1124   convsample_21();  // C, callback debug
   1125 
   1126   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
   1127 
   1128   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
   1129 
   1130   convsample_50();  // C, detect unicode signature
   1131 
   1132   printf("End of converter samples.\n");
   1133 
   1134   fflush(stdout);
   1135   fflush(stderr);
   1136 
   1137   return 0;
   1138 }
   1139