Home | History | Annotate | Download | only in ucnv
      1 /**************************************************************************
      2 *
      3 *   Copyright (C) 2000-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *
      6 ***************************************************************************
      7 *   file name:  convsamp.c
      8 *   encoding:   ASCII (7-bit)
      9 *
     10 *   created on: 2000may30
     11 *   created by: Steven R. Loomis
     12 *
     13 *   Sample code for the ICU conversion routines.
     14 *
     15 * Note: Nothing special is needed to build this sample. Link with
     16 *       the icu UC and icu I18N libraries.
     17 *
     18 *       I use 'assert' for error checking, you probably will want
     19 *       something more flexible.  '***BEGIN SAMPLE***' and
     20 *       '***END SAMPLE***' mark pieces suitable for stand alone
     21 *       code snippets.
     22 *
     23 *
     24 *  Each test can define it's own BUFFERSIZE
     25 *
     26 */
     27 
     28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
     29 
     30 #include <stdio.h>
     31 #include <ctype.h>            /* for isspace, etc.    */
     32 #include <assert.h>
     33 #include <string.h>
     34 #include <stdlib.h>  /* malloc */
     35 
     36 #include "unicode/utypes.h"   /* Basic ICU data types */
     37 #include "unicode/ucnv.h"     /* C   Converter API    */
     38 #include "unicode/ustring.h"  /* some more string fcns*/
     39 #include "unicode/uchar.h"    /* char names           */
     40 #include "unicode/uloc.h"
     41 #include "unicode/unistr.h"
     42 
     43 #include "flagcb.h"
     44 
     45 /* Some utility functions */
     46 
     47 static const UChar kNone[] = { 0x0000 };
     48 
     49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
     50 
     51 /* Print a UChar if possible, in seven characters. */
     52 void prettyPrintUChar(UChar c)
     53 {
     54   if(  (c <= 0x007F) &&
     55        (isgraph(c))  ) {
     56     printf(" '%c'   ", (char)(0x00FF&c));
     57   } else if ( c > 0x007F ) {
     58     char buf[1000];
     59     UErrorCode status = U_ZERO_ERROR;
     60     int32_t o;
     61 
     62     o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status);
     63     if(U_SUCCESS(status) && (o>0) ) {
     64       buf[6] = 0;
     65       printf("%7s", buf);
     66     } else {
     67       o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status);
     68       if(U_SUCCESS(status) && (o>0)) {
     69         buf[5] = 0;
     70         printf("~%6s", buf);
     71       }
     72       else {
     73         printf(" ??????");
     74       }
     75     }
     76   } else {
     77     switch((char)(c & 0x007F)) {
     78     case ' ':
     79       printf(" ' '   ");
     80       break;
     81     case '\t':
     82       printf(" \\t    ");
     83       break;
     84     case '\n':
     85       printf(" \\n    ");
     86       break;
     87     default:
     88       printf("  _    ");
     89       break;
     90     }
     91   }
     92 }
     93 
     94 
     95 void printUChars(const char  *name = "?",
     96                  const UChar *uch  = kNone,
     97                  int32_t     len   = -1 )
     98 {
     99   int32_t i;
    100 
    101   if( (len == -1) && (uch) ) {
    102     len = u_strlen(uch);
    103   }
    104 
    105   printf("%5s: ", name);
    106   for( i = 0; i <len; i++) {
    107     printf("%-6d ", i);
    108   }
    109   printf("\n");
    110 
    111   printf("%5s: ", "uni");
    112   for( i = 0; i <len; i++) {
    113     printf("\\u%04X ", (int)uch[i]);
    114   }
    115   printf("\n");
    116 
    117   printf("%5s:", "ch");
    118   for( i = 0; i <len; i++) {
    119     prettyPrintUChar(uch[i]);
    120   }
    121   printf("\n");
    122 }
    123 
    124 void printBytes(const char  *name = "?",
    125                  const char *uch  = "",
    126                  int32_t     len   = -1 )
    127 {
    128   int32_t i;
    129 
    130   if( (len == -1) && (uch) ) {
    131     len = strlen(uch);
    132   }
    133 
    134   printf("%5s: ", name);
    135   for( i = 0; i <len; i++) {
    136     printf("%-4d ", i);
    137   }
    138   printf("\n");
    139 
    140   printf("%5s: ", "uni");
    141   for( i = 0; i <len; i++) {
    142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
    143   }
    144   printf("\n");
    145 
    146   printf("%5s:", "ch");
    147   for( i = 0; i <len; i++) {
    148     if(isgraph(0x00FF & (int)uch[i])) {
    149       printf(" '%c' ", (char)uch[i]);
    150     } else {
    151       printf("     ");
    152     }
    153   }
    154   printf("\n");
    155 }
    156 
    157 void printUChar(UChar32 ch32)
    158 {
    159     if(ch32 > 0xFFFF) {
    160       printf("ch: U+%06X\n", ch32);
    161     }
    162     else {
    163       UChar ch = (UChar)ch32;
    164       printUChars("C", &ch, 1);
    165     }
    166 }
    167 
    168 /*******************************************************************
    169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
    170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
    171 
    172   This example first creates a UChar String out of the Unicode chars.
    173 
    174   targetSize must be set to the amount of space available in the target
    175   buffer. After fromUChars is called,
    176   len will contain the number of bytes in target[] which were
    177   used in the resulting codepage.  In this case, there is a 1:1 mapping
    178   between the input and output characters. The exclamation mark has the
    179   same value in both KOI8-R and Unicode.
    180 
    181   src: 0      1      2      3      4      5      6
    182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
    183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
    184 
    185  targ:  0    1    2    3    4    5    6
    186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
    187    ch:                                '!'
    188 
    189 
    190 Converting FROM unicode
    191   to koi8-r.
    192   You must call ucnv_close to clean up the memory used by the
    193   converter.
    194 
    195   'len' returns the number of OUTPUT bytes resulting from the
    196   conversion.
    197  */
    198 
    199 UErrorCode convsample_02()
    200 {
    201   printf("\n\n==============================================\n"
    202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
    203 
    204 
    205   // **************************** START SAMPLE *******************
    206   // "cat<cat>OK"
    207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
    208                      0x0430, 0x0021, 0x0000 };
    209   char target[100];
    210   UErrorCode status = U_ZERO_ERROR;
    211   UConverter *conv;
    212   int32_t     len;
    213 
    214   // set up the converter
    215   conv = ucnv_open("koi8-r", &status);
    216   assert(U_SUCCESS(status));
    217 
    218   // convert to koi8-r
    219   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
    220   assert(U_SUCCESS(status));
    221 
    222   // close the converter
    223   ucnv_close(conv);
    224 
    225   // ***************************** END SAMPLE ********************
    226 
    227   // Print it out
    228   printUChars("src", source);
    229   printf("\n");
    230   printBytes("targ", target, len);
    231 
    232   return U_ZERO_ERROR;
    233 }
    234 
    235 
    236 UErrorCode convsample_03()
    237 {
    238   printf("\n\n==============================================\n"
    239          "Sample 03: C: print out all converters\n");
    240 
    241   int32_t count;
    242   int32_t i;
    243 
    244   // **************************** START SAMPLE *******************
    245   count = ucnv_countAvailable();
    246   printf("Available converters: %d\n", count);
    247 
    248   for(i=0;i<count;i++)
    249   {
    250     printf("%s ", ucnv_getAvailableName(i));
    251   }
    252 
    253   // ***************************** END SAMPLE ********************
    254 
    255   printf("\n");
    256 
    257   return U_ZERO_ERROR;
    258 }
    259 
    260 
    261 
    262 #define BUFFERSIZE 17 /* make it interesting :) */
    263 
    264 /*
    265   Converting from a codepage to Unicode in bulk..
    266   What is the best way to determine the buffer size?
    267 
    268      The 'buffersize' is in bytes of input.
    269     For a given converter, divinding this by the minimum char size
    270     give you the maximum number of Unicode characters that could be
    271     expected for a given number of input bytes.
    272      see: ucnv_getMinCharSize()
    273 
    274      For example, a single byte codepage like 'Latin-3' has a
    275     minimum char size of 1. (It takes at least 1 byte to represent
    276     each Unicode char.) So the unicode buffer has the same number of
    277     UChars as the input buffer has bytes.
    278 
    279      In a strictly double byte codepage such as cp1362 (Windows
    280     Korean), the minimum char size is 2. So, only half as many Unicode
    281     chars as bytes are needed.
    282 
    283      This work to calculate the buffer size is an optimization. Any
    284     size of input and output buffer can be used, as long as the
    285     program handles the following cases: If the input buffer is empty,
    286     the source pointer will be equal to sourceLimit.  If the output
    287     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
    288  */
    289 
    290 UErrorCode convsample_05()
    291 {
    292   printf("\n\n==============================================\n"
    293          "Sample 05: C: count the number of letters in a UTF-8 document\n");
    294 
    295   FILE *f;
    296   int32_t count;
    297   char inBuf[BUFFERSIZE];
    298   const char *source;
    299   const char *sourceLimit;
    300   UChar *uBuf;
    301   UChar *target;
    302   UChar *targetLimit;
    303   UChar *p;
    304   int32_t uBufSize = 0;
    305   UConverter *conv;
    306   UErrorCode status = U_ZERO_ERROR;
    307   uint32_t letters=0, total=0;
    308 
    309   f = fopen("data01.txt", "r");
    310   if(!f)
    311   {
    312     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    313     return U_FILE_ACCESS_ERROR;
    314   }
    315 
    316   // **************************** START SAMPLE *******************
    317   conv = ucnv_open("utf-8", &status);
    318   assert(U_SUCCESS(status));
    319 
    320   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    321   printf("input bytes %d / min chars %d = %d UChars\n",
    322          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    323   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    324   assert(uBuf!=NULL);
    325 
    326   // grab another buffer's worth
    327   while((!feof(f)) &&
    328         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    329   {
    330     // Convert bytes to unicode
    331     source = inBuf;
    332     sourceLimit = inBuf + count;
    333 
    334     do
    335     {
    336         target = uBuf;
    337         targetLimit = uBuf + uBufSize;
    338 
    339         ucnv_toUnicode(conv, &target, targetLimit,
    340                        &source, sourceLimit, NULL,
    341                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    342                                    /* is true (when no more data will come) */
    343                        &status);
    344 
    345         if(status == U_BUFFER_OVERFLOW_ERROR)
    346         {
    347           // simply ran out of space - we'll reset the target ptr the next
    348           // time through the loop.
    349           status = U_ZERO_ERROR;
    350         }
    351         else
    352         {
    353           //  Check other errors here.
    354           assert(U_SUCCESS(status));
    355           // Break out of the loop (by force)
    356         }
    357 
    358         // Process the Unicode
    359         // Todo: handle UTF-16/surrogates
    360 
    361         for(p = uBuf; p<target; p++)
    362         {
    363           if(u_isalpha(*p))
    364             letters++;
    365           total++;
    366         }
    367     } while (source < sourceLimit); // while simply out of space
    368   }
    369 
    370   printf("%d letters out of %d total UChars.\n", letters, total);
    371 
    372   // ***************************** END SAMPLE ********************
    373   ucnv_close(conv);
    374 
    375   printf("\n");
    376 
    377   fclose(f);
    378 
    379   return U_ZERO_ERROR;
    380 }
    381 #undef BUFFERSIZE
    382 
    383 #define BUFFERSIZE 1024
    384 typedef struct
    385 {
    386   UChar32  codepoint;
    387   uint32_t frequency;
    388 } CharFreqInfo;
    389 
    390 UErrorCode convsample_06()
    391 {
    392   printf("\n\n==============================================\n"
    393          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
    394 
    395   FILE *f;
    396   int32_t count;
    397   char inBuf[BUFFERSIZE];
    398   const char *source;
    399   const char *sourceLimit;
    400   int32_t uBufSize = 0;
    401   UConverter *conv;
    402   UErrorCode status = U_ZERO_ERROR;
    403   uint32_t letters=0, total=0;
    404 
    405   CharFreqInfo   *info;
    406   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
    407   UChar32   p;
    408 
    409   uint32_t ie = 0;
    410   uint32_t gh = 0;
    411   UChar32 l = 0;
    412 
    413   f = fopen("data06.txt", "r");
    414   if(!f)
    415   {
    416     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    417     return U_FILE_ACCESS_ERROR;
    418   }
    419 
    420   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
    421   if(!info)
    422   {
    423     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
    424   }
    425 
    426   /* reset frequencies */
    427   for(p=0;p<charCount;p++)
    428   {
    429     info[p].codepoint = p;
    430     info[p].frequency = 0;
    431   }
    432 
    433   // **************************** START SAMPLE *******************
    434   conv = ucnv_open("utf-8", &status);
    435   assert(U_SUCCESS(status));
    436 
    437   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    438   printf("input bytes %d / min chars %d = %d UChars\n",
    439          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    440 
    441   // grab another buffer's worth
    442   while((!feof(f)) &&
    443         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    444   {
    445     // Convert bytes to unicode
    446     source = inBuf;
    447     sourceLimit = inBuf + count;
    448 
    449     while(source < sourceLimit)
    450     {
    451       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    452       if(U_FAILURE(status))
    453       {
    454         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
    455         status = U_ZERO_ERROR;
    456         continue;
    457       }
    458       U_ASSERT(status);
    459       total++;
    460 
    461       if(u_isalpha(p))
    462         letters++;
    463 
    464       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
    465         ie++;
    466 
    467       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
    468         gh++;
    469 
    470       if(p>charCount)
    471       {
    472         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
    473         free(info);
    474         fclose(f);
    475         ucnv_close(conv);
    476         return U_UNSUPPORTED_ERROR;
    477       }
    478       info[p].frequency++;
    479       l = p;
    480     }
    481   }
    482 
    483   fclose(f);
    484   ucnv_close(conv);
    485 
    486   printf("%d letters out of %d total UChars.\n", letters, total);
    487   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
    488 
    489   // now, we could sort it..
    490 
    491   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
    492 
    493   for(p=0;p<charCount;p++)
    494   {
    495     if(info[p].frequency)
    496     {
    497       printf("% 5d U+%06X ", info[p].frequency, p);
    498       if(p <= 0xFFFF)
    499       {
    500         prettyPrintUChar((UChar)p);
    501       }
    502       printf("\n");
    503     }
    504   }
    505   free(info);
    506   // ***************************** END SAMPLE ********************
    507 
    508   printf("\n");
    509 
    510   return U_ZERO_ERROR;
    511 }
    512 #undef BUFFERSIZE
    513 
    514 
    515 /******************************************************
    516   You must call ucnv_close to clean up the memory used by the
    517   converter.
    518 
    519   'len' returns the number of OUTPUT bytes resulting from the
    520   conversion.
    521  */
    522 
    523 UErrorCode convsample_12()
    524 {
    525   printf("\n\n==============================================\n"
    526          "Sample 12: C: simple sjis -> unicode conversion\n");
    527 
    528 
    529   // **************************** START SAMPLE *******************
    530 
    531   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
    532   UChar target[100];
    533   UErrorCode status = U_ZERO_ERROR;
    534   UConverter *conv;
    535   int32_t     len;
    536 
    537   // set up the converter
    538   conv = ucnv_open("shift_jis", &status);
    539   assert(U_SUCCESS(status));
    540 
    541   // convert to Unicode
    542   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
    543   target[6] = 0xFDCA;
    544   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
    545   U_ASSERT(status);
    546   // close the converter
    547   ucnv_close(conv);
    548 
    549   // ***************************** END SAMPLE ********************
    550 
    551   // Print it out
    552   printBytes("src", source, strlen(source) );
    553   printf("\n");
    554   printUChars("targ", target, len);
    555 
    556   return U_ZERO_ERROR;
    557 }
    558 
    559 /******************************************************************
    560    C: Convert from codepage to Unicode one at a time.
    561 */
    562 
    563 UErrorCode convsample_13()
    564 {
    565   printf("\n\n==============================================\n"
    566          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
    567 
    568 
    569   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
    570   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
    571   const char *source, *sourceLimit;
    572   UChar32 target;
    573   UErrorCode status = U_ZERO_ERROR;
    574   UConverter *conv = NULL;
    575   int32_t srcCount=0;
    576   int32_t dstCount=0;
    577 
    578   srcCount = sizeof(sourceChars);
    579 
    580   conv = ucnv_open("Big5", &status);
    581   U_ASSERT(status);
    582 
    583   source = sourceChars;
    584   sourceLimit = sourceChars + sizeof(sourceChars);
    585 
    586   // **************************** START SAMPLE *******************
    587 
    588 
    589   printBytes("src",source,sourceLimit-source);
    590 
    591   while(source < sourceLimit)
    592   {
    593     puts("");
    594     target = ucnv_getNextUChar (conv,
    595                                 &source,
    596                                 sourceLimit,
    597                                 &status);
    598 
    599     //    printBytes("src",source,sourceLimit-source);
    600     U_ASSERT(status);
    601     printUChar(target);
    602     dstCount++;
    603   }
    604 
    605 
    606   // ************************** END SAMPLE *************************
    607 
    608   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
    609   ucnv_close(conv);
    610 
    611   return U_ZERO_ERROR;
    612 }
    613 
    614 
    615 
    616 
    617 UBool convsample_20_didSubstitute(const char *source)
    618 {
    619   UChar uchars[100];
    620   char bytes[100];
    621   UConverter *conv = NULL;
    622   UErrorCode status = U_ZERO_ERROR;
    623   uint32_t len, len2;
    624   UBool  flagVal;
    625 
    626   FromUFLAGContext * context = NULL;
    627 
    628   printf("\n\n==============================================\n"
    629          "Sample 20: C: Test for substitution using callbacks\n");
    630 
    631   /* print out the original source */
    632   printBytes("src", source);
    633   printf("\n");
    634 
    635   /* First, convert from UTF8 to unicode */
    636   conv = ucnv_open("utf-8", &status);
    637   U_ASSERT(status);
    638 
    639   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    640   U_ASSERT(status);
    641 
    642   printUChars("uch", uchars, len);
    643   printf("\n");
    644 
    645   /* Now, close the converter */
    646   ucnv_close(conv);
    647 
    648   /* Now, convert to windows-1252 */
    649   conv = ucnv_open("windows-1252", &status);
    650   U_ASSERT(status);
    651 
    652   /* Converter starts out with the SUBSTITUTE callback set. */
    653 
    654   /* initialize our callback */
    655   context = flagCB_fromU_openContext();
    656 
    657   /* Set our special callback */
    658   ucnv_setFromUCallBack(conv,
    659                         flagCB_fromU,
    660                         context,
    661                         &(context->subCallback),
    662                         &(context->subContext),
    663                         &status);
    664 
    665   U_ASSERT(status);
    666 
    667   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
    668   U_ASSERT(status);
    669 
    670   flagVal = context->flag;  /* it's about to go away when we close the cnv */
    671 
    672   ucnv_close(conv);
    673 
    674   /* print out the original source */
    675   printBytes("bytes", bytes, len2);
    676 
    677   return flagVal; /* true if callback was called */
    678 }
    679 
    680 UErrorCode convsample_20()
    681 {
    682   const char *sample1 = "abc\xdf\xbf";
    683   const char *sample2 = "abc_def";
    684 
    685 
    686   if(convsample_20_didSubstitute(sample1))
    687   {
    688     printf("DID substitute.\n******\n");
    689   }
    690   else
    691   {
    692     printf("Did NOT substitute.\n*****\n");
    693   }
    694 
    695   if(convsample_20_didSubstitute(sample2))
    696   {
    697     printf("DID substitute.\n******\n");
    698   }
    699   else
    700   {
    701     printf("Did NOT substitute.\n*****\n");
    702   }
    703 
    704   return U_ZERO_ERROR;
    705 }
    706 
    707 // 21  - C, callback, with clone and debug
    708 
    709 
    710 
    711 UBool convsample_21_didSubstitute(const char *source)
    712 {
    713   UChar uchars[100];
    714   char bytes[100];
    715   UConverter *conv = NULL, *cloneCnv = NULL;
    716   UErrorCode status = U_ZERO_ERROR;
    717   uint32_t len, len2;
    718   int32_t  cloneLen;
    719   UBool  flagVal = FALSE;
    720   UConverterFromUCallback junkCB;
    721 
    722   FromUFLAGContext *flagCtx = NULL,
    723                    *cloneFlagCtx = NULL;
    724 
    725   debugCBContext   *debugCtx1 = NULL,
    726                    *debugCtx2 = NULL,
    727                    *cloneDebugCtx = NULL;
    728 
    729   printf("\n\n==============================================\n"
    730          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
    731 
    732   /* print out the original source */
    733   printBytes("src", source);
    734   printf("\n");
    735 
    736   /* First, convert from UTF8 to unicode */
    737   conv = ucnv_open("utf-8", &status);
    738   U_ASSERT(status);
    739 
    740   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
    741   U_ASSERT(status);
    742 
    743   printUChars("uch", uchars, len);
    744   printf("\n");
    745 
    746   /* Now, close the converter */
    747   ucnv_close(conv);
    748 
    749   /* Now, convert to windows-1252 */
    750   conv = ucnv_open("windows-1252", &status);
    751   U_ASSERT(status);
    752 
    753   /* Converter starts out with the SUBSTITUTE callback set. */
    754 
    755   /* initialize our callback */
    756   /* from the 'bottom' innermost, out
    757    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
    758 
    759 #if DEBUG_TMI
    760   printf("flagCB_fromU = %p\n", &flagCB_fromU);
    761   printf("debugCB_fromU = %p\n", &debugCB_fromU);
    762 #endif
    763 
    764   debugCtx1 = debugCB_openContext();
    765    flagCtx  = flagCB_fromU_openContext();
    766   debugCtx2 = debugCB_openContext();
    767 
    768   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
    769   debugCtx1->subContext  =  flagCtx;
    770 
    771   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
    772   flagCtx->subContext    =  debugCtx2;
    773 
    774   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
    775   debugCtx2->subContext  = NULL;
    776 
    777   /* Set our special callback */
    778 
    779   ucnv_setFromUCallBack(conv,
    780                         debugCB_fromU,
    781                         debugCtx1,
    782                         &(debugCtx2->subCallback),
    783                         &(debugCtx2->subContext),
    784                         &status);
    785 
    786   U_ASSERT(status);
    787 
    788 #if DEBUG_TMI
    789   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
    790          conv, debugCtx1, debugCtx1->subCallback,
    791          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
    792 #endif
    793 
    794   cloneLen = 1; /* but passing in null so it will clone */
    795   cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
    796 
    797   U_ASSERT(status);
    798 
    799 #if DEBUG_TMI
    800   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
    801 #endif
    802 
    803   ucnv_close(conv);
    804 
    805 #if DEBUG_TMI
    806   printf("%p closed.\n", conv);
    807 #endif
    808 
    809   U_ASSERT(status);
    810   /* Now, we have to extract the context */
    811   cloneDebugCtx = NULL;
    812   cloneFlagCtx  = NULL;
    813 
    814   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
    815   if(cloneDebugCtx != NULL) {
    816       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
    817   }
    818 
    819   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
    820          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
    821 
    822   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
    823   U_ASSERT(status);
    824 
    825   if(cloneFlagCtx != NULL) {
    826       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
    827   } else {
    828       printf("** Warning, couldn't get the subcallback \n");
    829   }
    830 
    831   ucnv_close(cloneCnv);
    832 
    833   /* print out the original source */
    834   printBytes("bytes", bytes, len2);
    835 
    836   return flagVal; /* true if callback was called */
    837 }
    838 
    839 UErrorCode convsample_21()
    840 {
    841   const char *sample1 = "abc\xdf\xbf";
    842   const char *sample2 = "abc_def";
    843 
    844   if(convsample_21_didSubstitute(sample1))
    845   {
    846     printf("DID substitute.\n******\n");
    847   }
    848   else
    849   {
    850     printf("Did NOT substitute.\n*****\n");
    851   }
    852 
    853   if(convsample_21_didSubstitute(sample2))
    854   {
    855     printf("DID substitute.\n******\n");
    856   }
    857   else
    858   {
    859     printf("Did NOT substitute.\n*****\n");
    860   }
    861 
    862   return U_ZERO_ERROR;
    863 }
    864 
    865 
    866 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
    867 
    868 #define BUFFERSIZE 17 /* make it interesting :) */
    869 
    870 UErrorCode convsample_40()
    871 {
    872   printf("\n\n==============================================\n"
    873     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
    874 
    875   FILE *f;
    876   FILE *out;
    877   int32_t count;
    878   char inBuf[BUFFERSIZE];
    879   const char *source;
    880   const char *sourceLimit;
    881   UChar *uBuf;
    882   UChar *target;
    883   UChar *targetLimit;
    884   int32_t uBufSize = 0;
    885   UConverter *conv = NULL;
    886   UErrorCode status = U_ZERO_ERROR;
    887   uint32_t inbytes=0, total=0;
    888 
    889   f = fopen("data02.bin", "rb");
    890   if(!f)
    891   {
    892     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    893     return U_FILE_ACCESS_ERROR;
    894   }
    895 
    896   out = fopen("data40.utf16", "wb");
    897   if(!out)
    898   {
    899     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    900     fclose(f);
    901     return U_FILE_ACCESS_ERROR;
    902   }
    903 
    904   // **************************** START SAMPLE *******************
    905   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
    906   assert(U_SUCCESS(status));
    907 
    908   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    909   printf("input bytes %d / min chars %d = %d UChars\n",
    910          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    911   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    912   assert(uBuf!=NULL);
    913 
    914   // grab another buffer's worth
    915   while((!feof(f)) &&
    916         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
    917   {
    918     inbytes += count;
    919 
    920     // Convert bytes to unicode
    921     source = inBuf;
    922     sourceLimit = inBuf + count;
    923 
    924     do
    925     {
    926         target = uBuf;
    927         targetLimit = uBuf + uBufSize;
    928 
    929         ucnv_toUnicode( conv, &target, targetLimit,
    930                        &source, sourceLimit, NULL,
    931                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    932                                    /* is true (when no more data will come) */
    933                          &status);
    934 
    935         if(status == U_BUFFER_OVERFLOW_ERROR)
    936         {
    937           // simply ran out of space - we'll reset the target ptr the next
    938           // time through the loop.
    939           status = U_ZERO_ERROR;
    940         }
    941         else
    942         {
    943           //  Check other errors here.
    944           assert(U_SUCCESS(status));
    945           // Break out of the loop (by force)
    946         }
    947 
    948         // Process the Unicode
    949         // Todo: handle UTF-16/surrogates
    950         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
    951                (size_t)(target-uBuf));
    952         total += (target-uBuf);
    953     } while (source < sourceLimit); // while simply out of space
    954   }
    955 
    956   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
    957 
    958   // ***************************** END SAMPLE ********************
    959   ucnv_close(conv);
    960 
    961   fclose(f);
    962   fclose(out);
    963   printf("\n");
    964 
    965   return U_ZERO_ERROR;
    966 }
    967 #undef BUFFERSIZE
    968 
    969 
    970 
    971 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
    972 
    973 #define BUFFERSIZE 24 /* make it interesting :) */
    974 
    975 UErrorCode convsample_46()
    976 {
    977   printf("\n\n==============================================\n"
    978     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
    979 
    980   FILE *f;
    981   FILE *out;
    982   int32_t count;
    983   UChar inBuf[BUFFERSIZE];
    984   const UChar *source;
    985   const UChar *sourceLimit;
    986   char *buf;
    987   char *target;
    988   char *targetLimit;
    989 
    990   int32_t bufSize = 0;
    991   UConverter *conv = NULL;
    992   UErrorCode status = U_ZERO_ERROR;
    993   uint32_t inchars=0, total=0;
    994 
    995   f = fopen("data40.utf16", "rb");
    996   if(!f)
    997   {
    998     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
    999     return U_FILE_ACCESS_ERROR;
   1000   }
   1001 
   1002   out = fopen("data46.out", "wb");
   1003   if(!out)
   1004   {
   1005     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
   1006     fclose(f);
   1007     return U_FILE_ACCESS_ERROR;
   1008   }
   1009 
   1010   // **************************** START SAMPLE *******************
   1011   conv = ucnv_open( "iso-8859-2", &status);
   1012   assert(U_SUCCESS(status));
   1013 
   1014   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
   1015   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
   1016          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
   1017   buf = (char*)malloc(bufSize * sizeof(char));
   1018   assert(buf!=NULL);
   1019 
   1020   // grab another buffer's worth
   1021   while((!feof(f)) &&
   1022         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
   1023   {
   1024     inchars += count;
   1025 
   1026     // Convert bytes to unicode
   1027     source = inBuf;
   1028     sourceLimit = inBuf + count;
   1029 
   1030     do
   1031     {
   1032         target = buf;
   1033         targetLimit = buf + bufSize;
   1034 
   1035         ucnv_fromUnicode( conv, &target, targetLimit,
   1036                        &source, sourceLimit, NULL,
   1037                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
   1038                                    /* is true (when no more data will come) */
   1039                          &status);
   1040 
   1041         if(status == U_BUFFER_OVERFLOW_ERROR)
   1042         {
   1043           // simply ran out of space - we'll reset the target ptr the next
   1044           // time through the loop.
   1045           status = U_ZERO_ERROR;
   1046         }
   1047         else
   1048         {
   1049           //  Check other errors here.
   1050           assert(U_SUCCESS(status));
   1051           // Break out of the loop (by force)
   1052         }
   1053 
   1054         // Process the Unicode
   1055         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
   1056                (size_t)(target-buf));
   1057         total += (target-buf);
   1058     } while (source < sourceLimit); // while simply out of space
   1059   }
   1060 
   1061   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
   1062 
   1063   // ***************************** END SAMPLE ********************
   1064   ucnv_close(conv);
   1065 
   1066   fclose(f);
   1067   fclose(out);
   1068   printf("\n");
   1069 
   1070   return U_ZERO_ERROR;
   1071 }
   1072 #undef BUFFERSIZE
   1073 
   1074 #define BUFFERSIZE 219
   1075 
   1076 
   1077 /* main */
   1078 
   1079 int main()
   1080 {
   1081 
   1082   printf("Default Converter=%s\n", ucnv_getDefaultName() );
   1083 
   1084   convsample_02();  // C  , u->koi8r, conv
   1085   convsample_03();  // C,   iterate
   1086 
   1087   convsample_05();  // C,  utf8->u, getNextUChar
   1088   convsample_06(); // C freq counter thingy
   1089 
   1090   convsample_12();  // C,  sjis->u, conv
   1091   convsample_13();  // C,  big5->u, getNextU
   1092 
   1093   convsample_20();  // C, callback
   1094   convsample_21();  // C, callback debug
   1095 
   1096   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
   1097 
   1098   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
   1099 
   1100   printf("End of converter samples.\n");
   1101 
   1102   fflush(stdout);
   1103   fflush(stderr);
   1104 
   1105   return 0;
   1106 }
   1107