Home | History | Annotate | Download | only in ucnv
      1 /*************************************************************************
      2 *
      3 *    2016 and later: Unicode, Inc. and others.
      4 *   License & terms of use: http://www.unicode.org/copyright.html#License
      5 *
      6 **************************************************************************
      7 **************************************************************************
      8 *
      9 *   Copyright (C) 2000-2016, International Business Machines
     10 *   Corporation and others.  All Rights Reserved.
     11 *
     12 ***************************************************************************
     13 *   file name:  convsamp.c
     14 *   encoding:   ASCII (7-bit)
     15 *
     16 *   created on: 2000may30
     17 *   created by: Steven R. Loomis
     18 *
     19 *   Sample code for the ICU conversion routines.
     20 *
     21 * Note: Nothing special is needed to build this sample. Link with
     22 *       the icu UC and icu I18N libraries.
     23 *
     24 *       I use 'assert' for error checking, you probably will want
     25 *       something more flexible.  '***BEGIN SAMPLE***' and
     26 *       '***END SAMPLE***' mark pieces suitable for stand alone
     27 *       code snippets.
     28 *
     29 *
     30 *  Each test can define it's own BUFFERSIZE
     31 *
     32 */
     33 
     34 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
     35 
     36 #include <stdio.h>
     37 #include <ctype.h>            /* for isspace, etc.    */
     38 #include <assert.h>
     39 #include <string.h>
     40 #include <stdlib.h>  /* malloc */
     41 
     42 #include "unicode/utypes.h"   /* Basic ICU data types */
     43 #include "unicode/ucnv.h"     /* C   Converter API    */
     44 #include "unicode/ustring.h"  /* some more string fcns*/
     45 #include "unicode/uchar.h"    /* char names           */
     46 #include "unicode/uloc.h"
     47 #include "unicode/unistr.h"
     48 
     49 #include "flagcb.h"
     50 
     51 /* Some utility functions */
     52 #ifndef UPRV_LENGTHOF
     53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     54 #endif
     55 
     56 static const UChar kNone[] = { 0x0000 };
     57 
     58 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
     59 
     60 /* Print a UChar if possible, in seven characters. */
     61 void prettyPrintUChar(UChar c)
     62 {
     63   if(  (c <= 0x007F) &&
     64        (isgraph(c))  ) {
     65     printf(" '%c'   ", (char)(0x00FF&c));
     66   } else if ( c > 0x007F ) {
     67     char buf[1000];
     68     UErrorCode status = U_ZERO_ERROR;
     69     int32_t o;
     70 
     71     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
     72     if(U_SUCCESS(status) && (o>0) ) {
     73       buf[6] = 0;
     74       printf("%7s", buf);
     75     } else {
     76       printf(" ??????");
     77     }
     78   } else {
     79     switch((char)(c & 0x007F)) {
     80     case ' ':
     81       printf(" ' '   ");
     82       break;
     83     case '\t':
     84       printf(" \\t    ");
     85       break;
     86     case '\n':
     87       printf(" \\n    ");
     88       break;
     89     default:
     90       printf("  _    ");
     91       break;
     92     }
     93   }
     94 }
     95 
     96 
     97 void printUChars(const char  *name = "?",
     98                  const UChar *uch  = kNone,
     99                  int32_t     len   = -1 )
    100 {
    101   int32_t i;
    102 
    103   if( (len == -1) && (uch) ) {
    104     len = u_strlen(uch);
    105   }
    106 
    107   printf("%5s: ", name);
    108   for( i = 0; i <len; i++) {
    109     printf("%-6d ", i);
    110   }
    111   printf("\n");
    112 
    113   printf("%5s: ", "uni");
    114   for( i = 0; i <len; i++) {
    115     printf("\\u%04X ", (int)uch[i]);
    116   }
    117   printf("\n");
    118 
    119   printf("%5s:", "ch");
    120   for( i = 0; i <len; i++) {
    121     prettyPrintUChar(uch[i]);
    122   }
    123   printf("\n");
    124 }
    125 
    126 void printBytes(const char  *name = "?",
    127                  const char *uch  = "",
    128                  int32_t     len   = -1 )
    129 {
    130   int32_t i;
    131 
    132   if( (len == -1) && (uch) ) {
    133     len = static_cast<int32_t>(strlen(uch));
    134   }
    135 
    136   printf("%5s: ", name);
    137   for( i = 0; i <len; i++) {
    138     printf("%-4d ", i);
    139   }
    140   printf("\n");
    141 
    142   printf("%5s: ", "uni");
    143   for( i = 0; i <len; i++) {
    144     printf("\\x%02X ", 0x00FF & (int)uch[i]);
    145   }
    146   printf("\n");
    147 
    148   printf("%5s:", "ch");
    149   for( i = 0; i <len; i++) {
    150     if(isgraph(0x00FF & (int)uch[i])) {
    151       printf(" '%c' ", (char)uch[i]);
    152     } else {
    153       printf("     ");
    154     }
    155   }
    156   printf("\n");
    157 }
    158 
    159 void printUChar(UChar32 ch32)
    160 {
    161     if(ch32 > 0xFFFF) {
    162       printf("ch: U+%06X\n", ch32);
    163     }
    164     else {
    165       UChar ch = (UChar)ch32;
    166       printUChars("C", &ch, 1);
    167     }
    168 }
    169 
    170 /*******************************************************************
    171   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
    172   followed by an exclamation mark (!) into the KOI8-R Russian code page.
    173 
    174   This example first creates a UChar String out of the Unicode chars.
    175 
    176   targetSize must be set to the amount of space available in the target
    177   buffer. After fromUChars is called,
    178   len will contain the number of bytes in target[] which were
    179   used in the resulting codepage.  In this case, there is a 1:1 mapping
    180   between the input and output characters. The exclamation mark has the
    181   same value in both KOI8-R and Unicode.
    182 
    183   src: 0      1      2      3      4      5      6
    184   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
    185    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
    186 
    187  targ:  0    1    2    3    4    5    6
    188   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
    189    ch:                                '!'
    190 
    191 
    192 Converting FROM unicode
    193   to koi8-r.
    194   You must call ucnv_close to clean up the memory used by the
    195   converter.
    196 
    197   'len' returns the number of OUTPUT bytes resulting from the
    198   conversion.
    199  */
    200 
    201 UErrorCode convsample_02()
    202 {
    203   printf("\n\n==============================================\n"
    204          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
    205 
    206 
    207   // **************************** START SAMPLE *******************
    208   // "cat<cat>OK"
    209   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
    210                      0x0430, 0x0021, 0x0000 };
    211   char target[100];
    212   UErrorCode status = U_ZERO_ERROR;
    213   UConverter *conv;
    214   int32_t     len;
    215 
    216   // set up the converter
    217   //! [ucnv_open]
    218   conv = ucnv_open("koi8-r", &status);
    219   //! [ucnv_open]
    220   assert(U_SUCCESS(status));
    221 
    222   // convert to koi8-r
    223   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
    224   assert(U_SUCCESS(status));
    225 
    226   // close the converter
    227   ucnv_close(conv);
    228 
    229   // ***************************** END SAMPLE ********************
    230 
    231   // Print it out
    232   printUChars("src", source);
    233   printf("\n");
    234   printBytes("targ", target, len);
    235 
    236   return U_ZERO_ERROR;
    237 }
    238 
    239 
    240 UErrorCode convsample_03()
    241 {
    242   printf("\n\n==============================================\n"
    243          "Sample 03: C: print out all converters\n");
    244 
    245   int32_t count;
    246   int32_t i;
    247 
    248   // **************************** START SAMPLE *******************
    249   count = ucnv_countAvailable();
    250   printf("Available converters: %d\n", count);
    251 
    252   for(i=0;i<count;i++)
    253   {
    254     printf("%s ", ucnv_getAvailableName(i));
    255   }
    256 
    257   // ***************************** END SAMPLE ********************
    258 
    259   printf("\n");
    260 
    261   return U_ZERO_ERROR;
    262 }
    263 
    264 
    265 
    266 #define BUFFERSIZE 17 /* make it interesting :) */
    267 
    268 /*
    269   Converting from a codepage to Unicode in bulk..
    270   What is the best way to determine the buffer size?
    271 
    272      The 'buffersize' is in bytes of input.
    273     For a given converter, divinding this by the minimum char size
    274     give you the maximum number of Unicode characters that could be
    275     expected for a given number of input bytes.
    276      see: ucnv_getMinCharSize()
    277 
    278      For example, a single byte codepage like 'Latin-3' has a
    279     minimum char size of 1. (It takes at least 1 byte to represent
    280     each Unicode char.) So the unicode buffer has the same number of
    281     UChars as the input buffer has bytes.
    282 
    283      In a strictly double byte codepage such as cp1362 (Windows
    284     Korean), the minimum char size is 2. So, only half as many Unicode
    285     chars as bytes are needed.
    286 
    287      This work to calculate the buffer size is an optimization. Any
    288     size of input and output buffer can be used, as long as the
    289     program handles the following cases: If the input buffer is empty,
    290     the source pointer will be equal to sourceLimit.  If the output
    291     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
    292  */
    293 
    294 UErrorCode convsample_05()
    295 {
    296   printf("\n\n==============================================\n"
    297          "Sample 05: C: count the number of letters in a UTF-8 document\n");
    298 
    299   FILE *f;
    300   int32_t count;
    301   char inBuf[BUFFERSIZE];
    302   const char *source;
    303   const char *sourceLimit;
    304   UChar *uBuf;
    305   UChar *target;
    306   UChar *targetLimit;
    307   UChar *p;
    308   int32_t uBufSize = 0;
    309   UConverter *conv;
    310   UErrorCode status = U_ZERO_ERROR;
    311   uint32_t letters=0, total=0;
    312 
    313   f = fopen("data01.txt", "r");
    314   if(!f)
    315   {
    316     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    317     return U_FILE_ACCESS_ERROR;
    318   }
    319 
    320   // **************************** START SAMPLE *******************
    321   conv = ucnv_open("utf-8", &status);
    322   assert(U_SUCCESS(status));
    323 
    324   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    325   printf("input bytes %d / min chars %d = %d UChars\n",
    326          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    327   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    328   assert(uBuf!=NULL);
    329 
    330   // grab another buffer's worth
    331   while((!feof(f)) &&
    332         ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
    333   {
    334     // Convert bytes to unicode
    335     source = inBuf;
    336     sourceLimit = inBuf + count;
    337 
    338     do
    339     {
    340         target = uBuf;
    341         targetLimit = uBuf + uBufSize;
    342 
    343         ucnv_toUnicode(conv, &target, targetLimit,
    344                        &source, sourceLimit, NULL,
    345                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    346                                    /* is true (when no more data will come) */
    347                        &status);
    348 
    349         if(status == U_BUFFER_OVERFLOW_ERROR)
    350         {
    351           // simply ran out of space - we'll reset the target ptr the next
    352           // time through the loop.
    353           status = U_ZERO_ERROR;
    354         }
    355         else
    356         {
    357           //  Check other errors here.
    358           assert(U_SUCCESS(status));
    359           // Break out of the loop (by force)
    360         }
    361 
    362         // Process the Unicode
    363         // Todo: handle UTF-16/surrogates
    364 
    365         for(p = uBuf; p<target; p++)
    366         {
    367           if(u_isalpha(*p))
    368             letters++;
    369           total++;
    370         }
    371     } while (source < sourceLimit); // while simply out of space
    372   }
    373 
    374   printf("%d letters out of %d total UChars.\n", letters, total);
    375 
    376   // ***************************** END SAMPLE ********************
    377   ucnv_close(conv);
    378 
    379   printf("\n");
    380 
    381   fclose(f);
    382 
    383   return U_ZERO_ERROR;
    384 }
    385 #undef BUFFERSIZE
    386 
    387 #define BUFFERSIZE 1024
    388 typedef struct
    389 {
    390   UChar32  codepoint;
    391   uint32_t frequency;
    392 } CharFreqInfo;
    393 
    394 UErrorCode convsample_06()
    395 {
    396   printf("\n\n==============================================\n"
    397          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
    398 
    399   FILE *f;
    400   int32_t count;
    401   char inBuf[BUFFERSIZE];
    402   const char *source;
    403   const char *sourceLimit;
    404   int32_t uBufSize = 0;
    405   UConverter *conv;
    406   UErrorCode status = U_ZERO_ERROR;
    407   uint32_t letters=0, total=0;
    408 
    409   CharFreqInfo   *info;
    410   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
    411   UChar32   p;
    412 
    413   uint32_t ie = 0;
    414   uint32_t gh = 0;
    415   UChar32 l = 0;
    416 
    417   f = fopen("data06.txt", "r");
    418   if(!f)
    419   {
    420     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    421     return U_FILE_ACCESS_ERROR;
    422   }
    423 
    424   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
    425   if(!info)
    426   {
    427     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", static_cast<int>(sizeof(CharFreqInfo)*charCount));
    428   }
    429 
    430   /* reset frequencies */
    431   for(p=0;p<charCount;p++)
    432   {
    433     info[p].codepoint = p;
    434     info[p].frequency = 0;
    435   }
    436 
    437   // **************************** START SAMPLE *******************
    438   conv = ucnv_open("utf-8", &status);
    439   assert(U_SUCCESS(status));
    440 
    441   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    442   printf("input bytes %d / min chars %d = %d UChars\n",
    443          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    444 
    445   // grab another buffer's worth
    446   while((!feof(f)) &&
    447         ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
    448   {
    449     // Convert bytes to unicode
    450     source = inBuf;
    451     sourceLimit = inBuf + count;
    452 
    453     while(source < sourceLimit)
    454     {
    455       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    456       if(U_FAILURE(status))
    457       {
    458         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
    459         status = U_ZERO_ERROR;
    460         continue;
    461       }
    462       U_ASSERT(status);
    463       total++;
    464 
    465       if(u_isalpha(p))
    466         letters++;
    467 
    468       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
    469         ie++;
    470 
    471       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
    472         gh++;
    473 
    474       if(p>charCount)
    475       {
    476         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
    477         free(info);
    478         fclose(f);
    479         ucnv_close(conv);
    480         return U_UNSUPPORTED_ERROR;
    481       }
    482       info[p].frequency++;
    483       l = p;
    484     }
    485   }
    486 
    487   fclose(f);
    488   ucnv_close(conv);
    489 
    490   printf("%d letters out of %d total UChars.\n", letters, total);
    491   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
    492 
    493   // now, we could sort it..
    494 
    495   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
    496 
    497   for(p=0;p<charCount;p++)
    498   {
    499     if(info[p].frequency)
    500     {
    501       printf("% 5d U+%06X ", info[p].frequency, p);
    502       if(p <= 0xFFFF)
    503       {
    504         prettyPrintUChar((UChar)p);
    505       }
    506       printf("\n");
    507     }
    508   }
    509   free(info);
    510   // ***************************** END SAMPLE ********************
    511 
    512   printf("\n");
    513 
    514   return U_ZERO_ERROR;
    515 }
    516 #undef BUFFERSIZE
    517 
    518 
    519 /******************************************************
    520   You must call ucnv_close to clean up the memory used by the
    521   converter.
    522 
    523   'len' returns the number of OUTPUT bytes resulting from the
    524   conversion.
    525  */
    526 
    527 UErrorCode convsample_12()
    528 {
    529   printf("\n\n==============================================\n"
    530          "Sample 12: C: simple sjis -> unicode conversion\n");
    531 
    532 
    533   // **************************** START SAMPLE *******************
    534 
    535   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
    536   UChar target[100];
    537   UErrorCode status = U_ZERO_ERROR;
    538   UConverter *conv;
    539   int32_t     len;
    540 
    541   // set up the converter
    542   conv = ucnv_open("shift_jis", &status);
    543   assert(U_SUCCESS(status));
    544 
    545   // convert to Unicode
    546   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
    547   target[6] = 0xFDCA;
    548   len = ucnv_toUChars(conv, target, 100, source, static_cast<int32_t>(strlen(source)), &status);
    549   U_ASSERT(status);
    550   // close the converter
    551   ucnv_close(conv);
    552 
    553   // ***************************** END SAMPLE ********************
    554 
    555   // Print it out
    556   printBytes("src", source, static_cast<int32_t>(strlen(source)) );
    557   printf("\n");
    558   printUChars("targ", target, len);
    559 
    560   return U_ZERO_ERROR;
    561 }
    562 
    563 /******************************************************************
    564    C: Convert from codepage to Unicode one at a time.
    565 */
    566 
    567 UErrorCode convsample_13()
    568 {
    569   printf("\n\n==============================================\n"
    570          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
    571 
    572 
    573   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
    574   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
    575   const char *source, *sourceLimit;
    576   UChar32 target;
    577   UErrorCode status = U_ZERO_ERROR;
    578   UConverter *conv = NULL;
    579   int32_t srcCount=0;
    580   int32_t dstCount=0;
    581 
    582   srcCount = sizeof(sourceChars);
    583 
    584   conv = ucnv_open("Big5", &status);
    585   U_ASSERT(status);
    586 
    587   source = sourceChars;
    588   sourceLimit = sourceChars + sizeof(sourceChars);
    589 
    590   // **************************** START SAMPLE *******************
    591 
    592 
    593   printBytes("src", source, static_cast<int32_t>(sourceLimit - source));
    594 
    595   while(source < sourceLimit)
    596   {
    597     puts("");
    598     target = ucnv_getNextUChar (conv,
    599                                 &source,
    600                                 sourceLimit,
    601                                 &status);
    602 
    603     //    printBytes("src",source,sourceLimit-source);
    604     U_ASSERT(status);
    605     printUChar(target);
    606     dstCount++;
    607   }
    608 
    609 
    610   // ************************** END SAMPLE *************************
    611 
    612   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
    613   ucnv_close(conv);
    614 
    615   return U_ZERO_ERROR;
    616 }
    617 
    618 
    619 
    620 
    621 UBool convsample_20_didSubstitute(const char *source)
    622 {
    623   UChar uchars[100];
    624   char bytes[100];
    625   UConverter *conv = NULL;
    626   UErrorCode status = U_ZERO_ERROR;
    627   uint32_t len, len2;
    628   UBool  flagVal;
    629 
    630   FromUFLAGContext * context = NULL;
    631 
    632   printf("\n\n==============================================\n"
    633          "Sample 20: C: Test for substitution using callbacks\n");
    634 
    635   /* print out the original source */
    636   printBytes("src", source);
    637   printf("\n");
    638 
    639   /* First, convert from UTF8 to unicode */
    640   conv = ucnv_open("utf-8", &status);
    641   U_ASSERT(status);
    642 
    643   len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status);
    644   U_ASSERT(status);
    645 
    646   printUChars("uch", uchars, len);
    647   printf("\n");
    648 
    649   /* Now, close the converter */
    650   ucnv_close(conv);
    651 
    652   /* Now, convert to windows-1252 */
    653   conv = ucnv_open("windows-1252", &status);
    654   U_ASSERT(status);
    655 
    656   /* Converter starts out with the SUBSTITUTE callback set. */
    657 
    658   /* initialize our callback */
    659   context = flagCB_fromU_openContext();
    660 
    661   /* Set our special callback */
    662   ucnv_setFromUCallBack(conv,
    663                         flagCB_fromU,
    664                         context,
    665                         &(context->subCallback),
    666                         &(context->subContext),
    667                         &status);
    668 
    669   U_ASSERT(status);
    670 
    671   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
    672   U_ASSERT(status);
    673 
    674   flagVal = context->flag;  /* it's about to go away when we close the cnv */
    675 
    676   ucnv_close(conv);
    677 
    678   /* print out the original source */
    679   printBytes("bytes", bytes, len2);
    680 
    681   return flagVal; /* true if callback was called */
    682 }
    683 
    684 UErrorCode convsample_20()
    685 {
    686   const char *sample1 = "abc\xdf\xbf";
    687   const char *sample2 = "abc_def";
    688 
    689 
    690   if(convsample_20_didSubstitute(sample1))
    691   {
    692     printf("DID substitute.\n******\n");
    693   }
    694   else
    695   {
    696     printf("Did NOT substitute.\n*****\n");
    697   }
    698 
    699   if(convsample_20_didSubstitute(sample2))
    700   {
    701     printf("DID substitute.\n******\n");
    702   }
    703   else
    704   {
    705     printf("Did NOT substitute.\n*****\n");
    706   }
    707 
    708   return U_ZERO_ERROR;
    709 }
    710 
    711 // 21  - C, callback, with clone and debug
    712 
    713 
    714 
    715 UBool convsample_21_didSubstitute(const char *source)
    716 {
    717   UChar uchars[100];
    718   char bytes[100];
    719   UConverter *conv = NULL, *cloneCnv = NULL;
    720   UErrorCode status = U_ZERO_ERROR;
    721   uint32_t len, len2;
    722   UBool  flagVal = FALSE;
    723   UConverterFromUCallback junkCB;
    724 
    725   FromUFLAGContext *flagCtx = NULL,
    726                    *cloneFlagCtx = NULL;
    727 
    728   debugCBContext   *debugCtx1 = NULL,
    729                    *debugCtx2 = NULL,
    730                    *cloneDebugCtx = NULL;
    731 
    732   printf("\n\n==============================================\n"
    733          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
    734 
    735   /* print out the original source */
    736   printBytes("src", source);
    737   printf("\n");
    738 
    739   /* First, convert from UTF8 to unicode */
    740   conv = ucnv_open("utf-8", &status);
    741   U_ASSERT(status);
    742 
    743   len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status);
    744   U_ASSERT(status);
    745 
    746   printUChars("uch", uchars, len);
    747   printf("\n");
    748 
    749   /* Now, close the converter */
    750   ucnv_close(conv);
    751 
    752   /* Now, convert to windows-1252 */
    753   conv = ucnv_open("windows-1252", &status);
    754   U_ASSERT(status);
    755 
    756   /* Converter starts out with the SUBSTITUTE callback set. */
    757 
    758   /* initialize our callback */
    759   /* from the 'bottom' innermost, out
    760    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
    761 
    762 #if DEBUG_TMI
    763   printf("flagCB_fromU = %p\n", &flagCB_fromU);
    764   printf("debugCB_fromU = %p\n", &debugCB_fromU);
    765 #endif
    766 
    767   debugCtx1 = debugCB_openContext();
    768    flagCtx  = flagCB_fromU_openContext();
    769   debugCtx2 = debugCB_openContext();
    770 
    771   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
    772   debugCtx1->subContext  =  flagCtx;
    773 
    774   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
    775   flagCtx->subContext    =  debugCtx2;
    776 
    777   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
    778   debugCtx2->subContext  = NULL;
    779 
    780   /* Set our special callback */
    781 
    782   ucnv_setFromUCallBack(conv,
    783                         debugCB_fromU,
    784                         debugCtx1,
    785                         &(debugCtx2->subCallback),
    786                         &(debugCtx2->subContext),
    787                         &status);
    788 
    789   U_ASSERT(status);
    790 
    791 #if DEBUG_TMI
    792   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
    793          conv, debugCtx1, debugCtx1->subCallback,
    794          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
    795 #endif
    796 
    797   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
    798 
    799   U_ASSERT(status);
    800 
    801 #if DEBUG_TMI
    802   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
    803 #endif
    804 
    805   ucnv_close(conv);
    806 
    807 #if DEBUG_TMI
    808   printf("%p closed.\n", conv);
    809 #endif
    810 
    811   U_ASSERT(status);
    812   /* Now, we have to extract the context */
    813   cloneDebugCtx = NULL;
    814   cloneFlagCtx  = NULL;
    815 
    816   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
    817   if(cloneDebugCtx != NULL) {
    818       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
    819   }
    820 
    821   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
    822          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
    823 
    824   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
    825   U_ASSERT(status);
    826 
    827   if(cloneFlagCtx != NULL) {
    828       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
    829   } else {
    830       printf("** Warning, couldn't get the subcallback \n");
    831   }
    832 
    833   ucnv_close(cloneCnv);
    834 
    835   /* print out the original source */
    836   printBytes("bytes", bytes, len2);
    837 
    838   return flagVal; /* true if callback was called */
    839 }
    840 
    841 UErrorCode convsample_21()
    842 {
    843   const char *sample1 = "abc\xdf\xbf";
    844   const char *sample2 = "abc_def";
    845 
    846   if(convsample_21_didSubstitute(sample1))
    847   {
    848     printf("DID substitute.\n******\n");
    849   }
    850   else
    851   {
    852     printf("Did NOT substitute.\n*****\n");
    853   }
    854 
    855   if(convsample_21_didSubstitute(sample2))
    856   {
    857     printf("DID substitute.\n******\n");
    858   }
    859   else
    860   {
    861     printf("Did NOT substitute.\n*****\n");
    862   }
    863 
    864   return U_ZERO_ERROR;
    865 }
    866 
    867 
    868 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
    869 
    870 #define BUFFERSIZE 17 /* make it interesting :) */
    871 
    872 UErrorCode convsample_40()
    873 {
    874   printf("\n\n==============================================\n"
    875     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
    876 
    877   FILE *f;
    878   FILE *out;
    879   int32_t count;
    880   char inBuf[BUFFERSIZE];
    881   const char *source;
    882   const char *sourceLimit;
    883   UChar *uBuf;
    884   UChar *target;
    885   UChar *targetLimit;
    886   int32_t uBufSize = 0;
    887   UConverter *conv = NULL;
    888   UErrorCode status = U_ZERO_ERROR;
    889   uint32_t inbytes=0, total=0;
    890 
    891   f = fopen("data02.bin", "rb");
    892   if(!f)
    893   {
    894     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    895     return U_FILE_ACCESS_ERROR;
    896   }
    897 
    898   out = fopen("data40.utf16", "wb");
    899   if(!out)
    900   {
    901     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    902     fclose(f);
    903     return U_FILE_ACCESS_ERROR;
    904   }
    905 
    906   // **************************** START SAMPLE *******************
    907   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
    908   assert(U_SUCCESS(status));
    909 
    910   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
    911   printf("input bytes %d / min chars %d = %d UChars\n",
    912          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
    913   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
    914   assert(uBuf!=NULL);
    915 
    916   // grab another buffer's worth
    917   while((!feof(f)) &&
    918         ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) )
    919   {
    920     inbytes += count;
    921 
    922     // Convert bytes to unicode
    923     source = inBuf;
    924     sourceLimit = inBuf + count;
    925 
    926     do
    927     {
    928         target = uBuf;
    929         targetLimit = uBuf + uBufSize;
    930 
    931         ucnv_toUnicode( conv, &target, targetLimit,
    932                        &source, sourceLimit, NULL,
    933                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
    934                                    /* is true (when no more data will come) */
    935                          &status);
    936 
    937         if(status == U_BUFFER_OVERFLOW_ERROR)
    938         {
    939           // simply ran out of space - we'll reset the target ptr the next
    940           // time through the loop.
    941           status = U_ZERO_ERROR;
    942         }
    943         else
    944         {
    945           //  Check other errors here.
    946           assert(U_SUCCESS(status));
    947           // Break out of the loop (by force)
    948         }
    949 
    950         // Process the Unicode
    951         // Todo: handle UTF-16/surrogates
    952         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf));
    953         total += static_cast<uint32_t>((target-uBuf));
    954     } while (source < sourceLimit); // while simply out of space
    955   }
    956 
    957   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
    958 
    959   // ***************************** END SAMPLE ********************
    960   ucnv_close(conv);
    961 
    962   fclose(f);
    963   fclose(out);
    964   printf("\n");
    965 
    966   return U_ZERO_ERROR;
    967 }
    968 #undef BUFFERSIZE
    969 
    970 
    971 
    972 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
    973 
    974 #define BUFFERSIZE 24 /* make it interesting :) */
    975 
    976 UErrorCode convsample_46()
    977 {
    978   printf("\n\n==============================================\n"
    979     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
    980 
    981   FILE *f;
    982   FILE *out;
    983   int32_t count;
    984   UChar inBuf[BUFFERSIZE];
    985   const UChar *source;
    986   const UChar *sourceLimit;
    987   char *buf;
    988   char *target;
    989   char *targetLimit;
    990 
    991   int32_t bufSize = 0;
    992   UConverter *conv = NULL;
    993   UErrorCode status = U_ZERO_ERROR;
    994   uint32_t inchars=0, total=0;
    995 
    996   f = fopen("data40.utf16", "rb");
    997   if(!f)
    998   {
    999     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
   1000     return U_FILE_ACCESS_ERROR;
   1001   }
   1002 
   1003   out = fopen("data46.out", "wb");
   1004   if(!out)
   1005   {
   1006     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
   1007     fclose(f);
   1008     return U_FILE_ACCESS_ERROR;
   1009   }
   1010 
   1011   // **************************** START SAMPLE *******************
   1012   conv = ucnv_open( "iso-8859-2", &status);
   1013   assert(U_SUCCESS(status));
   1014 
   1015   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
   1016   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
   1017          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
   1018   buf = (char*)malloc(bufSize * sizeof(char));
   1019   assert(buf!=NULL);
   1020 
   1021   // grab another buffer's worth
   1022   while((!feof(f)) &&
   1023         ((count=static_cast<int32_t>(fread(inBuf, sizeof(UChar), BUFFERSIZE , f))) > 0) )
   1024   {
   1025     inchars += count;
   1026 
   1027     // Convert bytes to unicode
   1028     source = inBuf;
   1029     sourceLimit = inBuf + count;
   1030 
   1031     do
   1032     {
   1033         target = buf;
   1034         targetLimit = buf + bufSize;
   1035 
   1036         ucnv_fromUnicode( conv, &target, targetLimit,
   1037                        &source, sourceLimit, NULL,
   1038                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
   1039                                    /* is true (when no more data will come) */
   1040                          &status);
   1041 
   1042         if(status == U_BUFFER_OVERFLOW_ERROR)
   1043         {
   1044           // simply ran out of space - we'll reset the target ptr the next
   1045           // time through the loop.
   1046           status = U_ZERO_ERROR;
   1047         }
   1048         else
   1049         {
   1050           //  Check other errors here.
   1051           assert(U_SUCCESS(status));
   1052           // Break out of the loop (by force)
   1053         }
   1054 
   1055         // Process the Unicode
   1056         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == (size_t)(target-buf));
   1057         total += static_cast<uint32_t>((target-buf));
   1058     } while (source < sourceLimit); // while simply out of space
   1059   }
   1060 
   1061   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, static_cast<int>(inchars * sizeof(UChar)), total);
   1062 
   1063   // ***************************** END SAMPLE ********************
   1064   ucnv_close(conv);
   1065 
   1066   fclose(f);
   1067   fclose(out);
   1068   printf("\n");
   1069 
   1070   return U_ZERO_ERROR;
   1071 }
   1072 #undef BUFFERSIZE
   1073 
   1074 #define BUFFERSIZE 219
   1075 
   1076 void convsample_50() {
   1077   printf("\n\n==============================================\n"
   1078          "Sample 50: C: ucnv_detectUnicodeSignature\n");
   1079 
   1080   //! [ucnv_detectUnicodeSignature]
   1081   UErrorCode err = U_ZERO_ERROR;
   1082   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
   1083   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
   1084   int32_t signatureLength = 0;
   1085   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
   1086   UConverter *conv = NULL;
   1087   UChar output[100];
   1088   UChar *target = output, *out;
   1089   const char *source = input;
   1090   if(encoding!=NULL && U_SUCCESS(err)){
   1091     // should signature be discarded ?
   1092     conv = ucnv_open(encoding, &err);
   1093     // do the conversion
   1094     ucnv_toUnicode(conv,
   1095                    &target, output + UPRV_LENGTHOF(output),
   1096                    &source, input + sizeof(input),
   1097                    NULL, TRUE, &err);
   1098     out = output;
   1099     if (discardSignature){
   1100       ++out; // ignore initial U+FEFF
   1101     }
   1102     while(out != target) {
   1103       printf("%04x ", *out++);
   1104     }
   1105     puts("");
   1106   }
   1107   //! [ucnv_detectUnicodeSignature]
   1108   puts("");
   1109 }
   1110 
   1111 
   1112 
   1113 /* main */
   1114 
   1115 int main()
   1116 {
   1117 
   1118   printf("Default Converter=%s\n", ucnv_getDefaultName() );
   1119 
   1120   convsample_02();  // C  , u->koi8r, conv
   1121   convsample_03();  // C,   iterate
   1122 
   1123   convsample_05();  // C,  utf8->u, getNextUChar
   1124   convsample_06(); // C freq counter thingy
   1125 
   1126   convsample_12();  // C,  sjis->u, conv
   1127   convsample_13();  // C,  big5->u, getNextU
   1128 
   1129   convsample_20();  // C, callback
   1130   convsample_21();  // C, callback debug
   1131 
   1132   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
   1133 
   1134   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
   1135 
   1136   convsample_50();  // C, detect unicode signature
   1137 
   1138   printf("End of converter samples.\n");
   1139 
   1140   fflush(stdout);
   1141   fflush(stderr);
   1142 
   1143   return 0;
   1144 }
   1145