Home | History | Annotate | Download | only in uciter8
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2007, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uciter8.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003jan10
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file contains sample code that illustrates reading
     17 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
     18 *   and also accepting single surrogates.
     19 */
     20 
     21 #include <stdio.h>
     22 #include <string.h>
     23 #include "unicode/utypes.h"
     24 #include "unicode/uiter.h"
     25 #include "uit_len8.h"
     26 
     27 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     28 
     29 #define log_err printf
     30 
     31 /* UCharIterator test ------------------------------------------------------- */
     32 
     33 /*
     34  * The following code is a copy of the UCharIterator test code in
     35  * source/test/cintltst/custrtst.c,
     36  * testing the lenient-8 iterator instead of the UTF-8 one.
     37  */
     38 
     39 /*
     40  * Compare results from two iterators, should be same.
     41  * Assume that the text is not empty and that
     42  * iteration start==0 and iteration limit==length.
     43  */
     44 static void
     45 compareIterators(UCharIterator *iter1, const char *n1,
     46                  UCharIterator *iter2, const char *n2) {
     47     int32_t i, pos1, pos2, middle, length;
     48     UChar32 c1, c2;
     49 
     50     /* compare lengths */
     51     length=iter1->getIndex(iter1, UITER_LENGTH);
     52     pos2=iter2->getIndex(iter2, UITER_LENGTH);
     53     if(length!=pos2) {
     54         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
     55         return;
     56     }
     57 
     58     /* set into the middle */
     59     middle=length/2;
     60 
     61     pos1=iter1->move(iter1, middle, UITER_ZERO);
     62     if(pos1!=middle) {
     63         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
     64         return;
     65     }
     66 
     67     pos2=iter2->move(iter2, middle, UITER_ZERO);
     68     if(pos2!=middle) {
     69         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
     70         return;
     71     }
     72 
     73     /* test current() */
     74     c1=iter1->current(iter1);
     75     c2=iter2->current(iter2);
     76     if(c1!=c2) {
     77         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
     78         return;
     79     }
     80 
     81     /* move forward 3 UChars */
     82     for(i=0; i<3; ++i) {
     83         c1=iter1->next(iter1);
     84         c2=iter2->next(iter2);
     85         if(c1!=c2) {
     86             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
     87             return;
     88         }
     89     }
     90 
     91     /* move backward 5 UChars */
     92     for(i=0; i<5; ++i) {
     93         c1=iter1->previous(iter1);
     94         c2=iter2->previous(iter2);
     95         if(c1!=c2) {
     96             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
     97             return;
     98         }
     99     }
    100 
    101     /* iterate forward from the beginning */
    102     pos1=iter1->move(iter1, 0, UITER_START);
    103     if(pos1<0) {
    104         log_err("%s->move(start) failed\n", n1);
    105         return;
    106     }
    107     if(!iter1->hasNext(iter1)) {
    108         log_err("%s->hasNext() at the start returns FALSE\n", n1);
    109         return;
    110     }
    111 
    112     pos2=iter2->move(iter2, 0, UITER_START);
    113     if(pos2<0) {
    114         log_err("%s->move(start) failed\n", n2);
    115         return;
    116     }
    117     if(!iter2->hasNext(iter2)) {
    118         log_err("%s->hasNext() at the start returns FALSE\n", n2);
    119         return;
    120     }
    121 
    122     do {
    123         c1=iter1->next(iter1);
    124         c2=iter2->next(iter2);
    125         if(c1!=c2) {
    126             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
    127             return;
    128         }
    129     } while(c1>=0);
    130 
    131     if(iter1->hasNext(iter1)) {
    132         log_err("%s->hasNext() at the end returns TRUE\n", n1);
    133         return;
    134     }
    135     if(iter2->hasNext(iter2)) {
    136         log_err("%s->hasNext() at the end returns TRUE\n", n2);
    137         return;
    138     }
    139 
    140     /* back to the middle */
    141     pos1=iter1->move(iter1, middle, UITER_ZERO);
    142     if(pos1!=middle) {
    143         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
    144         return;
    145     }
    146 
    147     pos2=iter2->move(iter2, middle, UITER_ZERO);
    148     if(pos2!=middle) {
    149         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
    150         return;
    151     }
    152 
    153     /* move to index 1 */
    154     pos1=iter1->move(iter1, 1, UITER_ZERO);
    155     if(pos1!=1) {
    156         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
    157         return;
    158     }
    159 
    160     pos2=iter2->move(iter2, 1, UITER_ZERO);
    161     if(pos2!=1) {
    162         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
    163         return;
    164     }
    165 
    166     /* iterate backward from the end */
    167     pos1=iter1->move(iter1, 0, UITER_LIMIT);
    168     if(pos1<0) {
    169         log_err("%s->move(limit) failed\n", n1);
    170         return;
    171     }
    172     if(!iter1->hasPrevious(iter1)) {
    173         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
    174         return;
    175     }
    176 
    177     pos2=iter2->move(iter2, 0, UITER_LIMIT);
    178     if(pos2<0) {
    179         log_err("%s->move(limit) failed\n", n2);
    180         return;
    181     }
    182     if(!iter2->hasPrevious(iter2)) {
    183         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
    184         return;
    185     }
    186 
    187     do {
    188         c1=iter1->previous(iter1);
    189         c2=iter2->previous(iter2);
    190         if(c1!=c2) {
    191             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
    192             return;
    193         }
    194     } while(c1>=0);
    195 
    196     if(iter1->hasPrevious(iter1)) {
    197         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
    198         return;
    199     }
    200     if(iter2->hasPrevious(iter2)) {
    201         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
    202         return;
    203     }
    204 }
    205 
    206 /*
    207  * Test the iterator's getState() and setState() functions.
    208  * iter1 and iter2 must be set up for the same iterator type and the same string
    209  * but may be physically different structs (different addresses).
    210  *
    211  * Assume that the text is not empty and that
    212  * iteration start==0 and iteration limit==length.
    213  * It must be 2<=middle<=length-2.
    214  */
    215 static void
    216 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
    217     UChar32 u[4];
    218 
    219     UErrorCode errorCode;
    220     UChar32 c;
    221     uint32_t state;
    222     int32_t i, j;
    223 
    224     /* get four UChars from the middle of the string */
    225     iter1->move(iter1, middle-2, UITER_ZERO);
    226     for(i=0; i<4; ++i) {
    227         c=iter1->next(iter1);
    228         if(c<0) {
    229             /* the test violates the assumptions, see comment above */
    230             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
    231             return;
    232         }
    233         u[i]=c;
    234     }
    235 
    236     /* move to the middle and get the state */
    237     iter1->move(iter1, -2, UITER_CURRENT);
    238     state=uiter_getState(iter1);
    239 
    240     /* set the state into the second iterator and compare the results */
    241     errorCode=U_ZERO_ERROR;
    242     uiter_setState(iter2, state, &errorCode);
    243     if(U_FAILURE(errorCode)) {
    244         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
    245         return;
    246     }
    247 
    248     c=iter2->current(iter2);
    249     if(c!=u[2]) {
    250         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
    251     }
    252 
    253     c=iter2->previous(iter2);
    254     if(c!=u[1]) {
    255         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
    256     }
    257 
    258     iter2->move(iter2, 2, UITER_CURRENT);
    259     c=iter2->next(iter2);
    260     if(c!=u[3]) {
    261         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
    262     }
    263 
    264     iter2->move(iter2, -3, UITER_CURRENT);
    265     c=iter2->previous(iter2);
    266     if(c!=u[0]) {
    267         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
    268     }
    269 
    270     /* move the second iterator back to the middle */
    271     iter2->move(iter2, 1, UITER_CURRENT);
    272     iter2->next(iter2);
    273 
    274     /* check that both are in the middle */
    275     i=iter1->getIndex(iter1, UITER_CURRENT);
    276     j=iter2->getIndex(iter2, UITER_CURRENT);
    277     if(i!=middle) {
    278         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
    279     }
    280     if(i!=j) {
    281         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
    282     }
    283 
    284     /* compare lengths */
    285     i=iter1->getIndex(iter1, UITER_LENGTH);
    286     j=iter2->getIndex(iter2, UITER_LENGTH);
    287     if(i!=j) {
    288         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
    289     }
    290 }
    291 
    292 static void
    293 TestLenient8Iterator() {
    294     static const UChar text[]={
    295         0x61, 0x62, 0x63,
    296         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
    297         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
    298         0x78, 0x79, 0x7a, 0
    299     };
    300     static const uint8_t bytes[]={
    301         0x61, 0x62, 0x63,
    302         /* dffd            107fd                    d801               dffd - mixture */
    303         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
    304         0x78, 0x79, 0x7a, 0
    305     };
    306 
    307     UCharIterator iter1, iter2;
    308     UChar32 c1, c2;
    309     int32_t length;
    310 
    311     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
    312 
    313     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
    314     uiter_setString(&iter1, text, -1);
    315     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
    316     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
    317 
    318     /* try again with length=-1 */
    319     uiter_setLenient8(&iter2, (const char *)bytes, -1);
    320     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
    321 
    322     /* test get/set state */
    323     length=LENGTHOF(text)-1;
    324     uiter_setLenient8(&iter1, (const char*)bytes, -1);
    325     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
    326     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
    327 
    328     /* ---------------------------------------------------------------------- */
    329 
    330     puts("no output so far means that the lenient-8 iterator works fine");
    331 
    332     puts("iterate forward:\nUTF-16\tlenient-8");
    333     uiter_setString(&iter1, text, -1);
    334     iter1.move(&iter1, 0, UITER_START);
    335     iter2.move(&iter2, 0, UITER_START);
    336     for(;;) {
    337         c1=iter1.next(&iter1);
    338         c2=iter2.next(&iter2);
    339         if(c1<0 && c2<0) {
    340             break;
    341         }
    342         if(c1<0) {
    343             printf("\t%04x\n", c2);
    344         } else if(c2<0) {
    345             printf("%04x\n", c1);
    346         } else {
    347             printf("%04x\t%04x\n", c1, c2);
    348         }
    349     }
    350 }
    351 
    352 extern int
    353 main(int argc, const char *argv[]) {
    354     TestLenient8Iterator();
    355     return 0;
    356 }
    357