Home | History | Annotate | Download | only in uciter8
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
      5 *   License & terms of use: http://www.unicode.org/copyright.html#License
      6 *
      7 *******************************************************************************
      8 *******************************************************************************
      9 *
     10 *   Copyright (C) 2003-2014, International Business Machines
     11 *   Corporation and others.  All Rights Reserved.
     12 *
     13 *******************************************************************************
     14 *   file name:  uciter8.c
     15 *   encoding:   US-ASCII
     16 *   tab size:   8 (not used)
     17 *   indentation:4
     18 *
     19 *   created on: 2003jan10
     20 *   created by: Markus W. Scherer
     21 *
     22 *   This file contains sample code that illustrates reading
     23 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
     24 *   and also accepting single surrogates.
     25 */
     26 
     27 #include <stdio.h>
     28 #include <string.h>
     29 #include "unicode/utypes.h"
     30 #include "unicode/uiter.h"
     31 #include "uit_len8.h"
     32 
     33 #define log_err printf
     34 
     35 /* UCharIterator test ------------------------------------------------------- */
     36 
     37 /*
     38  * The following code is a copy of the UCharIterator test code in
     39  * source/test/cintltst/custrtst.c,
     40  * testing the lenient-8 iterator instead of the UTF-8 one.
     41  */
     42 
     43 /*
     44  * Compare results from two iterators, should be same.
     45  * Assume that the text is not empty and that
     46  * iteration start==0 and iteration limit==length.
     47  */
     48 static void
     49 compareIterators(UCharIterator *iter1, const char *n1,
     50                  UCharIterator *iter2, const char *n2) {
     51     int32_t i, pos1, pos2, middle, length;
     52     UChar32 c1, c2;
     53 
     54     /* compare lengths */
     55     length=iter1->getIndex(iter1, UITER_LENGTH);
     56     pos2=iter2->getIndex(iter2, UITER_LENGTH);
     57     if(length!=pos2) {
     58         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
     59         return;
     60     }
     61 
     62     /* set into the middle */
     63     middle=length/2;
     64 
     65     pos1=iter1->move(iter1, middle, UITER_ZERO);
     66     if(pos1!=middle) {
     67         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
     68         return;
     69     }
     70 
     71     pos2=iter2->move(iter2, middle, UITER_ZERO);
     72     if(pos2!=middle) {
     73         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
     74         return;
     75     }
     76 
     77     /* test current() */
     78     c1=iter1->current(iter1);
     79     c2=iter2->current(iter2);
     80     if(c1!=c2) {
     81         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
     82         return;
     83     }
     84 
     85     /* move forward 3 UChars */
     86     for(i=0; i<3; ++i) {
     87         c1=iter1->next(iter1);
     88         c2=iter2->next(iter2);
     89         if(c1!=c2) {
     90             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
     91             return;
     92         }
     93     }
     94 
     95     /* move backward 5 UChars */
     96     for(i=0; i<5; ++i) {
     97         c1=iter1->previous(iter1);
     98         c2=iter2->previous(iter2);
     99         if(c1!=c2) {
    100             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
    101             return;
    102         }
    103     }
    104 
    105     /* iterate forward from the beginning */
    106     pos1=iter1->move(iter1, 0, UITER_START);
    107     if(pos1<0) {
    108         log_err("%s->move(start) failed\n", n1);
    109         return;
    110     }
    111     if(!iter1->hasNext(iter1)) {
    112         log_err("%s->hasNext() at the start returns FALSE\n", n1);
    113         return;
    114     }
    115 
    116     pos2=iter2->move(iter2, 0, UITER_START);
    117     if(pos2<0) {
    118         log_err("%s->move(start) failed\n", n2);
    119         return;
    120     }
    121     if(!iter2->hasNext(iter2)) {
    122         log_err("%s->hasNext() at the start returns FALSE\n", n2);
    123         return;
    124     }
    125 
    126     do {
    127         c1=iter1->next(iter1);
    128         c2=iter2->next(iter2);
    129         if(c1!=c2) {
    130             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
    131             return;
    132         }
    133     } while(c1>=0);
    134 
    135     if(iter1->hasNext(iter1)) {
    136         log_err("%s->hasNext() at the end returns TRUE\n", n1);
    137         return;
    138     }
    139     if(iter2->hasNext(iter2)) {
    140         log_err("%s->hasNext() at the end returns TRUE\n", n2);
    141         return;
    142     }
    143 
    144     /* back to the middle */
    145     pos1=iter1->move(iter1, middle, UITER_ZERO);
    146     if(pos1!=middle) {
    147         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
    148         return;
    149     }
    150 
    151     pos2=iter2->move(iter2, middle, UITER_ZERO);
    152     if(pos2!=middle) {
    153         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
    154         return;
    155     }
    156 
    157     /* move to index 1 */
    158     pos1=iter1->move(iter1, 1, UITER_ZERO);
    159     if(pos1!=1) {
    160         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
    161         return;
    162     }
    163 
    164     pos2=iter2->move(iter2, 1, UITER_ZERO);
    165     if(pos2!=1) {
    166         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
    167         return;
    168     }
    169 
    170     /* iterate backward from the end */
    171     pos1=iter1->move(iter1, 0, UITER_LIMIT);
    172     if(pos1<0) {
    173         log_err("%s->move(limit) failed\n", n1);
    174         return;
    175     }
    176     if(!iter1->hasPrevious(iter1)) {
    177         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
    178         return;
    179     }
    180 
    181     pos2=iter2->move(iter2, 0, UITER_LIMIT);
    182     if(pos2<0) {
    183         log_err("%s->move(limit) failed\n", n2);
    184         return;
    185     }
    186     if(!iter2->hasPrevious(iter2)) {
    187         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
    188         return;
    189     }
    190 
    191     do {
    192         c1=iter1->previous(iter1);
    193         c2=iter2->previous(iter2);
    194         if(c1!=c2) {
    195             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
    196             return;
    197         }
    198     } while(c1>=0);
    199 
    200     if(iter1->hasPrevious(iter1)) {
    201         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
    202         return;
    203     }
    204     if(iter2->hasPrevious(iter2)) {
    205         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
    206         return;
    207     }
    208 }
    209 
    210 /*
    211  * Test the iterator's getState() and setState() functions.
    212  * iter1 and iter2 must be set up for the same iterator type and the same string
    213  * but may be physically different structs (different addresses).
    214  *
    215  * Assume that the text is not empty and that
    216  * iteration start==0 and iteration limit==length.
    217  * It must be 2<=middle<=length-2.
    218  */
    219 static void
    220 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
    221     UChar32 u[4];
    222 
    223     UErrorCode errorCode;
    224     UChar32 c;
    225     uint32_t state;
    226     int32_t i, j;
    227 
    228     /* get four UChars from the middle of the string */
    229     iter1->move(iter1, middle-2, UITER_ZERO);
    230     for(i=0; i<4; ++i) {
    231         c=iter1->next(iter1);
    232         if(c<0) {
    233             /* the test violates the assumptions, see comment above */
    234             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
    235             return;
    236         }
    237         u[i]=c;
    238     }
    239 
    240     /* move to the middle and get the state */
    241     iter1->move(iter1, -2, UITER_CURRENT);
    242     state=uiter_getState(iter1);
    243 
    244     /* set the state into the second iterator and compare the results */
    245     errorCode=U_ZERO_ERROR;
    246     uiter_setState(iter2, state, &errorCode);
    247     if(U_FAILURE(errorCode)) {
    248         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
    249         return;
    250     }
    251 
    252     c=iter2->current(iter2);
    253     if(c!=u[2]) {
    254         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
    255     }
    256 
    257     c=iter2->previous(iter2);
    258     if(c!=u[1]) {
    259         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
    260     }
    261 
    262     iter2->move(iter2, 2, UITER_CURRENT);
    263     c=iter2->next(iter2);
    264     if(c!=u[3]) {
    265         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
    266     }
    267 
    268     iter2->move(iter2, -3, UITER_CURRENT);
    269     c=iter2->previous(iter2);
    270     if(c!=u[0]) {
    271         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
    272     }
    273 
    274     /* move the second iterator back to the middle */
    275     iter2->move(iter2, 1, UITER_CURRENT);
    276     iter2->next(iter2);
    277 
    278     /* check that both are in the middle */
    279     i=iter1->getIndex(iter1, UITER_CURRENT);
    280     j=iter2->getIndex(iter2, UITER_CURRENT);
    281     if(i!=middle) {
    282         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
    283     }
    284     if(i!=j) {
    285         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
    286     }
    287 
    288     /* compare lengths */
    289     i=iter1->getIndex(iter1, UITER_LENGTH);
    290     j=iter2->getIndex(iter2, UITER_LENGTH);
    291     if(i!=j) {
    292         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
    293     }
    294 }
    295 
    296 static void
    297 TestLenient8Iterator() {
    298     static const UChar text[]={
    299         0x61, 0x62, 0x63,
    300         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
    301         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
    302         0x78, 0x79, 0x7a, 0
    303     };
    304     static const uint8_t bytes[]={
    305         0x61, 0x62, 0x63,
    306         /* dffd            107fd                    d801               dffd - mixture */
    307         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
    308         0x78, 0x79, 0x7a, 0
    309     };
    310 
    311     UCharIterator iter1, iter2;
    312     UChar32 c1, c2;
    313     int32_t length;
    314 
    315     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
    316 
    317     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
    318     uiter_setString(&iter1, text, -1);
    319     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
    320     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
    321 
    322     /* try again with length=-1 */
    323     uiter_setLenient8(&iter2, (const char *)bytes, -1);
    324     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
    325 
    326     /* test get/set state */
    327     length=UPRV_LENGTHOF(text)-1;
    328     uiter_setLenient8(&iter1, (const char*)bytes, -1);
    329     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
    330     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
    331 
    332     /* ---------------------------------------------------------------------- */
    333 
    334     puts("no output so far means that the lenient-8 iterator works fine");
    335 
    336     puts("iterate forward:\nUTF-16\tlenient-8");
    337     uiter_setString(&iter1, text, -1);
    338     iter1.move(&iter1, 0, UITER_START);
    339     iter2.move(&iter2, 0, UITER_START);
    340     for(;;) {
    341         c1=iter1.next(&iter1);
    342         c2=iter2.next(&iter2);
    343         if(c1<0 && c2<0) {
    344             break;
    345         }
    346         if(c1<0) {
    347             printf("\t%04x\n", c2);
    348         } else if(c2<0) {
    349             printf("%04x\n", c1);
    350         } else {
    351             printf("%04x\t%04x\n", c1, c2);
    352         }
    353     }
    354 }
    355 
    356 extern int
    357 main(int argc, const char *argv[]) {
    358     TestLenient8Iterator();
    359     return 0;
    360 }
    361