Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  unorm_it.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003jan21
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/uiter.h"
     22 #include "unicode/unorm.h"
     23 #include "unicode/utf.h"
     24 #include "unorm_it.h"
     25 #include "cmemory.h"
     26 
     27 /* UNormIterator ------------------------------------------------------------ */
     28 
     29 enum {
     30     INITIAL_CAPACITY=100
     31 };
     32 
     33 struct UNormIterator {
     34     UCharIterator api;
     35     UCharIterator *iter;
     36 
     37     /*
     38      * chars and states either use the static buffers
     39      * or are allocated in the same memory block
     40      *
     41      * They are parallel arrays with states[] holding the getState() values
     42      * from normalization boundaries, and UITER_NO_STATE in between.
     43      */
     44     UChar *chars;
     45     uint32_t *states;
     46 
     47     /*
     48      * api.start: first valid character & state in the arrays
     49      * api.index: current position
     50      * api.limit: one past the last valid character in chars[], but states[limit] is valid
     51      * capacity: length of allocated arrays
     52      */
     53     int32_t capacity;
     54 
     55     /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
     56     uint32_t state;
     57 
     58     /* there are UChars available before start or after limit? */
     59     UBool hasPrevious, hasNext, isStackAllocated;
     60 
     61     UNormalizationMode mode;
     62 
     63     UChar charsBuffer[INITIAL_CAPACITY];
     64     uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
     65 };
     66 
     67 static void
     68 initIndexes(UNormIterator *uni, UCharIterator *iter) {
     69     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
     70     UCharIterator *api=&uni->api;
     71 
     72     if(!iter->hasPrevious(iter)) {
     73         /* set indexes to the beginning of the arrays */
     74         api->start=api->index=api->limit=0;
     75         uni->hasPrevious=FALSE;
     76         uni->hasNext=iter->hasNext(iter);
     77     } else if(!iter->hasNext(iter)) {
     78         /* set indexes to the end of the arrays */
     79         api->start=api->index=api->limit=uni->capacity;
     80         uni->hasNext=FALSE;
     81         uni->hasPrevious=iter->hasPrevious(iter);
     82     } else {
     83         /* set indexes into the middle of the arrays */
     84         api->start=api->index=api->limit=uni->capacity/2;
     85         uni->hasPrevious=uni->hasNext=TRUE;
     86     }
     87 }
     88 
     89 static UBool
     90 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
     91     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
     92     UCharIterator *api=&uni->api;
     93 
     94     uint32_t *states;
     95     UChar *chars;
     96     int32_t start, limit;
     97 
     98     states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
     99     if(states==NULL) {
    100         return FALSE;
    101     }
    102 
    103     chars=(UChar *)(states+(capacity+1));
    104     uni->capacity=capacity;
    105 
    106     start=api->start;
    107     limit=api->limit;
    108 
    109     if(addAtStart) {
    110         /* copy old contents to the end of the new arrays */
    111         int32_t delta;
    112 
    113         delta=capacity-uni->capacity;
    114         uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
    115         uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
    116 
    117         api->start=start+delta;
    118         api->index+=delta;
    119         api->limit=limit+delta;
    120     } else {
    121         /* copy old contents to the beginning of the new arrays */
    122         uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
    123         uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
    124     }
    125 
    126     uni->chars=chars;
    127     uni->states=states;
    128 
    129     return TRUE;
    130 }
    131 
    132 static void
    133 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
    134     /* move array contents up to make room */
    135     int32_t srcIndex, destIndex, limit;
    136 
    137     limit=api->limit;
    138     srcIndex=delta;
    139     if(srcIndex>api->start) {
    140         /* look for a position in the arrays with a known state */
    141         while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
    142             ++srcIndex;
    143         }
    144     }
    145 
    146     /* now actually move the array contents */
    147     api->start=destIndex=0;
    148     while(srcIndex<limit) {
    149         chars[destIndex]=chars[srcIndex];
    150         states[destIndex++]=states[srcIndex++];
    151     }
    152 
    153     /* copy states[limit] as well! */
    154     states[destIndex]=states[srcIndex];
    155 
    156     api->limit=destIndex;
    157 }
    158 
    159 static void
    160 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
    161     /* move array contents up to make room */
    162     int32_t srcIndex, destIndex, start;
    163 
    164     start=api->start;
    165     destIndex=((UNormIterator *)api)->capacity;
    166     srcIndex=destIndex-delta;
    167     if(srcIndex<api->limit) {
    168         /* look for a position in the arrays with a known state */
    169         while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
    170             --srcIndex;
    171         }
    172     }
    173 
    174     /* now actually move the array contents */
    175     api->limit=destIndex;
    176 
    177     /* copy states[limit] as well! */
    178     states[destIndex]=states[srcIndex];
    179 
    180     while(srcIndex>start) {
    181         chars[--destIndex]=chars[--srcIndex];
    182         states[destIndex]=states[srcIndex];
    183     }
    184 
    185     api->start=destIndex;
    186 }
    187 
    188 /* normalize forward from the limit, assume hasNext is true */
    189 static UBool
    190 readNext(UNormIterator *uni, UCharIterator *iter) {
    191     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
    192     UCharIterator *api=&uni->api;
    193 
    194     /* make capacity/4 room at the end of the arrays */
    195     int32_t limit, capacity, room;
    196     UErrorCode errorCode;
    197 
    198     limit=api->limit;
    199     capacity=uni->capacity;
    200     room=capacity/4;
    201     if(room>(capacity-limit)) {
    202         /* move array contents to make room */
    203         moveContentsTowardStart(api, uni->chars, uni->states, room);
    204         api->index=limit=api->limit;
    205         uni->hasPrevious=TRUE;
    206     }
    207 
    208     /* normalize starting from the limit position */
    209     errorCode=U_ZERO_ERROR;
    210     if(uni->state!=uni->states[limit]) {
    211         uiter_setState(iter, uni->states[limit], &errorCode);
    212         if(U_FAILURE(errorCode)) {
    213             uni->state=UITER_NO_STATE;
    214             uni->hasNext=FALSE;
    215             return FALSE;
    216         }
    217     }
    218 
    219     room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
    220     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    221         if(room<=capacity) {
    222             /* empty and re-use the arrays */
    223             uni->states[0]=uni->states[limit];
    224             api->start=api->index=api->limit=limit=0;
    225             uni->hasPrevious=TRUE;
    226         } else {
    227             capacity+=room+100;
    228             if(!reallocArrays(uni, capacity, FALSE)) {
    229                 uni->state=UITER_NO_STATE;
    230                 uni->hasNext=FALSE;
    231                 return FALSE;
    232             }
    233             limit=api->limit;
    234         }
    235 
    236         errorCode=U_ZERO_ERROR;
    237         uiter_setState(iter, uni->states[limit], &errorCode);
    238         room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
    239     }
    240     if(U_FAILURE(errorCode) || room==0) {
    241         uni->state=UITER_NO_STATE;
    242         uni->hasNext=FALSE;
    243         return FALSE;
    244     }
    245 
    246     /* room>0 */
    247     ++limit; /* leave the known states[limit] alone */
    248     for(--room; room>0; --room) {
    249         /* set unknown states for all but the normalization boundaries */
    250         uni->states[limit++]=UITER_NO_STATE;
    251     }
    252     uni->states[limit]=uni->state=uiter_getState(iter);
    253     uni->hasNext=iter->hasNext(iter);
    254     api->limit=limit;
    255     return TRUE;
    256 }
    257 
    258 /* normalize backward from the start, assume hasPrevious is true */
    259 static UBool
    260 readPrevious(UNormIterator *uni, UCharIterator *iter) {
    261     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
    262     UCharIterator *api=&uni->api;
    263 
    264     /* make capacity/4 room at the start of the arrays */
    265     int32_t start, capacity, room;
    266     UErrorCode errorCode;
    267 
    268     start=api->start;
    269     capacity=uni->capacity;
    270     room=capacity/4;
    271     if(room>start) {
    272         /* move array contents to make room */
    273         moveContentsTowardEnd(api, uni->chars, uni->states, room);
    274         api->index=start=api->start;
    275         uni->hasNext=TRUE;
    276     }
    277 
    278     /* normalize ending at the start position */
    279     errorCode=U_ZERO_ERROR;
    280     if(uni->state!=uni->states[start]) {
    281         uiter_setState(iter, uni->states[start], &errorCode);
    282         if(U_FAILURE(errorCode)) {
    283             uni->state=UITER_NO_STATE;
    284             uni->hasPrevious=FALSE;
    285             return FALSE;
    286         }
    287     }
    288 
    289     room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
    290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    291         if(room<=capacity) {
    292             /* empty and re-use the arrays */
    293             uni->states[capacity]=uni->states[start];
    294             api->start=api->index=api->limit=start=capacity;
    295             uni->hasNext=TRUE;
    296         } else {
    297             capacity+=room+100;
    298             if(!reallocArrays(uni, capacity, TRUE)) {
    299                 uni->state=UITER_NO_STATE;
    300                 uni->hasPrevious=FALSE;
    301                 return FALSE;
    302             }
    303             start=api->start;
    304         }
    305 
    306         errorCode=U_ZERO_ERROR;
    307         uiter_setState(iter, uni->states[start], &errorCode);
    308         room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
    309     }
    310     if(U_FAILURE(errorCode) || room==0) {
    311         uni->state=UITER_NO_STATE;
    312         uni->hasPrevious=FALSE;
    313         return FALSE;
    314     }
    315 
    316     /* room>0 */
    317     do {
    318         /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
    319         uni->chars[--start]=uni->chars[--room];
    320         /* set unknown states for all but the normalization boundaries */
    321         uni->states[start]=UITER_NO_STATE;
    322     } while(room>0);
    323     uni->states[start]=uni->state=uiter_getState(iter);
    324     uni->hasPrevious=iter->hasPrevious(iter);
    325     api->start=start;
    326     return TRUE;
    327 }
    328 
    329 /* Iterator runtime API functions ------------------------------------------- */
    330 
    331 static int32_t U_CALLCONV
    332 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
    333     switch(origin) {
    334     case UITER_ZERO:
    335     case UITER_START:
    336         return 0;
    337     case UITER_CURRENT:
    338     case UITER_LIMIT:
    339     case UITER_LENGTH:
    340         return UITER_UNKNOWN_INDEX;
    341     default:
    342         /* not a valid origin */
    343         /* Should never get here! */
    344         return -1;
    345     }
    346 }
    347 
    348 static int32_t U_CALLCONV
    349 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
    350     UNormIterator *uni=(UNormIterator *)api;
    351     UCharIterator *iter=uni->iter;
    352     int32_t pos;
    353 
    354     switch(origin) {
    355     case UITER_ZERO:
    356     case UITER_START:
    357         /* restart from the beginning */
    358         if(uni->hasPrevious) {
    359             iter->move(iter, 0, UITER_START);
    360             api->start=api->index=api->limit=0;
    361             uni->states[api->limit]=uni->state=uiter_getState(iter);
    362             uni->hasPrevious=FALSE;
    363             uni->hasNext=iter->hasNext(iter);
    364         } else {
    365             /* we already have the beginning of the normalized text */
    366             api->index=api->start;
    367         }
    368         break;
    369     case UITER_CURRENT:
    370         break;
    371     case UITER_LIMIT:
    372     case UITER_LENGTH:
    373         /* restart from the end */
    374         if(uni->hasNext) {
    375             iter->move(iter, 0, UITER_LIMIT);
    376             api->start=api->index=api->limit=uni->capacity;
    377             uni->states[api->limit]=uni->state=uiter_getState(iter);
    378             uni->hasPrevious=iter->hasPrevious(iter);
    379             uni->hasNext=FALSE;
    380         } else {
    381             /* we already have the end of the normalized text */
    382             api->index=api->limit;
    383         }
    384         break;
    385     default:
    386         return -1;  /* Error */
    387     }
    388 
    389     /* move relative to the current position by delta normalized UChars */
    390     if(delta==0) {
    391         /* nothing to do */
    392     } else if(delta>0) {
    393         /* go forward until the requested position is in the buffer */
    394         for(;;) {
    395             pos=api->index+delta;   /* requested position */
    396             delta=pos-api->limit;   /* remainder beyond buffered text */
    397             if(delta<=0) {
    398                 api->index=pos;     /* position reached */
    399                 break;
    400             }
    401 
    402             /* go to end of buffer and normalize further */
    403             api->index=api->limit;
    404             if(!uni->hasNext || !readNext(uni, iter)) {
    405                 break;              /* reached end of text */
    406             }
    407         }
    408     } else /* delta<0 */ {
    409         /* go backward until the requested position is in the buffer */
    410         for(;;) {
    411             pos=api->index+delta;   /* requested position */
    412             delta=pos-api->start;   /* remainder beyond buffered text */
    413             if(delta>=0) {
    414                 api->index=pos;     /* position reached */
    415                 break;
    416             }
    417 
    418             /* go to start of buffer and normalize further */
    419             api->index=api->start;
    420             if(!uni->hasPrevious || !readPrevious(uni, iter)) {
    421                 break;              /* reached start of text */
    422             }
    423         }
    424     }
    425 
    426     if(api->index==api->start && !uni->hasPrevious) {
    427         return 0;
    428     } else {
    429         return UITER_UNKNOWN_INDEX;
    430     }
    431 }
    432 
    433 static UBool U_CALLCONV
    434 unormIteratorHasNext(UCharIterator *api) {
    435     return api->index<api->limit || ((UNormIterator *)api)->hasNext;
    436 }
    437 
    438 static UBool U_CALLCONV
    439 unormIteratorHasPrevious(UCharIterator *api) {
    440     return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
    441 }
    442 
    443 static UChar32 U_CALLCONV
    444 unormIteratorCurrent(UCharIterator *api) {
    445     UNormIterator *uni=(UNormIterator *)api;
    446 
    447     if( api->index<api->limit ||
    448         (uni->hasNext && readNext(uni, uni->iter))
    449     ) {
    450         return uni->chars[api->index];
    451     } else {
    452         return U_SENTINEL;
    453     }
    454 }
    455 
    456 static UChar32 U_CALLCONV
    457 unormIteratorNext(UCharIterator *api) {
    458     UNormIterator *uni=(UNormIterator *)api;
    459 
    460     if( api->index<api->limit ||
    461         (uni->hasNext && readNext(uni, uni->iter))
    462     ) {
    463         return uni->chars[api->index++];
    464     } else {
    465         return U_SENTINEL;
    466     }
    467 }
    468 
    469 static UChar32 U_CALLCONV
    470 unormIteratorPrevious(UCharIterator *api) {
    471     UNormIterator *uni=(UNormIterator *)api;
    472 
    473     if( api->index>api->start ||
    474         (uni->hasPrevious && readPrevious(uni, uni->iter))
    475     ) {
    476         return uni->chars[--api->index];
    477     } else {
    478         return U_SENTINEL;
    479     }
    480 }
    481 
    482 static uint32_t U_CALLCONV
    483 unormIteratorGetState(const UCharIterator *api) {
    484     /* not uni->state because that may not be at api->index */
    485     return ((UNormIterator *)api)->states[api->index];
    486 }
    487 
    488 static void U_CALLCONV
    489 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
    490     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    491         /* do nothing */
    492     } else if(api==NULL) {
    493         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    494     } else if(state==UITER_NO_STATE) {
    495         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    496     } else {
    497         UNormIterator *uni=(UNormIterator *)api;
    498         UCharIterator *iter=((UNormIterator *)api)->iter;
    499         if(state!=uni->state) {
    500             uni->state=state;
    501             uiter_setState(iter, state, pErrorCode);
    502         }
    503 
    504         /*
    505          * Try shortcuts: If the requested state is in the array contents
    506          * then just set the index there.
    507          *
    508          * We assume that the state is unique per position!
    509          */
    510         if(state==uni->states[api->index]) {
    511             return;
    512         } else if(state==uni->states[api->limit]) {
    513             api->index=api->limit;
    514             return;
    515         } else {
    516             /* search for the index with this state */
    517             int32_t i;
    518 
    519             for(i=api->start; i<api->limit; ++i) {
    520                 if(state==uni->states[i]) {
    521                     api->index=i;
    522                     return;
    523                 }
    524             }
    525         }
    526 
    527         /* there is no array index for this state, reset for fresh contents */
    528         initIndexes((UNormIterator *)api, iter);
    529         uni->states[api->limit]=state;
    530     }
    531 }
    532 
    533 static const UCharIterator unormIterator={
    534     NULL, 0, 0, 0, 0, 0,
    535     unormIteratorGetIndex,
    536     unormIteratorMove,
    537     unormIteratorHasNext,
    538     unormIteratorHasPrevious,
    539     unormIteratorCurrent,
    540     unormIteratorNext,
    541     unormIteratorPrevious,
    542     NULL,
    543     unormIteratorGetState,
    544     unormIteratorSetState
    545 };
    546 
    547 /* Setup functions ---------------------------------------------------------- */
    548 
    549 U_CAPI UNormIterator * U_EXPORT2
    550 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
    551     UNormIterator *uni;
    552 
    553     /* argument checking */
    554     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    555         return NULL;
    556     }
    557 
    558     /* allocate */
    559     uni=NULL;
    560     if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
    561         if(U_ALIGNMENT_OFFSET(stackMem)==0) {
    562             /* already aligned */
    563             uni=(UNormIterator *)stackMem;
    564         } else {
    565             int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
    566             if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
    567                 /* needs alignment */
    568                 uni=(UNormIterator *)((char *)stackMem+align);
    569             }
    570         }
    571         /* else does not fit */
    572     }
    573 
    574     if(uni!=NULL) {
    575         uni->isStackAllocated=TRUE;
    576     } else {
    577         uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
    578         if(uni==NULL) {
    579             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    580             return NULL;
    581         }
    582         uni->isStackAllocated=FALSE;
    583     }
    584 
    585     /*
    586      * initialize
    587      * do not memset because that would unnecessarily initialize the arrays
    588      */
    589     uni->iter=NULL;
    590     uni->chars=uni->charsBuffer;
    591     uni->states=uni->statesBuffer;
    592     uni->capacity=INITIAL_CAPACITY;
    593     uni->state=UITER_NO_STATE;
    594     uni->hasPrevious=uni->hasNext=FALSE;
    595     uni->mode=UNORM_NONE;
    596 
    597     /* set a no-op iterator into the api */
    598     uiter_setString(&uni->api, NULL, 0);
    599     return uni;
    600 }
    601 
    602 U_CAPI void U_EXPORT2
    603 unorm_closeIter(UNormIterator *uni) {
    604     if(uni!=NULL) {
    605         if(uni->states!=uni->statesBuffer) {
    606             /* chars and states are allocated in the same memory block */
    607             uprv_free(uni->states);
    608         }
    609         if(!uni->isStackAllocated) {
    610             uprv_free(uni);
    611         }
    612     }
    613 }
    614 
    615 U_CAPI UCharIterator * U_EXPORT2
    616 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
    617     /* argument checking */
    618     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    619         return NULL;
    620     }
    621     if(uni==NULL) {
    622         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    623         return NULL;
    624     }
    625     if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
    626         mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
    627     ) {
    628         /* set a no-op iterator into the api */
    629         uiter_setString(&uni->api, NULL, 0);
    630         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    631         return NULL;
    632     }
    633 
    634     /* set the iterator and initialize */
    635     uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
    636 
    637     uni->iter=iter;
    638     uni->mode=mode;
    639 
    640     initIndexes(uni, iter);
    641     uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
    642 
    643     return &uni->api;
    644 }
    645 
    646 #endif /* uconfig.h switches */
    647