Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  unorm_it.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003jan21
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/uiter.h"
     22 #include "unicode/unorm.h"
     23 #include "unorm_it.h"
     24 #include "cmemory.h"
     25 
     26 /* UNormIterator ------------------------------------------------------------ */
     27 
     28 enum {
     29     INITIAL_CAPACITY=100
     30 };
     31 
     32 struct UNormIterator {
     33     UCharIterator api;
     34     UCharIterator *iter;
     35 
     36     /*
     37      * chars and states either use the static buffers
     38      * or are allocated in the same memory block
     39      *
     40      * They are parallel arrays with states[] holding the getState() values
     41      * from normalization boundaries, and UITER_NO_STATE in between.
     42      */
     43     UChar *chars;
     44     uint32_t *states;
     45 
     46     /*
     47      * api.start: first valid character & state in the arrays
     48      * api.index: current position
     49      * api.limit: one past the last valid character in chars[], but states[limit] is valid
     50      * capacity: length of allocated arrays
     51      */
     52     int32_t capacity;
     53 
     54     /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
     55     uint32_t state;
     56 
     57     /* there are UChars available before start or after limit? */
     58     UBool hasPrevious, hasNext, isStackAllocated;
     59 
     60     UNormalizationMode mode;
     61 
     62     UChar charsBuffer[INITIAL_CAPACITY];
     63     uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
     64 };
     65 
     66 static void
     67 initIndexes(UNormIterator *uni, UCharIterator *iter) {
     68     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
     69     UCharIterator *api=&uni->api;
     70 
     71     if(!iter->hasPrevious(iter)) {
     72         /* set indexes to the beginning of the arrays */
     73         api->start=api->index=api->limit=0;
     74         uni->hasPrevious=FALSE;
     75         uni->hasNext=iter->hasNext(iter);
     76     } else if(!iter->hasNext(iter)) {
     77         /* set indexes to the end of the arrays */
     78         api->start=api->index=api->limit=uni->capacity;
     79         uni->hasNext=FALSE;
     80         uni->hasPrevious=iter->hasPrevious(iter);
     81     } else {
     82         /* set indexes into the middle of the arrays */
     83         api->start=api->index=api->limit=uni->capacity/2;
     84         uni->hasPrevious=uni->hasNext=TRUE;
     85     }
     86 }
     87 
     88 static UBool
     89 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
     90     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
     91     UCharIterator *api=&uni->api;
     92 
     93     uint32_t *states;
     94     UChar *chars;
     95     int32_t start, limit;
     96 
     97     states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
     98     if(states==NULL) {
     99         return FALSE;
    100     }
    101 
    102     chars=(UChar *)(states+(capacity+1));
    103     uni->capacity=capacity;
    104 
    105     start=api->start;
    106     limit=api->limit;
    107 
    108     if(addAtStart) {
    109         /* copy old contents to the end of the new arrays */
    110         int32_t delta;
    111 
    112         delta=capacity-uni->capacity;
    113         uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
    114         uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
    115 
    116         api->start=start+delta;
    117         api->index+=delta;
    118         api->limit=limit+delta;
    119     } else {
    120         /* copy old contents to the beginning of the new arrays */
    121         uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
    122         uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
    123     }
    124 
    125     uni->chars=chars;
    126     uni->states=states;
    127 
    128     return TRUE;
    129 }
    130 
    131 static void
    132 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
    133     /* move array contents up to make room */
    134     int32_t srcIndex, destIndex, limit;
    135 
    136     limit=api->limit;
    137     srcIndex=delta;
    138     if(srcIndex>api->start) {
    139         /* look for a position in the arrays with a known state */
    140         while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
    141             ++srcIndex;
    142         }
    143     }
    144 
    145     /* now actually move the array contents */
    146     api->start=destIndex=0;
    147     while(srcIndex<limit) {
    148         chars[destIndex]=chars[srcIndex];
    149         states[destIndex++]=states[srcIndex++];
    150     }
    151 
    152     /* copy states[limit] as well! */
    153     states[destIndex]=states[srcIndex];
    154 
    155     api->limit=destIndex;
    156 }
    157 
    158 static void
    159 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
    160     /* move array contents up to make room */
    161     int32_t srcIndex, destIndex, start;
    162 
    163     start=api->start;
    164     destIndex=((UNormIterator *)api)->capacity;
    165     srcIndex=destIndex-delta;
    166     if(srcIndex<api->limit) {
    167         /* look for a position in the arrays with a known state */
    168         while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
    169             --srcIndex;
    170         }
    171     }
    172 
    173     /* now actually move the array contents */
    174     api->limit=destIndex;
    175 
    176     /* copy states[limit] as well! */
    177     states[destIndex]=states[srcIndex];
    178 
    179     while(srcIndex>start) {
    180         chars[--destIndex]=chars[--srcIndex];
    181         states[destIndex]=states[srcIndex];
    182     }
    183 
    184     api->start=destIndex;
    185 }
    186 
    187 /* normalize forward from the limit, assume hasNext is true */
    188 static UBool
    189 readNext(UNormIterator *uni, UCharIterator *iter) {
    190     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
    191     UCharIterator *api=&uni->api;
    192 
    193     /* make capacity/4 room at the end of the arrays */
    194     int32_t limit, capacity, room;
    195     UErrorCode errorCode;
    196 
    197     limit=api->limit;
    198     capacity=uni->capacity;
    199     room=capacity/4;
    200     if(room>(capacity-limit)) {
    201         /* move array contents to make room */
    202         moveContentsTowardStart(api, uni->chars, uni->states, room);
    203         api->index=limit=api->limit;
    204         uni->hasPrevious=TRUE;
    205     }
    206 
    207     /* normalize starting from the limit position */
    208     errorCode=U_ZERO_ERROR;
    209     if(uni->state!=uni->states[limit]) {
    210         uiter_setState(iter, uni->states[limit], &errorCode);
    211         if(U_FAILURE(errorCode)) {
    212             uni->state=UITER_NO_STATE;
    213             uni->hasNext=FALSE;
    214             return FALSE;
    215         }
    216     }
    217 
    218     room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
    219     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    220         if(room<=capacity) {
    221             /* empty and re-use the arrays */
    222             uni->states[0]=uni->states[limit];
    223             api->start=api->index=api->limit=limit=0;
    224             uni->hasPrevious=TRUE;
    225         } else {
    226             capacity+=room+100;
    227             if(!reallocArrays(uni, capacity, FALSE)) {
    228                 uni->state=UITER_NO_STATE;
    229                 uni->hasNext=FALSE;
    230                 return FALSE;
    231             }
    232             limit=api->limit;
    233         }
    234 
    235         errorCode=U_ZERO_ERROR;
    236         uiter_setState(iter, uni->states[limit], &errorCode);
    237         room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
    238     }
    239     if(U_FAILURE(errorCode) || room==0) {
    240         uni->state=UITER_NO_STATE;
    241         uni->hasNext=FALSE;
    242         return FALSE;
    243     }
    244 
    245     /* room>0 */
    246     ++limit; /* leave the known states[limit] alone */
    247     for(--room; room>0; --room) {
    248         /* set unknown states for all but the normalization boundaries */
    249         uni->states[limit++]=UITER_NO_STATE;
    250     }
    251     uni->states[limit]=uni->state=uiter_getState(iter);
    252     uni->hasNext=iter->hasNext(iter);
    253     api->limit=limit;
    254     return TRUE;
    255 }
    256 
    257 /* normalize backward from the start, assume hasPrevious is true */
    258 static UBool
    259 readPrevious(UNormIterator *uni, UCharIterator *iter) {
    260     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
    261     UCharIterator *api=&uni->api;
    262 
    263     /* make capacity/4 room at the start of the arrays */
    264     int32_t start, capacity, room;
    265     UErrorCode errorCode;
    266 
    267     start=api->start;
    268     capacity=uni->capacity;
    269     room=capacity/4;
    270     if(room>start) {
    271         /* move array contents to make room */
    272         moveContentsTowardEnd(api, uni->chars, uni->states, room);
    273         api->index=start=api->start;
    274         uni->hasNext=TRUE;
    275     }
    276 
    277     /* normalize ending at the start position */
    278     errorCode=U_ZERO_ERROR;
    279     if(uni->state!=uni->states[start]) {
    280         uiter_setState(iter, uni->states[start], &errorCode);
    281         if(U_FAILURE(errorCode)) {
    282             uni->state=UITER_NO_STATE;
    283             uni->hasPrevious=FALSE;
    284             return FALSE;
    285         }
    286     }
    287 
    288     room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
    289     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    290         if(room<=capacity) {
    291             /* empty and re-use the arrays */
    292             uni->states[capacity]=uni->states[start];
    293             api->start=api->index=api->limit=start=capacity;
    294             uni->hasNext=TRUE;
    295         } else {
    296             capacity+=room+100;
    297             if(!reallocArrays(uni, capacity, TRUE)) {
    298                 uni->state=UITER_NO_STATE;
    299                 uni->hasPrevious=FALSE;
    300                 return FALSE;
    301             }
    302             start=api->start;
    303         }
    304 
    305         errorCode=U_ZERO_ERROR;
    306         uiter_setState(iter, uni->states[start], &errorCode);
    307         room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
    308     }
    309     if(U_FAILURE(errorCode) || room==0) {
    310         uni->state=UITER_NO_STATE;
    311         uni->hasPrevious=FALSE;
    312         return FALSE;
    313     }
    314 
    315     /* room>0 */
    316     do {
    317         /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
    318         uni->chars[--start]=uni->chars[--room];
    319         /* set unknown states for all but the normalization boundaries */
    320         uni->states[start]=UITER_NO_STATE;
    321     } while(room>0);
    322     uni->states[start]=uni->state=uiter_getState(iter);
    323     uni->hasPrevious=iter->hasPrevious(iter);
    324     api->start=start;
    325     return TRUE;
    326 }
    327 
    328 /* Iterator runtime API functions ------------------------------------------- */
    329 
    330 static int32_t U_CALLCONV
    331 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
    332     switch(origin) {
    333     case UITER_ZERO:
    334     case UITER_START:
    335         return 0;
    336     case UITER_CURRENT:
    337     case UITER_LIMIT:
    338     case UITER_LENGTH:
    339         return UITER_UNKNOWN_INDEX;
    340     default:
    341         /* not a valid origin */
    342         /* Should never get here! */
    343         return -1;
    344     }
    345 }
    346 
    347 static int32_t U_CALLCONV
    348 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
    349     UNormIterator *uni=(UNormIterator *)api;
    350     UCharIterator *iter=uni->iter;
    351     int32_t pos;
    352 
    353     switch(origin) {
    354     case UITER_ZERO:
    355     case UITER_START:
    356         /* restart from the beginning */
    357         if(uni->hasPrevious) {
    358             iter->move(iter, 0, UITER_START);
    359             api->start=api->index=api->limit=0;
    360             uni->states[api->limit]=uni->state=uiter_getState(iter);
    361             uni->hasPrevious=FALSE;
    362             uni->hasNext=iter->hasNext(iter);
    363         } else {
    364             /* we already have the beginning of the normalized text */
    365             api->index=api->start;
    366         }
    367         break;
    368     case UITER_CURRENT:
    369         break;
    370     case UITER_LIMIT:
    371     case UITER_LENGTH:
    372         /* restart from the end */
    373         if(uni->hasNext) {
    374             iter->move(iter, 0, UITER_LIMIT);
    375             api->start=api->index=api->limit=uni->capacity;
    376             uni->states[api->limit]=uni->state=uiter_getState(iter);
    377             uni->hasPrevious=iter->hasPrevious(iter);
    378             uni->hasNext=FALSE;
    379         } else {
    380             /* we already have the end of the normalized text */
    381             api->index=api->limit;
    382         }
    383         break;
    384     default:
    385         return -1;  /* Error */
    386     }
    387 
    388     /* move relative to the current position by delta normalized UChars */
    389     if(delta==0) {
    390         /* nothing to do */
    391     } else if(delta>0) {
    392         /* go forward until the requested position is in the buffer */
    393         for(;;) {
    394             pos=api->index+delta;   /* requested position */
    395             delta=pos-api->limit;   /* remainder beyond buffered text */
    396             if(delta<=0) {
    397                 api->index=pos;     /* position reached */
    398                 break;
    399             }
    400 
    401             /* go to end of buffer and normalize further */
    402             api->index=api->limit;
    403             if(!uni->hasNext || !readNext(uni, iter)) {
    404                 break;              /* reached end of text */
    405             }
    406         }
    407     } else /* delta<0 */ {
    408         /* go backward until the requested position is in the buffer */
    409         for(;;) {
    410             pos=api->index+delta;   /* requested position */
    411             delta=pos-api->start;   /* remainder beyond buffered text */
    412             if(delta>=0) {
    413                 api->index=pos;     /* position reached */
    414                 break;
    415             }
    416 
    417             /* go to start of buffer and normalize further */
    418             api->index=api->start;
    419             if(!uni->hasPrevious || !readPrevious(uni, iter)) {
    420                 break;              /* reached start of text */
    421             }
    422         }
    423     }
    424 
    425     if(api->index==api->start && !uni->hasPrevious) {
    426         return 0;
    427     } else {
    428         return UITER_UNKNOWN_INDEX;
    429     }
    430 }
    431 
    432 static UBool U_CALLCONV
    433 unormIteratorHasNext(UCharIterator *api) {
    434     return api->index<api->limit || ((UNormIterator *)api)->hasNext;
    435 }
    436 
    437 static UBool U_CALLCONV
    438 unormIteratorHasPrevious(UCharIterator *api) {
    439     return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
    440 }
    441 
    442 static UChar32 U_CALLCONV
    443 unormIteratorCurrent(UCharIterator *api) {
    444     UNormIterator *uni=(UNormIterator *)api;
    445 
    446     if( api->index<api->limit ||
    447         (uni->hasNext && readNext(uni, uni->iter))
    448     ) {
    449         return uni->chars[api->index];
    450     } else {
    451         return U_SENTINEL;
    452     }
    453 }
    454 
    455 static UChar32 U_CALLCONV
    456 unormIteratorNext(UCharIterator *api) {
    457     UNormIterator *uni=(UNormIterator *)api;
    458 
    459     if( api->index<api->limit ||
    460         (uni->hasNext && readNext(uni, uni->iter))
    461     ) {
    462         return uni->chars[api->index++];
    463     } else {
    464         return U_SENTINEL;
    465     }
    466 }
    467 
    468 static UChar32 U_CALLCONV
    469 unormIteratorPrevious(UCharIterator *api) {
    470     UNormIterator *uni=(UNormIterator *)api;
    471 
    472     if( api->index>api->start ||
    473         (uni->hasPrevious && readPrevious(uni, uni->iter))
    474     ) {
    475         return uni->chars[--api->index];
    476     } else {
    477         return U_SENTINEL;
    478     }
    479 }
    480 
    481 static uint32_t U_CALLCONV
    482 unormIteratorGetState(const UCharIterator *api) {
    483     /* not uni->state because that may not be at api->index */
    484     return ((UNormIterator *)api)->states[api->index];
    485 }
    486 
    487 static void U_CALLCONV
    488 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
    489     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    490         /* do nothing */
    491     } else if(api==NULL) {
    492         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    493     } else if(state==UITER_NO_STATE) {
    494         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    495     } else {
    496         UNormIterator *uni=(UNormIterator *)api;
    497         UCharIterator *iter=((UNormIterator *)api)->iter;
    498         if(state!=uni->state) {
    499             uni->state=state;
    500             uiter_setState(iter, state, pErrorCode);
    501         }
    502 
    503         /*
    504          * Try shortcuts: If the requested state is in the array contents
    505          * then just set the index there.
    506          *
    507          * We assume that the state is unique per position!
    508          */
    509         if(state==uni->states[api->index]) {
    510             return;
    511         } else if(state==uni->states[api->limit]) {
    512             api->index=api->limit;
    513             return;
    514         } else {
    515             /* search for the index with this state */
    516             int32_t i;
    517 
    518             for(i=api->start; i<api->limit; ++i) {
    519                 if(state==uni->states[i]) {
    520                     api->index=i;
    521                     return;
    522                 }
    523             }
    524         }
    525 
    526         /* there is no array index for this state, reset for fresh contents */
    527         initIndexes((UNormIterator *)api, iter);
    528         uni->states[api->limit]=state;
    529     }
    530 }
    531 
    532 static const UCharIterator unormIterator={
    533     NULL, 0, 0, 0, 0, 0,
    534     unormIteratorGetIndex,
    535     unormIteratorMove,
    536     unormIteratorHasNext,
    537     unormIteratorHasPrevious,
    538     unormIteratorCurrent,
    539     unormIteratorNext,
    540     unormIteratorPrevious,
    541     NULL,
    542     unormIteratorGetState,
    543     unormIteratorSetState
    544 };
    545 
    546 /* Setup functions ---------------------------------------------------------- */
    547 
    548 U_CAPI UNormIterator * U_EXPORT2
    549 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
    550     UNormIterator *uni;
    551 
    552     /* argument checking */
    553     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    554         return NULL;
    555     }
    556 
    557     /* allocate */
    558     uni=NULL;
    559     if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
    560         if(U_ALIGNMENT_OFFSET(stackMem)==0) {
    561             /* already aligned */
    562             uni=(UNormIterator *)stackMem;
    563         } else {
    564             int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
    565             if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
    566                 /* needs alignment */
    567                 uni=(UNormIterator *)((char *)stackMem+align);
    568             }
    569         }
    570         /* else does not fit */
    571     }
    572 
    573     if(uni!=NULL) {
    574         uni->isStackAllocated=TRUE;
    575     } else {
    576         uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
    577         if(uni==NULL) {
    578             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    579             return NULL;
    580         }
    581         uni->isStackAllocated=FALSE;
    582     }
    583 
    584     /*
    585      * initialize
    586      * do not memset because that would unnecessarily initialize the arrays
    587      */
    588     uni->iter=NULL;
    589     uni->chars=uni->charsBuffer;
    590     uni->states=uni->statesBuffer;
    591     uni->capacity=INITIAL_CAPACITY;
    592     uni->state=UITER_NO_STATE;
    593     uni->hasPrevious=uni->hasNext=FALSE;
    594     uni->mode=UNORM_NONE;
    595 
    596     /* set a no-op iterator into the api */
    597     uiter_setString(&uni->api, NULL, 0);
    598     return uni;
    599 }
    600 
    601 U_CAPI void U_EXPORT2
    602 unorm_closeIter(UNormIterator *uni) {
    603     if(uni!=NULL) {
    604         if(uni->states!=uni->statesBuffer) {
    605             /* chars and states are allocated in the same memory block */
    606             uprv_free(uni->states);
    607         }
    608         if(!uni->isStackAllocated) {
    609             uprv_free(uni);
    610         }
    611     }
    612 }
    613 
    614 U_CAPI UCharIterator * U_EXPORT2
    615 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
    616     /* argument checking */
    617     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    618         return NULL;
    619     }
    620     if(uni==NULL) {
    621         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    622         return NULL;
    623     }
    624     if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
    625         mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
    626     ) {
    627         /* set a no-op iterator into the api */
    628         uiter_setString(&uni->api, NULL, 0);
    629         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    630         return NULL;
    631     }
    632 
    633     /* set the iterator and initialize */
    634     uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
    635 
    636     uni->iter=iter;
    637     uni->mode=mode;
    638 
    639     initIndexes(uni, iter);
    640     uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
    641 
    642     return &uni->api;
    643 }
    644 
    645 #endif /* uconfig.h switches */
    646