Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  propsvec.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002feb22
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store bits (Unicode character properties) in bit set vectors.
     17 */
     18 
     19 #include <stdlib.h>
     20 #include "unicode/utypes.h"
     21 #include "cmemory.h"
     22 #include "utrie.h"
     23 #include "utrie2.h"
     24 #include "uarrsort.h"
     25 #include "propsvec.h"
     26 
     27 struct UPropsVectors {
     28     uint32_t *v;
     29     int32_t columns;  /* number of columns, plus two for start & limit values */
     30     int32_t maxRows;
     31     int32_t rows;
     32     int32_t prevRow;  /* search optimization: remember last row seen */
     33     UBool isCompacted;
     34 };
     35 
     36 #define UPVEC_INITIAL_ROWS (1<<12)
     37 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
     38 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
     39 
     40 U_CAPI UPropsVectors * U_EXPORT2
     41 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
     42     UPropsVectors *pv;
     43     uint32_t *v, *row;
     44     uint32_t cp;
     45 
     46     if(U_FAILURE(*pErrorCode)) {
     47         return NULL;
     48     }
     49     if(columns<1) {
     50         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     51         return NULL;
     52     }
     53     columns+=2; /* count range start and limit columns */
     54 
     55     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
     56     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
     57     if(pv==NULL || v==NULL) {
     58         uprv_free(pv);
     59         uprv_free(v);
     60         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
     61         return NULL;
     62     }
     63     uprv_memset(pv, 0, sizeof(UPropsVectors));
     64     pv->v=v;
     65     pv->columns=columns;
     66     pv->maxRows=UPVEC_INITIAL_ROWS;
     67     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
     68 
     69     /* set the all-Unicode row and the special-value rows */
     70     row=pv->v;
     71     uprv_memset(row, 0, pv->rows*columns*4);
     72     row[0]=0;
     73     row[1]=0x110000;
     74     row+=columns;
     75     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
     76         row[0]=cp;
     77         row[1]=cp+1;
     78         row+=columns;
     79     }
     80     return pv;
     81 }
     82 
     83 U_CAPI void U_EXPORT2
     84 upvec_close(UPropsVectors *pv) {
     85     if(pv!=NULL) {
     86         uprv_free(pv->v);
     87         uprv_free(pv);
     88     }
     89 }
     90 
     91 static uint32_t *
     92 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
     93     uint32_t *row;
     94     int32_t columns, i, start, limit, prevRow, rows;
     95 
     96     columns=pv->columns;
     97     rows=limit=pv->rows;
     98     prevRow=pv->prevRow;
     99 
    100     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
    101     row=pv->v+prevRow*columns;
    102     if(rangeStart>=(UChar32)row[0]) {
    103         if(rangeStart<(UChar32)row[1]) {
    104             /* same row as last seen */
    105             return row;
    106         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    107             /* next row after the last one */
    108             pv->prevRow=prevRow+1;
    109             return row;
    110         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    111             /* second row after the last one */
    112             pv->prevRow=prevRow+2;
    113             return row;
    114         } else if((rangeStart-(UChar32)row[1])<10) {
    115             /* we are close, continue looping */
    116             prevRow+=2;
    117             do {
    118                 ++prevRow;
    119                 row+=columns;
    120             } while(rangeStart>=(UChar32)row[1]);
    121             pv->prevRow=prevRow;
    122             return row;
    123         }
    124     } else if(rangeStart<(UChar32)pv->v[1]) {
    125         /* the very first row */
    126         pv->prevRow=0;
    127         return pv->v;
    128     }
    129 
    130     /* do a binary search for the start of the range */
    131     start=0;
    132     while(start<limit-1) {
    133         i=(start+limit)/2;
    134         row=pv->v+i*columns;
    135         if(rangeStart<(UChar32)row[0]) {
    136             limit=i;
    137         } else if(rangeStart<(UChar32)row[1]) {
    138             pv->prevRow=i;
    139             return row;
    140         } else {
    141             start=i;
    142         }
    143     }
    144 
    145     /* must be found because all ranges together always cover all of Unicode */
    146     pv->prevRow=start;
    147     return pv->v+start*columns;
    148 }
    149 
    150 U_CAPI void U_EXPORT2
    151 upvec_setValue(UPropsVectors *pv,
    152                UChar32 start, UChar32 end,
    153                int32_t column,
    154                uint32_t value, uint32_t mask,
    155                UErrorCode *pErrorCode) {
    156     uint32_t *firstRow, *lastRow;
    157     int32_t columns;
    158     UChar32 limit;
    159     UBool splitFirstRow, splitLastRow;
    160 
    161     /* argument checking */
    162     if(U_FAILURE(*pErrorCode)) {
    163         return;
    164     }
    165     if( pv==NULL ||
    166         start<0 || start>end || end>UPVEC_MAX_CP ||
    167         column<0 || column>=(pv->columns-2)
    168     ) {
    169         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    170         return;
    171     }
    172     if(pv->isCompacted) {
    173         *pErrorCode=U_NO_WRITE_PERMISSION;
    174         return;
    175     }
    176     limit=end+1;
    177 
    178     /* initialize */
    179     columns=pv->columns;
    180     column+=2; /* skip range start and limit columns */
    181     value&=mask;
    182 
    183     /* find the rows whose ranges overlap with the input range */
    184 
    185     /* find the first and last rows, always successful */
    186     firstRow=_findRow(pv, start);
    187     lastRow=_findRow(pv, end);
    188 
    189     /*
    190      * Rows need to be split if they partially overlap with the
    191      * input range (only possible for the first and last rows)
    192      * and if their value differs from the input value.
    193      */
    194     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
    195     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
    196 
    197     /* split first/last rows if necessary */
    198     if(splitFirstRow || splitLastRow) {
    199         int32_t count, rows;
    200 
    201         rows=pv->rows;
    202         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
    203             uint32_t *newVectors;
    204             int32_t newMaxRows;
    205 
    206             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
    207                 newMaxRows=UPVEC_MEDIUM_ROWS;
    208             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
    209                 newMaxRows=UPVEC_MAX_ROWS;
    210             } else {
    211                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
    212                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    213                 return;
    214             }
    215             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
    216             if(newVectors==NULL) {
    217                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    218                 return;
    219             }
    220             uprv_memcpy(newVectors, pv->v, rows*columns*4);
    221             firstRow=newVectors+(firstRow-pv->v);
    222             lastRow=newVectors+(lastRow-pv->v);
    223             uprv_free(pv->v);
    224             pv->v=newVectors;
    225             pv->maxRows=newMaxRows;
    226         }
    227 
    228         /* count the number of row cells to move after the last row, and move them */
    229         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
    230         if(count>0) {
    231             uprv_memmove(
    232                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
    233                 lastRow+columns,
    234                 count*4);
    235         }
    236         pv->rows=rows+splitFirstRow+splitLastRow;
    237 
    238         /* split the first row, and move the firstRow pointer to the second part */
    239         if(splitFirstRow) {
    240             /* copy all affected rows up one and move the lastRow pointer */
    241             count = (int32_t)((lastRow-firstRow)+columns);
    242             uprv_memmove(firstRow+columns, firstRow, count*4);
    243             lastRow+=columns;
    244 
    245             /* split the range and move the firstRow pointer */
    246             firstRow[1]=firstRow[columns]=(uint32_t)start;
    247             firstRow+=columns;
    248         }
    249 
    250         /* split the last row */
    251         if(splitLastRow) {
    252             /* copy the last row data */
    253             uprv_memcpy(lastRow+columns, lastRow, columns*4);
    254 
    255             /* split the range and move the firstRow pointer */
    256             lastRow[1]=lastRow[columns]=(uint32_t)limit;
    257         }
    258     }
    259 
    260     /* set the "row last seen" to the last row for the range */
    261     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
    262 
    263     /* set the input value in all remaining rows */
    264     firstRow+=column;
    265     lastRow+=column;
    266     mask=~mask;
    267     for(;;) {
    268         *firstRow=(*firstRow&mask)|value;
    269         if(firstRow==lastRow) {
    270             break;
    271         }
    272         firstRow+=columns;
    273     }
    274 }
    275 
    276 U_CAPI uint32_t U_EXPORT2
    277 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
    278     uint32_t *row;
    279 
    280     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
    281         return 0;
    282     }
    283     row=_findRow((UPropsVectors *)pv, c);
    284     return row[2+column];
    285 }
    286 
    287 U_CAPI uint32_t * U_EXPORT2
    288 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
    289              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
    290     uint32_t *row;
    291     int32_t columns;
    292 
    293     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
    294         return NULL;
    295     }
    296 
    297     columns=pv->columns;
    298     row=pv->v+rowIndex*columns;
    299     if(pRangeStart!=NULL) {
    300         *pRangeStart=(UChar32)row[0];
    301     }
    302     if(pRangeEnd!=NULL) {
    303         *pRangeEnd=(UChar32)row[1]-1;
    304     }
    305     return row+2;
    306 }
    307 
    308 static int32_t U_CALLCONV
    309 upvec_compareRows(const void *context, const void *l, const void *r) {
    310     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
    311     const UPropsVectors *pv=(const UPropsVectors *)context;
    312     int32_t i, count, columns;
    313 
    314     count=columns=pv->columns; /* includes start/limit columns */
    315 
    316     /* start comparing after start/limit but wrap around to them */
    317     i=2;
    318     do {
    319         if(left[i]!=right[i]) {
    320             return left[i]<right[i] ? -1 : 1;
    321         }
    322         if(++i==columns) {
    323             i=0;
    324         }
    325     } while(--count>0);
    326 
    327     return 0;
    328 }
    329 
    330 U_CAPI void U_EXPORT2
    331 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
    332     uint32_t *row;
    333     int32_t i, columns, valueColumns, rows, count;
    334     UChar32 start, limit;
    335 
    336     /* argument checking */
    337     if(U_FAILURE(*pErrorCode)) {
    338         return;
    339     }
    340     if(handler==NULL) {
    341         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    342         return;
    343     }
    344     if(pv->isCompacted) {
    345         return;
    346     }
    347 
    348     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
    349     pv->isCompacted=TRUE;
    350 
    351     rows=pv->rows;
    352     columns=pv->columns;
    353     valueColumns=columns-2; /* not counting start & limit */
    354 
    355     /* sort the properties vectors to find unique vector values */
    356     uprv_sortArray(pv->v, rows, columns*4,
    357                    upvec_compareRows, pv, FALSE, pErrorCode);
    358     if(U_FAILURE(*pErrorCode)) {
    359         return;
    360     }
    361 
    362     /*
    363      * Find and set the special values.
    364      * This has to do almost the same work as the compaction below,
    365      * to find the indexes where the special-value rows will move.
    366      */
    367     row=pv->v;
    368     count=-valueColumns;
    369     for(i=0; i<rows; ++i) {
    370         start=(UChar32)row[0];
    371 
    372         /* count a new values vector if it is different from the current one */
    373         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
    374             count+=valueColumns;
    375         }
    376 
    377         if(start>=UPVEC_FIRST_SPECIAL_CP) {
    378             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
    379             if(U_FAILURE(*pErrorCode)) {
    380                 return;
    381             }
    382         }
    383 
    384         row+=columns;
    385     }
    386 
    387     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
    388     count+=valueColumns;
    389 
    390     /* Call the handler once more to signal the start of delivering real values. */
    391     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
    392             count, row-valueColumns, valueColumns, pErrorCode);
    393     if(U_FAILURE(*pErrorCode)) {
    394         return;
    395     }
    396 
    397     /*
    398      * Move vector contents up to a contiguous array with only unique
    399      * vector values, and call the handler function for each vector.
    400      *
    401      * This destroys the Properties Vector structure and replaces it
    402      * with an array of just vector values.
    403      */
    404     row=pv->v;
    405     count=-valueColumns;
    406     for(i=0; i<rows; ++i) {
    407         /* fetch these first before memmove() may overwrite them */
    408         start=(UChar32)row[0];
    409         limit=(UChar32)row[1];
    410 
    411         /* add a new values vector if it is different from the current one */
    412         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
    413             count+=valueColumns;
    414             uprv_memmove(pv->v+count, row+2, valueColumns*4);
    415         }
    416 
    417         if(start<UPVEC_FIRST_SPECIAL_CP) {
    418             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
    419             if(U_FAILURE(*pErrorCode)) {
    420                 return;
    421             }
    422         }
    423 
    424         row+=columns;
    425     }
    426 
    427     /* count is at the beginning of the last vector, add one to include that last vector */
    428     pv->rows=count/valueColumns+1;
    429 }
    430 
    431 U_CAPI const uint32_t * U_EXPORT2
    432 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
    433     if(!pv->isCompacted) {
    434         return NULL;
    435     }
    436     if(pRows!=NULL) {
    437         *pRows=pv->rows;
    438     }
    439     if(pColumns!=NULL) {
    440         *pColumns=pv->columns-2;
    441     }
    442     return pv->v;
    443 }
    444 
    445 U_CAPI uint32_t * U_EXPORT2
    446 upvec_cloneArray(const UPropsVectors *pv,
    447                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
    448     uint32_t *clonedArray;
    449     int32_t byteLength;
    450 
    451     if(U_FAILURE(*pErrorCode)) {
    452         return NULL;
    453     }
    454     if(!pv->isCompacted) {
    455         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    456         return NULL;
    457     }
    458     byteLength=pv->rows*(pv->columns-2)*4;
    459     clonedArray=(uint32_t *)uprv_malloc(byteLength);
    460     if(clonedArray==NULL) {
    461         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    462         return NULL;
    463     }
    464     uprv_memcpy(clonedArray, pv->v, byteLength);
    465     if(pRows!=NULL) {
    466         *pRows=pv->rows;
    467     }
    468     if(pColumns!=NULL) {
    469         *pColumns=pv->columns-2;
    470     }
    471     return clonedArray;
    472 }
    473 
    474 U_CAPI UTrie2 * U_EXPORT2
    475 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
    476     UPVecToUTrie2Context toUTrie2={ NULL };
    477     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
    478     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
    479     if(U_FAILURE(*pErrorCode)) {
    480         utrie2_close(toUTrie2.trie);
    481         toUTrie2.trie=NULL;
    482     }
    483     return toUTrie2.trie;
    484 }
    485 
    486 /*
    487  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
    488  * some 16-bit field and builds and returns a UTrie2.
    489  */
    490 
    491 U_CAPI void U_CALLCONV
    492 upvec_compactToUTrieHandler(void *context,
    493                             UChar32 start, UChar32 end,
    494                             int32_t rowIndex, uint32_t *row, int32_t columns,
    495                             UErrorCode *pErrorCode) {
    496     UPVecToUTrieContext *toUTrie=(UPVecToUTrieContext *)context;
    497     if(start<UPVEC_FIRST_SPECIAL_CP) {
    498         if(!utrie_setRange32(toUTrie->newTrie, start, end+1, (uint32_t)rowIndex, TRUE)) {
    499             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    500         }
    501     } else {
    502         switch(start) {
    503         case UPVEC_INITIAL_VALUE_CP:
    504             toUTrie->initialValue=rowIndex;
    505             break;
    506         case UPVEC_START_REAL_VALUES_CP:
    507             if(rowIndex>0xffff) {
    508                 /* too many rows for a 16-bit trie */
    509                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    510             } else {
    511                 toUTrie->newTrie=utrie_open(NULL, NULL, toUTrie->capacity,
    512                                             toUTrie->initialValue, toUTrie->initialValue,
    513                                             toUTrie->latin1Linear);
    514                 if(toUTrie->newTrie==NULL) {
    515                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    516                 }
    517             }
    518             break;
    519         default:
    520             break;
    521         }
    522     }
    523 }
    524 
    525 U_CAPI void U_CALLCONV
    526 upvec_compactToUTrie2Handler(void *context,
    527                              UChar32 start, UChar32 end,
    528                              int32_t rowIndex, uint32_t *row, int32_t columns,
    529                              UErrorCode *pErrorCode) {
    530     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
    531     if(start<UPVEC_FIRST_SPECIAL_CP) {
    532         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
    533     } else {
    534         switch(start) {
    535         case UPVEC_INITIAL_VALUE_CP:
    536             toUTrie2->initialValue=rowIndex;
    537             break;
    538         case UPVEC_ERROR_VALUE_CP:
    539             toUTrie2->errorValue=rowIndex;
    540             break;
    541         case UPVEC_START_REAL_VALUES_CP:
    542             toUTrie2->maxValue=rowIndex;
    543             if(rowIndex>0xffff) {
    544                 /* too many rows for a 16-bit trie */
    545                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    546             } else {
    547                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
    548                                            toUTrie2->errorValue, pErrorCode);
    549             }
    550             break;
    551         default:
    552             break;
    553         }
    554     }
    555 }
    556