Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  propsvec.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002feb22
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store bits (Unicode character properties) in bit set vectors.
     17 */
     18 
     19 #include <stdlib.h>
     20 #include "unicode/utypes.h"
     21 #include "cmemory.h"
     22 #include "utrie.h"
     23 #include "utrie2.h"
     24 #include "uarrsort.h"
     25 #include "propsvec.h"
     26 #include "uassert.h"
     27 
     28 struct UPropsVectors {
     29     uint32_t *v;
     30     int32_t columns;  /* number of columns, plus two for start & limit values */
     31     int32_t maxRows;
     32     int32_t rows;
     33     int32_t prevRow;  /* search optimization: remember last row seen */
     34     UBool isCompacted;
     35 };
     36 
     37 #define UPVEC_INITIAL_ROWS (1<<12)
     38 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
     39 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
     40 
     41 U_CAPI UPropsVectors * U_EXPORT2
     42 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
     43     UPropsVectors *pv;
     44     uint32_t *v, *row;
     45     uint32_t cp;
     46 
     47     if(U_FAILURE(*pErrorCode)) {
     48         return NULL;
     49     }
     50     if(columns<1) {
     51         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     52         return NULL;
     53     }
     54     columns+=2; /* count range start and limit columns */
     55 
     56     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
     57     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
     58     if(pv==NULL || v==NULL) {
     59         uprv_free(pv);
     60         uprv_free(v);
     61         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
     62         return NULL;
     63     }
     64     uprv_memset(pv, 0, sizeof(UPropsVectors));
     65     pv->v=v;
     66     pv->columns=columns;
     67     pv->maxRows=UPVEC_INITIAL_ROWS;
     68     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
     69 
     70     /* set the all-Unicode row and the special-value rows */
     71     row=pv->v;
     72     uprv_memset(row, 0, pv->rows*columns*4);
     73     row[0]=0;
     74     row[1]=0x110000;
     75     row+=columns;
     76     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
     77         row[0]=cp;
     78         row[1]=cp+1;
     79         row+=columns;
     80     }
     81     return pv;
     82 }
     83 
     84 U_CAPI void U_EXPORT2
     85 upvec_close(UPropsVectors *pv) {
     86     if(pv!=NULL) {
     87         uprv_free(pv->v);
     88         uprv_free(pv);
     89     }
     90 }
     91 
     92 static uint32_t *
     93 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
     94     uint32_t *row;
     95     int32_t columns, i, start, limit, prevRow;
     96 
     97     columns=pv->columns;
     98     limit=pv->rows;
     99     prevRow=pv->prevRow;
    100 
    101     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
    102     row=pv->v+prevRow*columns;
    103     if(rangeStart>=(UChar32)row[0]) {
    104         if(rangeStart<(UChar32)row[1]) {
    105             /* same row as last seen */
    106             return row;
    107         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    108             /* next row after the last one */
    109             pv->prevRow=prevRow+1;
    110             return row;
    111         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    112             /* second row after the last one */
    113             pv->prevRow=prevRow+2;
    114             return row;
    115         } else if((rangeStart-(UChar32)row[1])<10) {
    116             /* we are close, continue looping */
    117             prevRow+=2;
    118             do {
    119                 ++prevRow;
    120                 row+=columns;
    121             } while(rangeStart>=(UChar32)row[1]);
    122             pv->prevRow=prevRow;
    123             return row;
    124         }
    125     } else if(rangeStart<(UChar32)pv->v[1]) {
    126         /* the very first row */
    127         pv->prevRow=0;
    128         return pv->v;
    129     }
    130 
    131     /* do a binary search for the start of the range */
    132     start=0;
    133     while(start<limit-1) {
    134         i=(start+limit)/2;
    135         row=pv->v+i*columns;
    136         if(rangeStart<(UChar32)row[0]) {
    137             limit=i;
    138         } else if(rangeStart<(UChar32)row[1]) {
    139             pv->prevRow=i;
    140             return row;
    141         } else {
    142             start=i;
    143         }
    144     }
    145 
    146     /* must be found because all ranges together always cover all of Unicode */
    147     pv->prevRow=start;
    148     return pv->v+start*columns;
    149 }
    150 
    151 U_CAPI void U_EXPORT2
    152 upvec_setValue(UPropsVectors *pv,
    153                UChar32 start, UChar32 end,
    154                int32_t column,
    155                uint32_t value, uint32_t mask,
    156                UErrorCode *pErrorCode) {
    157     uint32_t *firstRow, *lastRow;
    158     int32_t columns;
    159     UChar32 limit;
    160     UBool splitFirstRow, splitLastRow;
    161 
    162     /* argument checking */
    163     if(U_FAILURE(*pErrorCode)) {
    164         return;
    165     }
    166     if( pv==NULL ||
    167         start<0 || start>end || end>UPVEC_MAX_CP ||
    168         column<0 || column>=(pv->columns-2)
    169     ) {
    170         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    171         return;
    172     }
    173     if(pv->isCompacted) {
    174         *pErrorCode=U_NO_WRITE_PERMISSION;
    175         return;
    176     }
    177     limit=end+1;
    178 
    179     /* initialize */
    180     columns=pv->columns;
    181     column+=2; /* skip range start and limit columns */
    182     value&=mask;
    183 
    184     /* find the rows whose ranges overlap with the input range */
    185 
    186     /* find the first and last rows, always successful */
    187     firstRow=_findRow(pv, start);
    188     lastRow=_findRow(pv, end);
    189 
    190     /*
    191      * Rows need to be split if they partially overlap with the
    192      * input range (only possible for the first and last rows)
    193      * and if their value differs from the input value.
    194      */
    195     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
    196     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
    197 
    198     /* split first/last rows if necessary */
    199     if(splitFirstRow || splitLastRow) {
    200         int32_t count, rows;
    201 
    202         rows=pv->rows;
    203         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
    204             uint32_t *newVectors;
    205             int32_t newMaxRows;
    206 
    207             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
    208                 newMaxRows=UPVEC_MEDIUM_ROWS;
    209             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
    210                 newMaxRows=UPVEC_MAX_ROWS;
    211             } else {
    212                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
    213                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    214                 return;
    215             }
    216             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
    217             if(newVectors==NULL) {
    218                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    219                 return;
    220             }
    221             uprv_memcpy(newVectors, pv->v, rows*columns*4);
    222             firstRow=newVectors+(firstRow-pv->v);
    223             lastRow=newVectors+(lastRow-pv->v);
    224             uprv_free(pv->v);
    225             pv->v=newVectors;
    226             pv->maxRows=newMaxRows;
    227         }
    228 
    229         /* count the number of row cells to move after the last row, and move them */
    230         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
    231         if(count>0) {
    232             uprv_memmove(
    233                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
    234                 lastRow+columns,
    235                 count*4);
    236         }
    237         pv->rows=rows+splitFirstRow+splitLastRow;
    238 
    239         /* split the first row, and move the firstRow pointer to the second part */
    240         if(splitFirstRow) {
    241             /* copy all affected rows up one and move the lastRow pointer */
    242             count = (int32_t)((lastRow-firstRow)+columns);
    243             uprv_memmove(firstRow+columns, firstRow, count*4);
    244             lastRow+=columns;
    245 
    246             /* split the range and move the firstRow pointer */
    247             firstRow[1]=firstRow[columns]=(uint32_t)start;
    248             firstRow+=columns;
    249         }
    250 
    251         /* split the last row */
    252         if(splitLastRow) {
    253             /* copy the last row data */
    254             uprv_memcpy(lastRow+columns, lastRow, columns*4);
    255 
    256             /* split the range and move the firstRow pointer */
    257             lastRow[1]=lastRow[columns]=(uint32_t)limit;
    258         }
    259     }
    260 
    261     /* set the "row last seen" to the last row for the range */
    262     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
    263 
    264     /* set the input value in all remaining rows */
    265     firstRow+=column;
    266     lastRow+=column;
    267     mask=~mask;
    268     for(;;) {
    269         *firstRow=(*firstRow&mask)|value;
    270         if(firstRow==lastRow) {
    271             break;
    272         }
    273         firstRow+=columns;
    274     }
    275 }
    276 
    277 U_CAPI uint32_t U_EXPORT2
    278 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
    279     uint32_t *row;
    280     UPropsVectors *ncpv;
    281 
    282     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
    283         return 0;
    284     }
    285     ncpv=(UPropsVectors *)pv;
    286     row=_findRow(ncpv, c);
    287     return row[2+column];
    288 }
    289 
    290 U_CAPI uint32_t * U_EXPORT2
    291 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
    292              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
    293     uint32_t *row;
    294     int32_t columns;
    295 
    296     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
    297         return NULL;
    298     }
    299 
    300     columns=pv->columns;
    301     row=pv->v+rowIndex*columns;
    302     if(pRangeStart!=NULL) {
    303         *pRangeStart=(UChar32)row[0];
    304     }
    305     if(pRangeEnd!=NULL) {
    306         *pRangeEnd=(UChar32)row[1]-1;
    307     }
    308     return row+2;
    309 }
    310 
    311 static int32_t U_CALLCONV
    312 upvec_compareRows(const void *context, const void *l, const void *r) {
    313     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
    314     const UPropsVectors *pv=(const UPropsVectors *)context;
    315     int32_t i, count, columns;
    316 
    317     count=columns=pv->columns; /* includes start/limit columns */
    318 
    319     /* start comparing after start/limit but wrap around to them */
    320     i=2;
    321     do {
    322         if(left[i]!=right[i]) {
    323             return left[i]<right[i] ? -1 : 1;
    324         }
    325         if(++i==columns) {
    326             i=0;
    327         }
    328     } while(--count>0);
    329 
    330     return 0;
    331 }
    332 
    333 U_CAPI void U_EXPORT2
    334 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
    335     uint32_t *row;
    336     int32_t i, columns, valueColumns, rows, count;
    337     UChar32 start, limit;
    338 
    339     /* argument checking */
    340     if(U_FAILURE(*pErrorCode)) {
    341         return;
    342     }
    343     if(handler==NULL) {
    344         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    345         return;
    346     }
    347     if(pv->isCompacted) {
    348         return;
    349     }
    350 
    351     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
    352     pv->isCompacted=TRUE;
    353 
    354     rows=pv->rows;
    355     columns=pv->columns;
    356     U_ASSERT(columns>=3); /* upvec_open asserts this */
    357     valueColumns=columns-2; /* not counting start & limit */
    358 
    359     /* sort the properties vectors to find unique vector values */
    360     uprv_sortArray(pv->v, rows, columns*4,
    361                    upvec_compareRows, pv, FALSE, pErrorCode);
    362     if(U_FAILURE(*pErrorCode)) {
    363         return;
    364     }
    365 
    366     /*
    367      * Find and set the special values.
    368      * This has to do almost the same work as the compaction below,
    369      * to find the indexes where the special-value rows will move.
    370      */
    371     row=pv->v;
    372     count=-valueColumns;
    373     for(i=0; i<rows; ++i) {
    374         start=(UChar32)row[0];
    375 
    376         /* count a new values vector if it is different from the current one */
    377         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
    378             count+=valueColumns;
    379         }
    380 
    381         if(start>=UPVEC_FIRST_SPECIAL_CP) {
    382             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
    383             if(U_FAILURE(*pErrorCode)) {
    384                 return;
    385             }
    386         }
    387 
    388         row+=columns;
    389     }
    390 
    391     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
    392     count+=valueColumns;
    393 
    394     /* Call the handler once more to signal the start of delivering real values. */
    395     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
    396             count, row-valueColumns, valueColumns, pErrorCode);
    397     if(U_FAILURE(*pErrorCode)) {
    398         return;
    399     }
    400 
    401     /*
    402      * Move vector contents up to a contiguous array with only unique
    403      * vector values, and call the handler function for each vector.
    404      *
    405      * This destroys the Properties Vector structure and replaces it
    406      * with an array of just vector values.
    407      */
    408     row=pv->v;
    409     count=-valueColumns;
    410     for(i=0; i<rows; ++i) {
    411         /* fetch these first before memmove() may overwrite them */
    412         start=(UChar32)row[0];
    413         limit=(UChar32)row[1];
    414 
    415         /* add a new values vector if it is different from the current one */
    416         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
    417             count+=valueColumns;
    418             uprv_memmove(pv->v+count, row+2, valueColumns*4);
    419         }
    420 
    421         if(start<UPVEC_FIRST_SPECIAL_CP) {
    422             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
    423             if(U_FAILURE(*pErrorCode)) {
    424                 return;
    425             }
    426         }
    427 
    428         row+=columns;
    429     }
    430 
    431     /* count is at the beginning of the last vector, add one to include that last vector */
    432     pv->rows=count/valueColumns+1;
    433 }
    434 
    435 U_CAPI const uint32_t * U_EXPORT2
    436 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
    437     if(!pv->isCompacted) {
    438         return NULL;
    439     }
    440     if(pRows!=NULL) {
    441         *pRows=pv->rows;
    442     }
    443     if(pColumns!=NULL) {
    444         *pColumns=pv->columns-2;
    445     }
    446     return pv->v;
    447 }
    448 
    449 U_CAPI uint32_t * U_EXPORT2
    450 upvec_cloneArray(const UPropsVectors *pv,
    451                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
    452     uint32_t *clonedArray;
    453     int32_t byteLength;
    454 
    455     if(U_FAILURE(*pErrorCode)) {
    456         return NULL;
    457     }
    458     if(!pv->isCompacted) {
    459         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    460         return NULL;
    461     }
    462     byteLength=pv->rows*(pv->columns-2)*4;
    463     clonedArray=(uint32_t *)uprv_malloc(byteLength);
    464     if(clonedArray==NULL) {
    465         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    466         return NULL;
    467     }
    468     uprv_memcpy(clonedArray, pv->v, byteLength);
    469     if(pRows!=NULL) {
    470         *pRows=pv->rows;
    471     }
    472     if(pColumns!=NULL) {
    473         *pColumns=pv->columns-2;
    474     }
    475     return clonedArray;
    476 }
    477 
    478 U_CAPI UTrie2 * U_EXPORT2
    479 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
    480     UPVecToUTrie2Context toUTrie2={ NULL };
    481     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
    482     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
    483     if(U_FAILURE(*pErrorCode)) {
    484         utrie2_close(toUTrie2.trie);
    485         toUTrie2.trie=NULL;
    486     }
    487     return toUTrie2.trie;
    488 }
    489 
    490 /*
    491  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
    492  * some 16-bit field and builds and returns a UTrie2.
    493  */
    494 
    495 U_CAPI void U_CALLCONV
    496 upvec_compactToUTrie2Handler(void *context,
    497                              UChar32 start, UChar32 end,
    498                              int32_t rowIndex, uint32_t *row, int32_t columns,
    499                              UErrorCode *pErrorCode) {
    500     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
    501     if(start<UPVEC_FIRST_SPECIAL_CP) {
    502         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
    503     } else {
    504         switch(start) {
    505         case UPVEC_INITIAL_VALUE_CP:
    506             toUTrie2->initialValue=rowIndex;
    507             break;
    508         case UPVEC_ERROR_VALUE_CP:
    509             toUTrie2->errorValue=rowIndex;
    510             break;
    511         case UPVEC_START_REAL_VALUES_CP:
    512             toUTrie2->maxValue=rowIndex;
    513             if(rowIndex>0xffff) {
    514                 /* too many rows for a 16-bit trie */
    515                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    516             } else {
    517                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
    518                                            toUTrie2->errorValue, pErrorCode);
    519             }
    520             break;
    521         default:
    522             break;
    523         }
    524     }
    525 }
    526