Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  propsvec.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002feb22
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store bits (Unicode character properties) in bit set vectors.
     17 */
     18 
     19 #include <stdlib.h>
     20 #include "unicode/utypes.h"
     21 #include "cmemory.h"
     22 #include "utrie.h"
     23 #include "utrie2.h"
     24 #include "uarrsort.h"
     25 #include "propsvec.h"
     26 
     27 struct UPropsVectors {
     28     uint32_t *v;
     29     int32_t columns;  /* number of columns, plus two for start & limit values */
     30     int32_t maxRows;
     31     int32_t rows;
     32     int32_t prevRow;  /* search optimization: remember last row seen */
     33     UBool isCompacted;
     34 };
     35 
     36 #define UPVEC_INITIAL_ROWS (1<<12)
     37 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
     38 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
     39 
     40 U_CAPI UPropsVectors * U_EXPORT2
     41 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
     42     UPropsVectors *pv;
     43     uint32_t *v, *row;
     44     uint32_t cp;
     45 
     46     if(U_FAILURE(*pErrorCode)) {
     47         return NULL;
     48     }
     49     if(columns<1) {
     50         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     51         return NULL;
     52     }
     53     columns+=2; /* count range start and limit columns */
     54 
     55     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
     56     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
     57     if(pv==NULL || v==NULL) {
     58         uprv_free(pv);
     59         uprv_free(v);
     60         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
     61         return NULL;
     62     }
     63     uprv_memset(pv, 0, sizeof(UPropsVectors));
     64     pv->v=v;
     65     pv->columns=columns;
     66     pv->maxRows=UPVEC_INITIAL_ROWS;
     67     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
     68 
     69     /* set the all-Unicode row and the special-value rows */
     70     row=pv->v;
     71     uprv_memset(row, 0, pv->rows*columns*4);
     72     row[0]=0;
     73     row[1]=0x110000;
     74     row+=columns;
     75     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
     76         row[0]=cp;
     77         row[1]=cp+1;
     78         row+=columns;
     79     }
     80     return pv;
     81 }
     82 
     83 U_CAPI void U_EXPORT2
     84 upvec_close(UPropsVectors *pv) {
     85     if(pv!=NULL) {
     86         uprv_free(pv->v);
     87         uprv_free(pv);
     88     }
     89 }
     90 
     91 static uint32_t *
     92 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
     93     uint32_t *row;
     94     int32_t columns, i, start, limit, prevRow, rows;
     95 
     96     columns=pv->columns;
     97     rows=limit=pv->rows;
     98     prevRow=pv->prevRow;
     99 
    100     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
    101     row=pv->v+prevRow*columns;
    102     if(rangeStart>=(UChar32)row[0]) {
    103         if(rangeStart<(UChar32)row[1]) {
    104             /* same row as last seen */
    105             return row;
    106         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    107             /* next row after the last one */
    108             pv->prevRow=prevRow+1;
    109             return row;
    110         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    111             /* second row after the last one */
    112             pv->prevRow=prevRow+2;
    113             return row;
    114         } else if((rangeStart-(UChar32)row[1])<10) {
    115             /* we are close, continue looping */
    116             prevRow+=2;
    117             do {
    118                 ++prevRow;
    119                 row+=columns;
    120             } while(rangeStart>=(UChar32)row[1]);
    121             pv->prevRow=prevRow;
    122             return row;
    123         }
    124     } else if(rangeStart<(UChar32)pv->v[1]) {
    125         /* the very first row */
    126         pv->prevRow=0;
    127         return pv->v;
    128     }
    129 
    130     /* do a binary search for the start of the range */
    131     start=0;
    132     while(start<limit-1) {
    133         i=(start+limit)/2;
    134         row=pv->v+i*columns;
    135         if(rangeStart<(UChar32)row[0]) {
    136             limit=i;
    137         } else if(rangeStart<(UChar32)row[1]) {
    138             pv->prevRow=i;
    139             return row;
    140         } else {
    141             start=i;
    142         }
    143     }
    144 
    145     /* must be found because all ranges together always cover all of Unicode */
    146     pv->prevRow=start;
    147     return pv->v+start*columns;
    148 }
    149 
    150 U_CAPI void U_EXPORT2
    151 upvec_setValue(UPropsVectors *pv,
    152                UChar32 start, UChar32 end,
    153                int32_t column,
    154                uint32_t value, uint32_t mask,
    155                UErrorCode *pErrorCode) {
    156     uint32_t *firstRow, *lastRow;
    157     int32_t columns;
    158     UChar32 limit;
    159     UBool splitFirstRow, splitLastRow;
    160 
    161     /* argument checking */
    162     if(U_FAILURE(*pErrorCode)) {
    163         return;
    164     }
    165     if( pv==NULL ||
    166         start<0 || start>end || end>UPVEC_MAX_CP ||
    167         column<0 || column>=(pv->columns-2)
    168     ) {
    169         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    170         return;
    171     }
    172     if(pv->isCompacted) {
    173         *pErrorCode=U_NO_WRITE_PERMISSION;
    174         return;
    175     }
    176     limit=end+1;
    177 
    178     /* initialize */
    179     columns=pv->columns;
    180     column+=2; /* skip range start and limit columns */
    181     value&=mask;
    182 
    183     /* find the rows whose ranges overlap with the input range */
    184 
    185     /* find the first and last rows, always successful */
    186     firstRow=_findRow(pv, start);
    187     lastRow=_findRow(pv, end);
    188 
    189     /*
    190      * Rows need to be split if they partially overlap with the
    191      * input range (only possible for the first and last rows)
    192      * and if their value differs from the input value.
    193      */
    194     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
    195     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
    196 
    197     /* split first/last rows if necessary */
    198     if(splitFirstRow || splitLastRow) {
    199         int32_t count, rows;
    200 
    201         rows=pv->rows;
    202         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
    203             uint32_t *newVectors;
    204             int32_t newMaxRows;
    205 
    206             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
    207                 newMaxRows=UPVEC_MEDIUM_ROWS;
    208             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
    209                 newMaxRows=UPVEC_MAX_ROWS;
    210             } else {
    211                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
    212                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    213                 return;
    214             }
    215             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
    216             if(newVectors==NULL) {
    217                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    218                 return;
    219             }
    220             uprv_memcpy(newVectors, pv->v, rows*columns*4);
    221             firstRow=newVectors+(firstRow-pv->v);
    222             lastRow=newVectors+(lastRow-pv->v);
    223             uprv_free(pv->v);
    224             pv->v=newVectors;
    225             pv->maxRows=newMaxRows;
    226         }
    227 
    228         /* count the number of row cells to move after the last row, and move them */
    229         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
    230         if(count>0) {
    231             uprv_memmove(
    232                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
    233                 lastRow+columns,
    234                 count*4);
    235         }
    236         pv->rows=rows+splitFirstRow+splitLastRow;
    237 
    238         /* split the first row, and move the firstRow pointer to the second part */
    239         if(splitFirstRow) {
    240             /* copy all affected rows up one and move the lastRow pointer */
    241             count = (int32_t)((lastRow-firstRow)+columns);
    242             uprv_memmove(firstRow+columns, firstRow, count*4);
    243             lastRow+=columns;
    244 
    245             /* split the range and move the firstRow pointer */
    246             firstRow[1]=firstRow[columns]=(uint32_t)start;
    247             firstRow+=columns;
    248         }
    249 
    250         /* split the last row */
    251         if(splitLastRow) {
    252             /* copy the last row data */
    253             uprv_memcpy(lastRow+columns, lastRow, columns*4);
    254 
    255             /* split the range and move the firstRow pointer */
    256             lastRow[1]=lastRow[columns]=(uint32_t)limit;
    257         }
    258     }
    259 
    260     /* set the "row last seen" to the last row for the range */
    261     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
    262 
    263     /* set the input value in all remaining rows */
    264     firstRow+=column;
    265     lastRow+=column;
    266     mask=~mask;
    267     for(;;) {
    268         *firstRow=(*firstRow&mask)|value;
    269         if(firstRow==lastRow) {
    270             break;
    271         }
    272         firstRow+=columns;
    273     }
    274 }
    275 
    276 U_CAPI uint32_t U_EXPORT2
    277 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
    278     uint32_t *row;
    279     UPropsVectors *ncpv;
    280 
    281     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
    282         return 0;
    283     }
    284     ncpv=(UPropsVectors *)pv;
    285     row=_findRow(ncpv, c);
    286     return row[2+column];
    287 }
    288 
    289 U_CAPI uint32_t * U_EXPORT2
    290 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
    291              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
    292     uint32_t *row;
    293     int32_t columns;
    294 
    295     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
    296         return NULL;
    297     }
    298 
    299     columns=pv->columns;
    300     row=pv->v+rowIndex*columns;
    301     if(pRangeStart!=NULL) {
    302         *pRangeStart=(UChar32)row[0];
    303     }
    304     if(pRangeEnd!=NULL) {
    305         *pRangeEnd=(UChar32)row[1]-1;
    306     }
    307     return row+2;
    308 }
    309 
    310 static int32_t U_CALLCONV
    311 upvec_compareRows(const void *context, const void *l, const void *r) {
    312     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
    313     const UPropsVectors *pv=(const UPropsVectors *)context;
    314     int32_t i, count, columns;
    315 
    316     count=columns=pv->columns; /* includes start/limit columns */
    317 
    318     /* start comparing after start/limit but wrap around to them */
    319     i=2;
    320     do {
    321         if(left[i]!=right[i]) {
    322             return left[i]<right[i] ? -1 : 1;
    323         }
    324         if(++i==columns) {
    325             i=0;
    326         }
    327     } while(--count>0);
    328 
    329     return 0;
    330 }
    331 
    332 U_CAPI void U_EXPORT2
    333 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
    334     uint32_t *row;
    335     int32_t i, columns, valueColumns, rows, count;
    336     UChar32 start, limit;
    337 
    338     /* argument checking */
    339     if(U_FAILURE(*pErrorCode)) {
    340         return;
    341     }
    342     if(handler==NULL) {
    343         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    344         return;
    345     }
    346     if(pv->isCompacted) {
    347         return;
    348     }
    349 
    350     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
    351     pv->isCompacted=TRUE;
    352 
    353     rows=pv->rows;
    354     columns=pv->columns;
    355     valueColumns=columns-2; /* not counting start & limit */
    356 
    357     /* sort the properties vectors to find unique vector values */
    358     uprv_sortArray(pv->v, rows, columns*4,
    359                    upvec_compareRows, pv, FALSE, pErrorCode);
    360     if(U_FAILURE(*pErrorCode)) {
    361         return;
    362     }
    363 
    364     /*
    365      * Find and set the special values.
    366      * This has to do almost the same work as the compaction below,
    367      * to find the indexes where the special-value rows will move.
    368      */
    369     row=pv->v;
    370     count=-valueColumns;
    371     for(i=0; i<rows; ++i) {
    372         start=(UChar32)row[0];
    373 
    374         /* count a new values vector if it is different from the current one */
    375         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
    376             count+=valueColumns;
    377         }
    378 
    379         if(start>=UPVEC_FIRST_SPECIAL_CP) {
    380             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
    381             if(U_FAILURE(*pErrorCode)) {
    382                 return;
    383             }
    384         }
    385 
    386         row+=columns;
    387     }
    388 
    389     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
    390     count+=valueColumns;
    391 
    392     /* Call the handler once more to signal the start of delivering real values. */
    393     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
    394             count, row-valueColumns, valueColumns, pErrorCode);
    395     if(U_FAILURE(*pErrorCode)) {
    396         return;
    397     }
    398 
    399     /*
    400      * Move vector contents up to a contiguous array with only unique
    401      * vector values, and call the handler function for each vector.
    402      *
    403      * This destroys the Properties Vector structure and replaces it
    404      * with an array of just vector values.
    405      */
    406     row=pv->v;
    407     count=-valueColumns;
    408     for(i=0; i<rows; ++i) {
    409         /* fetch these first before memmove() may overwrite them */
    410         start=(UChar32)row[0];
    411         limit=(UChar32)row[1];
    412 
    413         /* add a new values vector if it is different from the current one */
    414         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
    415             count+=valueColumns;
    416             uprv_memmove(pv->v+count, row+2, valueColumns*4);
    417         }
    418 
    419         if(start<UPVEC_FIRST_SPECIAL_CP) {
    420             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
    421             if(U_FAILURE(*pErrorCode)) {
    422                 return;
    423             }
    424         }
    425 
    426         row+=columns;
    427     }
    428 
    429     /* count is at the beginning of the last vector, add one to include that last vector */
    430     pv->rows=count/valueColumns+1;
    431 }
    432 
    433 U_CAPI const uint32_t * U_EXPORT2
    434 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
    435     if(!pv->isCompacted) {
    436         return NULL;
    437     }
    438     if(pRows!=NULL) {
    439         *pRows=pv->rows;
    440     }
    441     if(pColumns!=NULL) {
    442         *pColumns=pv->columns-2;
    443     }
    444     return pv->v;
    445 }
    446 
    447 U_CAPI uint32_t * U_EXPORT2
    448 upvec_cloneArray(const UPropsVectors *pv,
    449                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
    450     uint32_t *clonedArray;
    451     int32_t byteLength;
    452 
    453     if(U_FAILURE(*pErrorCode)) {
    454         return NULL;
    455     }
    456     if(!pv->isCompacted) {
    457         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    458         return NULL;
    459     }
    460     byteLength=pv->rows*(pv->columns-2)*4;
    461     clonedArray=(uint32_t *)uprv_malloc(byteLength);
    462     if(clonedArray==NULL) {
    463         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    464         return NULL;
    465     }
    466     uprv_memcpy(clonedArray, pv->v, byteLength);
    467     if(pRows!=NULL) {
    468         *pRows=pv->rows;
    469     }
    470     if(pColumns!=NULL) {
    471         *pColumns=pv->columns-2;
    472     }
    473     return clonedArray;
    474 }
    475 
    476 U_CAPI UTrie2 * U_EXPORT2
    477 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
    478     UPVecToUTrie2Context toUTrie2={ NULL };
    479     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
    480     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
    481     if(U_FAILURE(*pErrorCode)) {
    482         utrie2_close(toUTrie2.trie);
    483         toUTrie2.trie=NULL;
    484     }
    485     return toUTrie2.trie;
    486 }
    487 
    488 /*
    489  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
    490  * some 16-bit field and builds and returns a UTrie2.
    491  */
    492 
    493 U_CAPI void U_CALLCONV
    494 upvec_compactToUTrie2Handler(void *context,
    495                              UChar32 start, UChar32 end,
    496                              int32_t rowIndex, uint32_t *row, int32_t columns,
    497                              UErrorCode *pErrorCode) {
    498     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
    499     if(start<UPVEC_FIRST_SPECIAL_CP) {
    500         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
    501     } else {
    502         switch(start) {
    503         case UPVEC_INITIAL_VALUE_CP:
    504             toUTrie2->initialValue=rowIndex;
    505             break;
    506         case UPVEC_ERROR_VALUE_CP:
    507             toUTrie2->errorValue=rowIndex;
    508             break;
    509         case UPVEC_START_REAL_VALUES_CP:
    510             toUTrie2->maxValue=rowIndex;
    511             if(rowIndex>0xffff) {
    512                 /* too many rows for a 16-bit trie */
    513                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    514             } else {
    515                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
    516                                            toUTrie2->errorValue, pErrorCode);
    517             }
    518             break;
    519         default:
    520             break;
    521         }
    522     }
    523 }
    524