Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  propsvec.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002feb22
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Store bits (Unicode character properties) in bit set vectors.
     19 */
     20 
     21 #include <stdlib.h>
     22 #include "unicode/utypes.h"
     23 #include "cmemory.h"
     24 #include "utrie.h"
     25 #include "utrie2.h"
     26 #include "uarrsort.h"
     27 #include "propsvec.h"
     28 #include "uassert.h"
     29 
     30 struct UPropsVectors {
     31     uint32_t *v;
     32     int32_t columns;  /* number of columns, plus two for start & limit values */
     33     int32_t maxRows;
     34     int32_t rows;
     35     int32_t prevRow;  /* search optimization: remember last row seen */
     36     UBool isCompacted;
     37 };
     38 
     39 #define UPVEC_INITIAL_ROWS (1<<12)
     40 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
     41 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
     42 
     43 U_CAPI UPropsVectors * U_EXPORT2
     44 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
     45     UPropsVectors *pv;
     46     uint32_t *v, *row;
     47     uint32_t cp;
     48 
     49     if(U_FAILURE(*pErrorCode)) {
     50         return NULL;
     51     }
     52     if(columns<1) {
     53         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
     54         return NULL;
     55     }
     56     columns+=2; /* count range start and limit columns */
     57 
     58     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
     59     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
     60     if(pv==NULL || v==NULL) {
     61         uprv_free(pv);
     62         uprv_free(v);
     63         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
     64         return NULL;
     65     }
     66     uprv_memset(pv, 0, sizeof(UPropsVectors));
     67     pv->v=v;
     68     pv->columns=columns;
     69     pv->maxRows=UPVEC_INITIAL_ROWS;
     70     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
     71 
     72     /* set the all-Unicode row and the special-value rows */
     73     row=pv->v;
     74     uprv_memset(row, 0, pv->rows*columns*4);
     75     row[0]=0;
     76     row[1]=0x110000;
     77     row+=columns;
     78     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
     79         row[0]=cp;
     80         row[1]=cp+1;
     81         row+=columns;
     82     }
     83     return pv;
     84 }
     85 
     86 U_CAPI void U_EXPORT2
     87 upvec_close(UPropsVectors *pv) {
     88     if(pv!=NULL) {
     89         uprv_free(pv->v);
     90         uprv_free(pv);
     91     }
     92 }
     93 
     94 static uint32_t *
     95 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
     96     uint32_t *row;
     97     int32_t columns, i, start, limit, prevRow;
     98 
     99     columns=pv->columns;
    100     limit=pv->rows;
    101     prevRow=pv->prevRow;
    102 
    103     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
    104     row=pv->v+prevRow*columns;
    105     if(rangeStart>=(UChar32)row[0]) {
    106         if(rangeStart<(UChar32)row[1]) {
    107             /* same row as last seen */
    108             return row;
    109         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    110             /* next row after the last one */
    111             pv->prevRow=prevRow+1;
    112             return row;
    113         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
    114             /* second row after the last one */
    115             pv->prevRow=prevRow+2;
    116             return row;
    117         } else if((rangeStart-(UChar32)row[1])<10) {
    118             /* we are close, continue looping */
    119             prevRow+=2;
    120             do {
    121                 ++prevRow;
    122                 row+=columns;
    123             } while(rangeStart>=(UChar32)row[1]);
    124             pv->prevRow=prevRow;
    125             return row;
    126         }
    127     } else if(rangeStart<(UChar32)pv->v[1]) {
    128         /* the very first row */
    129         pv->prevRow=0;
    130         return pv->v;
    131     }
    132 
    133     /* do a binary search for the start of the range */
    134     start=0;
    135     while(start<limit-1) {
    136         i=(start+limit)/2;
    137         row=pv->v+i*columns;
    138         if(rangeStart<(UChar32)row[0]) {
    139             limit=i;
    140         } else if(rangeStart<(UChar32)row[1]) {
    141             pv->prevRow=i;
    142             return row;
    143         } else {
    144             start=i;
    145         }
    146     }
    147 
    148     /* must be found because all ranges together always cover all of Unicode */
    149     pv->prevRow=start;
    150     return pv->v+start*columns;
    151 }
    152 
    153 U_CAPI void U_EXPORT2
    154 upvec_setValue(UPropsVectors *pv,
    155                UChar32 start, UChar32 end,
    156                int32_t column,
    157                uint32_t value, uint32_t mask,
    158                UErrorCode *pErrorCode) {
    159     uint32_t *firstRow, *lastRow;
    160     int32_t columns;
    161     UChar32 limit;
    162     UBool splitFirstRow, splitLastRow;
    163 
    164     /* argument checking */
    165     if(U_FAILURE(*pErrorCode)) {
    166         return;
    167     }
    168     if( pv==NULL ||
    169         start<0 || start>end || end>UPVEC_MAX_CP ||
    170         column<0 || column>=(pv->columns-2)
    171     ) {
    172         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    173         return;
    174     }
    175     if(pv->isCompacted) {
    176         *pErrorCode=U_NO_WRITE_PERMISSION;
    177         return;
    178     }
    179     limit=end+1;
    180 
    181     /* initialize */
    182     columns=pv->columns;
    183     column+=2; /* skip range start and limit columns */
    184     value&=mask;
    185 
    186     /* find the rows whose ranges overlap with the input range */
    187 
    188     /* find the first and last rows, always successful */
    189     firstRow=_findRow(pv, start);
    190     lastRow=_findRow(pv, end);
    191 
    192     /*
    193      * Rows need to be split if they partially overlap with the
    194      * input range (only possible for the first and last rows)
    195      * and if their value differs from the input value.
    196      */
    197     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
    198     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
    199 
    200     /* split first/last rows if necessary */
    201     if(splitFirstRow || splitLastRow) {
    202         int32_t count, rows;
    203 
    204         rows=pv->rows;
    205         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
    206             uint32_t *newVectors;
    207             int32_t newMaxRows;
    208 
    209             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
    210                 newMaxRows=UPVEC_MEDIUM_ROWS;
    211             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
    212                 newMaxRows=UPVEC_MAX_ROWS;
    213             } else {
    214                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
    215                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    216                 return;
    217             }
    218             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
    219             if(newVectors==NULL) {
    220                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    221                 return;
    222             }
    223             uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
    224             firstRow=newVectors+(firstRow-pv->v);
    225             lastRow=newVectors+(lastRow-pv->v);
    226             uprv_free(pv->v);
    227             pv->v=newVectors;
    228             pv->maxRows=newMaxRows;
    229         }
    230 
    231         /* count the number of row cells to move after the last row, and move them */
    232         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
    233         if(count>0) {
    234             uprv_memmove(
    235                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
    236                 lastRow+columns,
    237                 count*4);
    238         }
    239         pv->rows=rows+splitFirstRow+splitLastRow;
    240 
    241         /* split the first row, and move the firstRow pointer to the second part */
    242         if(splitFirstRow) {
    243             /* copy all affected rows up one and move the lastRow pointer */
    244             count = (int32_t)((lastRow-firstRow)+columns);
    245             uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
    246             lastRow+=columns;
    247 
    248             /* split the range and move the firstRow pointer */
    249             firstRow[1]=firstRow[columns]=(uint32_t)start;
    250             firstRow+=columns;
    251         }
    252 
    253         /* split the last row */
    254         if(splitLastRow) {
    255             /* copy the last row data */
    256             uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
    257 
    258             /* split the range and move the firstRow pointer */
    259             lastRow[1]=lastRow[columns]=(uint32_t)limit;
    260         }
    261     }
    262 
    263     /* set the "row last seen" to the last row for the range */
    264     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
    265 
    266     /* set the input value in all remaining rows */
    267     firstRow+=column;
    268     lastRow+=column;
    269     mask=~mask;
    270     for(;;) {
    271         *firstRow=(*firstRow&mask)|value;
    272         if(firstRow==lastRow) {
    273             break;
    274         }
    275         firstRow+=columns;
    276     }
    277 }
    278 
    279 U_CAPI uint32_t U_EXPORT2
    280 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
    281     uint32_t *row;
    282     UPropsVectors *ncpv;
    283 
    284     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
    285         return 0;
    286     }
    287     ncpv=(UPropsVectors *)pv;
    288     row=_findRow(ncpv, c);
    289     return row[2+column];
    290 }
    291 
    292 U_CAPI uint32_t * U_EXPORT2
    293 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
    294              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
    295     uint32_t *row;
    296     int32_t columns;
    297 
    298     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
    299         return NULL;
    300     }
    301 
    302     columns=pv->columns;
    303     row=pv->v+rowIndex*columns;
    304     if(pRangeStart!=NULL) {
    305         *pRangeStart=(UChar32)row[0];
    306     }
    307     if(pRangeEnd!=NULL) {
    308         *pRangeEnd=(UChar32)row[1]-1;
    309     }
    310     return row+2;
    311 }
    312 
    313 static int32_t U_CALLCONV
    314 upvec_compareRows(const void *context, const void *l, const void *r) {
    315     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
    316     const UPropsVectors *pv=(const UPropsVectors *)context;
    317     int32_t i, count, columns;
    318 
    319     count=columns=pv->columns; /* includes start/limit columns */
    320 
    321     /* start comparing after start/limit but wrap around to them */
    322     i=2;
    323     do {
    324         if(left[i]!=right[i]) {
    325             return left[i]<right[i] ? -1 : 1;
    326         }
    327         if(++i==columns) {
    328             i=0;
    329         }
    330     } while(--count>0);
    331 
    332     return 0;
    333 }
    334 
    335 U_CAPI void U_EXPORT2
    336 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
    337     uint32_t *row;
    338     int32_t i, columns, valueColumns, rows, count;
    339     UChar32 start, limit;
    340 
    341     /* argument checking */
    342     if(U_FAILURE(*pErrorCode)) {
    343         return;
    344     }
    345     if(handler==NULL) {
    346         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    347         return;
    348     }
    349     if(pv->isCompacted) {
    350         return;
    351     }
    352 
    353     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
    354     pv->isCompacted=TRUE;
    355 
    356     rows=pv->rows;
    357     columns=pv->columns;
    358     U_ASSERT(columns>=3); /* upvec_open asserts this */
    359     valueColumns=columns-2; /* not counting start & limit */
    360 
    361     /* sort the properties vectors to find unique vector values */
    362     uprv_sortArray(pv->v, rows, columns*4,
    363                    upvec_compareRows, pv, FALSE, pErrorCode);
    364     if(U_FAILURE(*pErrorCode)) {
    365         return;
    366     }
    367 
    368     /*
    369      * Find and set the special values.
    370      * This has to do almost the same work as the compaction below,
    371      * to find the indexes where the special-value rows will move.
    372      */
    373     row=pv->v;
    374     count=-valueColumns;
    375     for(i=0; i<rows; ++i) {
    376         start=(UChar32)row[0];
    377 
    378         /* count a new values vector if it is different from the current one */
    379         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
    380             count+=valueColumns;
    381         }
    382 
    383         if(start>=UPVEC_FIRST_SPECIAL_CP) {
    384             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
    385             if(U_FAILURE(*pErrorCode)) {
    386                 return;
    387             }
    388         }
    389 
    390         row+=columns;
    391     }
    392 
    393     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
    394     count+=valueColumns;
    395 
    396     /* Call the handler once more to signal the start of delivering real values. */
    397     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
    398             count, row-valueColumns, valueColumns, pErrorCode);
    399     if(U_FAILURE(*pErrorCode)) {
    400         return;
    401     }
    402 
    403     /*
    404      * Move vector contents up to a contiguous array with only unique
    405      * vector values, and call the handler function for each vector.
    406      *
    407      * This destroys the Properties Vector structure and replaces it
    408      * with an array of just vector values.
    409      */
    410     row=pv->v;
    411     count=-valueColumns;
    412     for(i=0; i<rows; ++i) {
    413         /* fetch these first before memmove() may overwrite them */
    414         start=(UChar32)row[0];
    415         limit=(UChar32)row[1];
    416 
    417         /* add a new values vector if it is different from the current one */
    418         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
    419             count+=valueColumns;
    420             uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
    421         }
    422 
    423         if(start<UPVEC_FIRST_SPECIAL_CP) {
    424             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
    425             if(U_FAILURE(*pErrorCode)) {
    426                 return;
    427             }
    428         }
    429 
    430         row+=columns;
    431     }
    432 
    433     /* count is at the beginning of the last vector, add one to include that last vector */
    434     pv->rows=count/valueColumns+1;
    435 }
    436 
    437 U_CAPI const uint32_t * U_EXPORT2
    438 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
    439     if(!pv->isCompacted) {
    440         return NULL;
    441     }
    442     if(pRows!=NULL) {
    443         *pRows=pv->rows;
    444     }
    445     if(pColumns!=NULL) {
    446         *pColumns=pv->columns-2;
    447     }
    448     return pv->v;
    449 }
    450 
    451 U_CAPI uint32_t * U_EXPORT2
    452 upvec_cloneArray(const UPropsVectors *pv,
    453                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
    454     uint32_t *clonedArray;
    455     int32_t byteLength;
    456 
    457     if(U_FAILURE(*pErrorCode)) {
    458         return NULL;
    459     }
    460     if(!pv->isCompacted) {
    461         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    462         return NULL;
    463     }
    464     byteLength=pv->rows*(pv->columns-2)*4;
    465     clonedArray=(uint32_t *)uprv_malloc(byteLength);
    466     if(clonedArray==NULL) {
    467         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    468         return NULL;
    469     }
    470     uprv_memcpy(clonedArray, pv->v, byteLength);
    471     if(pRows!=NULL) {
    472         *pRows=pv->rows;
    473     }
    474     if(pColumns!=NULL) {
    475         *pColumns=pv->columns-2;
    476     }
    477     return clonedArray;
    478 }
    479 
    480 U_CAPI UTrie2 * U_EXPORT2
    481 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
    482     UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 };
    483     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
    484     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
    485     if(U_FAILURE(*pErrorCode)) {
    486         utrie2_close(toUTrie2.trie);
    487         toUTrie2.trie=NULL;
    488     }
    489     return toUTrie2.trie;
    490 }
    491 
    492 /*
    493  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
    494  * some 16-bit field and builds and returns a UTrie2.
    495  */
    496 
    497 U_CAPI void U_CALLCONV
    498 upvec_compactToUTrie2Handler(void *context,
    499                              UChar32 start, UChar32 end,
    500                              int32_t rowIndex, uint32_t *row, int32_t columns,
    501                              UErrorCode *pErrorCode) {
    502     (void)row;
    503     (void)columns;
    504     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
    505     if(start<UPVEC_FIRST_SPECIAL_CP) {
    506         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
    507     } else {
    508         switch(start) {
    509         case UPVEC_INITIAL_VALUE_CP:
    510             toUTrie2->initialValue=rowIndex;
    511             break;
    512         case UPVEC_ERROR_VALUE_CP:
    513             toUTrie2->errorValue=rowIndex;
    514             break;
    515         case UPVEC_START_REAL_VALUES_CP:
    516             toUTrie2->maxValue=rowIndex;
    517             if(rowIndex>0xffff) {
    518                 /* too many rows for a 16-bit trie */
    519                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    520             } else {
    521                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
    522                                            toUTrie2->errorValue, pErrorCode);
    523             }
    524             break;
    525         default:
    526             break;
    527         }
    528     }
    529 }
    530