1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: propsvec.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb22 14 * created by: Markus W. Scherer 15 * 16 * Store bits (Unicode character properties) in bit set vectors. 17 */ 18 19 #include <stdlib.h> 20 #include "unicode/utypes.h" 21 #include "cmemory.h" 22 #include "utrie.h" 23 #include "utrie2.h" 24 #include "uarrsort.h" 25 #include "propsvec.h" 26 27 struct UPropsVectors { 28 uint32_t *v; 29 int32_t columns; /* number of columns, plus two for start & limit values */ 30 int32_t maxRows; 31 int32_t rows; 32 int32_t prevRow; /* search optimization: remember last row seen */ 33 UBool isCompacted; 34 }; 35 36 #define UPVEC_INITIAL_ROWS (1<<12) 37 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) 38 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) 39 40 U_CAPI UPropsVectors * U_EXPORT2 41 upvec_open(int32_t columns, UErrorCode *pErrorCode) { 42 UPropsVectors *pv; 43 uint32_t *v, *row; 44 uint32_t cp; 45 46 if(U_FAILURE(*pErrorCode)) { 47 return NULL; 48 } 49 if(columns<1) { 50 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 51 return NULL; 52 } 53 columns+=2; /* count range start and limit columns */ 54 55 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); 56 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); 57 if(pv==NULL || v==NULL) { 58 uprv_free(pv); 59 uprv_free(v); 60 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 61 return NULL; 62 } 63 uprv_memset(pv, 0, sizeof(UPropsVectors)); 64 pv->v=v; 65 pv->columns=columns; 66 pv->maxRows=UPVEC_INITIAL_ROWS; 67 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); 68 69 /* set the all-Unicode row and the special-value rows */ 70 row=pv->v; 71 uprv_memset(row, 0, pv->rows*columns*4); 72 row[0]=0; 73 row[1]=0x110000; 74 row+=columns; 75 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { 76 row[0]=cp; 77 row[1]=cp+1; 78 row+=columns; 79 } 80 return pv; 81 } 82 83 U_CAPI void U_EXPORT2 84 upvec_close(UPropsVectors *pv) { 85 if(pv!=NULL) { 86 uprv_free(pv->v); 87 uprv_free(pv); 88 } 89 } 90 91 static uint32_t * 92 _findRow(UPropsVectors *pv, UChar32 rangeStart) { 93 uint32_t *row; 94 int32_t columns, i, start, limit, prevRow, rows; 95 96 columns=pv->columns; 97 rows=limit=pv->rows; 98 prevRow=pv->prevRow; 99 100 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ 101 row=pv->v+prevRow*columns; 102 if(rangeStart>=(UChar32)row[0]) { 103 if(rangeStart<(UChar32)row[1]) { 104 /* same row as last seen */ 105 return row; 106 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 107 /* next row after the last one */ 108 pv->prevRow=prevRow+1; 109 return row; 110 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 111 /* second row after the last one */ 112 pv->prevRow=prevRow+2; 113 return row; 114 } else if((rangeStart-(UChar32)row[1])<10) { 115 /* we are close, continue looping */ 116 prevRow+=2; 117 do { 118 ++prevRow; 119 row+=columns; 120 } while(rangeStart>=(UChar32)row[1]); 121 pv->prevRow=prevRow; 122 return row; 123 } 124 } else if(rangeStart<(UChar32)pv->v[1]) { 125 /* the very first row */ 126 pv->prevRow=0; 127 return pv->v; 128 } 129 130 /* do a binary search for the start of the range */ 131 start=0; 132 while(start<limit-1) { 133 i=(start+limit)/2; 134 row=pv->v+i*columns; 135 if(rangeStart<(UChar32)row[0]) { 136 limit=i; 137 } else if(rangeStart<(UChar32)row[1]) { 138 pv->prevRow=i; 139 return row; 140 } else { 141 start=i; 142 } 143 } 144 145 /* must be found because all ranges together always cover all of Unicode */ 146 pv->prevRow=start; 147 return pv->v+start*columns; 148 } 149 150 U_CAPI void U_EXPORT2 151 upvec_setValue(UPropsVectors *pv, 152 UChar32 start, UChar32 end, 153 int32_t column, 154 uint32_t value, uint32_t mask, 155 UErrorCode *pErrorCode) { 156 uint32_t *firstRow, *lastRow; 157 int32_t columns; 158 UChar32 limit; 159 UBool splitFirstRow, splitLastRow; 160 161 /* argument checking */ 162 if(U_FAILURE(*pErrorCode)) { 163 return; 164 } 165 if( pv==NULL || 166 start<0 || start>end || end>UPVEC_MAX_CP || 167 column<0 || column>=(pv->columns-2) 168 ) { 169 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 170 return; 171 } 172 if(pv->isCompacted) { 173 *pErrorCode=U_NO_WRITE_PERMISSION; 174 return; 175 } 176 limit=end+1; 177 178 /* initialize */ 179 columns=pv->columns; 180 column+=2; /* skip range start and limit columns */ 181 value&=mask; 182 183 /* find the rows whose ranges overlap with the input range */ 184 185 /* find the first and last rows, always successful */ 186 firstRow=_findRow(pv, start); 187 lastRow=_findRow(pv, end); 188 189 /* 190 * Rows need to be split if they partially overlap with the 191 * input range (only possible for the first and last rows) 192 * and if their value differs from the input value. 193 */ 194 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); 195 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); 196 197 /* split first/last rows if necessary */ 198 if(splitFirstRow || splitLastRow) { 199 int32_t count, rows; 200 201 rows=pv->rows; 202 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { 203 uint32_t *newVectors; 204 int32_t newMaxRows; 205 206 if(pv->maxRows<UPVEC_MEDIUM_ROWS) { 207 newMaxRows=UPVEC_MEDIUM_ROWS; 208 } else if(pv->maxRows<UPVEC_MAX_ROWS) { 209 newMaxRows=UPVEC_MAX_ROWS; 210 } else { 211 /* Implementation bug, or UPVEC_MAX_ROWS too low. */ 212 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 213 return; 214 } 215 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); 216 if(newVectors==NULL) { 217 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 218 return; 219 } 220 uprv_memcpy(newVectors, pv->v, rows*columns*4); 221 firstRow=newVectors+(firstRow-pv->v); 222 lastRow=newVectors+(lastRow-pv->v); 223 uprv_free(pv->v); 224 pv->v=newVectors; 225 pv->maxRows=newMaxRows; 226 } 227 228 /* count the number of row cells to move after the last row, and move them */ 229 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); 230 if(count>0) { 231 uprv_memmove( 232 lastRow+(1+splitFirstRow+splitLastRow)*columns, 233 lastRow+columns, 234 count*4); 235 } 236 pv->rows=rows+splitFirstRow+splitLastRow; 237 238 /* split the first row, and move the firstRow pointer to the second part */ 239 if(splitFirstRow) { 240 /* copy all affected rows up one and move the lastRow pointer */ 241 count = (int32_t)((lastRow-firstRow)+columns); 242 uprv_memmove(firstRow+columns, firstRow, count*4); 243 lastRow+=columns; 244 245 /* split the range and move the firstRow pointer */ 246 firstRow[1]=firstRow[columns]=(uint32_t)start; 247 firstRow+=columns; 248 } 249 250 /* split the last row */ 251 if(splitLastRow) { 252 /* copy the last row data */ 253 uprv_memcpy(lastRow+columns, lastRow, columns*4); 254 255 /* split the range and move the firstRow pointer */ 256 lastRow[1]=lastRow[columns]=(uint32_t)limit; 257 } 258 } 259 260 /* set the "row last seen" to the last row for the range */ 261 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); 262 263 /* set the input value in all remaining rows */ 264 firstRow+=column; 265 lastRow+=column; 266 mask=~mask; 267 for(;;) { 268 *firstRow=(*firstRow&mask)|value; 269 if(firstRow==lastRow) { 270 break; 271 } 272 firstRow+=columns; 273 } 274 } 275 276 U_CAPI uint32_t U_EXPORT2 277 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { 278 uint32_t *row; 279 UPropsVectors *ncpv; 280 281 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { 282 return 0; 283 } 284 ncpv=(UPropsVectors *)pv; 285 row=_findRow(ncpv, c); 286 return row[2+column]; 287 } 288 289 U_CAPI uint32_t * U_EXPORT2 290 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, 291 UChar32 *pRangeStart, UChar32 *pRangeEnd) { 292 uint32_t *row; 293 int32_t columns; 294 295 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { 296 return NULL; 297 } 298 299 columns=pv->columns; 300 row=pv->v+rowIndex*columns; 301 if(pRangeStart!=NULL) { 302 *pRangeStart=(UChar32)row[0]; 303 } 304 if(pRangeEnd!=NULL) { 305 *pRangeEnd=(UChar32)row[1]-1; 306 } 307 return row+2; 308 } 309 310 static int32_t U_CALLCONV 311 upvec_compareRows(const void *context, const void *l, const void *r) { 312 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; 313 const UPropsVectors *pv=(const UPropsVectors *)context; 314 int32_t i, count, columns; 315 316 count=columns=pv->columns; /* includes start/limit columns */ 317 318 /* start comparing after start/limit but wrap around to them */ 319 i=2; 320 do { 321 if(left[i]!=right[i]) { 322 return left[i]<right[i] ? -1 : 1; 323 } 324 if(++i==columns) { 325 i=0; 326 } 327 } while(--count>0); 328 329 return 0; 330 } 331 332 U_CAPI void U_EXPORT2 333 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { 334 uint32_t *row; 335 int32_t i, columns, valueColumns, rows, count; 336 UChar32 start, limit; 337 338 /* argument checking */ 339 if(U_FAILURE(*pErrorCode)) { 340 return; 341 } 342 if(handler==NULL) { 343 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 344 return; 345 } 346 if(pv->isCompacted) { 347 return; 348 } 349 350 /* Set the flag now: Sorting and compacting destroys the builder data structure. */ 351 pv->isCompacted=TRUE; 352 353 rows=pv->rows; 354 columns=pv->columns; 355 valueColumns=columns-2; /* not counting start & limit */ 356 357 /* sort the properties vectors to find unique vector values */ 358 uprv_sortArray(pv->v, rows, columns*4, 359 upvec_compareRows, pv, FALSE, pErrorCode); 360 if(U_FAILURE(*pErrorCode)) { 361 return; 362 } 363 364 /* 365 * Find and set the special values. 366 * This has to do almost the same work as the compaction below, 367 * to find the indexes where the special-value rows will move. 368 */ 369 row=pv->v; 370 count=-valueColumns; 371 for(i=0; i<rows; ++i) { 372 start=(UChar32)row[0]; 373 374 /* count a new values vector if it is different from the current one */ 375 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { 376 count+=valueColumns; 377 } 378 379 if(start>=UPVEC_FIRST_SPECIAL_CP) { 380 handler(context, start, start, count, row+2, valueColumns, pErrorCode); 381 if(U_FAILURE(*pErrorCode)) { 382 return; 383 } 384 } 385 386 row+=columns; 387 } 388 389 /* count is at the beginning of the last vector, add valueColumns to include that last vector */ 390 count+=valueColumns; 391 392 /* Call the handler once more to signal the start of delivering real values. */ 393 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, 394 count, row-valueColumns, valueColumns, pErrorCode); 395 if(U_FAILURE(*pErrorCode)) { 396 return; 397 } 398 399 /* 400 * Move vector contents up to a contiguous array with only unique 401 * vector values, and call the handler function for each vector. 402 * 403 * This destroys the Properties Vector structure and replaces it 404 * with an array of just vector values. 405 */ 406 row=pv->v; 407 count=-valueColumns; 408 for(i=0; i<rows; ++i) { 409 /* fetch these first before memmove() may overwrite them */ 410 start=(UChar32)row[0]; 411 limit=(UChar32)row[1]; 412 413 /* add a new values vector if it is different from the current one */ 414 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { 415 count+=valueColumns; 416 uprv_memmove(pv->v+count, row+2, valueColumns*4); 417 } 418 419 if(start<UPVEC_FIRST_SPECIAL_CP) { 420 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); 421 if(U_FAILURE(*pErrorCode)) { 422 return; 423 } 424 } 425 426 row+=columns; 427 } 428 429 /* count is at the beginning of the last vector, add one to include that last vector */ 430 pv->rows=count/valueColumns+1; 431 } 432 433 U_CAPI const uint32_t * U_EXPORT2 434 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { 435 if(!pv->isCompacted) { 436 return NULL; 437 } 438 if(pRows!=NULL) { 439 *pRows=pv->rows; 440 } 441 if(pColumns!=NULL) { 442 *pColumns=pv->columns-2; 443 } 444 return pv->v; 445 } 446 447 U_CAPI uint32_t * U_EXPORT2 448 upvec_cloneArray(const UPropsVectors *pv, 449 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { 450 uint32_t *clonedArray; 451 int32_t byteLength; 452 453 if(U_FAILURE(*pErrorCode)) { 454 return NULL; 455 } 456 if(!pv->isCompacted) { 457 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 458 return NULL; 459 } 460 byteLength=pv->rows*(pv->columns-2)*4; 461 clonedArray=(uint32_t *)uprv_malloc(byteLength); 462 if(clonedArray==NULL) { 463 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 464 return NULL; 465 } 466 uprv_memcpy(clonedArray, pv->v, byteLength); 467 if(pRows!=NULL) { 468 *pRows=pv->rows; 469 } 470 if(pColumns!=NULL) { 471 *pColumns=pv->columns-2; 472 } 473 return clonedArray; 474 } 475 476 U_CAPI UTrie2 * U_EXPORT2 477 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { 478 UPVecToUTrie2Context toUTrie2={ NULL }; 479 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); 480 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); 481 if(U_FAILURE(*pErrorCode)) { 482 utrie2_close(toUTrie2.trie); 483 toUTrie2.trie=NULL; 484 } 485 return toUTrie2.trie; 486 } 487 488 /* 489 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts 490 * some 16-bit field and builds and returns a UTrie2. 491 */ 492 493 U_CAPI void U_CALLCONV 494 upvec_compactToUTrie2Handler(void *context, 495 UChar32 start, UChar32 end, 496 int32_t rowIndex, uint32_t *row, int32_t columns, 497 UErrorCode *pErrorCode) { 498 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; 499 if(start<UPVEC_FIRST_SPECIAL_CP) { 500 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); 501 } else { 502 switch(start) { 503 case UPVEC_INITIAL_VALUE_CP: 504 toUTrie2->initialValue=rowIndex; 505 break; 506 case UPVEC_ERROR_VALUE_CP: 507 toUTrie2->errorValue=rowIndex; 508 break; 509 case UPVEC_START_REAL_VALUES_CP: 510 toUTrie2->maxValue=rowIndex; 511 if(rowIndex>0xffff) { 512 /* too many rows for a 16-bit trie */ 513 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 514 } else { 515 toUTrie2->trie=utrie2_open(toUTrie2->initialValue, 516 toUTrie2->errorValue, pErrorCode); 517 } 518 break; 519 default: 520 break; 521 } 522 } 523 } 524