1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: propsvec.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb22 14 * created by: Markus W. Scherer 15 * 16 * Store bits (Unicode character properties) in bit set vectors. 17 */ 18 19 #include <stdlib.h> 20 #include "unicode/utypes.h" 21 #include "cmemory.h" 22 #include "utrie.h" 23 #include "utrie2.h" 24 #include "uarrsort.h" 25 #include "propsvec.h" 26 #include "uassert.h" 27 28 struct UPropsVectors { 29 uint32_t *v; 30 int32_t columns; /* number of columns, plus two for start & limit values */ 31 int32_t maxRows; 32 int32_t rows; 33 int32_t prevRow; /* search optimization: remember last row seen */ 34 UBool isCompacted; 35 }; 36 37 #define UPVEC_INITIAL_ROWS (1<<12) 38 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) 39 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) 40 41 U_CAPI UPropsVectors * U_EXPORT2 42 upvec_open(int32_t columns, UErrorCode *pErrorCode) { 43 UPropsVectors *pv; 44 uint32_t *v, *row; 45 uint32_t cp; 46 47 if(U_FAILURE(*pErrorCode)) { 48 return NULL; 49 } 50 if(columns<1) { 51 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 52 return NULL; 53 } 54 columns+=2; /* count range start and limit columns */ 55 56 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); 57 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); 58 if(pv==NULL || v==NULL) { 59 uprv_free(pv); 60 uprv_free(v); 61 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 62 return NULL; 63 } 64 uprv_memset(pv, 0, sizeof(UPropsVectors)); 65 pv->v=v; 66 pv->columns=columns; 67 pv->maxRows=UPVEC_INITIAL_ROWS; 68 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); 69 70 /* set the all-Unicode row and the special-value rows */ 71 row=pv->v; 72 uprv_memset(row, 0, pv->rows*columns*4); 73 row[0]=0; 74 row[1]=0x110000; 75 row+=columns; 76 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { 77 row[0]=cp; 78 row[1]=cp+1; 79 row+=columns; 80 } 81 return pv; 82 } 83 84 U_CAPI void U_EXPORT2 85 upvec_close(UPropsVectors *pv) { 86 if(pv!=NULL) { 87 uprv_free(pv->v); 88 uprv_free(pv); 89 } 90 } 91 92 static uint32_t * 93 _findRow(UPropsVectors *pv, UChar32 rangeStart) { 94 uint32_t *row; 95 int32_t columns, i, start, limit, prevRow; 96 97 columns=pv->columns; 98 limit=pv->rows; 99 prevRow=pv->prevRow; 100 101 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ 102 row=pv->v+prevRow*columns; 103 if(rangeStart>=(UChar32)row[0]) { 104 if(rangeStart<(UChar32)row[1]) { 105 /* same row as last seen */ 106 return row; 107 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 108 /* next row after the last one */ 109 pv->prevRow=prevRow+1; 110 return row; 111 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 112 /* second row after the last one */ 113 pv->prevRow=prevRow+2; 114 return row; 115 } else if((rangeStart-(UChar32)row[1])<10) { 116 /* we are close, continue looping */ 117 prevRow+=2; 118 do { 119 ++prevRow; 120 row+=columns; 121 } while(rangeStart>=(UChar32)row[1]); 122 pv->prevRow=prevRow; 123 return row; 124 } 125 } else if(rangeStart<(UChar32)pv->v[1]) { 126 /* the very first row */ 127 pv->prevRow=0; 128 return pv->v; 129 } 130 131 /* do a binary search for the start of the range */ 132 start=0; 133 while(start<limit-1) { 134 i=(start+limit)/2; 135 row=pv->v+i*columns; 136 if(rangeStart<(UChar32)row[0]) { 137 limit=i; 138 } else if(rangeStart<(UChar32)row[1]) { 139 pv->prevRow=i; 140 return row; 141 } else { 142 start=i; 143 } 144 } 145 146 /* must be found because all ranges together always cover all of Unicode */ 147 pv->prevRow=start; 148 return pv->v+start*columns; 149 } 150 151 U_CAPI void U_EXPORT2 152 upvec_setValue(UPropsVectors *pv, 153 UChar32 start, UChar32 end, 154 int32_t column, 155 uint32_t value, uint32_t mask, 156 UErrorCode *pErrorCode) { 157 uint32_t *firstRow, *lastRow; 158 int32_t columns; 159 UChar32 limit; 160 UBool splitFirstRow, splitLastRow; 161 162 /* argument checking */ 163 if(U_FAILURE(*pErrorCode)) { 164 return; 165 } 166 if( pv==NULL || 167 start<0 || start>end || end>UPVEC_MAX_CP || 168 column<0 || column>=(pv->columns-2) 169 ) { 170 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 171 return; 172 } 173 if(pv->isCompacted) { 174 *pErrorCode=U_NO_WRITE_PERMISSION; 175 return; 176 } 177 limit=end+1; 178 179 /* initialize */ 180 columns=pv->columns; 181 column+=2; /* skip range start and limit columns */ 182 value&=mask; 183 184 /* find the rows whose ranges overlap with the input range */ 185 186 /* find the first and last rows, always successful */ 187 firstRow=_findRow(pv, start); 188 lastRow=_findRow(pv, end); 189 190 /* 191 * Rows need to be split if they partially overlap with the 192 * input range (only possible for the first and last rows) 193 * and if their value differs from the input value. 194 */ 195 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); 196 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); 197 198 /* split first/last rows if necessary */ 199 if(splitFirstRow || splitLastRow) { 200 int32_t count, rows; 201 202 rows=pv->rows; 203 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { 204 uint32_t *newVectors; 205 int32_t newMaxRows; 206 207 if(pv->maxRows<UPVEC_MEDIUM_ROWS) { 208 newMaxRows=UPVEC_MEDIUM_ROWS; 209 } else if(pv->maxRows<UPVEC_MAX_ROWS) { 210 newMaxRows=UPVEC_MAX_ROWS; 211 } else { 212 /* Implementation bug, or UPVEC_MAX_ROWS too low. */ 213 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 214 return; 215 } 216 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); 217 if(newVectors==NULL) { 218 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 219 return; 220 } 221 uprv_memcpy(newVectors, pv->v, rows*columns*4); 222 firstRow=newVectors+(firstRow-pv->v); 223 lastRow=newVectors+(lastRow-pv->v); 224 uprv_free(pv->v); 225 pv->v=newVectors; 226 pv->maxRows=newMaxRows; 227 } 228 229 /* count the number of row cells to move after the last row, and move them */ 230 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); 231 if(count>0) { 232 uprv_memmove( 233 lastRow+(1+splitFirstRow+splitLastRow)*columns, 234 lastRow+columns, 235 count*4); 236 } 237 pv->rows=rows+splitFirstRow+splitLastRow; 238 239 /* split the first row, and move the firstRow pointer to the second part */ 240 if(splitFirstRow) { 241 /* copy all affected rows up one and move the lastRow pointer */ 242 count = (int32_t)((lastRow-firstRow)+columns); 243 uprv_memmove(firstRow+columns, firstRow, count*4); 244 lastRow+=columns; 245 246 /* split the range and move the firstRow pointer */ 247 firstRow[1]=firstRow[columns]=(uint32_t)start; 248 firstRow+=columns; 249 } 250 251 /* split the last row */ 252 if(splitLastRow) { 253 /* copy the last row data */ 254 uprv_memcpy(lastRow+columns, lastRow, columns*4); 255 256 /* split the range and move the firstRow pointer */ 257 lastRow[1]=lastRow[columns]=(uint32_t)limit; 258 } 259 } 260 261 /* set the "row last seen" to the last row for the range */ 262 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); 263 264 /* set the input value in all remaining rows */ 265 firstRow+=column; 266 lastRow+=column; 267 mask=~mask; 268 for(;;) { 269 *firstRow=(*firstRow&mask)|value; 270 if(firstRow==lastRow) { 271 break; 272 } 273 firstRow+=columns; 274 } 275 } 276 277 U_CAPI uint32_t U_EXPORT2 278 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { 279 uint32_t *row; 280 UPropsVectors *ncpv; 281 282 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { 283 return 0; 284 } 285 ncpv=(UPropsVectors *)pv; 286 row=_findRow(ncpv, c); 287 return row[2+column]; 288 } 289 290 U_CAPI uint32_t * U_EXPORT2 291 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, 292 UChar32 *pRangeStart, UChar32 *pRangeEnd) { 293 uint32_t *row; 294 int32_t columns; 295 296 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { 297 return NULL; 298 } 299 300 columns=pv->columns; 301 row=pv->v+rowIndex*columns; 302 if(pRangeStart!=NULL) { 303 *pRangeStart=(UChar32)row[0]; 304 } 305 if(pRangeEnd!=NULL) { 306 *pRangeEnd=(UChar32)row[1]-1; 307 } 308 return row+2; 309 } 310 311 static int32_t U_CALLCONV 312 upvec_compareRows(const void *context, const void *l, const void *r) { 313 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; 314 const UPropsVectors *pv=(const UPropsVectors *)context; 315 int32_t i, count, columns; 316 317 count=columns=pv->columns; /* includes start/limit columns */ 318 319 /* start comparing after start/limit but wrap around to them */ 320 i=2; 321 do { 322 if(left[i]!=right[i]) { 323 return left[i]<right[i] ? -1 : 1; 324 } 325 if(++i==columns) { 326 i=0; 327 } 328 } while(--count>0); 329 330 return 0; 331 } 332 333 U_CAPI void U_EXPORT2 334 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { 335 uint32_t *row; 336 int32_t i, columns, valueColumns, rows, count; 337 UChar32 start, limit; 338 339 /* argument checking */ 340 if(U_FAILURE(*pErrorCode)) { 341 return; 342 } 343 if(handler==NULL) { 344 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 345 return; 346 } 347 if(pv->isCompacted) { 348 return; 349 } 350 351 /* Set the flag now: Sorting and compacting destroys the builder data structure. */ 352 pv->isCompacted=TRUE; 353 354 rows=pv->rows; 355 columns=pv->columns; 356 U_ASSERT(columns>=3); /* upvec_open asserts this */ 357 valueColumns=columns-2; /* not counting start & limit */ 358 359 /* sort the properties vectors to find unique vector values */ 360 uprv_sortArray(pv->v, rows, columns*4, 361 upvec_compareRows, pv, FALSE, pErrorCode); 362 if(U_FAILURE(*pErrorCode)) { 363 return; 364 } 365 366 /* 367 * Find and set the special values. 368 * This has to do almost the same work as the compaction below, 369 * to find the indexes where the special-value rows will move. 370 */ 371 row=pv->v; 372 count=-valueColumns; 373 for(i=0; i<rows; ++i) { 374 start=(UChar32)row[0]; 375 376 /* count a new values vector if it is different from the current one */ 377 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { 378 count+=valueColumns; 379 } 380 381 if(start>=UPVEC_FIRST_SPECIAL_CP) { 382 handler(context, start, start, count, row+2, valueColumns, pErrorCode); 383 if(U_FAILURE(*pErrorCode)) { 384 return; 385 } 386 } 387 388 row+=columns; 389 } 390 391 /* count is at the beginning of the last vector, add valueColumns to include that last vector */ 392 count+=valueColumns; 393 394 /* Call the handler once more to signal the start of delivering real values. */ 395 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, 396 count, row-valueColumns, valueColumns, pErrorCode); 397 if(U_FAILURE(*pErrorCode)) { 398 return; 399 } 400 401 /* 402 * Move vector contents up to a contiguous array with only unique 403 * vector values, and call the handler function for each vector. 404 * 405 * This destroys the Properties Vector structure and replaces it 406 * with an array of just vector values. 407 */ 408 row=pv->v; 409 count=-valueColumns; 410 for(i=0; i<rows; ++i) { 411 /* fetch these first before memmove() may overwrite them */ 412 start=(UChar32)row[0]; 413 limit=(UChar32)row[1]; 414 415 /* add a new values vector if it is different from the current one */ 416 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { 417 count+=valueColumns; 418 uprv_memmove(pv->v+count, row+2, valueColumns*4); 419 } 420 421 if(start<UPVEC_FIRST_SPECIAL_CP) { 422 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); 423 if(U_FAILURE(*pErrorCode)) { 424 return; 425 } 426 } 427 428 row+=columns; 429 } 430 431 /* count is at the beginning of the last vector, add one to include that last vector */ 432 pv->rows=count/valueColumns+1; 433 } 434 435 U_CAPI const uint32_t * U_EXPORT2 436 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { 437 if(!pv->isCompacted) { 438 return NULL; 439 } 440 if(pRows!=NULL) { 441 *pRows=pv->rows; 442 } 443 if(pColumns!=NULL) { 444 *pColumns=pv->columns-2; 445 } 446 return pv->v; 447 } 448 449 U_CAPI uint32_t * U_EXPORT2 450 upvec_cloneArray(const UPropsVectors *pv, 451 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { 452 uint32_t *clonedArray; 453 int32_t byteLength; 454 455 if(U_FAILURE(*pErrorCode)) { 456 return NULL; 457 } 458 if(!pv->isCompacted) { 459 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 460 return NULL; 461 } 462 byteLength=pv->rows*(pv->columns-2)*4; 463 clonedArray=(uint32_t *)uprv_malloc(byteLength); 464 if(clonedArray==NULL) { 465 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 466 return NULL; 467 } 468 uprv_memcpy(clonedArray, pv->v, byteLength); 469 if(pRows!=NULL) { 470 *pRows=pv->rows; 471 } 472 if(pColumns!=NULL) { 473 *pColumns=pv->columns-2; 474 } 475 return clonedArray; 476 } 477 478 U_CAPI UTrie2 * U_EXPORT2 479 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { 480 UPVecToUTrie2Context toUTrie2={ NULL }; 481 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); 482 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); 483 if(U_FAILURE(*pErrorCode)) { 484 utrie2_close(toUTrie2.trie); 485 toUTrie2.trie=NULL; 486 } 487 return toUTrie2.trie; 488 } 489 490 /* 491 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts 492 * some 16-bit field and builds and returns a UTrie2. 493 */ 494 495 U_CAPI void U_CALLCONV 496 upvec_compactToUTrie2Handler(void *context, 497 UChar32 start, UChar32 end, 498 int32_t rowIndex, uint32_t *row, int32_t columns, 499 UErrorCode *pErrorCode) { 500 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; 501 if(start<UPVEC_FIRST_SPECIAL_CP) { 502 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); 503 } else { 504 switch(start) { 505 case UPVEC_INITIAL_VALUE_CP: 506 toUTrie2->initialValue=rowIndex; 507 break; 508 case UPVEC_ERROR_VALUE_CP: 509 toUTrie2->errorValue=rowIndex; 510 break; 511 case UPVEC_START_REAL_VALUES_CP: 512 toUTrie2->maxValue=rowIndex; 513 if(rowIndex>0xffff) { 514 /* too many rows for a 16-bit trie */ 515 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 516 } else { 517 toUTrie2->trie=utrie2_open(toUTrie2->initialValue, 518 toUTrie2->errorValue, pErrorCode); 519 } 520 break; 521 default: 522 break; 523 } 524 } 525 } 526