1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: propsvec.c 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002feb22 16 * created by: Markus W. Scherer 17 * 18 * Store bits (Unicode character properties) in bit set vectors. 19 */ 20 21 #include <stdlib.h> 22 #include "unicode/utypes.h" 23 #include "cmemory.h" 24 #include "utrie.h" 25 #include "utrie2.h" 26 #include "uarrsort.h" 27 #include "propsvec.h" 28 #include "uassert.h" 29 30 struct UPropsVectors { 31 uint32_t *v; 32 int32_t columns; /* number of columns, plus two for start & limit values */ 33 int32_t maxRows; 34 int32_t rows; 35 int32_t prevRow; /* search optimization: remember last row seen */ 36 UBool isCompacted; 37 }; 38 39 #define UPVEC_INITIAL_ROWS (1<<12) 40 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) 41 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) 42 43 U_CAPI UPropsVectors * U_EXPORT2 44 upvec_open(int32_t columns, UErrorCode *pErrorCode) { 45 UPropsVectors *pv; 46 uint32_t *v, *row; 47 uint32_t cp; 48 49 if(U_FAILURE(*pErrorCode)) { 50 return NULL; 51 } 52 if(columns<1) { 53 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 54 return NULL; 55 } 56 columns+=2; /* count range start and limit columns */ 57 58 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); 59 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); 60 if(pv==NULL || v==NULL) { 61 uprv_free(pv); 62 uprv_free(v); 63 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 64 return NULL; 65 } 66 uprv_memset(pv, 0, sizeof(UPropsVectors)); 67 pv->v=v; 68 pv->columns=columns; 69 pv->maxRows=UPVEC_INITIAL_ROWS; 70 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); 71 72 /* set the all-Unicode row and the special-value rows */ 73 row=pv->v; 74 uprv_memset(row, 0, pv->rows*columns*4); 75 row[0]=0; 76 row[1]=0x110000; 77 row+=columns; 78 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { 79 row[0]=cp; 80 row[1]=cp+1; 81 row+=columns; 82 } 83 return pv; 84 } 85 86 U_CAPI void U_EXPORT2 87 upvec_close(UPropsVectors *pv) { 88 if(pv!=NULL) { 89 uprv_free(pv->v); 90 uprv_free(pv); 91 } 92 } 93 94 static uint32_t * 95 _findRow(UPropsVectors *pv, UChar32 rangeStart) { 96 uint32_t *row; 97 int32_t columns, i, start, limit, prevRow; 98 99 columns=pv->columns; 100 limit=pv->rows; 101 prevRow=pv->prevRow; 102 103 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ 104 row=pv->v+prevRow*columns; 105 if(rangeStart>=(UChar32)row[0]) { 106 if(rangeStart<(UChar32)row[1]) { 107 /* same row as last seen */ 108 return row; 109 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 110 /* next row after the last one */ 111 pv->prevRow=prevRow+1; 112 return row; 113 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 114 /* second row after the last one */ 115 pv->prevRow=prevRow+2; 116 return row; 117 } else if((rangeStart-(UChar32)row[1])<10) { 118 /* we are close, continue looping */ 119 prevRow+=2; 120 do { 121 ++prevRow; 122 row+=columns; 123 } while(rangeStart>=(UChar32)row[1]); 124 pv->prevRow=prevRow; 125 return row; 126 } 127 } else if(rangeStart<(UChar32)pv->v[1]) { 128 /* the very first row */ 129 pv->prevRow=0; 130 return pv->v; 131 } 132 133 /* do a binary search for the start of the range */ 134 start=0; 135 while(start<limit-1) { 136 i=(start+limit)/2; 137 row=pv->v+i*columns; 138 if(rangeStart<(UChar32)row[0]) { 139 limit=i; 140 } else if(rangeStart<(UChar32)row[1]) { 141 pv->prevRow=i; 142 return row; 143 } else { 144 start=i; 145 } 146 } 147 148 /* must be found because all ranges together always cover all of Unicode */ 149 pv->prevRow=start; 150 return pv->v+start*columns; 151 } 152 153 U_CAPI void U_EXPORT2 154 upvec_setValue(UPropsVectors *pv, 155 UChar32 start, UChar32 end, 156 int32_t column, 157 uint32_t value, uint32_t mask, 158 UErrorCode *pErrorCode) { 159 uint32_t *firstRow, *lastRow; 160 int32_t columns; 161 UChar32 limit; 162 UBool splitFirstRow, splitLastRow; 163 164 /* argument checking */ 165 if(U_FAILURE(*pErrorCode)) { 166 return; 167 } 168 if( pv==NULL || 169 start<0 || start>end || end>UPVEC_MAX_CP || 170 column<0 || column>=(pv->columns-2) 171 ) { 172 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 173 return; 174 } 175 if(pv->isCompacted) { 176 *pErrorCode=U_NO_WRITE_PERMISSION; 177 return; 178 } 179 limit=end+1; 180 181 /* initialize */ 182 columns=pv->columns; 183 column+=2; /* skip range start and limit columns */ 184 value&=mask; 185 186 /* find the rows whose ranges overlap with the input range */ 187 188 /* find the first and last rows, always successful */ 189 firstRow=_findRow(pv, start); 190 lastRow=_findRow(pv, end); 191 192 /* 193 * Rows need to be split if they partially overlap with the 194 * input range (only possible for the first and last rows) 195 * and if their value differs from the input value. 196 */ 197 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); 198 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); 199 200 /* split first/last rows if necessary */ 201 if(splitFirstRow || splitLastRow) { 202 int32_t count, rows; 203 204 rows=pv->rows; 205 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { 206 uint32_t *newVectors; 207 int32_t newMaxRows; 208 209 if(pv->maxRows<UPVEC_MEDIUM_ROWS) { 210 newMaxRows=UPVEC_MEDIUM_ROWS; 211 } else if(pv->maxRows<UPVEC_MAX_ROWS) { 212 newMaxRows=UPVEC_MAX_ROWS; 213 } else { 214 /* Implementation bug, or UPVEC_MAX_ROWS too low. */ 215 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 216 return; 217 } 218 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); 219 if(newVectors==NULL) { 220 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 221 return; 222 } 223 uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4); 224 firstRow=newVectors+(firstRow-pv->v); 225 lastRow=newVectors+(lastRow-pv->v); 226 uprv_free(pv->v); 227 pv->v=newVectors; 228 pv->maxRows=newMaxRows; 229 } 230 231 /* count the number of row cells to move after the last row, and move them */ 232 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); 233 if(count>0) { 234 uprv_memmove( 235 lastRow+(1+splitFirstRow+splitLastRow)*columns, 236 lastRow+columns, 237 count*4); 238 } 239 pv->rows=rows+splitFirstRow+splitLastRow; 240 241 /* split the first row, and move the firstRow pointer to the second part */ 242 if(splitFirstRow) { 243 /* copy all affected rows up one and move the lastRow pointer */ 244 count = (int32_t)((lastRow-firstRow)+columns); 245 uprv_memmove(firstRow+columns, firstRow, (size_t)count*4); 246 lastRow+=columns; 247 248 /* split the range and move the firstRow pointer */ 249 firstRow[1]=firstRow[columns]=(uint32_t)start; 250 firstRow+=columns; 251 } 252 253 /* split the last row */ 254 if(splitLastRow) { 255 /* copy the last row data */ 256 uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4); 257 258 /* split the range and move the firstRow pointer */ 259 lastRow[1]=lastRow[columns]=(uint32_t)limit; 260 } 261 } 262 263 /* set the "row last seen" to the last row for the range */ 264 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); 265 266 /* set the input value in all remaining rows */ 267 firstRow+=column; 268 lastRow+=column; 269 mask=~mask; 270 for(;;) { 271 *firstRow=(*firstRow&mask)|value; 272 if(firstRow==lastRow) { 273 break; 274 } 275 firstRow+=columns; 276 } 277 } 278 279 U_CAPI uint32_t U_EXPORT2 280 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { 281 uint32_t *row; 282 UPropsVectors *ncpv; 283 284 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { 285 return 0; 286 } 287 ncpv=(UPropsVectors *)pv; 288 row=_findRow(ncpv, c); 289 return row[2+column]; 290 } 291 292 U_CAPI uint32_t * U_EXPORT2 293 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, 294 UChar32 *pRangeStart, UChar32 *pRangeEnd) { 295 uint32_t *row; 296 int32_t columns; 297 298 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { 299 return NULL; 300 } 301 302 columns=pv->columns; 303 row=pv->v+rowIndex*columns; 304 if(pRangeStart!=NULL) { 305 *pRangeStart=(UChar32)row[0]; 306 } 307 if(pRangeEnd!=NULL) { 308 *pRangeEnd=(UChar32)row[1]-1; 309 } 310 return row+2; 311 } 312 313 static int32_t U_CALLCONV 314 upvec_compareRows(const void *context, const void *l, const void *r) { 315 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; 316 const UPropsVectors *pv=(const UPropsVectors *)context; 317 int32_t i, count, columns; 318 319 count=columns=pv->columns; /* includes start/limit columns */ 320 321 /* start comparing after start/limit but wrap around to them */ 322 i=2; 323 do { 324 if(left[i]!=right[i]) { 325 return left[i]<right[i] ? -1 : 1; 326 } 327 if(++i==columns) { 328 i=0; 329 } 330 } while(--count>0); 331 332 return 0; 333 } 334 335 U_CAPI void U_EXPORT2 336 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { 337 uint32_t *row; 338 int32_t i, columns, valueColumns, rows, count; 339 UChar32 start, limit; 340 341 /* argument checking */ 342 if(U_FAILURE(*pErrorCode)) { 343 return; 344 } 345 if(handler==NULL) { 346 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 347 return; 348 } 349 if(pv->isCompacted) { 350 return; 351 } 352 353 /* Set the flag now: Sorting and compacting destroys the builder data structure. */ 354 pv->isCompacted=TRUE; 355 356 rows=pv->rows; 357 columns=pv->columns; 358 U_ASSERT(columns>=3); /* upvec_open asserts this */ 359 valueColumns=columns-2; /* not counting start & limit */ 360 361 /* sort the properties vectors to find unique vector values */ 362 uprv_sortArray(pv->v, rows, columns*4, 363 upvec_compareRows, pv, FALSE, pErrorCode); 364 if(U_FAILURE(*pErrorCode)) { 365 return; 366 } 367 368 /* 369 * Find and set the special values. 370 * This has to do almost the same work as the compaction below, 371 * to find the indexes where the special-value rows will move. 372 */ 373 row=pv->v; 374 count=-valueColumns; 375 for(i=0; i<rows; ++i) { 376 start=(UChar32)row[0]; 377 378 /* count a new values vector if it is different from the current one */ 379 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { 380 count+=valueColumns; 381 } 382 383 if(start>=UPVEC_FIRST_SPECIAL_CP) { 384 handler(context, start, start, count, row+2, valueColumns, pErrorCode); 385 if(U_FAILURE(*pErrorCode)) { 386 return; 387 } 388 } 389 390 row+=columns; 391 } 392 393 /* count is at the beginning of the last vector, add valueColumns to include that last vector */ 394 count+=valueColumns; 395 396 /* Call the handler once more to signal the start of delivering real values. */ 397 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, 398 count, row-valueColumns, valueColumns, pErrorCode); 399 if(U_FAILURE(*pErrorCode)) { 400 return; 401 } 402 403 /* 404 * Move vector contents up to a contiguous array with only unique 405 * vector values, and call the handler function for each vector. 406 * 407 * This destroys the Properties Vector structure and replaces it 408 * with an array of just vector values. 409 */ 410 row=pv->v; 411 count=-valueColumns; 412 for(i=0; i<rows; ++i) { 413 /* fetch these first before memmove() may overwrite them */ 414 start=(UChar32)row[0]; 415 limit=(UChar32)row[1]; 416 417 /* add a new values vector if it is different from the current one */ 418 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { 419 count+=valueColumns; 420 uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4); 421 } 422 423 if(start<UPVEC_FIRST_SPECIAL_CP) { 424 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); 425 if(U_FAILURE(*pErrorCode)) { 426 return; 427 } 428 } 429 430 row+=columns; 431 } 432 433 /* count is at the beginning of the last vector, add one to include that last vector */ 434 pv->rows=count/valueColumns+1; 435 } 436 437 U_CAPI const uint32_t * U_EXPORT2 438 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { 439 if(!pv->isCompacted) { 440 return NULL; 441 } 442 if(pRows!=NULL) { 443 *pRows=pv->rows; 444 } 445 if(pColumns!=NULL) { 446 *pColumns=pv->columns-2; 447 } 448 return pv->v; 449 } 450 451 U_CAPI uint32_t * U_EXPORT2 452 upvec_cloneArray(const UPropsVectors *pv, 453 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { 454 uint32_t *clonedArray; 455 int32_t byteLength; 456 457 if(U_FAILURE(*pErrorCode)) { 458 return NULL; 459 } 460 if(!pv->isCompacted) { 461 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 462 return NULL; 463 } 464 byteLength=pv->rows*(pv->columns-2)*4; 465 clonedArray=(uint32_t *)uprv_malloc(byteLength); 466 if(clonedArray==NULL) { 467 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 468 return NULL; 469 } 470 uprv_memcpy(clonedArray, pv->v, byteLength); 471 if(pRows!=NULL) { 472 *pRows=pv->rows; 473 } 474 if(pColumns!=NULL) { 475 *pColumns=pv->columns-2; 476 } 477 return clonedArray; 478 } 479 480 U_CAPI UTrie2 * U_EXPORT2 481 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { 482 UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 }; 483 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); 484 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); 485 if(U_FAILURE(*pErrorCode)) { 486 utrie2_close(toUTrie2.trie); 487 toUTrie2.trie=NULL; 488 } 489 return toUTrie2.trie; 490 } 491 492 /* 493 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts 494 * some 16-bit field and builds and returns a UTrie2. 495 */ 496 497 U_CAPI void U_CALLCONV 498 upvec_compactToUTrie2Handler(void *context, 499 UChar32 start, UChar32 end, 500 int32_t rowIndex, uint32_t *row, int32_t columns, 501 UErrorCode *pErrorCode) { 502 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; 503 if(start<UPVEC_FIRST_SPECIAL_CP) { 504 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); 505 } else { 506 switch(start) { 507 case UPVEC_INITIAL_VALUE_CP: 508 toUTrie2->initialValue=rowIndex; 509 break; 510 case UPVEC_ERROR_VALUE_CP: 511 toUTrie2->errorValue=rowIndex; 512 break; 513 case UPVEC_START_REAL_VALUES_CP: 514 toUTrie2->maxValue=rowIndex; 515 if(rowIndex>0xffff) { 516 /* too many rows for a 16-bit trie */ 517 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 518 } else { 519 toUTrie2->trie=utrie2_open(toUTrie2->initialValue, 520 toUTrie2->errorValue, pErrorCode); 521 } 522 break; 523 default: 524 break; 525 } 526 } 527 } 528