1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unorm_it.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jan21 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/uiter.h" 22 #include "unicode/unorm.h" 23 #include "unicode/utf.h" 24 #include "unorm_it.h" 25 #include "cmemory.h" 26 27 /* UNormIterator ------------------------------------------------------------ */ 28 29 enum { 30 INITIAL_CAPACITY=100 31 }; 32 33 struct UNormIterator { 34 UCharIterator api; 35 UCharIterator *iter; 36 37 /* 38 * chars and states either use the static buffers 39 * or are allocated in the same memory block 40 * 41 * They are parallel arrays with states[] holding the getState() values 42 * from normalization boundaries, and UITER_NO_STATE in between. 43 */ 44 UChar *chars; 45 uint32_t *states; 46 47 /* 48 * api.start: first valid character & state in the arrays 49 * api.index: current position 50 * api.limit: one past the last valid character in chars[], but states[limit] is valid 51 * capacity: length of allocated arrays 52 */ 53 int32_t capacity; 54 55 /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */ 56 uint32_t state; 57 58 /* there are UChars available before start or after limit? */ 59 UBool hasPrevious, hasNext, isStackAllocated; 60 61 UNormalizationMode mode; 62 63 UChar charsBuffer[INITIAL_CAPACITY]; 64 uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */ 65 }; 66 67 static void 68 initIndexes(UNormIterator *uni, UCharIterator *iter) { 69 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 70 UCharIterator *api=&uni->api; 71 72 if(!iter->hasPrevious(iter)) { 73 /* set indexes to the beginning of the arrays */ 74 api->start=api->index=api->limit=0; 75 uni->hasPrevious=FALSE; 76 uni->hasNext=iter->hasNext(iter); 77 } else if(!iter->hasNext(iter)) { 78 /* set indexes to the end of the arrays */ 79 api->start=api->index=api->limit=uni->capacity; 80 uni->hasNext=FALSE; 81 uni->hasPrevious=iter->hasPrevious(iter); 82 } else { 83 /* set indexes into the middle of the arrays */ 84 api->start=api->index=api->limit=uni->capacity/2; 85 uni->hasPrevious=uni->hasNext=TRUE; 86 } 87 } 88 89 static UBool 90 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) { 91 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 92 UCharIterator *api=&uni->api; 93 94 uint32_t *states; 95 UChar *chars; 96 int32_t start, limit; 97 98 states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2); 99 if(states==NULL) { 100 return FALSE; 101 } 102 103 chars=(UChar *)(states+(capacity+1)); 104 uni->capacity=capacity; 105 106 start=api->start; 107 limit=api->limit; 108 109 if(addAtStart) { 110 /* copy old contents to the end of the new arrays */ 111 int32_t delta; 112 113 delta=capacity-uni->capacity; 114 uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4); 115 uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4); 116 117 api->start=start+delta; 118 api->index+=delta; 119 api->limit=limit+delta; 120 } else { 121 /* copy old contents to the beginning of the new arrays */ 122 uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4); 123 uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4); 124 } 125 126 uni->chars=chars; 127 uni->states=states; 128 129 return TRUE; 130 } 131 132 static void 133 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { 134 /* move array contents up to make room */ 135 int32_t srcIndex, destIndex, limit; 136 137 limit=api->limit; 138 srcIndex=delta; 139 if(srcIndex>api->start) { 140 /* look for a position in the arrays with a known state */ 141 while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) { 142 ++srcIndex; 143 } 144 } 145 146 /* now actually move the array contents */ 147 api->start=destIndex=0; 148 while(srcIndex<limit) { 149 chars[destIndex]=chars[srcIndex]; 150 states[destIndex++]=states[srcIndex++]; 151 } 152 153 /* copy states[limit] as well! */ 154 states[destIndex]=states[srcIndex]; 155 156 api->limit=destIndex; 157 } 158 159 static void 160 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { 161 /* move array contents up to make room */ 162 int32_t srcIndex, destIndex, start; 163 164 start=api->start; 165 destIndex=((UNormIterator *)api)->capacity; 166 srcIndex=destIndex-delta; 167 if(srcIndex<api->limit) { 168 /* look for a position in the arrays with a known state */ 169 while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) { 170 --srcIndex; 171 } 172 } 173 174 /* now actually move the array contents */ 175 api->limit=destIndex; 176 177 /* copy states[limit] as well! */ 178 states[destIndex]=states[srcIndex]; 179 180 while(srcIndex>start) { 181 chars[--destIndex]=chars[--srcIndex]; 182 states[destIndex]=states[srcIndex]; 183 } 184 185 api->start=destIndex; 186 } 187 188 /* normalize forward from the limit, assume hasNext is true */ 189 static UBool 190 readNext(UNormIterator *uni, UCharIterator *iter) { 191 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 192 UCharIterator *api=&uni->api; 193 194 /* make capacity/4 room at the end of the arrays */ 195 int32_t limit, capacity, room; 196 UErrorCode errorCode; 197 198 limit=api->limit; 199 capacity=uni->capacity; 200 room=capacity/4; 201 if(room>(capacity-limit)) { 202 /* move array contents to make room */ 203 moveContentsTowardStart(api, uni->chars, uni->states, room); 204 api->index=limit=api->limit; 205 uni->hasPrevious=TRUE; 206 } 207 208 /* normalize starting from the limit position */ 209 errorCode=U_ZERO_ERROR; 210 if(uni->state!=uni->states[limit]) { 211 uiter_setState(iter, uni->states[limit], &errorCode); 212 if(U_FAILURE(errorCode)) { 213 uni->state=UITER_NO_STATE; 214 uni->hasNext=FALSE; 215 return FALSE; 216 } 217 } 218 219 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); 220 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 221 if(room<=capacity) { 222 /* empty and re-use the arrays */ 223 uni->states[0]=uni->states[limit]; 224 api->start=api->index=api->limit=limit=0; 225 uni->hasPrevious=TRUE; 226 } else { 227 capacity+=room+100; 228 if(!reallocArrays(uni, capacity, FALSE)) { 229 uni->state=UITER_NO_STATE; 230 uni->hasNext=FALSE; 231 return FALSE; 232 } 233 limit=api->limit; 234 } 235 236 errorCode=U_ZERO_ERROR; 237 uiter_setState(iter, uni->states[limit], &errorCode); 238 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); 239 } 240 if(U_FAILURE(errorCode) || room==0) { 241 uni->state=UITER_NO_STATE; 242 uni->hasNext=FALSE; 243 return FALSE; 244 } 245 246 /* room>0 */ 247 ++limit; /* leave the known states[limit] alone */ 248 for(--room; room>0; --room) { 249 /* set unknown states for all but the normalization boundaries */ 250 uni->states[limit++]=UITER_NO_STATE; 251 } 252 uni->states[limit]=uni->state=uiter_getState(iter); 253 uni->hasNext=iter->hasNext(iter); 254 api->limit=limit; 255 return TRUE; 256 } 257 258 /* normalize backward from the start, assume hasPrevious is true */ 259 static UBool 260 readPrevious(UNormIterator *uni, UCharIterator *iter) { 261 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 262 UCharIterator *api=&uni->api; 263 264 /* make capacity/4 room at the start of the arrays */ 265 int32_t start, capacity, room; 266 UErrorCode errorCode; 267 268 start=api->start; 269 capacity=uni->capacity; 270 room=capacity/4; 271 if(room>start) { 272 /* move array contents to make room */ 273 moveContentsTowardEnd(api, uni->chars, uni->states, room); 274 api->index=start=api->start; 275 uni->hasNext=TRUE; 276 } 277 278 /* normalize ending at the start position */ 279 errorCode=U_ZERO_ERROR; 280 if(uni->state!=uni->states[start]) { 281 uiter_setState(iter, uni->states[start], &errorCode); 282 if(U_FAILURE(errorCode)) { 283 uni->state=UITER_NO_STATE; 284 uni->hasPrevious=FALSE; 285 return FALSE; 286 } 287 } 288 289 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); 290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 291 if(room<=capacity) { 292 /* empty and re-use the arrays */ 293 uni->states[capacity]=uni->states[start]; 294 api->start=api->index=api->limit=start=capacity; 295 uni->hasNext=TRUE; 296 } else { 297 capacity+=room+100; 298 if(!reallocArrays(uni, capacity, TRUE)) { 299 uni->state=UITER_NO_STATE; 300 uni->hasPrevious=FALSE; 301 return FALSE; 302 } 303 start=api->start; 304 } 305 306 errorCode=U_ZERO_ERROR; 307 uiter_setState(iter, uni->states[start], &errorCode); 308 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); 309 } 310 if(U_FAILURE(errorCode) || room==0) { 311 uni->state=UITER_NO_STATE; 312 uni->hasPrevious=FALSE; 313 return FALSE; 314 } 315 316 /* room>0 */ 317 do { 318 /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */ 319 uni->chars[--start]=uni->chars[--room]; 320 /* set unknown states for all but the normalization boundaries */ 321 uni->states[start]=UITER_NO_STATE; 322 } while(room>0); 323 uni->states[start]=uni->state=uiter_getState(iter); 324 uni->hasPrevious=iter->hasPrevious(iter); 325 api->start=start; 326 return TRUE; 327 } 328 329 /* Iterator runtime API functions ------------------------------------------- */ 330 331 static int32_t U_CALLCONV 332 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) { 333 switch(origin) { 334 case UITER_ZERO: 335 case UITER_START: 336 return 0; 337 case UITER_CURRENT: 338 case UITER_LIMIT: 339 case UITER_LENGTH: 340 return UITER_UNKNOWN_INDEX; 341 default: 342 /* not a valid origin */ 343 /* Should never get here! */ 344 return -1; 345 } 346 } 347 348 static int32_t U_CALLCONV 349 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) { 350 UNormIterator *uni=(UNormIterator *)api; 351 UCharIterator *iter=uni->iter; 352 int32_t pos; 353 354 switch(origin) { 355 case UITER_ZERO: 356 case UITER_START: 357 /* restart from the beginning */ 358 if(uni->hasPrevious) { 359 iter->move(iter, 0, UITER_START); 360 api->start=api->index=api->limit=0; 361 uni->states[api->limit]=uni->state=uiter_getState(iter); 362 uni->hasPrevious=FALSE; 363 uni->hasNext=iter->hasNext(iter); 364 } else { 365 /* we already have the beginning of the normalized text */ 366 api->index=api->start; 367 } 368 break; 369 case UITER_CURRENT: 370 break; 371 case UITER_LIMIT: 372 case UITER_LENGTH: 373 /* restart from the end */ 374 if(uni->hasNext) { 375 iter->move(iter, 0, UITER_LIMIT); 376 api->start=api->index=api->limit=uni->capacity; 377 uni->states[api->limit]=uni->state=uiter_getState(iter); 378 uni->hasPrevious=iter->hasPrevious(iter); 379 uni->hasNext=FALSE; 380 } else { 381 /* we already have the end of the normalized text */ 382 api->index=api->limit; 383 } 384 break; 385 default: 386 return -1; /* Error */ 387 } 388 389 /* move relative to the current position by delta normalized UChars */ 390 if(delta==0) { 391 /* nothing to do */ 392 } else if(delta>0) { 393 /* go forward until the requested position is in the buffer */ 394 for(;;) { 395 pos=api->index+delta; /* requested position */ 396 delta=pos-api->limit; /* remainder beyond buffered text */ 397 if(delta<=0) { 398 api->index=pos; /* position reached */ 399 break; 400 } 401 402 /* go to end of buffer and normalize further */ 403 api->index=api->limit; 404 if(!uni->hasNext || !readNext(uni, iter)) { 405 break; /* reached end of text */ 406 } 407 } 408 } else /* delta<0 */ { 409 /* go backward until the requested position is in the buffer */ 410 for(;;) { 411 pos=api->index+delta; /* requested position */ 412 delta=pos-api->start; /* remainder beyond buffered text */ 413 if(delta>=0) { 414 api->index=pos; /* position reached */ 415 break; 416 } 417 418 /* go to start of buffer and normalize further */ 419 api->index=api->start; 420 if(!uni->hasPrevious || !readPrevious(uni, iter)) { 421 break; /* reached start of text */ 422 } 423 } 424 } 425 426 if(api->index==api->start && !uni->hasPrevious) { 427 return 0; 428 } else { 429 return UITER_UNKNOWN_INDEX; 430 } 431 } 432 433 static UBool U_CALLCONV 434 unormIteratorHasNext(UCharIterator *api) { 435 return api->index<api->limit || ((UNormIterator *)api)->hasNext; 436 } 437 438 static UBool U_CALLCONV 439 unormIteratorHasPrevious(UCharIterator *api) { 440 return api->index>api->start || ((UNormIterator *)api)->hasPrevious; 441 } 442 443 static UChar32 U_CALLCONV 444 unormIteratorCurrent(UCharIterator *api) { 445 UNormIterator *uni=(UNormIterator *)api; 446 447 if( api->index<api->limit || 448 (uni->hasNext && readNext(uni, uni->iter)) 449 ) { 450 return uni->chars[api->index]; 451 } else { 452 return U_SENTINEL; 453 } 454 } 455 456 static UChar32 U_CALLCONV 457 unormIteratorNext(UCharIterator *api) { 458 UNormIterator *uni=(UNormIterator *)api; 459 460 if( api->index<api->limit || 461 (uni->hasNext && readNext(uni, uni->iter)) 462 ) { 463 return uni->chars[api->index++]; 464 } else { 465 return U_SENTINEL; 466 } 467 } 468 469 static UChar32 U_CALLCONV 470 unormIteratorPrevious(UCharIterator *api) { 471 UNormIterator *uni=(UNormIterator *)api; 472 473 if( api->index>api->start || 474 (uni->hasPrevious && readPrevious(uni, uni->iter)) 475 ) { 476 return uni->chars[--api->index]; 477 } else { 478 return U_SENTINEL; 479 } 480 } 481 482 static uint32_t U_CALLCONV 483 unormIteratorGetState(const UCharIterator *api) { 484 /* not uni->state because that may not be at api->index */ 485 return ((UNormIterator *)api)->states[api->index]; 486 } 487 488 static void U_CALLCONV 489 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) { 490 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 491 /* do nothing */ 492 } else if(api==NULL) { 493 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 494 } else if(state==UITER_NO_STATE) { 495 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 496 } else { 497 UNormIterator *uni=(UNormIterator *)api; 498 UCharIterator *iter=((UNormIterator *)api)->iter; 499 if(state!=uni->state) { 500 uni->state=state; 501 uiter_setState(iter, state, pErrorCode); 502 } 503 504 /* 505 * Try shortcuts: If the requested state is in the array contents 506 * then just set the index there. 507 * 508 * We assume that the state is unique per position! 509 */ 510 if(state==uni->states[api->index]) { 511 return; 512 } else if(state==uni->states[api->limit]) { 513 api->index=api->limit; 514 return; 515 } else { 516 /* search for the index with this state */ 517 int32_t i; 518 519 for(i=api->start; i<api->limit; ++i) { 520 if(state==uni->states[i]) { 521 api->index=i; 522 return; 523 } 524 } 525 } 526 527 /* there is no array index for this state, reset for fresh contents */ 528 initIndexes((UNormIterator *)api, iter); 529 uni->states[api->limit]=state; 530 } 531 } 532 533 static const UCharIterator unormIterator={ 534 NULL, 0, 0, 0, 0, 0, 535 unormIteratorGetIndex, 536 unormIteratorMove, 537 unormIteratorHasNext, 538 unormIteratorHasPrevious, 539 unormIteratorCurrent, 540 unormIteratorNext, 541 unormIteratorPrevious, 542 NULL, 543 unormIteratorGetState, 544 unormIteratorSetState 545 }; 546 547 /* Setup functions ---------------------------------------------------------- */ 548 549 U_CAPI UNormIterator * U_EXPORT2 550 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) { 551 UNormIterator *uni; 552 553 /* argument checking */ 554 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 555 return NULL; 556 } 557 558 /* allocate */ 559 uni=NULL; 560 if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) { 561 if(U_ALIGNMENT_OFFSET(stackMem)==0) { 562 /* already aligned */ 563 uni=(UNormIterator *)stackMem; 564 } else { 565 int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem); 566 if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) { 567 /* needs alignment */ 568 uni=(UNormIterator *)((char *)stackMem+align); 569 } 570 } 571 /* else does not fit */ 572 } 573 574 if(uni!=NULL) { 575 uni->isStackAllocated=TRUE; 576 } else { 577 uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator)); 578 if(uni==NULL) { 579 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 580 return NULL; 581 } 582 uni->isStackAllocated=FALSE; 583 } 584 585 /* 586 * initialize 587 * do not memset because that would unnecessarily initialize the arrays 588 */ 589 uni->iter=NULL; 590 uni->chars=uni->charsBuffer; 591 uni->states=uni->statesBuffer; 592 uni->capacity=INITIAL_CAPACITY; 593 uni->state=UITER_NO_STATE; 594 uni->hasPrevious=uni->hasNext=FALSE; 595 uni->mode=UNORM_NONE; 596 597 /* set a no-op iterator into the api */ 598 uiter_setString(&uni->api, NULL, 0); 599 return uni; 600 } 601 602 U_CAPI void U_EXPORT2 603 unorm_closeIter(UNormIterator *uni) { 604 if(uni!=NULL) { 605 if(uni->states!=uni->statesBuffer) { 606 /* chars and states are allocated in the same memory block */ 607 uprv_free(uni->states); 608 } 609 if(!uni->isStackAllocated) { 610 uprv_free(uni); 611 } 612 } 613 } 614 615 U_CAPI UCharIterator * U_EXPORT2 616 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) { 617 /* argument checking */ 618 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 619 return NULL; 620 } 621 if(uni==NULL) { 622 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 623 return NULL; 624 } 625 if( iter==NULL || iter->getState==NULL || iter->setState==NULL || 626 mode<UNORM_NONE || UNORM_MODE_COUNT<=mode 627 ) { 628 /* set a no-op iterator into the api */ 629 uiter_setString(&uni->api, NULL, 0); 630 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 631 return NULL; 632 } 633 634 /* set the iterator and initialize */ 635 uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator)); 636 637 uni->iter=iter; 638 uni->mode=mode; 639 640 initIndexes(uni, iter); 641 uni->states[uni->api.limit]=uni->state=uiter_getState(iter); 642 643 return &uni->api; 644 } 645 646 #endif /* uconfig.h switches */ 647