1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unorm_it.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jan21 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 20 21 #include "unicode/uiter.h" 22 #include "unicode/unorm.h" 23 #include "unorm_it.h" 24 #include "cmemory.h" 25 26 /* UNormIterator ------------------------------------------------------------ */ 27 28 enum { 29 INITIAL_CAPACITY=100 30 }; 31 32 struct UNormIterator { 33 UCharIterator api; 34 UCharIterator *iter; 35 36 /* 37 * chars and states either use the static buffers 38 * or are allocated in the same memory block 39 * 40 * They are parallel arrays with states[] holding the getState() values 41 * from normalization boundaries, and UITER_NO_STATE in between. 42 */ 43 UChar *chars; 44 uint32_t *states; 45 46 /* 47 * api.start: first valid character & state in the arrays 48 * api.index: current position 49 * api.limit: one past the last valid character in chars[], but states[limit] is valid 50 * capacity: length of allocated arrays 51 */ 52 int32_t capacity; 53 54 /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */ 55 uint32_t state; 56 57 /* there are UChars available before start or after limit? */ 58 UBool hasPrevious, hasNext, isStackAllocated; 59 60 UNormalizationMode mode; 61 62 UChar charsBuffer[INITIAL_CAPACITY]; 63 uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */ 64 }; 65 66 static void 67 initIndexes(UNormIterator *uni, UCharIterator *iter) { 68 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 69 UCharIterator *api=&uni->api; 70 71 if(!iter->hasPrevious(iter)) { 72 /* set indexes to the beginning of the arrays */ 73 api->start=api->index=api->limit=0; 74 uni->hasPrevious=FALSE; 75 uni->hasNext=iter->hasNext(iter); 76 } else if(!iter->hasNext(iter)) { 77 /* set indexes to the end of the arrays */ 78 api->start=api->index=api->limit=uni->capacity; 79 uni->hasNext=FALSE; 80 uni->hasPrevious=iter->hasPrevious(iter); 81 } else { 82 /* set indexes into the middle of the arrays */ 83 api->start=api->index=api->limit=uni->capacity/2; 84 uni->hasPrevious=uni->hasNext=TRUE; 85 } 86 } 87 88 static UBool 89 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) { 90 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 91 UCharIterator *api=&uni->api; 92 93 uint32_t *states; 94 UChar *chars; 95 int32_t start, limit; 96 97 states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2); 98 if(states==NULL) { 99 return FALSE; 100 } 101 102 chars=(UChar *)(states+(capacity+1)); 103 uni->capacity=capacity; 104 105 start=api->start; 106 limit=api->limit; 107 108 if(addAtStart) { 109 /* copy old contents to the end of the new arrays */ 110 int32_t delta; 111 112 delta=capacity-uni->capacity; 113 uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4); 114 uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4); 115 116 api->start=start+delta; 117 api->index+=delta; 118 api->limit=limit+delta; 119 } else { 120 /* copy old contents to the beginning of the new arrays */ 121 uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4); 122 uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4); 123 } 124 125 uni->chars=chars; 126 uni->states=states; 127 128 return TRUE; 129 } 130 131 static void 132 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { 133 /* move array contents up to make room */ 134 int32_t srcIndex, destIndex, limit; 135 136 limit=api->limit; 137 srcIndex=delta; 138 if(srcIndex>api->start) { 139 /* look for a position in the arrays with a known state */ 140 while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) { 141 ++srcIndex; 142 } 143 } 144 145 /* now actually move the array contents */ 146 api->start=destIndex=0; 147 while(srcIndex<limit) { 148 chars[destIndex]=chars[srcIndex]; 149 states[destIndex++]=states[srcIndex++]; 150 } 151 152 /* copy states[limit] as well! */ 153 states[destIndex]=states[srcIndex]; 154 155 api->limit=destIndex; 156 } 157 158 static void 159 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) { 160 /* move array contents up to make room */ 161 int32_t srcIndex, destIndex, start; 162 163 start=api->start; 164 destIndex=((UNormIterator *)api)->capacity; 165 srcIndex=destIndex-delta; 166 if(srcIndex<api->limit) { 167 /* look for a position in the arrays with a known state */ 168 while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) { 169 --srcIndex; 170 } 171 } 172 173 /* now actually move the array contents */ 174 api->limit=destIndex; 175 176 /* copy states[limit] as well! */ 177 states[destIndex]=states[srcIndex]; 178 179 while(srcIndex>start) { 180 chars[--destIndex]=chars[--srcIndex]; 181 states[destIndex]=states[srcIndex]; 182 } 183 184 api->start=destIndex; 185 } 186 187 /* normalize forward from the limit, assume hasNext is true */ 188 static UBool 189 readNext(UNormIterator *uni, UCharIterator *iter) { 190 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 191 UCharIterator *api=&uni->api; 192 193 /* make capacity/4 room at the end of the arrays */ 194 int32_t limit, capacity, room; 195 UErrorCode errorCode; 196 197 limit=api->limit; 198 capacity=uni->capacity; 199 room=capacity/4; 200 if(room>(capacity-limit)) { 201 /* move array contents to make room */ 202 moveContentsTowardStart(api, uni->chars, uni->states, room); 203 api->index=limit=api->limit; 204 uni->hasPrevious=TRUE; 205 } 206 207 /* normalize starting from the limit position */ 208 errorCode=U_ZERO_ERROR; 209 if(uni->state!=uni->states[limit]) { 210 uiter_setState(iter, uni->states[limit], &errorCode); 211 if(U_FAILURE(errorCode)) { 212 uni->state=UITER_NO_STATE; 213 uni->hasNext=FALSE; 214 return FALSE; 215 } 216 } 217 218 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); 219 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 220 if(room<=capacity) { 221 /* empty and re-use the arrays */ 222 uni->states[0]=uni->states[limit]; 223 api->start=api->index=api->limit=limit=0; 224 uni->hasPrevious=TRUE; 225 } else { 226 capacity+=room+100; 227 if(!reallocArrays(uni, capacity, FALSE)) { 228 uni->state=UITER_NO_STATE; 229 uni->hasNext=FALSE; 230 return FALSE; 231 } 232 limit=api->limit; 233 } 234 235 errorCode=U_ZERO_ERROR; 236 uiter_setState(iter, uni->states[limit], &errorCode); 237 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode); 238 } 239 if(U_FAILURE(errorCode) || room==0) { 240 uni->state=UITER_NO_STATE; 241 uni->hasNext=FALSE; 242 return FALSE; 243 } 244 245 /* room>0 */ 246 ++limit; /* leave the known states[limit] alone */ 247 for(--room; room>0; --room) { 248 /* set unknown states for all but the normalization boundaries */ 249 uni->states[limit++]=UITER_NO_STATE; 250 } 251 uni->states[limit]=uni->state=uiter_getState(iter); 252 uni->hasNext=iter->hasNext(iter); 253 api->limit=limit; 254 return TRUE; 255 } 256 257 /* normalize backward from the start, assume hasPrevious is true */ 258 static UBool 259 readPrevious(UNormIterator *uni, UCharIterator *iter) { 260 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */ 261 UCharIterator *api=&uni->api; 262 263 /* make capacity/4 room at the start of the arrays */ 264 int32_t start, capacity, room; 265 UErrorCode errorCode; 266 267 start=api->start; 268 capacity=uni->capacity; 269 room=capacity/4; 270 if(room>start) { 271 /* move array contents to make room */ 272 moveContentsTowardEnd(api, uni->chars, uni->states, room); 273 api->index=start=api->start; 274 uni->hasNext=TRUE; 275 } 276 277 /* normalize ending at the start position */ 278 errorCode=U_ZERO_ERROR; 279 if(uni->state!=uni->states[start]) { 280 uiter_setState(iter, uni->states[start], &errorCode); 281 if(U_FAILURE(errorCode)) { 282 uni->state=UITER_NO_STATE; 283 uni->hasPrevious=FALSE; 284 return FALSE; 285 } 286 } 287 288 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); 289 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 290 if(room<=capacity) { 291 /* empty and re-use the arrays */ 292 uni->states[capacity]=uni->states[start]; 293 api->start=api->index=api->limit=start=capacity; 294 uni->hasNext=TRUE; 295 } else { 296 capacity+=room+100; 297 if(!reallocArrays(uni, capacity, TRUE)) { 298 uni->state=UITER_NO_STATE; 299 uni->hasPrevious=FALSE; 300 return FALSE; 301 } 302 start=api->start; 303 } 304 305 errorCode=U_ZERO_ERROR; 306 uiter_setState(iter, uni->states[start], &errorCode); 307 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode); 308 } 309 if(U_FAILURE(errorCode) || room==0) { 310 uni->state=UITER_NO_STATE; 311 uni->hasPrevious=FALSE; 312 return FALSE; 313 } 314 315 /* room>0 */ 316 do { 317 /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */ 318 uni->chars[--start]=uni->chars[--room]; 319 /* set unknown states for all but the normalization boundaries */ 320 uni->states[start]=UITER_NO_STATE; 321 } while(room>0); 322 uni->states[start]=uni->state=uiter_getState(iter); 323 uni->hasPrevious=iter->hasPrevious(iter); 324 api->start=start; 325 return TRUE; 326 } 327 328 /* Iterator runtime API functions ------------------------------------------- */ 329 330 static int32_t U_CALLCONV 331 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) { 332 switch(origin) { 333 case UITER_ZERO: 334 case UITER_START: 335 return 0; 336 case UITER_CURRENT: 337 case UITER_LIMIT: 338 case UITER_LENGTH: 339 return UITER_UNKNOWN_INDEX; 340 default: 341 /* not a valid origin */ 342 /* Should never get here! */ 343 return -1; 344 } 345 } 346 347 static int32_t U_CALLCONV 348 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) { 349 UNormIterator *uni=(UNormIterator *)api; 350 UCharIterator *iter=uni->iter; 351 int32_t pos; 352 353 switch(origin) { 354 case UITER_ZERO: 355 case UITER_START: 356 /* restart from the beginning */ 357 if(uni->hasPrevious) { 358 iter->move(iter, 0, UITER_START); 359 api->start=api->index=api->limit=0; 360 uni->states[api->limit]=uni->state=uiter_getState(iter); 361 uni->hasPrevious=FALSE; 362 uni->hasNext=iter->hasNext(iter); 363 } else { 364 /* we already have the beginning of the normalized text */ 365 api->index=api->start; 366 } 367 break; 368 case UITER_CURRENT: 369 break; 370 case UITER_LIMIT: 371 case UITER_LENGTH: 372 /* restart from the end */ 373 if(uni->hasNext) { 374 iter->move(iter, 0, UITER_LIMIT); 375 api->start=api->index=api->limit=uni->capacity; 376 uni->states[api->limit]=uni->state=uiter_getState(iter); 377 uni->hasPrevious=iter->hasPrevious(iter); 378 uni->hasNext=FALSE; 379 } else { 380 /* we already have the end of the normalized text */ 381 api->index=api->limit; 382 } 383 break; 384 default: 385 return -1; /* Error */ 386 } 387 388 /* move relative to the current position by delta normalized UChars */ 389 if(delta==0) { 390 /* nothing to do */ 391 } else if(delta>0) { 392 /* go forward until the requested position is in the buffer */ 393 for(;;) { 394 pos=api->index+delta; /* requested position */ 395 delta=pos-api->limit; /* remainder beyond buffered text */ 396 if(delta<=0) { 397 api->index=pos; /* position reached */ 398 break; 399 } 400 401 /* go to end of buffer and normalize further */ 402 api->index=api->limit; 403 if(!uni->hasNext || !readNext(uni, iter)) { 404 break; /* reached end of text */ 405 } 406 } 407 } else /* delta<0 */ { 408 /* go backward until the requested position is in the buffer */ 409 for(;;) { 410 pos=api->index+delta; /* requested position */ 411 delta=pos-api->start; /* remainder beyond buffered text */ 412 if(delta>=0) { 413 api->index=pos; /* position reached */ 414 break; 415 } 416 417 /* go to start of buffer and normalize further */ 418 api->index=api->start; 419 if(!uni->hasPrevious || !readPrevious(uni, iter)) { 420 break; /* reached start of text */ 421 } 422 } 423 } 424 425 if(api->index==api->start && !uni->hasPrevious) { 426 return 0; 427 } else { 428 return UITER_UNKNOWN_INDEX; 429 } 430 } 431 432 static UBool U_CALLCONV 433 unormIteratorHasNext(UCharIterator *api) { 434 return api->index<api->limit || ((UNormIterator *)api)->hasNext; 435 } 436 437 static UBool U_CALLCONV 438 unormIteratorHasPrevious(UCharIterator *api) { 439 return api->index>api->start || ((UNormIterator *)api)->hasPrevious; 440 } 441 442 static UChar32 U_CALLCONV 443 unormIteratorCurrent(UCharIterator *api) { 444 UNormIterator *uni=(UNormIterator *)api; 445 446 if( api->index<api->limit || 447 (uni->hasNext && readNext(uni, uni->iter)) 448 ) { 449 return uni->chars[api->index]; 450 } else { 451 return U_SENTINEL; 452 } 453 } 454 455 static UChar32 U_CALLCONV 456 unormIteratorNext(UCharIterator *api) { 457 UNormIterator *uni=(UNormIterator *)api; 458 459 if( api->index<api->limit || 460 (uni->hasNext && readNext(uni, uni->iter)) 461 ) { 462 return uni->chars[api->index++]; 463 } else { 464 return U_SENTINEL; 465 } 466 } 467 468 static UChar32 U_CALLCONV 469 unormIteratorPrevious(UCharIterator *api) { 470 UNormIterator *uni=(UNormIterator *)api; 471 472 if( api->index>api->start || 473 (uni->hasPrevious && readPrevious(uni, uni->iter)) 474 ) { 475 return uni->chars[--api->index]; 476 } else { 477 return U_SENTINEL; 478 } 479 } 480 481 static uint32_t U_CALLCONV 482 unormIteratorGetState(const UCharIterator *api) { 483 /* not uni->state because that may not be at api->index */ 484 return ((UNormIterator *)api)->states[api->index]; 485 } 486 487 static void U_CALLCONV 488 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) { 489 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 490 /* do nothing */ 491 } else if(api==NULL) { 492 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 493 } else if(state==UITER_NO_STATE) { 494 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 495 } else { 496 UNormIterator *uni=(UNormIterator *)api; 497 UCharIterator *iter=((UNormIterator *)api)->iter; 498 if(state!=uni->state) { 499 uni->state=state; 500 uiter_setState(iter, state, pErrorCode); 501 } 502 503 /* 504 * Try shortcuts: If the requested state is in the array contents 505 * then just set the index there. 506 * 507 * We assume that the state is unique per position! 508 */ 509 if(state==uni->states[api->index]) { 510 return; 511 } else if(state==uni->states[api->limit]) { 512 api->index=api->limit; 513 return; 514 } else { 515 /* search for the index with this state */ 516 int32_t i; 517 518 for(i=api->start; i<api->limit; ++i) { 519 if(state==uni->states[i]) { 520 api->index=i; 521 return; 522 } 523 } 524 } 525 526 /* there is no array index for this state, reset for fresh contents */ 527 initIndexes((UNormIterator *)api, iter); 528 uni->states[api->limit]=state; 529 } 530 } 531 532 static const UCharIterator unormIterator={ 533 NULL, 0, 0, 0, 0, 0, 534 unormIteratorGetIndex, 535 unormIteratorMove, 536 unormIteratorHasNext, 537 unormIteratorHasPrevious, 538 unormIteratorCurrent, 539 unormIteratorNext, 540 unormIteratorPrevious, 541 NULL, 542 unormIteratorGetState, 543 unormIteratorSetState 544 }; 545 546 /* Setup functions ---------------------------------------------------------- */ 547 548 U_CAPI UNormIterator * U_EXPORT2 549 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) { 550 UNormIterator *uni; 551 552 /* argument checking */ 553 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 554 return NULL; 555 } 556 557 /* allocate */ 558 uni=NULL; 559 if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) { 560 if(U_ALIGNMENT_OFFSET(stackMem)==0) { 561 /* already aligned */ 562 uni=(UNormIterator *)stackMem; 563 } else { 564 int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem); 565 if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) { 566 /* needs alignment */ 567 uni=(UNormIterator *)((char *)stackMem+align); 568 } 569 } 570 /* else does not fit */ 571 } 572 573 if(uni!=NULL) { 574 uni->isStackAllocated=TRUE; 575 } else { 576 uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator)); 577 if(uni==NULL) { 578 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 579 return NULL; 580 } 581 uni->isStackAllocated=FALSE; 582 } 583 584 /* 585 * initialize 586 * do not memset because that would unnecessarily initialize the arrays 587 */ 588 uni->iter=NULL; 589 uni->chars=uni->charsBuffer; 590 uni->states=uni->statesBuffer; 591 uni->capacity=INITIAL_CAPACITY; 592 uni->state=UITER_NO_STATE; 593 uni->hasPrevious=uni->hasNext=FALSE; 594 uni->mode=UNORM_NONE; 595 596 /* set a no-op iterator into the api */ 597 uiter_setString(&uni->api, NULL, 0); 598 return uni; 599 } 600 601 U_CAPI void U_EXPORT2 602 unorm_closeIter(UNormIterator *uni) { 603 if(uni!=NULL) { 604 if(uni->states!=uni->statesBuffer) { 605 /* chars and states are allocated in the same memory block */ 606 uprv_free(uni->states); 607 } 608 if(!uni->isStackAllocated) { 609 uprv_free(uni); 610 } 611 } 612 } 613 614 U_CAPI UCharIterator * U_EXPORT2 615 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) { 616 /* argument checking */ 617 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 618 return NULL; 619 } 620 if(uni==NULL) { 621 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 622 return NULL; 623 } 624 if( iter==NULL || iter->getState==NULL || iter->setState==NULL || 625 mode<UNORM_NONE || UNORM_MODE_COUNT<=mode 626 ) { 627 /* set a no-op iterator into the api */ 628 uiter_setString(&uni->api, NULL, 0); 629 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 630 return NULL; 631 } 632 633 /* set the iterator and initialize */ 634 uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator)); 635 636 uni->iter=iter; 637 uni->mode=mode; 638 639 initIndexes(uni, iter); 640 uni->states[uni->api.limit]=uni->state=uiter_getState(iter); 641 642 return &uni->api; 643 } 644 645 #endif /* uconfig.h switches */ 646