1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2009 IBM and others. All rights reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 07/02/2001 synwee Creation. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/usearch.h" 15 #include "unicode/ustring.h" 16 #include "unicode/uchar.h" 17 #include "unormimp.h" 18 #include "ucol_imp.h" 19 #include "usrchimp.h" 20 #include "cmemory.h" 21 #include "ucln_in.h" 22 #include "uassert.h" 23 24 U_NAMESPACE_USE 25 26 // don't use Boyer-Moore 27 #define BOYER_MOORE 0 28 29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 30 31 // internal definition --------------------------------------------------- 32 33 #define LAST_BYTE_MASK_ 0xFF 34 #define SECOND_LAST_BYTE_SHIFT_ 8 35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 36 37 static const uint16_t *fcdTrieIndex = NULL; 38 static UChar32 fcdHighStart = 0; 39 40 // internal methods ------------------------------------------------- 41 42 /** 43 * Fast collation element iterator setOffset. 44 * This function does not check for bounds. 45 * @param coleiter collation element iterator 46 * @param offset to set 47 */ 48 static 49 inline void setColEIterOffset(UCollationElements *elems, 50 int32_t offset) 51 { 52 collIterate *ci = &(elems->iteratordata_); 53 ci->pos = ci->string + offset; 54 ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; 55 if (ci->flags & UCOL_ITER_INNORMBUF) { 56 ci->flags = ci->origFlags; 57 } 58 ci->fcdPosition = NULL; 59 60 ci->offsetReturn = NULL; 61 ci->offsetStore = ci->offsetBuffer; 62 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 63 } 64 65 /** 66 * Getting the mask for collation strength 67 * @param strength collation strength 68 * @return collation element mask 69 */ 70 static 71 inline uint32_t getMask(UCollationStrength strength) 72 { 73 switch (strength) 74 { 75 case UCOL_PRIMARY: 76 return UCOL_PRIMARYORDERMASK; 77 case UCOL_SECONDARY: 78 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 79 default: 80 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 81 UCOL_PRIMARYORDERMASK; 82 } 83 } 84 85 /** 86 * This is to squeeze the 21bit ces into a 256 table 87 * @param ce collation element 88 * @return collapsed version of the collation element 89 */ 90 static 91 inline int hash(uint32_t ce) 92 { 93 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 94 // well with the new collation where most of the latin 1 characters 95 // are of the value xx000xxx. their hashes will most of the time be 0 96 // to be discussed on the hash algo. 97 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 98 } 99 100 U_CDECL_BEGIN 101 static UBool U_CALLCONV 102 usearch_cleanup(void) { 103 fcdTrieIndex = NULL; 104 return TRUE; 105 } 106 U_CDECL_END 107 108 /** 109 * Initializing the fcd tables. 110 * Internal method, status assumed to be a success. 111 * @param status output error if any, caller to check status before calling 112 * method, status assumed to be success when passed in. 113 */ 114 static 115 inline void initializeFCD(UErrorCode *status) 116 { 117 if (fcdTrieIndex == NULL) { 118 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 119 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 120 } 121 } 122 123 /** 124 * Gets the fcd value for a character at the argument index. 125 * This method takes into accounts of the supplementary characters. 126 * @param str UTF16 string where character for fcd retrieval resides 127 * @param offset position of the character whose fcd is to be retrieved, to be 128 * overwritten with the next character position, taking 129 * surrogate characters into consideration. 130 * @param strlength length of the argument string 131 * @return fcd value 132 */ 133 static 134 uint16_t getFCD(const UChar *str, int32_t *offset, 135 int32_t strlength) 136 { 137 const UChar *temp = str + *offset; 138 uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength); 139 *offset = (int32_t)(temp - str); 140 return result; 141 } 142 143 /** 144 * Getting the modified collation elements taking into account the collation 145 * attributes 146 * @param strsrch string search data 147 * @param sourcece 148 * @return the modified collation element 149 */ 150 static 151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 152 { 153 // note for tertiary we can't use the collator->tertiaryMask, that 154 // is a preprocessed mask that takes into account case options. since 155 // we are only concerned with exact matches, we don't need that. 156 sourcece &= strsrch->ceMask; 157 158 if (strsrch->toShift) { 159 // alternate handling here, since only the 16 most significant digits 160 // is only used, we can safely do a compare without masking 161 // if the ce is a variable, we mask and get only the primary values 162 // no shifting to quartenary is required since all primary values 163 // less than variabletop will need to be masked off anyway. 164 if (strsrch->variableTop > sourcece) { 165 if (strsrch->strength >= UCOL_QUATERNARY) { 166 sourcece &= UCOL_PRIMARYORDERMASK; 167 } 168 else { 169 sourcece = UCOL_IGNORABLE; 170 } 171 } 172 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 173 sourcece = 0xFFFF; 174 } 175 176 return sourcece; 177 } 178 179 /** 180 * Allocate a memory and returns NULL if it failed. 181 * Internal method, status assumed to be a success. 182 * @param size to allocate 183 * @param status output error if any, caller to check status before calling 184 * method, status assumed to be success when passed in. 185 * @return newly allocated array, NULL otherwise 186 */ 187 static 188 inline void * allocateMemory(uint32_t size, UErrorCode *status) 189 { 190 uint32_t *result = (uint32_t *)uprv_malloc(size); 191 if (result == NULL) { 192 *status = U_MEMORY_ALLOCATION_ERROR; 193 } 194 return result; 195 } 196 197 /** 198 * Adds a uint32_t value to a destination array. 199 * Creates a new array if we run out of space. The caller will have to 200 * manually deallocate the newly allocated array. 201 * Internal method, status assumed to be success, caller has to check status 202 * before calling this method. destination not to be NULL and has at least 203 * size destinationlength. 204 * @param destination target array 205 * @param offset destination offset to add value 206 * @param destinationlength target array size, return value for the new size 207 * @param value to be added 208 * @param increments incremental size expected 209 * @param status output error if any, caller to check status before calling 210 * method, status assumed to be success when passed in. 211 * @return new destination array, destination if there was no new allocation 212 */ 213 static 214 inline int32_t * addTouint32_tArray(int32_t *destination, 215 uint32_t offset, 216 uint32_t *destinationlength, 217 uint32_t value, 218 uint32_t increments, 219 UErrorCode *status) 220 { 221 uint32_t newlength = *destinationlength; 222 if (offset + 1 == newlength) { 223 newlength += increments; 224 int32_t *temp = (int32_t *)allocateMemory( 225 sizeof(int32_t) * newlength, status); 226 if (U_FAILURE(*status)) { 227 return NULL; 228 } 229 uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 230 *destinationlength = newlength; 231 destination = temp; 232 } 233 destination[offset] = value; 234 return destination; 235 } 236 237 /** 238 * Adds a uint64_t value to a destination array. 239 * Creates a new array if we run out of space. The caller will have to 240 * manually deallocate the newly allocated array. 241 * Internal method, status assumed to be success, caller has to check status 242 * before calling this method. destination not to be NULL and has at least 243 * size destinationlength. 244 * @param destination target array 245 * @param offset destination offset to add value 246 * @param destinationlength target array size, return value for the new size 247 * @param value to be added 248 * @param increments incremental size expected 249 * @param status output error if any, caller to check status before calling 250 * method, status assumed to be success when passed in. 251 * @return new destination array, destination if there was no new allocation 252 */ 253 static 254 inline int64_t * addTouint64_tArray(int64_t *destination, 255 uint32_t offset, 256 uint32_t *destinationlength, 257 uint64_t value, 258 uint32_t increments, 259 UErrorCode *status) 260 { 261 uint32_t newlength = *destinationlength; 262 if (offset + 1 == newlength) { 263 newlength += increments; 264 int64_t *temp = (int64_t *)allocateMemory( 265 sizeof(int64_t) * newlength, status); 266 267 if (U_FAILURE(*status)) { 268 return NULL; 269 } 270 271 uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 272 *destinationlength = newlength; 273 destination = temp; 274 } 275 276 destination[offset] = value; 277 278 return destination; 279 } 280 281 /** 282 * Initializing the ce table for a pattern. 283 * Stores non-ignorable collation keys. 284 * Table size will be estimated by the size of the pattern text. Table 285 * expansion will be perform as we go along. Adding 1 to ensure that the table 286 * size definitely increases. 287 * Internal method, status assumed to be a success. 288 * @param strsrch string search data 289 * @param status output error if any, caller to check status before calling 290 * method, status assumed to be success when passed in. 291 * @return total number of expansions 292 */ 293 static 294 inline uint16_t initializePatternCETable(UStringSearch *strsrch, 295 UErrorCode *status) 296 { 297 UPattern *pattern = &(strsrch->pattern); 298 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 299 int32_t *cetable = pattern->CEBuffer; 300 uint32_t patternlength = pattern->textLength; 301 UCollationElements *coleiter = strsrch->utilIter; 302 303 if (coleiter == NULL) { 304 coleiter = ucol_openElements(strsrch->collator, pattern->text, 305 patternlength, status); 306 // status will be checked in ucol_next(..) later and if it is an 307 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 308 // returned. 309 strsrch->utilIter = coleiter; 310 } 311 else { 312 uprv_init_collIterate(strsrch->collator, pattern->text, 313 pattern->textLength, 314 &coleiter->iteratordata_); 315 } 316 317 if (pattern->CE != cetable && pattern->CE) { 318 uprv_free(pattern->CE); 319 } 320 321 uint16_t offset = 0; 322 uint16_t result = 0; 323 int32_t ce; 324 325 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 326 U_SUCCESS(*status)) { 327 uint32_t newce = getCE(strsrch, ce); 328 if (newce) { 329 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 330 newce, 331 patternlength - ucol_getOffset(coleiter) + 1, 332 status); 333 if (U_FAILURE(*status)) { 334 return 0; 335 } 336 offset ++; 337 if (cetable != temp && cetable != pattern->CEBuffer) { 338 uprv_free(cetable); 339 } 340 cetable = temp; 341 } 342 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 343 } 344 345 cetable[offset] = 0; 346 pattern->CE = cetable; 347 pattern->CELength = offset; 348 349 return result; 350 } 351 352 /** 353 * Initializing the pce table for a pattern. 354 * Stores non-ignorable collation keys. 355 * Table size will be estimated by the size of the pattern text. Table 356 * expansion will be perform as we go along. Adding 1 to ensure that the table 357 * size definitely increases. 358 * Internal method, status assumed to be a success. 359 * @param strsrch string search data 360 * @param status output error if any, caller to check status before calling 361 * method, status assumed to be success when passed in. 362 * @return total number of expansions 363 */ 364 static 365 inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 366 UErrorCode *status) 367 { 368 UPattern *pattern = &(strsrch->pattern); 369 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 370 int64_t *pcetable = pattern->PCEBuffer; 371 uint32_t patternlength = pattern->textLength; 372 UCollationElements *coleiter = strsrch->utilIter; 373 374 if (coleiter == NULL) { 375 coleiter = ucol_openElements(strsrch->collator, pattern->text, 376 patternlength, status); 377 // status will be checked in ucol_next(..) later and if it is an 378 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 379 // returned. 380 strsrch->utilIter = coleiter; 381 } else { 382 uprv_init_collIterate(strsrch->collator, pattern->text, 383 pattern->textLength, 384 &coleiter->iteratordata_); 385 } 386 387 if (pattern->PCE != pcetable && pattern->PCE != NULL) { 388 uprv_free(pattern->PCE); 389 } 390 391 uint16_t offset = 0; 392 uint16_t result = 0; 393 int64_t pce; 394 395 uprv_init_pce(coleiter); 396 397 // ** Should processed CEs be signed or unsigned? 398 // ** (the rest of the code in this file seems to play fast-and-loose with 399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 400 while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 401 U_SUCCESS(*status)) { 402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 403 pce, 404 patternlength - ucol_getOffset(coleiter) + 1, 405 status); 406 407 if (U_FAILURE(*status)) { 408 return 0; 409 } 410 411 offset += 1; 412 413 if (pcetable != temp && pcetable != pattern->PCEBuffer) { 414 uprv_free(pcetable); 415 } 416 417 pcetable = temp; 418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 419 } 420 421 pcetable[offset] = 0; 422 pattern->PCE = pcetable; 423 pattern->PCELength = offset; 424 425 return result; 426 } 427 428 /** 429 * Initializes the pattern struct. 430 * Internal method, status assumed to be success. 431 * @param strsrch UStringSearch data storage 432 * @param status output error if any, caller to check status before calling 433 * method, status assumed to be success when passed in. 434 * @return expansionsize the total expansion size of the pattern 435 */ 436 static 437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 438 { 439 UPattern *pattern = &(strsrch->pattern); 440 const UChar *patterntext = pattern->text; 441 int32_t length = pattern->textLength; 442 int32_t index = 0; 443 444 // Since the strength is primary, accents are ignored in the pattern. 445 if (strsrch->strength == UCOL_PRIMARY) { 446 pattern->hasPrefixAccents = 0; 447 pattern->hasSuffixAccents = 0; 448 } else { 449 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 450 SECOND_LAST_BYTE_SHIFT_; 451 index = length; 452 UTF_BACK_1(patterntext, 0, index); 453 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 454 LAST_BYTE_MASK_; 455 } 456 457 // ** HACK ** 458 if (strsrch->pattern.PCE != NULL) { 459 if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 460 uprv_free(strsrch->pattern.PCE); 461 } 462 463 strsrch->pattern.PCE = NULL; 464 } 465 466 // since intializePattern is an internal method status is a success. 467 return initializePatternCETable(strsrch, status); 468 } 469 470 /** 471 * Initializing shift tables, with the default values. 472 * If a corresponding default value is 0, the shift table is not set. 473 * @param shift table for forwards shift 474 * @param backshift table for backwards shift 475 * @param cetable table containing pattern ce 476 * @param cesize size of the pattern ces 477 * @param expansionsize total size of the expansions 478 * @param defaultforward the default forward value 479 * @param defaultbackward the default backward value 480 */ 481 static 482 inline void setShiftTable(int16_t shift[], int16_t backshift[], 483 int32_t *cetable, int32_t cesize, 484 int16_t expansionsize, 485 int16_t defaultforward, 486 int16_t defaultbackward) 487 { 488 // estimate the value to shift. to do that we estimate the smallest 489 // number of characters to give the relevant ces, ie approximately 490 // the number of ces minus their expansion, since expansions can come 491 // from a character. 492 int32_t count; 493 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 494 shift[count] = defaultforward; 495 } 496 cesize --; // down to the last index 497 for (count = 0; count < cesize; count ++) { 498 // number of ces from right of array to the count 499 int temp = defaultforward - count - 1; 500 shift[hash(cetable[count])] = temp > 1 ? temp : 1; 501 } 502 shift[hash(cetable[cesize])] = 1; 503 // for ignorables we just shift by one. see test examples. 504 shift[hash(0)] = 1; 505 506 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 507 backshift[count] = defaultbackward; 508 } 509 for (count = cesize; count > 0; count --) { 510 // the original value count does not seem to work 511 backshift[hash(cetable[count])] = count > expansionsize ? 512 (int16_t)(count - expansionsize) : 1; 513 } 514 backshift[hash(cetable[0])] = 1; 515 backshift[hash(0)] = 1; 516 } 517 518 /** 519 * Building of the pattern collation element list and the boyer moore strsrch 520 * table. 521 * The canonical match will only be performed after the default match fails. 522 * For both cases we need to remember the size of the composed and decomposed 523 * versions of the string. Since the Boyer-Moore shift calculations shifts by 524 * a number of characters in the text and tries to match the pattern from that 525 * offset, the shift value can not be too large in case we miss some 526 * characters. To choose a right shift size, we estimate the NFC form of the 527 * and use its size as a shift guide. The NFC form should be the small 528 * possible representation of the pattern. Anyways, we'll err on the smaller 529 * shift size. Hence the calculation for minlength. 530 * Canonical match will be performed slightly differently. We'll split the 531 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by 532 * the first and last base character (MS), the ending accents (EA). Matches 533 * will be done on MS first, and only when we match MS then some processing 534 * will be required for the prefix and end accents in order to determine if 535 * they match PA and EA. Hence the default shift values 536 * for the canonical match will take the size of either end's accent into 537 * consideration. Forwards search will take the end accents into consideration 538 * for the default shift values and the backwards search will take the prefix 539 * accents into consideration. 540 * If pattern has no non-ignorable ce, we return a illegal argument error. 541 * Internal method, status assumed to be success. 542 * @param strsrch UStringSearch data storage 543 * @param status for output errors if it occurs, status is assumed to be a 544 * success when it is passed in. 545 */ 546 static 547 inline void initialize(UStringSearch *strsrch, UErrorCode *status) 548 { 549 int16_t expandlength = initializePattern(strsrch, status); 550 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { 551 UPattern *pattern = &strsrch->pattern; 552 int32_t cesize = pattern->CELength; 553 554 int16_t minlength = cesize > expandlength 555 ? (int16_t)cesize - expandlength : 1; 556 pattern->defaultShiftSize = minlength; 557 setShiftTable(pattern->shift, pattern->backShift, pattern->CE, 558 cesize, expandlength, minlength, minlength); 559 return; 560 } 561 strsrch->pattern.defaultShiftSize = 0; 562 } 563 564 #if BOYER_MOORE 565 /** 566 * Check to make sure that the match length is at the end of the character by 567 * using the breakiterator. 568 * @param strsrch string search data 569 * @param start target text start offset 570 * @param end target text end offset 571 */ 572 static 573 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 574 int32_t *end) 575 { 576 #if !UCONFIG_NO_BREAK_ITERATION 577 UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 578 if (breakiterator) { 579 int32_t matchend = *end; 580 //int32_t matchstart = *start; 581 582 if (!ubrk_isBoundary(breakiterator, matchend)) { 583 *end = ubrk_following(breakiterator, matchend); 584 } 585 586 /* Check the start of the matched text to make sure it doesn't have any accents 587 * before it. This code may not be necessary and so it is commented out */ 588 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 589 *start = ubrk_preceding(breakiterator, matchstart); 590 }*/ 591 } 592 #endif 593 } 594 595 /** 596 * Determine whether the target text in UStringSearch bounded by the offset 597 * start and end is one or more whole units of text as 598 * determined by the breakiterator in UStringSearch. 599 * @param strsrch string search data 600 * @param start target text start offset 601 * @param end target text end offset 602 */ 603 static 604 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 605 int32_t end) 606 { 607 #if !UCONFIG_NO_BREAK_ITERATION 608 UBreakIterator *breakiterator = strsrch->search->breakIter; 609 //TODO: Add here. 610 if (breakiterator) { 611 int32_t startindex = ubrk_first(breakiterator); 612 int32_t endindex = ubrk_last(breakiterator); 613 614 // out-of-range indexes are never boundary positions 615 if (start < startindex || start > endindex || 616 end < startindex || end > endindex) { 617 return FALSE; 618 } 619 // otherwise, we can use following() on the position before the 620 // specified one and return true of the position we get back is the 621 // one the user specified 622 UBool result = (start == startindex || 623 ubrk_following(breakiterator, start - 1) == start) && 624 (end == endindex || 625 ubrk_following(breakiterator, end - 1) == end); 626 if (result) { 627 // iterates the individual ces 628 UCollationElements *coleiter = strsrch->utilIter; 629 const UChar *text = strsrch->search->text + 630 start; 631 UErrorCode status = U_ZERO_ERROR; 632 ucol_setText(coleiter, text, end - start, &status); 633 for (int32_t count = 0; count < strsrch->pattern.CELength; 634 count ++) { 635 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 636 if (ce == UCOL_IGNORABLE) { 637 count --; 638 continue; 639 } 640 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { 641 return FALSE; 642 } 643 } 644 int32_t nextce = ucol_next(coleiter, &status); 645 while (ucol_getOffset(coleiter) == (end - start) 646 && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 647 nextce = ucol_next(coleiter, &status); 648 } 649 if (ucol_getOffset(coleiter) == (end - start) 650 && nextce != UCOL_NULLORDER) { 651 // extra collation elements at the end of the match 652 return FALSE; 653 } 654 } 655 return result; 656 } 657 #endif 658 return TRUE; 659 } 660 661 /** 662 * Getting the next base character offset if current offset is an accent, 663 * or the current offset if the current character contains a base character. 664 * accents the following base character will be returned 665 * @param text string 666 * @param textoffset current offset 667 * @param textlength length of text string 668 * @return the next base character or the current offset 669 * if the current character is contains a base character. 670 */ 671 static 672 inline int32_t getNextBaseOffset(const UChar *text, 673 int32_t textoffset, 674 int32_t textlength) 675 { 676 if (textoffset < textlength) { 677 int32_t temp = textoffset; 678 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 679 while (temp < textlength) { 680 int32_t result = temp; 681 if ((getFCD(text, &temp, textlength) >> 682 SECOND_LAST_BYTE_SHIFT_) == 0) { 683 return result; 684 } 685 } 686 return textlength; 687 } 688 } 689 return textoffset; 690 } 691 692 /** 693 * Gets the next base character offset depending on the string search pattern 694 * data 695 * @param strsrch string search data 696 * @param textoffset current offset, one offset away from the last character 697 * to search for. 698 * @return start index of the next base character or the current offset 699 * if the current character is contains a base character. 700 */ 701 static 702 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 703 int32_t textoffset) 704 { 705 int32_t textlength = strsrch->search->textLength; 706 if (strsrch->pattern.hasSuffixAccents && 707 textoffset < textlength) { 708 int32_t temp = textoffset; 709 const UChar *text = strsrch->search->text; 710 UTF_BACK_1(text, 0, temp); 711 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 712 return getNextBaseOffset(text, textoffset, textlength); 713 } 714 } 715 return textoffset; 716 } 717 718 /** 719 * Shifting the collation element iterator position forward to prepare for 720 * a following match. If the last character is a unsafe character, we'll only 721 * shift by 1 to capture contractions, normalization etc. 722 * Internal method, status assumed to be success. 723 * @param text strsrch string search data 724 * @param textoffset start text position to do search 725 * @param ce the text ce which failed the match. 726 * @param patternceindex index of the ce within the pattern ce buffer which 727 * failed the match 728 * @return final offset 729 */ 730 static 731 inline int32_t shiftForward(UStringSearch *strsrch, 732 int32_t textoffset, 733 int32_t ce, 734 int32_t patternceindex) 735 { 736 UPattern *pattern = &(strsrch->pattern); 737 if (ce != UCOL_NULLORDER) { 738 int32_t shift = pattern->shift[hash(ce)]; 739 // this is to adjust for characters in the middle of the 740 // substring for matching that failed. 741 int32_t adjust = pattern->CELength - patternceindex; 742 if (adjust > 1 && shift >= adjust) { 743 shift -= adjust - 1; 744 } 745 textoffset += shift; 746 } 747 else { 748 textoffset += pattern->defaultShiftSize; 749 } 750 751 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 752 // check for unsafe characters 753 // * if it is the start or middle of a contraction: to be done after 754 // a initial match is found 755 // * thai or lao base consonant character: similar to contraction 756 // * high surrogate character: similar to contraction 757 // * next character is a accent: shift to the next base character 758 return textoffset; 759 } 760 #endif // #if BOYER_MOORE 761 762 /** 763 * sets match not found 764 * @param strsrch string search data 765 */ 766 static 767 inline void setMatchNotFound(UStringSearch *strsrch) 768 { 769 // this method resets the match result regardless of the error status. 770 strsrch->search->matchedIndex = USEARCH_DONE; 771 strsrch->search->matchedLength = 0; 772 if (strsrch->search->isForwardSearching) { 773 setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 774 } 775 else { 776 setColEIterOffset(strsrch->textIter, 0); 777 } 778 } 779 780 #if BOYER_MOORE 781 /** 782 * Gets the offset to the next safe point in text. 783 * ie. not the middle of a contraction, swappable characters or supplementary 784 * characters. 785 * @param collator collation sata 786 * @param text string to work with 787 * @param textoffset offset in string 788 * @param textlength length of text string 789 * @return offset to the next safe character 790 */ 791 static 792 inline int32_t getNextSafeOffset(const UCollator *collator, 793 const UChar *text, 794 int32_t textoffset, 795 int32_t textlength) 796 { 797 int32_t result = textoffset; // first contraction character 798 while (result != textlength && ucol_unsafeCP(text[result], collator)) { 799 result ++; 800 } 801 return result; 802 } 803 804 /** 805 * This checks for accents in the potential match started with a . 806 * composite character. 807 * This is really painful... we have to check that composite character do not 808 * have any extra accents. We have to normalize the potential match and find 809 * the immediate decomposed character before the match. 810 * The first composite character would have been taken care of by the fcd 811 * checks in checkForwardExactMatch. 812 * This is the slow path after the fcd of the first character and 813 * the last character has been checked by checkForwardExactMatch and we 814 * determine that the potential match has extra non-ignorable preceding 815 * ces. 816 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 817 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 818 * Note here that accents checking are slow and cautioned in the API docs. 819 * Internal method, status assumed to be a success, caller should check status 820 * before calling this method 821 * @param strsrch string search data 822 * @param start index of the potential unfriendly composite character 823 * @param end index of the potential unfriendly composite character 824 * @param status output error status if any. 825 * @return TRUE if there is non-ignorable accents before at the beginning 826 * of the match, FALSE otherwise. 827 */ 828 829 static 830 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 831 int32_t end, 832 UErrorCode *status) 833 { 834 UBool result = FALSE; 835 if (strsrch->pattern.hasPrefixAccents) { 836 int32_t length = end - start; 837 int32_t offset = 0; 838 const UChar *text = strsrch->search->text + start; 839 840 UTF_FWD_1(text, offset, length); 841 // we are only concerned with the first composite character 842 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 843 int32_t safeoffset = getNextSafeOffset(strsrch->collator, 844 text, 0, length); 845 if (safeoffset != length) { 846 safeoffset ++; 847 } 848 UChar *norm = NULL; 849 UChar buffer[INITIAL_ARRAY_SIZE_]; 850 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 851 buffer, INITIAL_ARRAY_SIZE_, 852 status); 853 if (U_FAILURE(*status)) { 854 return FALSE; 855 } 856 if (size >= INITIAL_ARRAY_SIZE_) { 857 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 858 status); 859 // if allocation failed, status will be set to 860 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 861 // checks for it. 862 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 863 size, status); 864 if (U_FAILURE(*status) && norm != NULL) { 865 uprv_free(norm); 866 return FALSE; 867 } 868 } 869 else { 870 norm = buffer; 871 } 872 873 UCollationElements *coleiter = strsrch->utilIter; 874 ucol_setText(coleiter, norm, size, status); 875 uint32_t firstce = strsrch->pattern.CE[0]; 876 UBool ignorable = TRUE; 877 uint32_t ce = UCOL_IGNORABLE; 878 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 879 offset = ucol_getOffset(coleiter); 880 if (ce != firstce && ce != UCOL_IGNORABLE) { 881 ignorable = FALSE; 882 } 883 ce = ucol_next(coleiter, status); 884 } 885 UChar32 codepoint; 886 UTF_PREV_CHAR(norm, 0, offset, codepoint); 887 result = !ignorable && (u_getCombiningClass(codepoint) != 0); 888 889 if (norm != buffer) { 890 uprv_free(norm); 891 } 892 } 893 } 894 895 return result; 896 } 897 898 /** 899 * Used by exact matches, checks if there are accents before the match. 900 * This is really painful... we have to check that composite characters at 901 * the start of the matches have to not have any extra accents. 902 * We check the FCD of the character first, if it starts with an accent and 903 * the first pattern ce does not match the first ce of the character, we bail. 904 * Otherwise we try normalizing the first composite 905 * character and find the immediate decomposed character before the match to 906 * see if it is an non-ignorable accent. 907 * Now normalizing the first composite character is enough because we ensure 908 * that when the match is passed in here with extra beginning ces, the 909 * first or last ce that match has to occur within the first character. 910 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 911 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 912 * Note here that accents checking are slow and cautioned in the API docs. 913 * @param strsrch string search data 914 * @param start offset 915 * @param end offset 916 * @return TRUE if there are accents on either side of the match, 917 * FALSE otherwise 918 */ 919 static 920 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 921 int32_t end) 922 { 923 if (strsrch->pattern.hasPrefixAccents) { 924 UCollationElements *coleiter = strsrch->textIter; 925 UErrorCode status = U_ZERO_ERROR; 926 // we have been iterating forwards previously 927 uint32_t ignorable = TRUE; 928 int32_t firstce = strsrch->pattern.CE[0]; 929 930 setColEIterOffset(coleiter, start); 931 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 932 if (U_FAILURE(status)) { 933 return TRUE; 934 } 935 while (ce != firstce) { 936 if (ce != UCOL_IGNORABLE) { 937 ignorable = FALSE; 938 } 939 ce = getCE(strsrch, ucol_next(coleiter, &status)); 940 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 941 return TRUE; 942 } 943 } 944 if (!ignorable && inNormBuf(coleiter)) { 945 // within normalization buffer, discontiguous handled here 946 return TRUE; 947 } 948 949 // within text 950 int32_t temp = start; 951 // original code 952 // accent = (getFCD(strsrch->search->text, &temp, 953 // strsrch->search->textLength) 954 // >> SECOND_LAST_BYTE_SHIFT_); 955 // however this code does not work well with VC7 .net in release mode. 956 // maybe the inlines for getFCD combined with shifting has bugs in 957 // VC7. anyways this is a work around. 958 UBool accent = getFCD(strsrch->search->text, &temp, 959 strsrch->search->textLength) > 0xFF; 960 if (!accent) { 961 return checkExtraMatchAccents(strsrch, start, end, &status); 962 } 963 if (!ignorable) { 964 return TRUE; 965 } 966 if (start > 0) { 967 temp = start; 968 UTF_BACK_1(strsrch->search->text, 0, temp); 969 if (getFCD(strsrch->search->text, &temp, 970 strsrch->search->textLength) & LAST_BYTE_MASK_) { 971 setColEIterOffset(coleiter, start); 972 ce = ucol_previous(coleiter, &status); 973 if (U_FAILURE(status) || 974 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 975 return TRUE; 976 } 977 } 978 } 979 } 980 981 return FALSE; 982 } 983 984 /** 985 * Used by exact matches, checks if there are accents bounding the match. 986 * Note this is the initial boundary check. If the potential match 987 * starts or ends with composite characters, the accents in those 988 * characters will be determined later. 989 * Not doing backwards iteration here, since discontiguos contraction for 990 * backwards collation element iterator, use up too many characters. 991 * E.g. looking for \u030A ring in \u01FA A ring above and acute, 992 * should fail since there is a acute at the end of \u01FA 993 * Note here that accents checking are slow and cautioned in the API docs. 994 * @param strsrch string search data 995 * @param start offset of match 996 * @param end end offset of the match 997 * @return TRUE if there are accents on either side of the match, 998 * FALSE otherwise 999 */ 1000 static 1001 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1002 int32_t end) 1003 { 1004 if (strsrch->pattern.hasSuffixAccents) { 1005 const UChar *text = strsrch->search->text; 1006 int32_t temp = end; 1007 int32_t textlength = strsrch->search->textLength; 1008 UTF_BACK_1(text, 0, temp); 1009 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1010 int32_t firstce = strsrch->pattern.CE[0]; 1011 UCollationElements *coleiter = strsrch->textIter; 1012 UErrorCode status = U_ZERO_ERROR; 1013 int32_t ce; 1014 setColEIterOffset(coleiter, start); 1015 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1016 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1017 return TRUE; 1018 } 1019 } 1020 int32_t count = 1; 1021 while (count < strsrch->pattern.CELength) { 1022 if (getCE(strsrch, ucol_next(coleiter, &status)) 1023 == UCOL_IGNORABLE) { 1024 // Thai can give an ignorable here. 1025 count --; 1026 } 1027 if (U_FAILURE(status)) { 1028 return TRUE; 1029 } 1030 count ++; 1031 } 1032 1033 ce = ucol_next(coleiter, &status); 1034 if (U_FAILURE(status)) { 1035 return TRUE; 1036 } 1037 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1038 ce = getCE(strsrch, ce); 1039 } 1040 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1041 if (ucol_getOffset(coleiter) <= end) { 1042 return TRUE; 1043 } 1044 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1045 return TRUE; 1046 } 1047 } 1048 } 1049 } 1050 return FALSE; 1051 } 1052 #endif // #if BOYER_MOORE 1053 1054 /** 1055 * Checks if the offset runs out of the text string 1056 * @param offset 1057 * @param textlength of the text string 1058 * @return TRUE if offset is out of bounds, FALSE otherwise 1059 */ 1060 static 1061 inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1062 { 1063 return offset < 0 || offset > textlength; 1064 } 1065 1066 /** 1067 * Checks for identical match 1068 * @param strsrch string search data 1069 * @param start offset of possible match 1070 * @param end offset of possible match 1071 * @return TRUE if identical match is found 1072 */ 1073 static 1074 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1075 int32_t end) 1076 { 1077 UChar t2[32], p2[32]; 1078 int32_t length = end - start; 1079 if (strsrch->strength != UCOL_IDENTICAL) { 1080 return TRUE; 1081 } 1082 1083 UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR; 1084 int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2), 1085 strsrch->search->text + start, length, 1086 FALSE, 0, &status); 1087 // use separate status2 in case of buffer overflow 1088 if (decomplength != unorm_decompose(p2, LENGTHOF(p2), 1089 strsrch->pattern.text, 1090 strsrch->pattern.textLength, 1091 FALSE, 0, &status2)) { 1092 return FALSE; // lengths are different 1093 } 1094 1095 // compare contents 1096 UChar *text, *pattern; 1097 if(U_SUCCESS(status)) { 1098 text = t2; 1099 pattern = p2; 1100 } else if(status==U_BUFFER_OVERFLOW_ERROR) { 1101 status = U_ZERO_ERROR; 1102 // allocate one buffer for both decompositions 1103 text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR); 1104 // Check for allocation failure. 1105 if (text == NULL) { 1106 return FALSE; 1107 } 1108 pattern = text + decomplength; 1109 unorm_decompose(text, decomplength, strsrch->search->text + start, 1110 length, FALSE, 0, &status); 1111 unorm_decompose(pattern, decomplength, strsrch->pattern.text, 1112 strsrch->pattern.textLength, FALSE, 0, &status); 1113 } else { 1114 // NFD failed, make sure that u_memcmp() does not overrun t2 & p2 1115 // and that we don't uprv_free() an undefined text pointer 1116 text = pattern = t2; 1117 decomplength = 0; 1118 } 1119 UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0); 1120 if(text != t2) { 1121 uprv_free(text); 1122 } 1123 // return FALSE if NFD failed 1124 return U_SUCCESS(status) && result; 1125 } 1126 1127 #if BOYER_MOORE 1128 /** 1129 * Checks to see if the match is repeated 1130 * @param strsrch string search data 1131 * @param start new match start index 1132 * @param end new match end index 1133 * @return TRUE if the the match is repeated, FALSE otherwise 1134 */ 1135 static 1136 inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1137 int32_t start, 1138 int32_t end) 1139 { 1140 int32_t lastmatchindex = strsrch->search->matchedIndex; 1141 UBool result; 1142 if (lastmatchindex == USEARCH_DONE) { 1143 return FALSE; 1144 } 1145 if (strsrch->search->isForwardSearching) { 1146 result = start <= lastmatchindex; 1147 } 1148 else { 1149 result = start >= lastmatchindex; 1150 } 1151 if (!result && !strsrch->search->isOverlap) { 1152 if (strsrch->search->isForwardSearching) { 1153 result = start < lastmatchindex + strsrch->search->matchedLength; 1154 } 1155 else { 1156 result = end > lastmatchindex; 1157 } 1158 } 1159 return result; 1160 } 1161 1162 /** 1163 * Gets the collation element iterator's current offset. 1164 * @param coleiter collation element iterator 1165 * @param forwards flag TRUE if we are moving in th forwards direction 1166 * @return current offset 1167 */ 1168 static 1169 inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1170 UBool forwards) 1171 { 1172 int32_t result = ucol_getOffset(coleiter); 1173 // intricacies of the the backwards collation element iterator 1174 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1175 result ++; 1176 } 1177 return result; 1178 } 1179 1180 /** 1181 * Checks match for contraction. 1182 * If the match ends with a partial contraction we fail. 1183 * If the match starts too far off (because of backwards iteration) we try to 1184 * chip off the extra characters depending on whether a breakiterator has 1185 * been used. 1186 * Internal method, error assumed to be success, caller has to check status 1187 * before calling this method. 1188 * @param strsrch string search data 1189 * @param start offset of potential match, to be modified if necessary 1190 * @param end offset of potential match, to be modified if necessary 1191 * @param status output error status if any 1192 * @return TRUE if match passes the contraction test, FALSE otherwise 1193 */ 1194 1195 static 1196 UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1197 int32_t *start, 1198 int32_t *end, UErrorCode *status) 1199 { 1200 UCollationElements *coleiter = strsrch->textIter; 1201 int32_t textlength = strsrch->search->textLength; 1202 int32_t temp = *start; 1203 const UCollator *collator = strsrch->collator; 1204 const UChar *text = strsrch->search->text; 1205 // This part checks if either ends of the match contains potential 1206 // contraction. If so we'll have to iterate through them 1207 // The start contraction needs to be checked since ucol_previous dumps 1208 // all characters till the first safe character into the buffer. 1209 // *start + 1 is used to test for the unsafe characters instead of *start 1210 // because ucol_prev takes all unsafe characters till the first safe 1211 // character ie *start. so by testing *start + 1, we can estimate if 1212 // excess prefix characters has been included in the potential search 1213 // results. 1214 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1215 (*start + 1 < textlength 1216 && ucol_unsafeCP(text[*start + 1], collator))) { 1217 int32_t expansion = getExpansionPrefix(coleiter); 1218 UBool expandflag = expansion > 0; 1219 setColEIterOffset(coleiter, *start); 1220 while (expansion > 0) { 1221 // getting rid of the redundant ce, caused by setOffset. 1222 // since backward contraction/expansion may have extra ces if we 1223 // are in the normalization buffer, hasAccentsBeforeMatch would 1224 // have taken care of it. 1225 // E.g. the character \u01FA will have an expansion of 3, but if 1226 // we are only looking for acute and ring \u030A and \u0301, we'll 1227 // have to skip the first ce in the expansion buffer. 1228 ucol_next(coleiter, status); 1229 if (U_FAILURE(*status)) { 1230 return FALSE; 1231 } 1232 if (ucol_getOffset(coleiter) != temp) { 1233 *start = temp; 1234 temp = ucol_getOffset(coleiter); 1235 } 1236 expansion --; 1237 } 1238 1239 int32_t *patternce = strsrch->pattern.CE; 1240 int32_t patterncelength = strsrch->pattern.CELength; 1241 int32_t count = 0; 1242 while (count < patterncelength) { 1243 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1244 if (ce == UCOL_IGNORABLE) { 1245 continue; 1246 } 1247 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1248 *start = temp; 1249 temp = ucol_getOffset(coleiter); 1250 } 1251 if (U_FAILURE(*status) || ce != patternce[count]) { 1252 (*end) ++; 1253 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1254 return FALSE; 1255 } 1256 count ++; 1257 } 1258 } 1259 return TRUE; 1260 } 1261 1262 /** 1263 * Checks and sets the match information if found. 1264 * Checks 1265 * <ul> 1266 * <li> the potential match does not repeat the previous match 1267 * <li> boundaries are correct 1268 * <li> exact matches has no extra accents 1269 * <li> identical matchesb 1270 * <li> potential match does not end in the middle of a contraction 1271 * <\ul> 1272 * Otherwise the offset will be shifted to the next character. 1273 * Internal method, status assumed to be success, caller has to check status 1274 * before calling this method. 1275 * @param strsrch string search data 1276 * @param textoffset offset in the collation element text. the returned value 1277 * will be the truncated end offset of the match or the new start 1278 * search offset. 1279 * @param status output error status if any 1280 * @return TRUE if the match is valid, FALSE otherwise 1281 */ 1282 static 1283 inline UBool checkNextExactMatch(UStringSearch *strsrch, 1284 int32_t *textoffset, UErrorCode *status) 1285 { 1286 UCollationElements *coleiter = strsrch->textIter; 1287 int32_t start = getColElemIterOffset(coleiter, FALSE); 1288 1289 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1290 return FALSE; 1291 } 1292 1293 // this totally matches, however we need to check if it is repeating 1294 if (!isBreakUnit(strsrch, start, *textoffset) || 1295 checkRepeatedMatch(strsrch, start, *textoffset) || 1296 hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1297 !checkIdentical(strsrch, start, *textoffset) || 1298 hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1299 1300 (*textoffset) ++; 1301 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1302 return FALSE; 1303 } 1304 1305 //Add breakiterator boundary check for primary strength search. 1306 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1307 checkBreakBoundary(strsrch, &start, textoffset); 1308 } 1309 1310 // totally match, we will get rid of the ending ignorables. 1311 strsrch->search->matchedIndex = start; 1312 strsrch->search->matchedLength = *textoffset - start; 1313 return TRUE; 1314 } 1315 1316 /** 1317 * Getting the previous base character offset, or the current offset if the 1318 * current character is a base character 1319 * @param text string 1320 * @param textoffset one offset after the current character 1321 * @return the offset of the next character after the base character or the first 1322 * composed character with accents 1323 */ 1324 static 1325 inline int32_t getPreviousBaseOffset(const UChar *text, 1326 int32_t textoffset) 1327 { 1328 if (textoffset > 0) { 1329 for (;;) { 1330 int32_t result = textoffset; 1331 UTF_BACK_1(text, 0, textoffset); 1332 int32_t temp = textoffset; 1333 uint16_t fcd = getFCD(text, &temp, result); 1334 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1335 if (fcd & LAST_BYTE_MASK_) { 1336 return textoffset; 1337 } 1338 return result; 1339 } 1340 if (textoffset == 0) { 1341 return 0; 1342 } 1343 } 1344 } 1345 return textoffset; 1346 } 1347 1348 /** 1349 * Getting the indexes of the accents that are not blocked in the argument 1350 * accent array 1351 * @param accents array of accents in nfd terminated by a 0. 1352 * @param accentsindex array of indexes of the accents that are not blocked 1353 */ 1354 static 1355 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1356 { 1357 int32_t index = 0; 1358 int32_t length = u_strlen(accents); 1359 UChar32 codepoint = 0; 1360 int cclass = 0; 1361 int result = 0; 1362 int32_t temp; 1363 while (index < length) { 1364 temp = index; 1365 UTF_NEXT_CHAR(accents, index, length, codepoint); 1366 if (u_getCombiningClass(codepoint) != cclass) { 1367 cclass = u_getCombiningClass(codepoint); 1368 accentsindex[result] = temp; 1369 result ++; 1370 } 1371 } 1372 accentsindex[result] = length; 1373 return result; 1374 } 1375 1376 /** 1377 * Appends 3 UChar arrays to a destination array. 1378 * Creates a new array if we run out of space. The caller will have to 1379 * manually deallocate the newly allocated array. 1380 * Internal method, status assumed to be success, caller has to check status 1381 * before calling this method. destination not to be NULL and has at least 1382 * size destinationlength. 1383 * @param destination target array 1384 * @param destinationlength target array size, returning the appended length 1385 * @param source1 null-terminated first array 1386 * @param source2 second array 1387 * @param source2length length of seond array 1388 * @param source3 null-terminated third array 1389 * @param status error status if any 1390 * @return new destination array, destination if there was no new allocation 1391 */ 1392 static 1393 inline UChar * addToUCharArray( UChar *destination, 1394 int32_t *destinationlength, 1395 const UChar *source1, 1396 const UChar *source2, 1397 int32_t source2length, 1398 const UChar *source3, 1399 UErrorCode *status) 1400 { 1401 int32_t source1length = source1 ? u_strlen(source1) : 0; 1402 int32_t source3length = source3 ? u_strlen(source3) : 0; 1403 if (*destinationlength < source1length + source2length + source3length + 1404 1) 1405 { 1406 destination = (UChar *)allocateMemory( 1407 (source1length + source2length + source3length + 1) * sizeof(UChar), 1408 status); 1409 // if error allocating memory, status will be 1410 // U_MEMORY_ALLOCATION_ERROR 1411 if (U_FAILURE(*status)) { 1412 *destinationlength = 0; 1413 return NULL; 1414 } 1415 } 1416 if (source1length != 0) { 1417 uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1418 } 1419 if (source2length != 0) { 1420 uprv_memcpy(destination + source1length, source2, 1421 sizeof(UChar) * source2length); 1422 } 1423 if (source3length != 0) { 1424 uprv_memcpy(destination + source1length + source2length, source3, 1425 sizeof(UChar) * source3length); 1426 } 1427 *destinationlength = source1length + source2length + source3length; 1428 return destination; 1429 } 1430 1431 /** 1432 * Running through a collation element iterator to see if the contents matches 1433 * pattern in string search data 1434 * @param strsrch string search data 1435 * @param coleiter collation element iterator 1436 * @return TRUE if a match if found, FALSE otherwise 1437 */ 1438 static 1439 inline UBool checkCollationMatch(const UStringSearch *strsrch, 1440 UCollationElements *coleiter) 1441 { 1442 int patternceindex = strsrch->pattern.CELength; 1443 int32_t *patternce = strsrch->pattern.CE; 1444 UErrorCode status = U_ZERO_ERROR; 1445 while (patternceindex > 0) { 1446 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1447 if (ce == UCOL_IGNORABLE) { 1448 continue; 1449 } 1450 if (U_FAILURE(status) || ce != *patternce) { 1451 return FALSE; 1452 } 1453 patternce ++; 1454 patternceindex --; 1455 } 1456 return TRUE; 1457 } 1458 1459 /** 1460 * Rearranges the front accents to try matching. 1461 * Prefix accents in the text will be grouped according to their combining 1462 * class and the groups will be mixed and matched to try find the perfect 1463 * match with the pattern. 1464 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1465 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1466 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1467 * "\u0301\u0325". 1468 * step 2: check if any of the generated substrings matches the pattern. 1469 * Internal method, status is assumed to be success, caller has to check status 1470 * before calling this method. 1471 * @param strsrch string search match 1472 * @param start first offset of the accents to start searching 1473 * @param end start of the last accent set 1474 * @param status output error status if any 1475 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1476 * offset of the match. Note this start includes all preceding accents. 1477 */ 1478 static 1479 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1480 int32_t start, 1481 int32_t end, 1482 UErrorCode *status) 1483 { 1484 const UChar *text = strsrch->search->text; 1485 int32_t textlength = strsrch->search->textLength; 1486 int32_t tempstart = start; 1487 1488 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1489 // die... failed at a base character 1490 return USEARCH_DONE; 1491 } 1492 1493 int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1494 start = getPreviousBaseOffset(text, tempstart); 1495 1496 UChar accents[INITIAL_ARRAY_SIZE_]; 1497 // normalizing the offensive string 1498 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1499 INITIAL_ARRAY_SIZE_, status); 1500 if (U_FAILURE(*status)) { 1501 return USEARCH_DONE; 1502 } 1503 1504 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1505 int32_t accentsize = getUnblockedAccentIndex(accents, 1506 accentsindex); 1507 int32_t count = (2 << (accentsize - 1)) - 1; 1508 UChar buffer[INITIAL_ARRAY_SIZE_]; 1509 UCollationElements *coleiter = strsrch->utilIter; 1510 while (U_SUCCESS(*status) && count > 0) { 1511 UChar *rearrange = strsrch->canonicalPrefixAccents; 1512 // copy the base characters 1513 for (int k = 0; k < accentsindex[0]; k ++) { 1514 *rearrange ++ = accents[k]; 1515 } 1516 // forming all possible canonical rearrangement by dropping 1517 // sets of accents 1518 for (int i = 0; i <= accentsize - 1; i ++) { 1519 int32_t mask = 1 << (accentsize - i - 1); 1520 if (count & mask) { 1521 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1522 *rearrange ++ = accents[j]; 1523 } 1524 } 1525 } 1526 *rearrange = 0; 1527 int32_t matchsize = INITIAL_ARRAY_SIZE_; 1528 UChar *match = addToUCharArray(buffer, &matchsize, 1529 strsrch->canonicalPrefixAccents, 1530 strsrch->search->text + offset, 1531 end - offset, 1532 strsrch->canonicalSuffixAccents, 1533 status); 1534 1535 // if status is a failure, ucol_setText does nothing. 1536 // run the collator iterator through this match 1537 ucol_setText(coleiter, match, matchsize, status); 1538 if (U_SUCCESS(*status)) { 1539 if (checkCollationMatch(strsrch, coleiter)) { 1540 if (match != buffer) { 1541 uprv_free(match); 1542 } 1543 return start; 1544 } 1545 } 1546 count --; 1547 } 1548 return USEARCH_DONE; 1549 } 1550 1551 /** 1552 * Gets the offset to the safe point in text before textoffset. 1553 * ie. not the middle of a contraction, swappable characters or supplementary 1554 * characters. 1555 * @param collator collation sata 1556 * @param text string to work with 1557 * @param textoffset offset in string 1558 * @param textlength length of text string 1559 * @return offset to the previous safe character 1560 */ 1561 static 1562 inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1563 const UChar *text, 1564 int32_t textoffset) 1565 { 1566 int32_t result = textoffset; // first contraction character 1567 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1568 result --; 1569 } 1570 if (result != 0) { 1571 // the first contraction character is consider unsafe here 1572 result --; 1573 } 1574 return result; 1575 } 1576 1577 /** 1578 * Cleaning up after we passed the safe zone 1579 * @param strsrch string search data 1580 * @param safetext safe text array 1581 * @param safebuffer safe text buffer 1582 * @param coleiter collation element iterator for safe text 1583 */ 1584 static 1585 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1586 UChar *safebuffer) 1587 { 1588 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1589 { 1590 uprv_free(safetext); 1591 } 1592 } 1593 1594 /** 1595 * Take the rearranged end accents and tries matching. If match failed at 1596 * a seperate preceding set of accents (seperated from the rearranged on by 1597 * at least a base character) then we rearrange the preceding accents and 1598 * tries matching again. 1599 * We allow skipping of the ends of the accent set if the ces do not match. 1600 * However if the failure is found before the accent set, it fails. 1601 * Internal method, status assumed to be success, caller has to check status 1602 * before calling this method. 1603 * @param strsrch string search data 1604 * @param textoffset of the start of the rearranged accent 1605 * @param status output error status if any 1606 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1607 * offset of the match. Note this start includes all preceding accents. 1608 */ 1609 static 1610 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1611 int32_t textoffset, 1612 UErrorCode *status) 1613 { 1614 const UChar *text = strsrch->search->text; 1615 const UCollator *collator = strsrch->collator; 1616 int32_t safelength = 0; 1617 UChar *safetext; 1618 int32_t safetextlength; 1619 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1620 UCollationElements *coleiter = strsrch->utilIter; 1621 int32_t safeoffset = textoffset; 1622 1623 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1624 collator)) { 1625 safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1626 safelength = textoffset - safeoffset; 1627 safetextlength = INITIAL_ARRAY_SIZE_; 1628 safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1629 text + safeoffset, safelength, 1630 strsrch->canonicalSuffixAccents, 1631 status); 1632 } 1633 else { 1634 safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1635 safetext = strsrch->canonicalSuffixAccents; 1636 } 1637 1638 // if status is a failure, ucol_setText does nothing 1639 ucol_setText(coleiter, safetext, safetextlength, status); 1640 // status checked in loop below 1641 1642 int32_t *ce = strsrch->pattern.CE; 1643 int32_t celength = strsrch->pattern.CELength; 1644 int ceindex = celength - 1; 1645 UBool isSafe = TRUE; // indication flag for position in safe zone 1646 1647 while (ceindex >= 0) { 1648 int32_t textce = ucol_previous(coleiter, status); 1649 if (U_FAILURE(*status)) { 1650 if (isSafe) { 1651 cleanUpSafeText(strsrch, safetext, safebuffer); 1652 } 1653 return USEARCH_DONE; 1654 } 1655 if (textce == UCOL_NULLORDER) { 1656 // check if we have passed the safe buffer 1657 if (coleiter == strsrch->textIter) { 1658 cleanUpSafeText(strsrch, safetext, safebuffer); 1659 return USEARCH_DONE; 1660 } 1661 cleanUpSafeText(strsrch, safetext, safebuffer); 1662 safetext = safebuffer; 1663 coleiter = strsrch->textIter; 1664 setColEIterOffset(coleiter, safeoffset); 1665 // status checked at the start of the loop 1666 isSafe = FALSE; 1667 continue; 1668 } 1669 textce = getCE(strsrch, textce); 1670 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1671 // do the beginning stuff 1672 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1673 if (isSafe && failedoffset >= safelength) { 1674 // alas... no hope. failed at rearranged accent set 1675 cleanUpSafeText(strsrch, safetext, safebuffer); 1676 return USEARCH_DONE; 1677 } 1678 else { 1679 if (isSafe) { 1680 failedoffset += safeoffset; 1681 cleanUpSafeText(strsrch, safetext, safebuffer); 1682 } 1683 1684 // try rearranging the front accents 1685 int32_t result = doNextCanonicalPrefixMatch(strsrch, 1686 failedoffset, textoffset, status); 1687 if (result != USEARCH_DONE) { 1688 // if status is a failure, ucol_setOffset does nothing 1689 setColEIterOffset(strsrch->textIter, result); 1690 } 1691 if (U_FAILURE(*status)) { 1692 return USEARCH_DONE; 1693 } 1694 return result; 1695 } 1696 } 1697 if (textce == ce[ceindex]) { 1698 ceindex --; 1699 } 1700 } 1701 // set offset here 1702 if (isSafe) { 1703 int32_t result = getColElemIterOffset(coleiter, FALSE); 1704 // sets the text iterator here with the correct expansion and offset 1705 int32_t leftoverces = getExpansionPrefix(coleiter); 1706 cleanUpSafeText(strsrch, safetext, safebuffer); 1707 if (result >= safelength) { 1708 result = textoffset; 1709 } 1710 else { 1711 result += safeoffset; 1712 } 1713 setColEIterOffset(strsrch->textIter, result); 1714 strsrch->textIter->iteratordata_.toReturn = 1715 setExpansionPrefix(strsrch->textIter, leftoverces); 1716 return result; 1717 } 1718 1719 return ucol_getOffset(coleiter); 1720 } 1721 1722 /** 1723 * Trying out the substring and sees if it can be a canonical match. 1724 * This will try normalizing the end accents and arranging them into canonical 1725 * equivalents and check their corresponding ces with the pattern ce. 1726 * Suffix accents in the text will be grouped according to their combining 1727 * class and the groups will be mixed and matched to try find the perfect 1728 * match with the pattern. 1729 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1730 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1731 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1732 * "\u0301\u0325". 1733 * step 2: check if any of the generated substrings matches the pattern. 1734 * Internal method, status assumed to be success, caller has to check status 1735 * before calling this method. 1736 * @param strsrch string search data 1737 * @param textoffset end offset in the collation element text that ends with 1738 * the accents to be rearranged 1739 * @param status error status if any 1740 * @return TRUE if the match is valid, FALSE otherwise 1741 */ 1742 static 1743 UBool doNextCanonicalMatch(UStringSearch *strsrch, 1744 int32_t textoffset, 1745 UErrorCode *status) 1746 { 1747 const UChar *text = strsrch->search->text; 1748 int32_t temp = textoffset; 1749 UTF_BACK_1(text, 0, temp); 1750 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1751 UCollationElements *coleiter = strsrch->textIter; 1752 int32_t offset = getColElemIterOffset(coleiter, FALSE); 1753 if (strsrch->pattern.hasPrefixAccents) { 1754 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1755 status); 1756 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1757 setColEIterOffset(coleiter, offset); 1758 return TRUE; 1759 } 1760 } 1761 return FALSE; 1762 } 1763 1764 if (!strsrch->pattern.hasSuffixAccents) { 1765 return FALSE; 1766 } 1767 1768 UChar accents[INITIAL_ARRAY_SIZE_]; 1769 // offset to the last base character in substring to search 1770 int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1771 // normalizing the offensive string 1772 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1773 0, accents, INITIAL_ARRAY_SIZE_, status); 1774 // status checked in loop below 1775 1776 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1777 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1778 1779 // 2 power n - 1 plus the full set of accents 1780 int32_t count = (2 << (size - 1)) - 1; 1781 while (U_SUCCESS(*status) && count > 0) { 1782 UChar *rearrange = strsrch->canonicalSuffixAccents; 1783 // copy the base characters 1784 for (int k = 0; k < accentsindex[0]; k ++) { 1785 *rearrange ++ = accents[k]; 1786 } 1787 // forming all possible canonical rearrangement by dropping 1788 // sets of accents 1789 for (int i = 0; i <= size - 1; i ++) { 1790 int32_t mask = 1 << (size - i - 1); 1791 if (count & mask) { 1792 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1793 *rearrange ++ = accents[j]; 1794 } 1795 } 1796 } 1797 *rearrange = 0; 1798 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1799 status); 1800 if (offset != USEARCH_DONE) { 1801 return TRUE; // match found 1802 } 1803 count --; 1804 } 1805 return FALSE; 1806 } 1807 1808 /** 1809 * Gets the previous base character offset depending on the string search 1810 * pattern data 1811 * @param strsrch string search data 1812 * @param textoffset current offset, current character 1813 * @return the offset of the next character after this base character or itself 1814 * if it is a composed character with accents 1815 */ 1816 static 1817 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1818 int32_t textoffset) 1819 { 1820 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1821 const UChar *text = strsrch->search->text; 1822 int32_t offset = textoffset; 1823 if (getFCD(text, &offset, strsrch->search->textLength) >> 1824 SECOND_LAST_BYTE_SHIFT_) { 1825 return getPreviousBaseOffset(text, textoffset); 1826 } 1827 } 1828 return textoffset; 1829 } 1830 1831 /** 1832 * Checks match for contraction. 1833 * If the match ends with a partial contraction we fail. 1834 * If the match starts too far off (because of backwards iteration) we try to 1835 * chip off the extra characters 1836 * Internal method, status assumed to be success, caller has to check status 1837 * before calling this method. 1838 * @param strsrch string search data 1839 * @param start offset of potential match, to be modified if necessary 1840 * @param end offset of potential match, to be modified if necessary 1841 * @param status output error status if any 1842 * @return TRUE if match passes the contraction test, FALSE otherwise 1843 */ 1844 static 1845 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1846 int32_t *start, 1847 int32_t *end, 1848 UErrorCode *status) 1849 { 1850 UCollationElements *coleiter = strsrch->textIter; 1851 int32_t textlength = strsrch->search->textLength; 1852 int32_t temp = *start; 1853 const UCollator *collator = strsrch->collator; 1854 const UChar *text = strsrch->search->text; 1855 // This part checks if either ends of the match contains potential 1856 // contraction. If so we'll have to iterate through them 1857 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1858 (*start + 1 < textlength 1859 && ucol_unsafeCP(text[*start + 1], collator))) { 1860 int32_t expansion = getExpansionPrefix(coleiter); 1861 UBool expandflag = expansion > 0; 1862 setColEIterOffset(coleiter, *start); 1863 while (expansion > 0) { 1864 // getting rid of the redundant ce, caused by setOffset. 1865 // since backward contraction/expansion may have extra ces if we 1866 // are in the normalization buffer, hasAccentsBeforeMatch would 1867 // have taken care of it. 1868 // E.g. the character \u01FA will have an expansion of 3, but if 1869 // we are only looking for acute and ring \u030A and \u0301, we'll 1870 // have to skip the first ce in the expansion buffer. 1871 ucol_next(coleiter, status); 1872 if (U_FAILURE(*status)) { 1873 return FALSE; 1874 } 1875 if (ucol_getOffset(coleiter) != temp) { 1876 *start = temp; 1877 temp = ucol_getOffset(coleiter); 1878 } 1879 expansion --; 1880 } 1881 1882 int32_t *patternce = strsrch->pattern.CE; 1883 int32_t patterncelength = strsrch->pattern.CELength; 1884 int32_t count = 0; 1885 int32_t textlength = strsrch->search->textLength; 1886 while (count < patterncelength) { 1887 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1888 // status checked below, note that if status is a failure 1889 // ucol_next returns UCOL_NULLORDER 1890 if (ce == UCOL_IGNORABLE) { 1891 continue; 1892 } 1893 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1894 *start = temp; 1895 temp = ucol_getOffset(coleiter); 1896 } 1897 1898 if (count == 0 && ce != patternce[0]) { 1899 // accents may have extra starting ces, this occurs when a 1900 // pure accent pattern is matched without rearrangement 1901 // text \u0325\u0300 and looking for \u0300 1902 int32_t expected = patternce[0]; 1903 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1904 ce = getCE(strsrch, ucol_next(coleiter, status)); 1905 while (U_SUCCESS(*status) && ce != expected && 1906 ce != UCOL_NULLORDER && 1907 ucol_getOffset(coleiter) <= *end) { 1908 ce = getCE(strsrch, ucol_next(coleiter, status)); 1909 } 1910 } 1911 } 1912 if (U_FAILURE(*status) || ce != patternce[count]) { 1913 (*end) ++; 1914 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1915 return FALSE; 1916 } 1917 count ++; 1918 } 1919 } 1920 return TRUE; 1921 } 1922 1923 /** 1924 * Checks and sets the match information if found. 1925 * Checks 1926 * <ul> 1927 * <li> the potential match does not repeat the previous match 1928 * <li> boundaries are correct 1929 * <li> potential match does not end in the middle of a contraction 1930 * <li> identical matches 1931 * <\ul> 1932 * Otherwise the offset will be shifted to the next character. 1933 * Internal method, status assumed to be success, caller has to check the 1934 * status before calling this method. 1935 * @param strsrch string search data 1936 * @param textoffset offset in the collation element text. the returned value 1937 * will be the truncated end offset of the match or the new start 1938 * search offset. 1939 * @param status output error status if any 1940 * @return TRUE if the match is valid, FALSE otherwise 1941 */ 1942 static 1943 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1944 int32_t *textoffset, 1945 UErrorCode *status) 1946 { 1947 // to ensure that the start and ends are not composite characters 1948 UCollationElements *coleiter = strsrch->textIter; 1949 // if we have a canonical accent match 1950 if ((strsrch->pattern.hasSuffixAccents && 1951 strsrch->canonicalSuffixAccents[0]) || 1952 (strsrch->pattern.hasPrefixAccents && 1953 strsrch->canonicalPrefixAccents[0])) { 1954 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1955 strsrch, 1956 ucol_getOffset(coleiter)); 1957 strsrch->search->matchedLength = *textoffset - 1958 strsrch->search->matchedIndex; 1959 return TRUE; 1960 } 1961 1962 int32_t start = getColElemIterOffset(coleiter, FALSE); 1963 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1964 status) || U_FAILURE(*status)) { 1965 return FALSE; 1966 } 1967 1968 start = getPreviousUStringSearchBaseOffset(strsrch, start); 1969 // this totally matches, however we need to check if it is repeating 1970 if (checkRepeatedMatch(strsrch, start, *textoffset) || 1971 !isBreakUnit(strsrch, start, *textoffset) || 1972 !checkIdentical(strsrch, start, *textoffset)) { 1973 (*textoffset) ++; 1974 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1975 strsrch->search->textLength); 1976 return FALSE; 1977 } 1978 1979 strsrch->search->matchedIndex = start; 1980 strsrch->search->matchedLength = *textoffset - start; 1981 return TRUE; 1982 } 1983 1984 /** 1985 * Shifting the collation element iterator position forward to prepare for 1986 * a preceding match. If the first character is a unsafe character, we'll only 1987 * shift by 1 to capture contractions, normalization etc. 1988 * Internal method, status assumed to be success, caller has to check status 1989 * before calling this method. 1990 * @param text strsrch string search data 1991 * @param textoffset start text position to do search 1992 * @param ce the text ce which failed the match. 1993 * @param patternceindex index of the ce within the pattern ce buffer which 1994 * failed the match 1995 * @return final offset 1996 */ 1997 static 1998 inline int32_t reverseShift(UStringSearch *strsrch, 1999 int32_t textoffset, 2000 int32_t ce, 2001 int32_t patternceindex) 2002 { 2003 if (strsrch->search->isOverlap) { 2004 if (textoffset != strsrch->search->textLength) { 2005 textoffset --; 2006 } 2007 else { 2008 textoffset -= strsrch->pattern.defaultShiftSize; 2009 } 2010 } 2011 else { 2012 if (ce != UCOL_NULLORDER) { 2013 int32_t shift = strsrch->pattern.backShift[hash(ce)]; 2014 2015 // this is to adjust for characters in the middle of the substring 2016 // for matching that failed. 2017 int32_t adjust = patternceindex; 2018 if (adjust > 1 && shift > adjust) { 2019 shift -= adjust - 1; 2020 } 2021 textoffset -= shift; 2022 } 2023 else { 2024 textoffset -= strsrch->pattern.defaultShiftSize; 2025 } 2026 } 2027 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 2028 return textoffset; 2029 } 2030 2031 /** 2032 * Checks match for contraction. 2033 * If the match starts with a partial contraction we fail. 2034 * Internal method, status assumed to be success, caller has to check status 2035 * before calling this method. 2036 * @param strsrch string search data 2037 * @param start offset of potential match, to be modified if necessary 2038 * @param end offset of potential match, to be modified if necessary 2039 * @param status output error status if any 2040 * @return TRUE if match passes the contraction test, FALSE otherwise 2041 */ 2042 static 2043 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2044 int32_t *start, 2045 int32_t *end, UErrorCode *status) 2046 { 2047 UCollationElements *coleiter = strsrch->textIter; 2048 int32_t textlength = strsrch->search->textLength; 2049 int32_t temp = *end; 2050 const UCollator *collator = strsrch->collator; 2051 const UChar *text = strsrch->search->text; 2052 // This part checks if either if the start of the match contains potential 2053 // contraction. If so we'll have to iterate through them 2054 // Since we used ucol_next while previously looking for the potential 2055 // match, this guarantees that our end will not be a partial contraction, 2056 // or a partial supplementary character. 2057 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2058 int32_t expansion = getExpansionSuffix(coleiter); 2059 UBool expandflag = expansion > 0; 2060 setColEIterOffset(coleiter, *end); 2061 while (U_SUCCESS(*status) && expansion > 0) { 2062 // getting rid of the redundant ce 2063 // since forward contraction/expansion may have extra ces 2064 // if we are in the normalization buffer, hasAccentsBeforeMatch 2065 // would have taken care of it. 2066 // E.g. the character \u01FA will have an expansion of 3, but if 2067 // we are only looking for A ring A\u030A, we'll have to skip the 2068 // last ce in the expansion buffer 2069 ucol_previous(coleiter, status); 2070 if (U_FAILURE(*status)) { 2071 return FALSE; 2072 } 2073 if (ucol_getOffset(coleiter) != temp) { 2074 *end = temp; 2075 temp = ucol_getOffset(coleiter); 2076 } 2077 expansion --; 2078 } 2079 2080 int32_t *patternce = strsrch->pattern.CE; 2081 int32_t patterncelength = strsrch->pattern.CELength; 2082 int32_t count = patterncelength; 2083 while (count > 0) { 2084 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2085 // status checked below, note that if status is a failure 2086 // ucol_previous returns UCOL_NULLORDER 2087 if (ce == UCOL_IGNORABLE) { 2088 continue; 2089 } 2090 if (expandflag && count == 0 && 2091 getColElemIterOffset(coleiter, FALSE) != temp) { 2092 *end = temp; 2093 temp = ucol_getOffset(coleiter); 2094 } 2095 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2096 (*start) --; 2097 *start = getPreviousBaseOffset(text, *start); 2098 return FALSE; 2099 } 2100 count --; 2101 } 2102 } 2103 return TRUE; 2104 } 2105 2106 /** 2107 * Checks and sets the match information if found. 2108 * Checks 2109 * <ul> 2110 * <li> the current match does not repeat the last match 2111 * <li> boundaries are correct 2112 * <li> exact matches has no extra accents 2113 * <li> identical matches 2114 * <\ul> 2115 * Otherwise the offset will be shifted to the preceding character. 2116 * Internal method, status assumed to be success, caller has to check status 2117 * before calling this method. 2118 * @param strsrch string search data 2119 * @param collator 2120 * @param coleiter collation element iterator 2121 * @param text string 2122 * @param textoffset offset in the collation element text. the returned value 2123 * will be the truncated start offset of the match or the new start 2124 * search offset. 2125 * @param status output error status if any 2126 * @return TRUE if the match is valid, FALSE otherwise 2127 */ 2128 static 2129 inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2130 int32_t *textoffset, 2131 UErrorCode *status) 2132 { 2133 // to ensure that the start and ends are not composite characters 2134 int32_t end = ucol_getOffset(strsrch->textIter); 2135 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2136 || U_FAILURE(*status)) { 2137 return FALSE; 2138 } 2139 2140 // this totally matches, however we need to check if it is repeating 2141 // the old match 2142 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2143 !isBreakUnit(strsrch, *textoffset, end) || 2144 hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2145 !checkIdentical(strsrch, *textoffset, end) || 2146 hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2147 (*textoffset) --; 2148 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2149 *textoffset); 2150 return FALSE; 2151 } 2152 2153 //Add breakiterator boundary check for primary strength search. 2154 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2155 checkBreakBoundary(strsrch, textoffset, &end); 2156 } 2157 2158 strsrch->search->matchedIndex = *textoffset; 2159 strsrch->search->matchedLength = end - *textoffset; 2160 return TRUE; 2161 } 2162 2163 /** 2164 * Rearranges the end accents to try matching. 2165 * Suffix accents in the text will be grouped according to their combining 2166 * class and the groups will be mixed and matched to try find the perfect 2167 * match with the pattern. 2168 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2169 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2170 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2171 * "\u0301\u0325". 2172 * step 2: check if any of the generated substrings matches the pattern. 2173 * Internal method, status assumed to be success, user has to check status 2174 * before calling this method. 2175 * @param strsrch string search match 2176 * @param start offset of the first base character 2177 * @param end start of the last accent set 2178 * @param status only error status if any 2179 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2180 * offset of the match. Note this start includes all following accents. 2181 */ 2182 static 2183 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2184 int32_t start, 2185 int32_t end, 2186 UErrorCode *status) 2187 { 2188 const UChar *text = strsrch->search->text; 2189 int32_t tempend = end; 2190 2191 UTF_BACK_1(text, 0, tempend); 2192 if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2193 LAST_BYTE_MASK_)) { 2194 // die... failed at a base character 2195 return USEARCH_DONE; 2196 } 2197 end = getNextBaseOffset(text, end, strsrch->search->textLength); 2198 2199 if (U_SUCCESS(*status)) { 2200 UChar accents[INITIAL_ARRAY_SIZE_]; 2201 int32_t offset = getPreviousBaseOffset(text, end); 2202 // normalizing the offensive string 2203 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2204 INITIAL_ARRAY_SIZE_, status); 2205 2206 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2207 int32_t accentsize = getUnblockedAccentIndex(accents, 2208 accentsindex); 2209 int32_t count = (2 << (accentsize - 1)) - 1; 2210 UChar buffer[INITIAL_ARRAY_SIZE_]; 2211 UCollationElements *coleiter = strsrch->utilIter; 2212 while (U_SUCCESS(*status) && count > 0) { 2213 UChar *rearrange = strsrch->canonicalSuffixAccents; 2214 // copy the base characters 2215 for (int k = 0; k < accentsindex[0]; k ++) { 2216 *rearrange ++ = accents[k]; 2217 } 2218 // forming all possible canonical rearrangement by dropping 2219 // sets of accents 2220 for (int i = 0; i <= accentsize - 1; i ++) { 2221 int32_t mask = 1 << (accentsize - i - 1); 2222 if (count & mask) { 2223 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2224 *rearrange ++ = accents[j]; 2225 } 2226 } 2227 } 2228 *rearrange = 0; 2229 int32_t matchsize = INITIAL_ARRAY_SIZE_; 2230 UChar *match = addToUCharArray(buffer, &matchsize, 2231 strsrch->canonicalPrefixAccents, 2232 strsrch->search->text + start, 2233 offset - start, 2234 strsrch->canonicalSuffixAccents, 2235 status); 2236 2237 // run the collator iterator through this match 2238 // if status is a failure ucol_setText does nothing 2239 ucol_setText(coleiter, match, matchsize, status); 2240 if (U_SUCCESS(*status)) { 2241 if (checkCollationMatch(strsrch, coleiter)) { 2242 if (match != buffer) { 2243 uprv_free(match); 2244 } 2245 return end; 2246 } 2247 } 2248 count --; 2249 } 2250 } 2251 return USEARCH_DONE; 2252 } 2253 2254 /** 2255 * Take the rearranged start accents and tries matching. If match failed at 2256 * a seperate following set of accents (seperated from the rearranged on by 2257 * at least a base character) then we rearrange the preceding accents and 2258 * tries matching again. 2259 * We allow skipping of the ends of the accent set if the ces do not match. 2260 * However if the failure is found before the accent set, it fails. 2261 * Internal method, status assumed to be success, caller has to check status 2262 * before calling this method. 2263 * @param strsrch string search data 2264 * @param textoffset of the ends of the rearranged accent 2265 * @param status output error status if any 2266 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2267 * offset of the match. Note this start includes all following accents. 2268 */ 2269 static 2270 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2271 int32_t textoffset, 2272 UErrorCode *status) 2273 { 2274 const UChar *text = strsrch->search->text; 2275 const UCollator *collator = strsrch->collator; 2276 int32_t safelength = 0; 2277 UChar *safetext; 2278 int32_t safetextlength; 2279 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2280 int32_t safeoffset = textoffset; 2281 2282 if (textoffset && 2283 ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2284 u_strlen(strsrch->canonicalPrefixAccents) - 1 2285 ], collator)) { 2286 safeoffset = getNextSafeOffset(collator, text, textoffset, 2287 strsrch->search->textLength); 2288 safelength = safeoffset - textoffset; 2289 safetextlength = INITIAL_ARRAY_SIZE_; 2290 safetext = addToUCharArray(safebuffer, &safetextlength, 2291 strsrch->canonicalPrefixAccents, 2292 text + textoffset, safelength, 2293 NULL, status); 2294 } 2295 else { 2296 safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2297 safetext = strsrch->canonicalPrefixAccents; 2298 } 2299 2300 UCollationElements *coleiter = strsrch->utilIter; 2301 // if status is a failure, ucol_setText does nothing 2302 ucol_setText(coleiter, safetext, safetextlength, status); 2303 // status checked in loop below 2304 2305 int32_t *ce = strsrch->pattern.CE; 2306 int32_t celength = strsrch->pattern.CELength; 2307 int ceindex = 0; 2308 UBool isSafe = TRUE; // safe zone indication flag for position 2309 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2310 2311 while (ceindex < celength) { 2312 int32_t textce = ucol_next(coleiter, status); 2313 if (U_FAILURE(*status)) { 2314 if (isSafe) { 2315 cleanUpSafeText(strsrch, safetext, safebuffer); 2316 } 2317 return USEARCH_DONE; 2318 } 2319 if (textce == UCOL_NULLORDER) { 2320 // check if we have passed the safe buffer 2321 if (coleiter == strsrch->textIter) { 2322 cleanUpSafeText(strsrch, safetext, safebuffer); 2323 return USEARCH_DONE; 2324 } 2325 cleanUpSafeText(strsrch, safetext, safebuffer); 2326 safetext = safebuffer; 2327 coleiter = strsrch->textIter; 2328 setColEIterOffset(coleiter, safeoffset); 2329 // status checked at the start of the loop 2330 isSafe = FALSE; 2331 continue; 2332 } 2333 textce = getCE(strsrch, textce); 2334 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2335 // do the beginning stuff 2336 int32_t failedoffset = ucol_getOffset(coleiter); 2337 if (isSafe && failedoffset <= prefixlength) { 2338 // alas... no hope. failed at rearranged accent set 2339 cleanUpSafeText(strsrch, safetext, safebuffer); 2340 return USEARCH_DONE; 2341 } 2342 else { 2343 if (isSafe) { 2344 failedoffset = safeoffset - failedoffset; 2345 cleanUpSafeText(strsrch, safetext, safebuffer); 2346 } 2347 2348 // try rearranging the end accents 2349 int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2350 textoffset, failedoffset, status); 2351 if (result != USEARCH_DONE) { 2352 // if status is a failure, ucol_setOffset does nothing 2353 setColEIterOffset(strsrch->textIter, result); 2354 } 2355 if (U_FAILURE(*status)) { 2356 return USEARCH_DONE; 2357 } 2358 return result; 2359 } 2360 } 2361 if (textce == ce[ceindex]) { 2362 ceindex ++; 2363 } 2364 } 2365 // set offset here 2366 if (isSafe) { 2367 int32_t result = ucol_getOffset(coleiter); 2368 // sets the text iterator here with the correct expansion and offset 2369 int32_t leftoverces = getExpansionSuffix(coleiter); 2370 cleanUpSafeText(strsrch, safetext, safebuffer); 2371 if (result <= prefixlength) { 2372 result = textoffset; 2373 } 2374 else { 2375 result = textoffset + (safeoffset - result); 2376 } 2377 setColEIterOffset(strsrch->textIter, result); 2378 setExpansionSuffix(strsrch->textIter, leftoverces); 2379 return result; 2380 } 2381 2382 return ucol_getOffset(coleiter); 2383 } 2384 2385 /** 2386 * Trying out the substring and sees if it can be a canonical match. 2387 * This will try normalizing the starting accents and arranging them into 2388 * canonical equivalents and check their corresponding ces with the pattern ce. 2389 * Prefix accents in the text will be grouped according to their combining 2390 * class and the groups will be mixed and matched to try find the perfect 2391 * match with the pattern. 2392 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2393 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2394 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2395 * "\u0301\u0325". 2396 * step 2: check if any of the generated substrings matches the pattern. 2397 * Internal method, status assumed to be success, caller has to check status 2398 * before calling this method. 2399 * @param strsrch string search data 2400 * @param textoffset start offset in the collation element text that starts 2401 * with the accents to be rearranged 2402 * @param status output error status if any 2403 * @return TRUE if the match is valid, FALSE otherwise 2404 */ 2405 static 2406 UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2407 int32_t textoffset, 2408 UErrorCode *status) 2409 { 2410 const UChar *text = strsrch->search->text; 2411 int32_t temp = textoffset; 2412 int32_t textlength = strsrch->search->textLength; 2413 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2414 UCollationElements *coleiter = strsrch->textIter; 2415 int32_t offset = ucol_getOffset(coleiter); 2416 if (strsrch->pattern.hasSuffixAccents) { 2417 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2418 offset, status); 2419 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2420 setColEIterOffset(coleiter, offset); 2421 return TRUE; 2422 } 2423 } 2424 return FALSE; 2425 } 2426 2427 if (!strsrch->pattern.hasPrefixAccents) { 2428 return FALSE; 2429 } 2430 2431 UChar accents[INITIAL_ARRAY_SIZE_]; 2432 // offset to the last base character in substring to search 2433 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2434 // normalizing the offensive string 2435 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2436 0, accents, INITIAL_ARRAY_SIZE_, status); 2437 // status checked in loop 2438 2439 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2440 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2441 2442 // 2 power n - 1 plus the full set of accents 2443 int32_t count = (2 << (size - 1)) - 1; 2444 while (U_SUCCESS(*status) && count > 0) { 2445 UChar *rearrange = strsrch->canonicalPrefixAccents; 2446 // copy the base characters 2447 for (int k = 0; k < accentsindex[0]; k ++) { 2448 *rearrange ++ = accents[k]; 2449 } 2450 // forming all possible canonical rearrangement by dropping 2451 // sets of accents 2452 for (int i = 0; i <= size - 1; i ++) { 2453 int32_t mask = 1 << (size - i - 1); 2454 if (count & mask) { 2455 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2456 *rearrange ++ = accents[j]; 2457 } 2458 } 2459 } 2460 *rearrange = 0; 2461 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2462 baseoffset, status); 2463 if (offset != USEARCH_DONE) { 2464 return TRUE; // match found 2465 } 2466 count --; 2467 } 2468 return FALSE; 2469 } 2470 2471 /** 2472 * Checks match for contraction. 2473 * If the match starts with a partial contraction we fail. 2474 * Internal method, status assumed to be success, caller has to check status 2475 * before calling this method. 2476 * @param strsrch string search data 2477 * @param start offset of potential match, to be modified if necessary 2478 * @param end offset of potential match, to be modified if necessary 2479 * @param status only error status if any 2480 * @return TRUE if match passes the contraction test, FALSE otherwise 2481 */ 2482 static 2483 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2484 int32_t *start, 2485 int32_t *end, UErrorCode *status) 2486 { 2487 UCollationElements *coleiter = strsrch->textIter; 2488 int32_t textlength = strsrch->search->textLength; 2489 int32_t temp = *end; 2490 const UCollator *collator = strsrch->collator; 2491 const UChar *text = strsrch->search->text; 2492 // This part checks if either if the start of the match contains potential 2493 // contraction. If so we'll have to iterate through them 2494 // Since we used ucol_next while previously looking for the potential 2495 // match, this guarantees that our end will not be a partial contraction, 2496 // or a partial supplementary character. 2497 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2498 int32_t expansion = getExpansionSuffix(coleiter); 2499 UBool expandflag = expansion > 0; 2500 setColEIterOffset(coleiter, *end); 2501 while (expansion > 0) { 2502 // getting rid of the redundant ce 2503 // since forward contraction/expansion may have extra ces 2504 // if we are in the normalization buffer, hasAccentsBeforeMatch 2505 // would have taken care of it. 2506 // E.g. the character \u01FA will have an expansion of 3, but if 2507 // we are only looking for A ring A\u030A, we'll have to skip the 2508 // last ce in the expansion buffer 2509 ucol_previous(coleiter, status); 2510 if (U_FAILURE(*status)) { 2511 return FALSE; 2512 } 2513 if (ucol_getOffset(coleiter) != temp) { 2514 *end = temp; 2515 temp = ucol_getOffset(coleiter); 2516 } 2517 expansion --; 2518 } 2519 2520 int32_t *patternce = strsrch->pattern.CE; 2521 int32_t patterncelength = strsrch->pattern.CELength; 2522 int32_t count = patterncelength; 2523 while (count > 0) { 2524 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2525 // status checked below, note that if status is a failure 2526 // ucol_previous returns UCOL_NULLORDER 2527 if (ce == UCOL_IGNORABLE) { 2528 continue; 2529 } 2530 if (expandflag && count == 0 && 2531 getColElemIterOffset(coleiter, FALSE) != temp) { 2532 *end = temp; 2533 temp = ucol_getOffset(coleiter); 2534 } 2535 if (count == patterncelength && 2536 ce != patternce[patterncelength - 1]) { 2537 // accents may have extra starting ces, this occurs when a 2538 // pure accent pattern is matched without rearrangement 2539 int32_t expected = patternce[patterncelength - 1]; 2540 UTF_BACK_1(text, 0, *end); 2541 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2542 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2543 while (U_SUCCESS(*status) && ce != expected && 2544 ce != UCOL_NULLORDER && 2545 ucol_getOffset(coleiter) <= *start) { 2546 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2547 } 2548 } 2549 } 2550 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2551 (*start) --; 2552 *start = getPreviousBaseOffset(text, *start); 2553 return FALSE; 2554 } 2555 count --; 2556 } 2557 } 2558 return TRUE; 2559 } 2560 2561 /** 2562 * Checks and sets the match information if found. 2563 * Checks 2564 * <ul> 2565 * <li> the potential match does not repeat the previous match 2566 * <li> boundaries are correct 2567 * <li> potential match does not end in the middle of a contraction 2568 * <li> identical matches 2569 * <\ul> 2570 * Otherwise the offset will be shifted to the next character. 2571 * Internal method, status assumed to be success, caller has to check status 2572 * before calling this method. 2573 * @param strsrch string search data 2574 * @param textoffset offset in the collation element text. the returned value 2575 * will be the truncated start offset of the match or the new start 2576 * search offset. 2577 * @param status only error status if any 2578 * @return TRUE if the match is valid, FALSE otherwise 2579 */ 2580 static 2581 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2582 int32_t *textoffset, 2583 UErrorCode *status) 2584 { 2585 // to ensure that the start and ends are not composite characters 2586 UCollationElements *coleiter = strsrch->textIter; 2587 // if we have a canonical accent match 2588 if ((strsrch->pattern.hasSuffixAccents && 2589 strsrch->canonicalSuffixAccents[0]) || 2590 (strsrch->pattern.hasPrefixAccents && 2591 strsrch->canonicalPrefixAccents[0])) { 2592 strsrch->search->matchedIndex = *textoffset; 2593 strsrch->search->matchedLength = 2594 getNextUStringSearchBaseOffset(strsrch, 2595 getColElemIterOffset(coleiter, FALSE)) 2596 - *textoffset; 2597 return TRUE; 2598 } 2599 2600 int32_t end = ucol_getOffset(coleiter); 2601 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2602 status) || 2603 U_FAILURE(*status)) { 2604 return FALSE; 2605 } 2606 2607 end = getNextUStringSearchBaseOffset(strsrch, end); 2608 // this totally matches, however we need to check if it is repeating 2609 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2610 !isBreakUnit(strsrch, *textoffset, end) || 2611 !checkIdentical(strsrch, *textoffset, end)) { 2612 (*textoffset) --; 2613 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2614 *textoffset); 2615 return FALSE; 2616 } 2617 2618 strsrch->search->matchedIndex = *textoffset; 2619 strsrch->search->matchedLength = end - *textoffset; 2620 return TRUE; 2621 } 2622 #endif // #if BOYER_MOORE 2623 2624 // constructors and destructor ------------------------------------------- 2625 2626 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2627 int32_t patternlength, 2628 const UChar *text, 2629 int32_t textlength, 2630 const char *locale, 2631 UBreakIterator *breakiter, 2632 UErrorCode *status) 2633 { 2634 if (U_FAILURE(*status)) { 2635 return NULL; 2636 } 2637 #if UCONFIG_NO_BREAK_ITERATION 2638 if (breakiter != NULL) { 2639 *status = U_UNSUPPORTED_ERROR; 2640 return NULL; 2641 } 2642 #endif 2643 if (locale) { 2644 // ucol_open internally checks for status 2645 UCollator *collator = ucol_open(locale, status); 2646 // pattern, text checks are done in usearch_openFromCollator 2647 UStringSearch *result = usearch_openFromCollator(pattern, 2648 patternlength, text, textlength, 2649 collator, breakiter, status); 2650 2651 if (result == NULL || U_FAILURE(*status)) { 2652 if (collator) { 2653 ucol_close(collator); 2654 } 2655 return NULL; 2656 } 2657 else { 2658 result->ownCollator = TRUE; 2659 } 2660 return result; 2661 } 2662 *status = U_ILLEGAL_ARGUMENT_ERROR; 2663 return NULL; 2664 } 2665 2666 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2667 const UChar *pattern, 2668 int32_t patternlength, 2669 const UChar *text, 2670 int32_t textlength, 2671 const UCollator *collator, 2672 UBreakIterator *breakiter, 2673 UErrorCode *status) 2674 { 2675 if (U_FAILURE(*status)) { 2676 return NULL; 2677 } 2678 #if UCONFIG_NO_BREAK_ITERATION 2679 if (breakiter != NULL) { 2680 *status = U_UNSUPPORTED_ERROR; 2681 return NULL; 2682 } 2683 #endif 2684 if (pattern == NULL || text == NULL || collator == NULL) { 2685 *status = U_ILLEGAL_ARGUMENT_ERROR; 2686 return NULL; 2687 } 2688 2689 // string search does not really work when numeric collation is turned on 2690 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2691 *status = U_UNSUPPORTED_ERROR; 2692 return NULL; 2693 } 2694 2695 if (U_SUCCESS(*status)) { 2696 initializeFCD(status); 2697 if (U_FAILURE(*status)) { 2698 return NULL; 2699 } 2700 2701 UStringSearch *result; 2702 if (textlength == -1) { 2703 textlength = u_strlen(text); 2704 } 2705 if (patternlength == -1) { 2706 patternlength = u_strlen(pattern); 2707 } 2708 if (textlength <= 0 || patternlength <= 0) { 2709 *status = U_ILLEGAL_ARGUMENT_ERROR; 2710 return NULL; 2711 } 2712 2713 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2714 if (result == NULL) { 2715 *status = U_MEMORY_ALLOCATION_ERROR; 2716 return NULL; 2717 } 2718 2719 result->collator = collator; 2720 result->strength = ucol_getStrength(collator); 2721 result->ceMask = getMask(result->strength); 2722 result->toShift = 2723 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2724 UCOL_SHIFTED; 2725 result->variableTop = ucol_getVariableTop(collator, status); 2726 2727 if (U_FAILURE(*status)) { 2728 uprv_free(result); 2729 return NULL; 2730 } 2731 2732 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2733 if (result->search == NULL) { 2734 *status = U_MEMORY_ALLOCATION_ERROR; 2735 uprv_free(result); 2736 return NULL; 2737 } 2738 2739 result->search->text = text; 2740 result->search->textLength = textlength; 2741 2742 result->pattern.text = pattern; 2743 result->pattern.textLength = patternlength; 2744 result->pattern.CE = NULL; 2745 result->pattern.PCE = NULL; 2746 2747 result->search->breakIter = breakiter; 2748 #if !UCONFIG_NO_BREAK_ITERATION 2749 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2750 if (breakiter) { 2751 ubrk_setText(breakiter, text, textlength, status); 2752 } 2753 #endif 2754 2755 result->ownCollator = FALSE; 2756 result->search->matchedLength = 0; 2757 result->search->matchedIndex = USEARCH_DONE; 2758 result->utilIter = NULL; 2759 result->textIter = ucol_openElements(collator, text, 2760 textlength, status); 2761 if (U_FAILURE(*status)) { 2762 usearch_close(result); 2763 return NULL; 2764 } 2765 2766 result->search->isOverlap = FALSE; 2767 result->search->isCanonicalMatch = FALSE; 2768 result->search->isForwardSearching = TRUE; 2769 result->search->reset = TRUE; 2770 2771 initialize(result, status); 2772 2773 if (U_FAILURE(*status)) { 2774 usearch_close(result); 2775 return NULL; 2776 } 2777 2778 return result; 2779 } 2780 return NULL; 2781 } 2782 2783 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2784 { 2785 if (strsrch) { 2786 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && 2787 strsrch->pattern.CE) { 2788 uprv_free(strsrch->pattern.CE); 2789 } 2790 2791 if (strsrch->pattern.PCE != NULL && 2792 strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 2793 uprv_free(strsrch->pattern.PCE); 2794 } 2795 2796 ucol_closeElements(strsrch->textIter); 2797 ucol_closeElements(strsrch->utilIter); 2798 2799 if (strsrch->ownCollator && strsrch->collator) { 2800 ucol_close((UCollator *)strsrch->collator); 2801 } 2802 2803 #if !UCONFIG_NO_BREAK_ITERATION 2804 if (strsrch->search->internalBreakIter) { 2805 ubrk_close(strsrch->search->internalBreakIter); 2806 } 2807 #endif 2808 2809 uprv_free(strsrch->search); 2810 uprv_free(strsrch); 2811 } 2812 } 2813 2814 // set and get methods -------------------------------------------------- 2815 2816 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2817 int32_t position, 2818 UErrorCode *status) 2819 { 2820 if (U_SUCCESS(*status) && strsrch) { 2821 if (isOutOfBounds(strsrch->search->textLength, position)) { 2822 *status = U_INDEX_OUTOFBOUNDS_ERROR; 2823 } 2824 else { 2825 setColEIterOffset(strsrch->textIter, position); 2826 } 2827 strsrch->search->matchedIndex = USEARCH_DONE; 2828 strsrch->search->matchedLength = 0; 2829 strsrch->search->reset = FALSE; 2830 } 2831 } 2832 2833 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2834 { 2835 if (strsrch) { 2836 int32_t result = ucol_getOffset(strsrch->textIter); 2837 if (isOutOfBounds(strsrch->search->textLength, result)) { 2838 return USEARCH_DONE; 2839 } 2840 return result; 2841 } 2842 return USEARCH_DONE; 2843 } 2844 2845 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2846 USearchAttribute attribute, 2847 USearchAttributeValue value, 2848 UErrorCode *status) 2849 { 2850 if (U_SUCCESS(*status) && strsrch) { 2851 switch (attribute) 2852 { 2853 case USEARCH_OVERLAP : 2854 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2855 break; 2856 case USEARCH_CANONICAL_MATCH : 2857 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2858 FALSE); 2859 break; 2860 case USEARCH_ATTRIBUTE_COUNT : 2861 default: 2862 *status = U_ILLEGAL_ARGUMENT_ERROR; 2863 } 2864 } 2865 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2866 *status = U_ILLEGAL_ARGUMENT_ERROR; 2867 } 2868 } 2869 2870 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2871 const UStringSearch *strsrch, 2872 USearchAttribute attribute) 2873 { 2874 if (strsrch) { 2875 switch (attribute) { 2876 case USEARCH_OVERLAP : 2877 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2878 USEARCH_OFF); 2879 case USEARCH_CANONICAL_MATCH : 2880 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2881 USEARCH_OFF); 2882 case USEARCH_ATTRIBUTE_COUNT : 2883 return USEARCH_DEFAULT; 2884 } 2885 } 2886 return USEARCH_DEFAULT; 2887 } 2888 2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2890 const UStringSearch *strsrch) 2891 { 2892 if (strsrch == NULL) { 2893 return USEARCH_DONE; 2894 } 2895 return strsrch->search->matchedIndex; 2896 } 2897 2898 2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2900 UChar *result, 2901 int32_t resultCapacity, 2902 UErrorCode *status) 2903 { 2904 if (U_FAILURE(*status)) { 2905 return USEARCH_DONE; 2906 } 2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2908 result == NULL)) { 2909 *status = U_ILLEGAL_ARGUMENT_ERROR; 2910 return USEARCH_DONE; 2911 } 2912 2913 int32_t copylength = strsrch->search->matchedLength; 2914 int32_t copyindex = strsrch->search->matchedIndex; 2915 if (copyindex == USEARCH_DONE) { 2916 u_terminateUChars(result, resultCapacity, 0, status); 2917 return USEARCH_DONE; 2918 } 2919 2920 if (resultCapacity < copylength) { 2921 copylength = resultCapacity; 2922 } 2923 if (copylength > 0) { 2924 uprv_memcpy(result, strsrch->search->text + copyindex, 2925 copylength * sizeof(UChar)); 2926 } 2927 return u_terminateUChars(result, resultCapacity, 2928 strsrch->search->matchedLength, status); 2929 } 2930 2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2932 const UStringSearch *strsrch) 2933 { 2934 if (strsrch) { 2935 return strsrch->search->matchedLength; 2936 } 2937 return USEARCH_DONE; 2938 } 2939 2940 #if !UCONFIG_NO_BREAK_ITERATION 2941 2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2943 UBreakIterator *breakiter, 2944 UErrorCode *status) 2945 { 2946 if (U_SUCCESS(*status) && strsrch) { 2947 strsrch->search->breakIter = breakiter; 2948 if (breakiter) { 2949 ubrk_setText(breakiter, strsrch->search->text, 2950 strsrch->search->textLength, status); 2951 } 2952 } 2953 } 2954 2955 U_CAPI const UBreakIterator* U_EXPORT2 2956 usearch_getBreakIterator(const UStringSearch *strsrch) 2957 { 2958 if (strsrch) { 2959 return strsrch->search->breakIter; 2960 } 2961 return NULL; 2962 } 2963 2964 #endif 2965 2966 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2967 const UChar *text, 2968 int32_t textlength, 2969 UErrorCode *status) 2970 { 2971 if (U_SUCCESS(*status)) { 2972 if (strsrch == NULL || text == NULL || textlength < -1 || 2973 textlength == 0) { 2974 *status = U_ILLEGAL_ARGUMENT_ERROR; 2975 } 2976 else { 2977 if (textlength == -1) { 2978 textlength = u_strlen(text); 2979 } 2980 strsrch->search->text = text; 2981 strsrch->search->textLength = textlength; 2982 ucol_setText(strsrch->textIter, text, textlength, status); 2983 strsrch->search->matchedIndex = USEARCH_DONE; 2984 strsrch->search->matchedLength = 0; 2985 strsrch->search->reset = TRUE; 2986 #if !UCONFIG_NO_BREAK_ITERATION 2987 if (strsrch->search->breakIter != NULL) { 2988 ubrk_setText(strsrch->search->breakIter, text, 2989 textlength, status); 2990 } 2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2992 #endif 2993 } 2994 } 2995 } 2996 2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 2998 int32_t *length) 2999 { 3000 if (strsrch) { 3001 *length = strsrch->search->textLength; 3002 return strsrch->search->text; 3003 } 3004 return NULL; 3005 } 3006 3007 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3008 const UCollator *collator, 3009 UErrorCode *status) 3010 { 3011 if (U_SUCCESS(*status)) { 3012 if (collator == NULL) { 3013 *status = U_ILLEGAL_ARGUMENT_ERROR; 3014 return; 3015 } 3016 3017 if (strsrch) { 3018 if (strsrch->ownCollator && (strsrch->collator != collator)) { 3019 ucol_close((UCollator *)strsrch->collator); 3020 strsrch->ownCollator = FALSE; 3021 } 3022 strsrch->collator = collator; 3023 strsrch->strength = ucol_getStrength(collator); 3024 strsrch->ceMask = getMask(strsrch->strength); 3025 #if !UCONFIG_NO_BREAK_ITERATION 3026 ubrk_close(strsrch->search->internalBreakIter); 3027 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3028 strsrch->search->text, strsrch->search->textLength, status); 3029 #endif 3030 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3031 strsrch->toShift = 3032 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3033 UCOL_SHIFTED; 3034 // if status is a failure, ucol_getVariableTop returns 0 3035 strsrch->variableTop = ucol_getVariableTop(collator, status); 3036 if (U_SUCCESS(*status)) { 3037 initialize(strsrch, status); 3038 if (U_SUCCESS(*status)) { 3039 /* free offset buffer to avoid memory leak before initializing. */ 3040 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 3041 uprv_init_collIterate(collator, strsrch->search->text, 3042 strsrch->search->textLength, 3043 &(strsrch->textIter->iteratordata_)); 3044 strsrch->utilIter->iteratordata_.coll = collator; 3045 } 3046 } 3047 } 3048 3049 // **** are these calls needed? 3050 // **** we call uprv_init_pce in initializePatternPCETable 3051 // **** and the CEBuffer constructor... 3052 #if 0 3053 uprv_init_pce(strsrch->textIter); 3054 uprv_init_pce(strsrch->utilIter); 3055 #endif 3056 } 3057 } 3058 3059 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3060 { 3061 if (strsrch) { 3062 return (UCollator *)strsrch->collator; 3063 } 3064 return NULL; 3065 } 3066 3067 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3068 const UChar *pattern, 3069 int32_t patternlength, 3070 UErrorCode *status) 3071 { 3072 if (U_SUCCESS(*status)) { 3073 if (strsrch == NULL || pattern == NULL) { 3074 *status = U_ILLEGAL_ARGUMENT_ERROR; 3075 } 3076 else { 3077 if (patternlength == -1) { 3078 patternlength = u_strlen(pattern); 3079 } 3080 if (patternlength == 0) { 3081 *status = U_ILLEGAL_ARGUMENT_ERROR; 3082 return; 3083 } 3084 strsrch->pattern.text = pattern; 3085 strsrch->pattern.textLength = patternlength; 3086 initialize(strsrch, status); 3087 } 3088 } 3089 } 3090 3091 U_CAPI const UChar* U_EXPORT2 3092 usearch_getPattern(const UStringSearch *strsrch, 3093 int32_t *length) 3094 { 3095 if (strsrch) { 3096 *length = strsrch->pattern.textLength; 3097 return strsrch->pattern.text; 3098 } 3099 return NULL; 3100 } 3101 3102 // miscellanous methods -------------------------------------------------- 3103 3104 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3105 UErrorCode *status) 3106 { 3107 if (strsrch && U_SUCCESS(*status)) { 3108 strsrch->search->isForwardSearching = TRUE; 3109 usearch_setOffset(strsrch, 0, status); 3110 if (U_SUCCESS(*status)) { 3111 return usearch_next(strsrch, status); 3112 } 3113 } 3114 return USEARCH_DONE; 3115 } 3116 3117 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3118 int32_t position, 3119 UErrorCode *status) 3120 { 3121 if (strsrch && U_SUCCESS(*status)) { 3122 strsrch->search->isForwardSearching = TRUE; 3123 // position checked in usearch_setOffset 3124 usearch_setOffset(strsrch, position, status); 3125 if (U_SUCCESS(*status)) { 3126 return usearch_next(strsrch, status); 3127 } 3128 } 3129 return USEARCH_DONE; 3130 } 3131 3132 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3133 UErrorCode *status) 3134 { 3135 if (strsrch && U_SUCCESS(*status)) { 3136 strsrch->search->isForwardSearching = FALSE; 3137 usearch_setOffset(strsrch, strsrch->search->textLength, status); 3138 if (U_SUCCESS(*status)) { 3139 return usearch_previous(strsrch, status); 3140 } 3141 } 3142 return USEARCH_DONE; 3143 } 3144 3145 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3146 int32_t position, 3147 UErrorCode *status) 3148 { 3149 if (strsrch && U_SUCCESS(*status)) { 3150 strsrch->search->isForwardSearching = FALSE; 3151 // position checked in usearch_setOffset 3152 usearch_setOffset(strsrch, position, status); 3153 if (U_SUCCESS(*status)) { 3154 return usearch_previous(strsrch, status); 3155 } 3156 } 3157 return USEARCH_DONE; 3158 } 3159 3160 /** 3161 * If a direction switch is required, we'll count the number of ces till the 3162 * beginning of the collation element iterator and iterate forwards that 3163 * number of times. This is so that we get to the correct point within the 3164 * string to continue the search in. Imagine when we are in the middle of the 3165 * normalization buffer when the change in direction is request. arrrgghh.... 3166 * After searching the offset within the collation element iterator will be 3167 * shifted to the start of the match. If a match is not found, the offset would 3168 * have been set to the end of the text string in the collation element 3169 * iterator. 3170 * Okay, here's my take on normalization buffer. The only time when there can 3171 * be 2 matches within the same normalization is when the pattern is consists 3172 * of all accents. But since the offset returned is from the text string, we 3173 * should not confuse the caller by returning the second match within the 3174 * same normalization buffer. If we do, the 2 results will have the same match 3175 * offsets, and that'll be confusing. I'll return the next match that doesn't 3176 * fall within the same normalization buffer. Note this does not affect the 3177 * results of matches spanning the text and the normalization buffer. 3178 * The position to start searching is taken from the collation element 3179 * iterator. Callers of this API would have to set the offset in the collation 3180 * element iterator before using this method. 3181 */ 3182 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3183 UErrorCode *status) 3184 { 3185 if (U_SUCCESS(*status) && strsrch) { 3186 // note offset is either equivalent to the start of the previous match 3187 // or is set by the user 3188 int32_t offset = usearch_getOffset(strsrch); 3189 USearch *search = strsrch->search; 3190 search->reset = FALSE; 3191 int32_t textlength = search->textLength; 3192 if (search->isForwardSearching) { 3193 #if BOYER_MOORE 3194 if (offset == textlength 3195 || (!search->isOverlap && 3196 (offset + strsrch->pattern.defaultShiftSize > textlength || 3197 (search->matchedIndex != USEARCH_DONE && 3198 offset + search->matchedLength >= textlength)))) { 3199 // not enough characters to match 3200 setMatchNotFound(strsrch); 3201 return USEARCH_DONE; 3202 } 3203 #else 3204 if (offset == textlength || 3205 (! search->isOverlap && 3206 (search->matchedIndex != USEARCH_DONE && 3207 offset + search->matchedLength > textlength))) { 3208 // not enough characters to match 3209 setMatchNotFound(strsrch); 3210 return USEARCH_DONE; 3211 } 3212 #endif 3213 } 3214 else { 3215 // switching direction. 3216 // if matchedIndex == USEARCH_DONE, it means that either a 3217 // setOffset has been called or that previous ran off the text 3218 // string. the iterator would have been set to offset 0 if a 3219 // match is not found. 3220 search->isForwardSearching = TRUE; 3221 if (search->matchedIndex != USEARCH_DONE) { 3222 // there's no need to set the collation element iterator 3223 // the next call to next will set the offset. 3224 return search->matchedIndex; 3225 } 3226 } 3227 3228 if (U_SUCCESS(*status)) { 3229 if (strsrch->pattern.CELength == 0) { 3230 if (search->matchedIndex == USEARCH_DONE) { 3231 search->matchedIndex = offset; 3232 } 3233 else { // moves by codepoints 3234 UTF_FWD_1(search->text, search->matchedIndex, textlength); 3235 } 3236 3237 search->matchedLength = 0; 3238 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3239 // status checked below 3240 if (search->matchedIndex == textlength) { 3241 search->matchedIndex = USEARCH_DONE; 3242 } 3243 } 3244 else { 3245 if (search->matchedLength > 0) { 3246 // if matchlength is 0 we are at the start of the iteration 3247 if (search->isOverlap) { 3248 ucol_setOffset(strsrch->textIter, offset + 1, status); 3249 } 3250 else { 3251 ucol_setOffset(strsrch->textIter, 3252 offset + search->matchedLength, status); 3253 } 3254 } 3255 else { 3256 // for boundary check purposes. this will ensure that the 3257 // next match will not preceed the current offset 3258 // note search->matchedIndex will always be set to something 3259 // in the code 3260 search->matchedIndex = offset - 1; 3261 } 3262 3263 if (search->isCanonicalMatch) { 3264 // can't use exact here since extra accents are allowed. 3265 usearch_handleNextCanonical(strsrch, status); 3266 } 3267 else { 3268 usearch_handleNextExact(strsrch, status); 3269 } 3270 } 3271 3272 if (U_FAILURE(*status)) { 3273 return USEARCH_DONE; 3274 } 3275 3276 #if !BOYER_MOORE 3277 if (search->matchedIndex == USEARCH_DONE) { 3278 ucol_setOffset(strsrch->textIter, search->textLength, status); 3279 } else { 3280 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3281 } 3282 #endif 3283 3284 return search->matchedIndex; 3285 } 3286 } 3287 return USEARCH_DONE; 3288 } 3289 3290 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3291 UErrorCode *status) 3292 { 3293 if (U_SUCCESS(*status) && strsrch) { 3294 int32_t offset; 3295 USearch *search = strsrch->search; 3296 if (search->reset) { 3297 offset = search->textLength; 3298 search->isForwardSearching = FALSE; 3299 search->reset = FALSE; 3300 setColEIterOffset(strsrch->textIter, offset); 3301 } 3302 else { 3303 offset = usearch_getOffset(strsrch); 3304 } 3305 3306 int32_t matchedindex = search->matchedIndex; 3307 if (search->isForwardSearching == TRUE) { 3308 // switching direction. 3309 // if matchedIndex == USEARCH_DONE, it means that either a 3310 // setOffset has been called or that next ran off the text 3311 // string. the iterator would have been set to offset textLength if 3312 // a match is not found. 3313 search->isForwardSearching = FALSE; 3314 if (matchedindex != USEARCH_DONE) { 3315 return matchedindex; 3316 } 3317 } 3318 else { 3319 #if BOYER_MOORE 3320 if (offset == 0 || matchedindex == 0 || 3321 (!search->isOverlap && 3322 (offset < strsrch->pattern.defaultShiftSize || 3323 (matchedindex != USEARCH_DONE && 3324 matchedindex < strsrch->pattern.defaultShiftSize)))) { 3325 // not enough characters to match 3326 setMatchNotFound(strsrch); 3327 return USEARCH_DONE; 3328 } 3329 #else 3330 // Could check pattern length, but the 3331 // linear search will do the right thing 3332 if (offset == 0 || matchedindex == 0) { 3333 setMatchNotFound(strsrch); 3334 return USEARCH_DONE; 3335 } 3336 #endif 3337 } 3338 3339 if (U_SUCCESS(*status)) { 3340 if (strsrch->pattern.CELength == 0) { 3341 search->matchedIndex = 3342 (matchedindex == USEARCH_DONE ? offset : matchedindex); 3343 if (search->matchedIndex == 0) { 3344 setMatchNotFound(strsrch); 3345 // status checked below 3346 } 3347 else { // move by codepoints 3348 UTF_BACK_1(search->text, 0, search->matchedIndex); 3349 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3350 // status checked below 3351 search->matchedLength = 0; 3352 } 3353 } 3354 else { 3355 #if !BOYER_MOORE 3356 if (search->matchedIndex != USEARCH_DONE) { 3357 if (search->isOverlap) { 3358 ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status); 3359 } 3360 } 3361 #endif 3362 3363 if (strsrch->search->isCanonicalMatch) { 3364 // can't use exact here since extra accents are allowed. 3365 usearch_handlePreviousCanonical(strsrch, status); 3366 // status checked below 3367 } 3368 else { 3369 usearch_handlePreviousExact(strsrch, status); 3370 // status checked below 3371 } 3372 } 3373 3374 if (U_FAILURE(*status)) { 3375 return USEARCH_DONE; 3376 } 3377 3378 return search->matchedIndex; 3379 } 3380 } 3381 return USEARCH_DONE; 3382 } 3383 3384 3385 3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3387 { 3388 /* 3389 reset is setting the attributes that are already in 3390 string search, hence all attributes in the collator should 3391 be retrieved without any problems 3392 */ 3393 if (strsrch) { 3394 UErrorCode status = U_ZERO_ERROR; 3395 UBool sameCollAttribute = TRUE; 3396 uint32_t ceMask; 3397 UBool shift; 3398 uint32_t varTop; 3399 3400 // **** hack to deal w/ how processed CEs encode quaternary **** 3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3404 sameCollAttribute = FALSE; 3405 } 3406 3407 strsrch->strength = ucol_getStrength(strsrch->collator); 3408 ceMask = getMask(strsrch->strength); 3409 if (strsrch->ceMask != ceMask) { 3410 strsrch->ceMask = ceMask; 3411 sameCollAttribute = FALSE; 3412 } 3413 3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3416 &status) == UCOL_SHIFTED; 3417 if (strsrch->toShift != shift) { 3418 strsrch->toShift = shift; 3419 sameCollAttribute = FALSE; 3420 } 3421 3422 // if status is a failure, ucol_getVariableTop returns 0 3423 varTop = ucol_getVariableTop(strsrch->collator, &status); 3424 if (strsrch->variableTop != varTop) { 3425 strsrch->variableTop = varTop; 3426 sameCollAttribute = FALSE; 3427 } 3428 if (!sameCollAttribute) { 3429 initialize(strsrch, &status); 3430 } 3431 /* free offset buffer to avoid memory leak before initializing. */ 3432 ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 3433 uprv_init_collIterate(strsrch->collator, strsrch->search->text, 3434 strsrch->search->textLength, 3435 &(strsrch->textIter->iteratordata_)); 3436 strsrch->search->matchedLength = 0; 3437 strsrch->search->matchedIndex = USEARCH_DONE; 3438 strsrch->search->isOverlap = FALSE; 3439 strsrch->search->isCanonicalMatch = FALSE; 3440 strsrch->search->isForwardSearching = TRUE; 3441 strsrch->search->reset = TRUE; 3442 } 3443 } 3444 3445 // 3446 // CEI Collation Element + source text index. 3447 // These structs are kept in the circular buffer. 3448 // 3449 struct CEI { 3450 int64_t ce; 3451 int32_t lowIndex; 3452 int32_t highIndex; 3453 }; 3454 3455 U_NAMESPACE_BEGIN 3456 3457 3458 // 3459 // CEBuffer A circular buffer of CEs from the text being searched. 3460 // 3461 #define DEFAULT_CEBUFFER_SIZE 50 3462 struct CEBuffer { 3463 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3464 CEI *buf; 3465 int32_t bufSize; 3466 int32_t firstIx; 3467 int32_t limitIx; 3468 UCollationElements *ceIter; 3469 UStringSearch *strSearch; 3470 3471 3472 3473 CEBuffer(UStringSearch *ss, UErrorCode *status); 3474 ~CEBuffer(); 3475 const CEI *get(int32_t index); 3476 const CEI *getPrevious(int32_t index); 3477 }; 3478 3479 3480 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { 3481 buf = defBuf; 3482 strSearch = ss; 3483 bufSize = ss->pattern.CELength+10; 3484 ceIter = ss->textIter; 3485 firstIx = 0; 3486 limitIx = 0; 3487 3488 uprv_init_pce(ceIter); 3489 3490 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3491 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3492 if (buf == NULL) { 3493 *status = U_MEMORY_ALLOCATION_ERROR; 3494 } 3495 } 3496 } 3497 3498 // TODO: add a reset or init function so that allocated 3499 // buffers can be retained & reused. 3500 3501 CEBuffer::~CEBuffer() { 3502 if (buf != defBuf) { 3503 uprv_free(buf); 3504 } 3505 } 3506 3507 3508 // Get the CE with the specified index. 3509 // Index must be in the range 3510 // n-history_size < index < n+1 3511 // where n is the largest index to have been fetched by some previous call to this function. 3512 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3513 // 3514 const CEI *CEBuffer::get(int32_t index) { 3515 int i = index % bufSize; 3516 3517 if (index>=firstIx && index<limitIx) { 3518 // The request was for an entry already in our buffer. 3519 // Just return it. 3520 return &buf[i]; 3521 } 3522 3523 // Caller is requesting a new, never accessed before, CE. 3524 // Verify that it is the next one in sequence, which is all 3525 // that is allowed. 3526 if (index != limitIx) { 3527 U_ASSERT(FALSE); 3528 3529 return NULL; 3530 } 3531 3532 // Manage the circular CE buffer indexing 3533 limitIx++; 3534 3535 if (limitIx - firstIx >= bufSize) { 3536 // The buffer is full, knock out the lowest-indexed entry. 3537 firstIx++; 3538 } 3539 3540 UErrorCode status = U_ZERO_ERROR; 3541 3542 buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 3543 3544 return &buf[i]; 3545 } 3546 3547 // Get the CE with the specified index. 3548 // Index must be in the range 3549 // n-history_size < index < n+1 3550 // where n is the largest index to have been fetched by some previous call to this function. 3551 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3552 // 3553 const CEI *CEBuffer::getPrevious(int32_t index) { 3554 int i = index % bufSize; 3555 3556 if (index>=firstIx && index<limitIx) { 3557 // The request was for an entry already in our buffer. 3558 // Just return it. 3559 return &buf[i]; 3560 } 3561 3562 // Caller is requesting a new, never accessed before, CE. 3563 // Verify that it is the next one in sequence, which is all 3564 // that is allowed. 3565 if (index != limitIx) { 3566 U_ASSERT(FALSE); 3567 3568 return NULL; 3569 } 3570 3571 // Manage the circular CE buffer indexing 3572 limitIx++; 3573 3574 if (limitIx - firstIx >= bufSize) { 3575 // The buffer is full, knock out the lowest-indexed entry. 3576 firstIx++; 3577 } 3578 3579 UErrorCode status = U_ZERO_ERROR; 3580 3581 buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 3582 3583 return &buf[i]; 3584 } 3585 3586 U_NAMESPACE_END 3587 3588 3589 // #define USEARCH_DEBUG 3590 3591 #ifdef USEARCH_DEBUG 3592 #include <stdio.h> 3593 #include <stdlib.h> 3594 #endif 3595 3596 /* 3597 * Find the next break boundary after startIndex. If the UStringSearch object 3598 * has an external break iterator, use that. Otherwise use the internal character 3599 * break iterator. 3600 */ 3601 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3602 #if 0 3603 const UChar *text = strsrch->search->text; 3604 int32_t textLen = strsrch->search->textLength; 3605 3606 U_ASSERT(startIndex>=0); 3607 U_ASSERT(startIndex<=textLen); 3608 3609 if (startIndex >= textLen) { 3610 return startIndex; 3611 } 3612 3613 UChar32 c; 3614 int32_t i = startIndex; 3615 U16_NEXT(text, i, textLen, c); 3616 3617 // If we are on a control character, stop without looking for combining marks. 3618 // Control characters do not combine. 3619 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3620 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3621 return i; 3622 } 3623 3624 // The initial character was not a control, and can thus accept trailing 3625 // combining characters. Advance over however many of them there are. 3626 int32_t indexOfLastCharChecked; 3627 for (;;) { 3628 indexOfLastCharChecked = i; 3629 if (i>=textLen) { 3630 break; 3631 } 3632 U16_NEXT(text, i, textLen, c); 3633 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3634 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3635 break; 3636 } 3637 } 3638 return indexOfLastCharChecked; 3639 #elif !UCONFIG_NO_BREAK_ITERATION 3640 UBreakIterator *breakiterator = strsrch->search->breakIter; 3641 3642 if (breakiterator == NULL) { 3643 breakiterator = strsrch->search->internalBreakIter; 3644 } 3645 3646 if (breakiterator != NULL) { 3647 return ubrk_following(breakiterator, startIndex); 3648 } 3649 3650 return startIndex; 3651 #else 3652 // **** or should we use the original code? **** 3653 return startIndex; 3654 #endif 3655 3656 } 3657 3658 /* 3659 * Returns TRUE if index is on a break boundary. If the UStringSearch 3660 * has an external break iterator, test using that, otherwise test 3661 * using the internal character break iterator. 3662 */ 3663 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3664 #if 0 3665 const UChar *text = strsrch->search->text; 3666 int32_t textLen = strsrch->search->textLength; 3667 3668 U_ASSERT(index>=0); 3669 U_ASSERT(index<=textLen); 3670 3671 if (index>=textLen || index<=0) { 3672 return FALSE; 3673 } 3674 3675 // If the character at the current index is not a GRAPHEME_EXTEND 3676 // then we can not be within a combining sequence. 3677 UChar32 c; 3678 U16_GET(text, 0, index, textLen, c); 3679 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3680 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3681 return FALSE; 3682 } 3683 3684 // We are at a combining mark. If the preceding character is anything 3685 // except a CONTROL, CR or LF, we are in a combining sequence. 3686 U16_PREV(text, 0, index, c); 3687 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3688 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3689 return combining; 3690 #elif !UCONFIG_NO_BREAK_ITERATION 3691 UBreakIterator *breakiterator = strsrch->search->breakIter; 3692 3693 if (breakiterator == NULL) { 3694 breakiterator = strsrch->search->internalBreakIter; 3695 } 3696 3697 return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index)); 3698 #else 3699 // **** or use the original code? **** 3700 return FALSE; 3701 #endif 3702 } 3703 3704 #if 0 3705 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3706 { 3707 #if !UCONFIG_NO_BREAK_ITERATION 3708 UBreakIterator *breakiterator = strsrch->search->breakIter; 3709 3710 if (breakiterator != NULL) { 3711 int32_t startindex = ubrk_first(breakiterator); 3712 int32_t endindex = ubrk_last(breakiterator); 3713 3714 // out-of-range indexes are never boundary positions 3715 if (start < startindex || start > endindex || 3716 end < startindex || end > endindex) { 3717 return FALSE; 3718 } 3719 3720 return ubrk_isBoundary(breakiterator, start) && 3721 ubrk_isBoundary(breakiterator, end); 3722 } 3723 #endif 3724 3725 return TRUE; 3726 } 3727 #endif 3728 3729 3730 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3731 int32_t startIdx, 3732 int32_t *matchStart, 3733 int32_t *matchLimit, 3734 UErrorCode *status) 3735 { 3736 if (U_FAILURE(*status)) { 3737 return FALSE; 3738 } 3739 3740 // TODO: reject search patterns beginning with a combining char. 3741 3742 #ifdef USEARCH_DEBUG 3743 if (getenv("USEARCH_DEBUG") != NULL) { 3744 printf("Pattern CEs\n"); 3745 for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 3746 printf(" %8x", strsrch->pattern.CE[ii]); 3747 } 3748 printf("\n"); 3749 } 3750 3751 #endif 3752 // Input parameter sanity check. 3753 // TODO: should input indicies clip to the text length 3754 // in the same way that UText does. 3755 if(strsrch->pattern.CELength == 0 || 3756 startIdx < 0 || 3757 startIdx > strsrch->search->textLength || 3758 strsrch->pattern.CE == NULL) { 3759 *status = U_ILLEGAL_ARGUMENT_ERROR; 3760 return FALSE; 3761 } 3762 3763 if (strsrch->pattern.PCE == NULL) { 3764 initializePatternPCETable(strsrch, status); 3765 } 3766 3767 ucol_setOffset(strsrch->textIter, startIdx, status); 3768 CEBuffer ceb(strsrch, status); 3769 3770 3771 int32_t targetIx = 0; 3772 const CEI *targetCEI; 3773 int32_t patIx; 3774 UBool found; 3775 3776 int32_t mStart = -1; 3777 int32_t mLimit = -1; 3778 int32_t minLimit; 3779 int32_t maxLimit; 3780 3781 3782 3783 // Outer loop moves over match starting positions in the 3784 // target CE space. 3785 for(targetIx=0; ; targetIx++) 3786 { 3787 found = TRUE; 3788 // Inner loop checks for a match beginning at each 3789 // position from the outer loop. 3790 for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) { 3791 int64_t patCE = strsrch->pattern.PCE[patIx]; 3792 targetCEI = ceb.get(targetIx+patIx); 3793 // Compare CE from target string with CE from the pattern. 3794 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 3795 // which will fail the compare, below. 3796 if (targetCEI->ce != patCE) { 3797 found = FALSE; 3798 break; 3799 } 3800 } 3801 3802 if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) { 3803 // No match at this targetIx. Try again at the next. 3804 continue; 3805 } 3806 3807 if (!found) { 3808 // No match at all, we have run off the end of the target text. 3809 break; 3810 } 3811 3812 3813 // We have found a match in CE space. 3814 // Now determine the bounds in string index space. 3815 // There still is a chance of match failure if the CE range not correspond to 3816 // an acceptable character range. 3817 // 3818 const CEI *firstCEI = ceb.get(targetIx); 3819 const CEI *lastCEI = ceb.get(targetIx + strsrch->pattern.PCELength - 1); 3820 const CEI *nextCEI = ceb.get(targetIx + strsrch->pattern.PCELength); 3821 3822 mStart = firstCEI->lowIndex; 3823 minLimit = lastCEI->lowIndex; 3824 maxLimit = nextCEI->lowIndex; 3825 3826 // Look at the CE following the match. If it is UCOL_NULLORDER the match 3827 // extended to the end of input, and the match is good. 3828 3829 // Look at the high and low indices of the CE following the match. If 3830 // they are the same it means one of two things: 3831 // 1. The match extended to the last CE from the target text, which is OK, or 3832 // 2. The last CE that was part of the match is in an expansion that extends 3833 // to the first CE after the match. In this case, we reject the match. 3834 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 3835 found = FALSE; 3836 } 3837 3838 3839 // Check for the start of the match being within a combining sequence. 3840 // This can happen if the pattern itself begins with a combining char, and 3841 // the match found combining marks in the target text that were attached 3842 // to something else. 3843 // This type of match should be rejected for not completely consuming a 3844 // combining sequence. 3845 if (isBreakBoundary(strsrch, mStart)) { 3846 found = FALSE; 3847 } 3848 3849 // Check for the start of the match being within an Collation Element Expansion, 3850 // meaning that the first char of the match is only partially matched. 3851 // With exapnsions, the first CE will report the index of the source 3852 // character, and all subsequent (expansions) CEs will report the source index of the 3853 // _following_ character. 3854 int32_t secondIx = firstCEI->highIndex; 3855 if (mStart == secondIx) { 3856 found = FALSE; 3857 } 3858 3859 // Advance the match end position to the first acceptable match boundary. 3860 // This advances the index over any combining charcters. 3861 mLimit = maxLimit; 3862 if (minLimit < maxLimit) { 3863 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 3864 3865 if (nba >= lastCEI->highIndex) { 3866 mLimit = nba; 3867 } 3868 } 3869 3870 #ifdef USEARCH_DEBUG 3871 if (getenv("USEARCH_DEBUG") != NULL) { 3872 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 3873 } 3874 #endif 3875 3876 // If advancing to the end of a combining sequence in character indexing space 3877 // advanced us beyond the end of the match in CE space, reject this match. 3878 if (mLimit > maxLimit) { 3879 found = FALSE; 3880 } 3881 3882 if (isBreakBoundary(strsrch, mLimit)) { 3883 found = FALSE; 3884 } 3885 3886 if (! checkIdentical(strsrch, mStart, mLimit)) { 3887 found = FALSE; 3888 } 3889 3890 if (found) { 3891 break; 3892 } 3893 } 3894 3895 #ifdef USEARCH_DEBUG 3896 if (getenv("USEARCH_DEBUG") != NULL) { 3897 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 3898 int32_t lastToPrint = ceb.limitIx+2; 3899 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 3900 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 3901 } 3902 printf("\n%s\n", found? "match found" : "no match"); 3903 } 3904 #endif 3905 3906 // All Done. Store back the match bounds to the caller. 3907 // 3908 if (found==FALSE) { 3909 mLimit = -1; 3910 mStart = -1; 3911 } 3912 3913 if (matchStart != NULL) { 3914 *matchStart= mStart; 3915 } 3916 3917 if (matchLimit != NULL) { 3918 *matchLimit = mLimit; 3919 } 3920 3921 return found; 3922 } 3923 3924 3925 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 3926 int32_t startIdx, 3927 int32_t *matchStart, 3928 int32_t *matchLimit, 3929 UErrorCode *status) 3930 { 3931 if (U_FAILURE(*status)) { 3932 return FALSE; 3933 } 3934 3935 // TODO: reject search patterns beginning with a combining char. 3936 3937 #ifdef USEARCH_DEBUG 3938 if (getenv("USEARCH_DEBUG") != NULL) { 3939 printf("Pattern CEs\n"); 3940 for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 3941 printf(" %8x", strsrch->pattern.CE[ii]); 3942 } 3943 printf("\n"); 3944 } 3945 3946 #endif 3947 // Input parameter sanity check. 3948 // TODO: should input indicies clip to the text length 3949 // in the same way that UText does. 3950 if(strsrch->pattern.CELength == 0 || 3951 startIdx < 0 || 3952 startIdx > strsrch->search->textLength || 3953 strsrch->pattern.CE == NULL) { 3954 *status = U_ILLEGAL_ARGUMENT_ERROR; 3955 return FALSE; 3956 } 3957 3958 if (strsrch->pattern.PCE == NULL) { 3959 initializePatternPCETable(strsrch, status); 3960 } 3961 3962 CEBuffer ceb(strsrch, status); 3963 int32_t targetIx = 0; 3964 3965 /* 3966 * Pre-load the buffer with the CE's for the grapheme 3967 * after our starting position so that we're sure that 3968 * we can look at the CE following the match when we 3969 * check the match boundaries. 3970 * 3971 * This will also pre-fetch the first CE that we'll 3972 * consider for the match. 3973 */ 3974 if (startIdx < strsrch->search->textLength) { 3975 UBreakIterator *bi = strsrch->search->internalBreakIter; 3976 int32_t next = ubrk_following(bi, startIdx); 3977 3978 ucol_setOffset(strsrch->textIter, next, status); 3979 3980 for (targetIx = 0; ; targetIx += 1) { 3981 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 3982 break; 3983 } 3984 } 3985 } else { 3986 ucol_setOffset(strsrch->textIter, startIdx, status); 3987 } 3988 3989 3990 const CEI *targetCEI; 3991 int32_t patIx; 3992 UBool found; 3993 3994 int32_t limitIx = targetIx; 3995 int32_t mStart = -1; 3996 int32_t mLimit = -1; 3997 int32_t minLimit; 3998 int32_t maxLimit; 3999 4000 4001 4002 // Outer loop moves over match starting positions in the 4003 // target CE space. 4004 for(targetIx = limitIx; ; targetIx += 1) 4005 { 4006 found = TRUE; 4007 // Inner loop checks for a match beginning at each 4008 // position from the outer loop. 4009 for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { 4010 int64_t patCE = strsrch->pattern.PCE[patIx]; 4011 4012 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx); 4013 // Compare CE from target string with CE from the pattern. 4014 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4015 // which will fail the compare, below. 4016 if (targetCEI->ce != patCE) { 4017 found = FALSE; 4018 break; 4019 } 4020 } 4021 4022 if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) { 4023 // No match at this targetIx. Try again at the next. 4024 continue; 4025 } 4026 4027 if (!found) { 4028 // No match at all, we have run off the end of the target text. 4029 break; 4030 } 4031 4032 4033 // We have found a match in CE space. 4034 // Now determine the bounds in string index space. 4035 // There still is a chance of match failure if the CE range not correspond to 4036 // an acceptable character range. 4037 // 4038 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1); 4039 const CEI *lastCEI = ceb.getPrevious(targetIx); 4040 const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL; 4041 4042 mStart = firstCEI->lowIndex; 4043 minLimit = lastCEI->lowIndex; 4044 maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex; 4045 4046 // Look at the CE following the match. If it is UCOL_NULLORDER the match 4047 // extended to the end of input, and the match is good. 4048 4049 // Look at the high and low indices of the CE following the match. If 4050 // they are the same it means one of two things: 4051 // 1. The match extended to the last CE from the target text, which is OK, or 4052 // 2. The last CE that was part of the match is in an expansion that extends 4053 // to the first CE after the match. In this case, we reject the match. 4054 if (targetIx >= 1) { 4055 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4056 found = FALSE; 4057 } 4058 } 4059 4060 4061 // Check for the start of the match being within a combining sequence. 4062 // This can happen if the pattern itself begins with a combining char, and 4063 // the match found combining marks in the target text that were attached 4064 // to something else. 4065 // This type of match should be rejected for not completely consuming a 4066 // combining sequence. 4067 if (isBreakBoundary(strsrch, mStart)) { 4068 found = FALSE; 4069 } 4070 4071 // Look at the high index of the first CE in the match. If it's the same as the 4072 // low index, the first CE in the match is in the middle of an expansion. 4073 if (mStart == firstCEI->highIndex) { 4074 found = FALSE; 4075 } 4076 4077 // Advance the match end position to the first acceptable match boundary. 4078 // This advances the index over any combining charcters. 4079 mLimit = maxLimit; 4080 if (/*targetIx > 0 &&*/ minLimit < maxLimit) { 4081 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4082 4083 if (nba >= lastCEI->highIndex) { 4084 mLimit = nba; 4085 } 4086 } 4087 4088 #ifdef USEARCH_DEBUG 4089 if (getenv("USEARCH_DEBUG") != NULL) { 4090 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4091 } 4092 #endif 4093 4094 // If advancing to the end of a combining sequence in character indexing space 4095 // advanced us beyond the end of the match in CE space, reject this match. 4096 if (mLimit > maxLimit) { 4097 found = FALSE; 4098 } 4099 4100 // Make sure the end of the match is on a break boundary 4101 if (isBreakBoundary(strsrch, mLimit)) { 4102 found = FALSE; 4103 } 4104 4105 if (! checkIdentical(strsrch, mStart, mLimit)) { 4106 found = FALSE; 4107 } 4108 4109 if (found) { 4110 break; 4111 } 4112 } 4113 4114 #ifdef USEARCH_DEBUG 4115 if (getenv("USEARCH_DEBUG") != NULL) { 4116 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4117 int32_t lastToPrint = ceb.limitIx+2; 4118 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4119 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4120 } 4121 printf("\n%s\n", found? "match found" : "no match"); 4122 } 4123 #endif 4124 4125 // All Done. Store back the match bounds to the caller. 4126 // 4127 if (found==FALSE) { 4128 mLimit = -1; 4129 mStart = -1; 4130 } 4131 4132 if (matchStart != NULL) { 4133 *matchStart= mStart; 4134 } 4135 4136 if (matchLimit != NULL) { 4137 *matchLimit = mLimit; 4138 } 4139 4140 return found; 4141 } 4142 4143 4144 4145 4146 // internal use methods declared in usrchimp.h ----------------------------- 4147 4148 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4149 { 4150 if (U_FAILURE(*status)) { 4151 setMatchNotFound(strsrch); 4152 return FALSE; 4153 } 4154 4155 #if BOYER_MOORE 4156 UCollationElements *coleiter = strsrch->textIter; 4157 int32_t textlength = strsrch->search->textLength; 4158 int32_t *patternce = strsrch->pattern.CE; 4159 int32_t patterncelength = strsrch->pattern.CELength; 4160 int32_t textoffset = ucol_getOffset(coleiter); 4161 4162 // status used in setting coleiter offset, since offset is checked in 4163 // shiftForward before setting the coleiter offset, status never 4164 // a failure 4165 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4166 patterncelength); 4167 while (textoffset <= textlength) 4168 { 4169 uint32_t patternceindex = patterncelength - 1; 4170 int32_t targetce; 4171 UBool found = FALSE; 4172 int32_t lastce = UCOL_NULLORDER; 4173 4174 setColEIterOffset(coleiter, textoffset); 4175 4176 for (;;) { 4177 // finding the last pattern ce match, imagine composite characters 4178 // for example: search for pattern A in text \u00C0 4179 // we'll have to skip \u0300 the grave first before we get to A 4180 targetce = ucol_previous(coleiter, status); 4181 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4182 found = FALSE; 4183 break; 4184 } 4185 targetce = getCE(strsrch, targetce); 4186 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4187 // this is for the text \u0315\u0300 that requires 4188 // normalization and pattern \u0300, where \u0315 is ignorable 4189 continue; 4190 } 4191 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4192 lastce = targetce; 4193 } 4194 if (targetce == patternce[patternceindex]) { 4195 // the first ce can be a contraction 4196 found = TRUE; 4197 break; 4198 } 4199 if (!hasExpansion(coleiter)) { 4200 found = FALSE; 4201 break; 4202 } 4203 } 4204 4205 //targetce = lastce; 4206 4207 while (found && patternceindex > 0) { 4208 lastce = targetce; 4209 targetce = ucol_previous(coleiter, status); 4210 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4211 found = FALSE; 4212 break; 4213 } 4214 targetce = getCE(strsrch, targetce); 4215 if (targetce == UCOL_IGNORABLE) { 4216 continue; 4217 } 4218 4219 patternceindex --; 4220 found = found && targetce == patternce[patternceindex]; 4221 } 4222 4223 targetce = lastce; 4224 4225 if (!found) { 4226 if (U_FAILURE(*status)) { 4227 break; 4228 } 4229 textoffset = shiftForward(strsrch, textoffset, lastce, 4230 patternceindex); 4231 // status checked at loop. 4232 patternceindex = patterncelength; 4233 continue; 4234 } 4235 4236 if (checkNextExactMatch(strsrch, &textoffset, status)) { 4237 // status checked in ucol_setOffset 4238 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4239 return TRUE; 4240 } 4241 } 4242 setMatchNotFound(strsrch); 4243 return FALSE; 4244 #else 4245 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4246 int32_t start = -1; 4247 int32_t end = -1; 4248 4249 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4250 strsrch->search->matchedIndex = start; 4251 strsrch->search->matchedLength = end - start; 4252 return TRUE; 4253 } else { 4254 setMatchNotFound(strsrch); 4255 return FALSE; 4256 } 4257 #endif 4258 } 4259 4260 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4261 { 4262 if (U_FAILURE(*status)) { 4263 setMatchNotFound(strsrch); 4264 return FALSE; 4265 } 4266 4267 #if BOYER_MOORE 4268 UCollationElements *coleiter = strsrch->textIter; 4269 int32_t textlength = strsrch->search->textLength; 4270 int32_t *patternce = strsrch->pattern.CE; 4271 int32_t patterncelength = strsrch->pattern.CELength; 4272 int32_t textoffset = ucol_getOffset(coleiter); 4273 UBool hasPatternAccents = 4274 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4275 4276 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4277 patterncelength); 4278 strsrch->canonicalPrefixAccents[0] = 0; 4279 strsrch->canonicalSuffixAccents[0] = 0; 4280 4281 while (textoffset <= textlength) 4282 { 4283 int32_t patternceindex = patterncelength - 1; 4284 int32_t targetce; 4285 UBool found = FALSE; 4286 int32_t lastce = UCOL_NULLORDER; 4287 4288 setColEIterOffset(coleiter, textoffset); 4289 4290 for (;;) { 4291 // finding the last pattern ce match, imagine composite characters 4292 // for example: search for pattern A in text \u00C0 4293 // we'll have to skip \u0300 the grave first before we get to A 4294 targetce = ucol_previous(coleiter, status); 4295 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4296 found = FALSE; 4297 break; 4298 } 4299 targetce = getCE(strsrch, targetce); 4300 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4301 lastce = targetce; 4302 } 4303 if (targetce == patternce[patternceindex]) { 4304 // the first ce can be a contraction 4305 found = TRUE; 4306 break; 4307 } 4308 if (!hasExpansion(coleiter)) { 4309 found = FALSE; 4310 break; 4311 } 4312 } 4313 4314 while (found && patternceindex > 0) { 4315 targetce = ucol_previous(coleiter, status); 4316 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4317 found = FALSE; 4318 break; 4319 } 4320 targetce = getCE(strsrch, targetce); 4321 if (targetce == UCOL_IGNORABLE) { 4322 continue; 4323 } 4324 4325 patternceindex --; 4326 found = found && targetce == patternce[patternceindex]; 4327 } 4328 4329 // initializing the rearranged accent array 4330 if (hasPatternAccents && !found) { 4331 strsrch->canonicalPrefixAccents[0] = 0; 4332 strsrch->canonicalSuffixAccents[0] = 0; 4333 if (U_FAILURE(*status)) { 4334 break; 4335 } 4336 found = doNextCanonicalMatch(strsrch, textoffset, status); 4337 } 4338 4339 if (!found) { 4340 if (U_FAILURE(*status)) { 4341 break; 4342 } 4343 textoffset = shiftForward(strsrch, textoffset, lastce, 4344 patternceindex); 4345 // status checked at loop 4346 patternceindex = patterncelength; 4347 continue; 4348 } 4349 4350 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4351 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4352 return TRUE; 4353 } 4354 } 4355 setMatchNotFound(strsrch); 4356 return FALSE; 4357 #else 4358 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4359 int32_t start = -1; 4360 int32_t end = -1; 4361 4362 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4363 strsrch->search->matchedIndex = start; 4364 strsrch->search->matchedLength = end - start; 4365 return TRUE; 4366 } else { 4367 setMatchNotFound(strsrch); 4368 return FALSE; 4369 } 4370 #endif 4371 } 4372 4373 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4374 { 4375 if (U_FAILURE(*status)) { 4376 setMatchNotFound(strsrch); 4377 return FALSE; 4378 } 4379 4380 #if BOYER_MOORE 4381 UCollationElements *coleiter = strsrch->textIter; 4382 int32_t *patternce = strsrch->pattern.CE; 4383 int32_t patterncelength = strsrch->pattern.CELength; 4384 int32_t textoffset = ucol_getOffset(coleiter); 4385 4386 // shifting it check for setting offset 4387 // if setOffset is called previously or there was no previous match, we 4388 // leave the offset as it is. 4389 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4390 textoffset = strsrch->search->matchedIndex; 4391 } 4392 4393 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4394 patterncelength); 4395 4396 while (textoffset >= 0) 4397 { 4398 int32_t patternceindex = 1; 4399 int32_t targetce; 4400 UBool found = FALSE; 4401 int32_t firstce = UCOL_NULLORDER; 4402 4403 // if status is a failure, ucol_setOffset does nothing 4404 setColEIterOffset(coleiter, textoffset); 4405 4406 for (;;) { 4407 // finding the first pattern ce match, imagine composite 4408 // characters. for example: search for pattern \u0300 in text 4409 // \u00C0, we'll have to skip A first before we get to 4410 // \u0300 the grave accent 4411 targetce = ucol_next(coleiter, status); 4412 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4413 found = FALSE; 4414 break; 4415 } 4416 targetce = getCE(strsrch, targetce); 4417 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4418 firstce = targetce; 4419 } 4420 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4421 continue; 4422 } 4423 if (targetce == patternce[0]) { 4424 found = TRUE; 4425 break; 4426 } 4427 if (!hasExpansion(coleiter)) { 4428 // checking for accents in composite character 4429 found = FALSE; 4430 break; 4431 } 4432 } 4433 4434 //targetce = firstce; 4435 4436 while (found && (patternceindex < patterncelength)) { 4437 firstce = targetce; 4438 targetce = ucol_next(coleiter, status); 4439 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4440 found = FALSE; 4441 break; 4442 } 4443 targetce = getCE(strsrch, targetce); 4444 if (targetce == UCOL_IGNORABLE) { 4445 continue; 4446 } 4447 4448 found = found && targetce == patternce[patternceindex]; 4449 patternceindex ++; 4450 } 4451 4452 targetce = firstce; 4453 4454 if (!found) { 4455 if (U_FAILURE(*status)) { 4456 break; 4457 } 4458 4459 textoffset = reverseShift(strsrch, textoffset, targetce, 4460 patternceindex); 4461 patternceindex = 0; 4462 continue; 4463 } 4464 4465 if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4466 setColEIterOffset(coleiter, textoffset); 4467 return TRUE; 4468 } 4469 } 4470 setMatchNotFound(strsrch); 4471 return FALSE; 4472 #else 4473 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4474 int32_t start = -1; 4475 int32_t end = -1; 4476 4477 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4478 strsrch->search->matchedIndex = start; 4479 strsrch->search->matchedLength = end - start; 4480 return TRUE; 4481 } else { 4482 setMatchNotFound(strsrch); 4483 return FALSE; 4484 } 4485 #endif 4486 } 4487 4488 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4489 UErrorCode *status) 4490 { 4491 if (U_FAILURE(*status)) { 4492 setMatchNotFound(strsrch); 4493 return FALSE; 4494 } 4495 4496 #if BOYER_MOORE 4497 UCollationElements *coleiter = strsrch->textIter; 4498 int32_t *patternce = strsrch->pattern.CE; 4499 int32_t patterncelength = strsrch->pattern.CELength; 4500 int32_t textoffset = ucol_getOffset(coleiter); 4501 UBool hasPatternAccents = 4502 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4503 4504 // shifting it check for setting offset 4505 // if setOffset is called previously or there was no previous match, we 4506 // leave the offset as it is. 4507 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4508 textoffset = strsrch->search->matchedIndex; 4509 } 4510 4511 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4512 patterncelength); 4513 strsrch->canonicalPrefixAccents[0] = 0; 4514 strsrch->canonicalSuffixAccents[0] = 0; 4515 4516 while (textoffset >= 0) 4517 { 4518 int32_t patternceindex = 1; 4519 int32_t targetce; 4520 UBool found = FALSE; 4521 int32_t firstce = UCOL_NULLORDER; 4522 4523 setColEIterOffset(coleiter, textoffset); 4524 for (;;) { 4525 // finding the first pattern ce match, imagine composite 4526 // characters. for example: search for pattern \u0300 in text 4527 // \u00C0, we'll have to skip A first before we get to 4528 // \u0300 the grave accent 4529 targetce = ucol_next(coleiter, status); 4530 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4531 found = FALSE; 4532 break; 4533 } 4534 targetce = getCE(strsrch, targetce); 4535 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4536 firstce = targetce; 4537 } 4538 4539 if (targetce == patternce[0]) { 4540 // the first ce can be a contraction 4541 found = TRUE; 4542 break; 4543 } 4544 if (!hasExpansion(coleiter)) { 4545 // checking for accents in composite character 4546 found = FALSE; 4547 break; 4548 } 4549 } 4550 4551 targetce = firstce; 4552 4553 while (found && patternceindex < patterncelength) { 4554 targetce = ucol_next(coleiter, status); 4555 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4556 found = FALSE; 4557 break; 4558 } 4559 targetce = getCE(strsrch, targetce); 4560 if (targetce == UCOL_IGNORABLE) { 4561 continue; 4562 } 4563 4564 found = found && targetce == patternce[patternceindex]; 4565 patternceindex ++; 4566 } 4567 4568 // initializing the rearranged accent array 4569 if (hasPatternAccents && !found) { 4570 strsrch->canonicalPrefixAccents[0] = 0; 4571 strsrch->canonicalSuffixAccents[0] = 0; 4572 if (U_FAILURE(*status)) { 4573 break; 4574 } 4575 found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4576 } 4577 4578 if (!found) { 4579 if (U_FAILURE(*status)) { 4580 break; 4581 } 4582 textoffset = reverseShift(strsrch, textoffset, targetce, 4583 patternceindex); 4584 patternceindex = 0; 4585 continue; 4586 } 4587 4588 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4589 setColEIterOffset(coleiter, textoffset); 4590 return TRUE; 4591 } 4592 } 4593 setMatchNotFound(strsrch); 4594 return FALSE; 4595 #else 4596 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4597 int32_t start = -1; 4598 int32_t end = -1; 4599 4600 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4601 strsrch->search->matchedIndex = start; 4602 strsrch->search->matchedLength = end - start; 4603 return TRUE; 4604 } else { 4605 setMatchNotFound(strsrch); 4606 return FALSE; 4607 } 4608 #endif 4609 } 4610 4611 #endif /* #if !UCONFIG_NO_COLLATION */ 4612