1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2015 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 07/02/2001 synwee Creation. 9 ********************************************************************** 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/usearch.h" 17 #include "unicode/ustring.h" 18 #include "unicode/uchar.h" 19 #include "unicode/utf16.h" 20 #include "normalizer2impl.h" 21 #include "usrchimp.h" 22 #include "cmemory.h" 23 #include "ucln_in.h" 24 #include "uassert.h" 25 #include "ustr_imp.h" 26 27 U_NAMESPACE_USE 28 29 // don't use Boyer-Moore 30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 31 #define BOYER_MOORE 0 32 33 // internal definition --------------------------------------------------- 34 35 #define LAST_BYTE_MASK_ 0xFF 36 #define SECOND_LAST_BYTE_SHIFT_ 8 37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 38 39 static const Normalizer2Impl *g_nfcImpl = NULL; 40 41 // internal methods ------------------------------------------------- 42 43 /** 44 * Fast collation element iterator setOffset. 45 * This function does not check for bounds. 46 * @param coleiter collation element iterator 47 * @param offset to set 48 */ 49 static 50 inline void setColEIterOffset(UCollationElements *elems, 51 int32_t offset) 52 { 53 // Note: Not "fast" any more after the 2013 collation rewrite. 54 // We do not want to expose more internals than necessary. 55 UErrorCode status = U_ZERO_ERROR; 56 ucol_setOffset(elems, offset, &status); 57 } 58 59 /** 60 * Getting the mask for collation strength 61 * @param strength collation strength 62 * @return collation element mask 63 */ 64 static 65 inline uint32_t getMask(UCollationStrength strength) 66 { 67 switch (strength) 68 { 69 case UCOL_PRIMARY: 70 return UCOL_PRIMARYORDERMASK; 71 case UCOL_SECONDARY: 72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 73 default: 74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 75 UCOL_PRIMARYORDERMASK; 76 } 77 } 78 79 /** 80 * @param ce 32-bit collation element 81 * @return hash code 82 */ 83 static 84 inline int hashFromCE32(uint32_t ce) 85 { 86 int hc = (int)( 87 ((((((ce >> 24) * 37) + 88 (ce >> 16)) * 37) + 89 (ce >> 8)) * 37) + 90 ce); 91 hc %= MAX_TABLE_SIZE_; 92 if (hc < 0) { 93 hc += MAX_TABLE_SIZE_; 94 } 95 return hc; 96 } 97 98 U_CDECL_BEGIN 99 static UBool U_CALLCONV 100 usearch_cleanup(void) { 101 g_nfcImpl = NULL; 102 return TRUE; 103 } 104 U_CDECL_END 105 106 /** 107 * Initializing the fcd tables. 108 * Internal method, status assumed to be a success. 109 * @param status output error if any, caller to check status before calling 110 * method, status assumed to be success when passed in. 111 */ 112 static 113 inline void initializeFCD(UErrorCode *status) 114 { 115 if (g_nfcImpl == NULL) { 116 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 117 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 118 } 119 } 120 121 /** 122 * Gets the fcd value for a character at the argument index. 123 * This method takes into accounts of the supplementary characters. 124 * @param str UTF16 string where character for fcd retrieval resides 125 * @param offset position of the character whose fcd is to be retrieved, to be 126 * overwritten with the next character position, taking 127 * surrogate characters into consideration. 128 * @param strlength length of the argument string 129 * @return fcd value 130 */ 131 static 132 uint16_t getFCD(const UChar *str, int32_t *offset, 133 int32_t strlength) 134 { 135 const UChar *temp = str + *offset; 136 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 137 *offset = (int32_t)(temp - str); 138 return result; 139 } 140 141 /** 142 * Getting the modified collation elements taking into account the collation 143 * attributes 144 * @param strsrch string search data 145 * @param sourcece 146 * @return the modified collation element 147 */ 148 static 149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 150 { 151 // note for tertiary we can't use the collator->tertiaryMask, that 152 // is a preprocessed mask that takes into account case options. since 153 // we are only concerned with exact matches, we don't need that. 154 sourcece &= strsrch->ceMask; 155 156 if (strsrch->toShift) { 157 // alternate handling here, since only the 16 most significant digits 158 // is only used, we can safely do a compare without masking 159 // if the ce is a variable, we mask and get only the primary values 160 // no shifting to quartenary is required since all primary values 161 // less than variabletop will need to be masked off anyway. 162 if (strsrch->variableTop > sourcece) { 163 if (strsrch->strength >= UCOL_QUATERNARY) { 164 sourcece &= UCOL_PRIMARYORDERMASK; 165 } 166 else { 167 sourcece = UCOL_IGNORABLE; 168 } 169 } 170 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 171 sourcece = 0xFFFF; 172 } 173 174 return sourcece; 175 } 176 177 /** 178 * Allocate a memory and returns NULL if it failed. 179 * Internal method, status assumed to be a success. 180 * @param size to allocate 181 * @param status output error if any, caller to check status before calling 182 * method, status assumed to be success when passed in. 183 * @return newly allocated array, NULL otherwise 184 */ 185 static 186 inline void * allocateMemory(uint32_t size, UErrorCode *status) 187 { 188 uint32_t *result = (uint32_t *)uprv_malloc(size); 189 if (result == NULL) { 190 *status = U_MEMORY_ALLOCATION_ERROR; 191 } 192 return result; 193 } 194 195 /** 196 * Adds a uint32_t value to a destination array. 197 * Creates a new array if we run out of space. The caller will have to 198 * manually deallocate the newly allocated array. 199 * Internal method, status assumed to be success, caller has to check status 200 * before calling this method. destination not to be NULL and has at least 201 * size destinationlength. 202 * @param destination target array 203 * @param offset destination offset to add value 204 * @param destinationlength target array size, return value for the new size 205 * @param value to be added 206 * @param increments incremental size expected 207 * @param status output error if any, caller to check status before calling 208 * method, status assumed to be success when passed in. 209 * @return new destination array, destination if there was no new allocation 210 */ 211 static 212 inline int32_t * addTouint32_tArray(int32_t *destination, 213 uint32_t offset, 214 uint32_t *destinationlength, 215 uint32_t value, 216 uint32_t increments, 217 UErrorCode *status) 218 { 219 uint32_t newlength = *destinationlength; 220 if (offset + 1 == newlength) { 221 newlength += increments; 222 int32_t *temp = (int32_t *)allocateMemory( 223 sizeof(int32_t) * newlength, status); 224 if (U_FAILURE(*status)) { 225 return NULL; 226 } 227 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset); 228 *destinationlength = newlength; 229 destination = temp; 230 } 231 destination[offset] = value; 232 return destination; 233 } 234 235 /** 236 * Adds a uint64_t value to a destination array. 237 * Creates a new array if we run out of space. The caller will have to 238 * manually deallocate the newly allocated array. 239 * Internal method, status assumed to be success, caller has to check status 240 * before calling this method. destination not to be NULL and has at least 241 * size destinationlength. 242 * @param destination target array 243 * @param offset destination offset to add value 244 * @param destinationlength target array size, return value for the new size 245 * @param value to be added 246 * @param increments incremental size expected 247 * @param status output error if any, caller to check status before calling 248 * method, status assumed to be success when passed in. 249 * @return new destination array, destination if there was no new allocation 250 */ 251 static 252 inline int64_t * addTouint64_tArray(int64_t *destination, 253 uint32_t offset, 254 uint32_t *destinationlength, 255 uint64_t value, 256 uint32_t increments, 257 UErrorCode *status) 258 { 259 uint32_t newlength = *destinationlength; 260 if (offset + 1 == newlength) { 261 newlength += increments; 262 int64_t *temp = (int64_t *)allocateMemory( 263 sizeof(int64_t) * newlength, status); 264 265 if (U_FAILURE(*status)) { 266 return NULL; 267 } 268 269 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset); 270 *destinationlength = newlength; 271 destination = temp; 272 } 273 274 destination[offset] = value; 275 276 return destination; 277 } 278 279 /** 280 * Initializing the ce table for a pattern. 281 * Stores non-ignorable collation keys. 282 * Table size will be estimated by the size of the pattern text. Table 283 * expansion will be perform as we go along. Adding 1 to ensure that the table 284 * size definitely increases. 285 * Internal method, status assumed to be a success. 286 * @param strsrch string search data 287 * @param status output error if any, caller to check status before calling 288 * method, status assumed to be success when passed in. 289 * @return total number of expansions 290 */ 291 static 292 inline uint16_t initializePatternCETable(UStringSearch *strsrch, 293 UErrorCode *status) 294 { 295 UPattern *pattern = &(strsrch->pattern); 296 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 297 int32_t *cetable = pattern->cesBuffer; 298 uint32_t patternlength = pattern->textLength; 299 UCollationElements *coleiter = strsrch->utilIter; 300 301 if (coleiter == NULL) { 302 coleiter = ucol_openElements(strsrch->collator, pattern->text, 303 patternlength, status); 304 // status will be checked in ucol_next(..) later and if it is an 305 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 306 // returned. 307 strsrch->utilIter = coleiter; 308 } 309 else { 310 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 311 } 312 if(U_FAILURE(*status)) { 313 return 0; 314 } 315 316 if (pattern->ces != cetable && pattern->ces) { 317 uprv_free(pattern->ces); 318 } 319 320 uint16_t offset = 0; 321 uint16_t result = 0; 322 int32_t ce; 323 324 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 325 U_SUCCESS(*status)) { 326 uint32_t newce = getCE(strsrch, ce); 327 if (newce) { 328 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 329 newce, 330 patternlength - ucol_getOffset(coleiter) + 1, 331 status); 332 if (U_FAILURE(*status)) { 333 return 0; 334 } 335 offset ++; 336 if (cetable != temp && cetable != pattern->cesBuffer) { 337 uprv_free(cetable); 338 } 339 cetable = temp; 340 } 341 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 342 } 343 344 cetable[offset] = 0; 345 pattern->ces = cetable; 346 pattern->cesLength = offset; 347 348 return result; 349 } 350 351 /** 352 * Initializing the pce table for a pattern. 353 * Stores non-ignorable collation keys. 354 * Table size will be estimated by the size of the pattern text. Table 355 * expansion will be perform as we go along. Adding 1 to ensure that the table 356 * size definitely increases. 357 * Internal method, status assumed to be a success. 358 * @param strsrch string search data 359 * @param status output error if any, caller to check status before calling 360 * method, status assumed to be success when passed in. 361 * @return total number of expansions 362 */ 363 static 364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 365 UErrorCode *status) 366 { 367 UPattern *pattern = &(strsrch->pattern); 368 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 369 int64_t *pcetable = pattern->pcesBuffer; 370 uint32_t patternlength = pattern->textLength; 371 UCollationElements *coleiter = strsrch->utilIter; 372 373 if (coleiter == NULL) { 374 coleiter = ucol_openElements(strsrch->collator, pattern->text, 375 patternlength, status); 376 // status will be checked in ucol_next(..) later and if it is an 377 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 378 // returned. 379 strsrch->utilIter = coleiter; 380 } else { 381 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 382 } 383 if(U_FAILURE(*status)) { 384 return 0; 385 } 386 387 if (pattern->pces != pcetable && pattern->pces != NULL) { 388 uprv_free(pattern->pces); 389 } 390 391 uint16_t offset = 0; 392 uint16_t result = 0; 393 int64_t pce; 394 395 icu::UCollationPCE iter(coleiter); 396 397 // ** Should processed CEs be signed or unsigned? 398 // ** (the rest of the code in this file seems to play fast-and-loose with 399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 400 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 401 U_SUCCESS(*status)) { 402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 403 pce, 404 patternlength - ucol_getOffset(coleiter) + 1, 405 status); 406 407 if (U_FAILURE(*status)) { 408 return 0; 409 } 410 411 offset += 1; 412 413 if (pcetable != temp && pcetable != pattern->pcesBuffer) { 414 uprv_free(pcetable); 415 } 416 417 pcetable = temp; 418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 419 } 420 421 pcetable[offset] = 0; 422 pattern->pces = pcetable; 423 pattern->pcesLength = offset; 424 425 return result; 426 } 427 428 /** 429 * Initializes the pattern struct. 430 * Internal method, status assumed to be success. 431 * @param strsrch UStringSearch data storage 432 * @param status output error if any, caller to check status before calling 433 * method, status assumed to be success when passed in. 434 * @return expansionsize the total expansion size of the pattern 435 */ 436 static 437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 438 { 439 if (U_FAILURE(*status)) { return 0; } 440 UPattern *pattern = &(strsrch->pattern); 441 const UChar *patterntext = pattern->text; 442 int32_t length = pattern->textLength; 443 int32_t index = 0; 444 445 // Since the strength is primary, accents are ignored in the pattern. 446 if (strsrch->strength == UCOL_PRIMARY) { 447 pattern->hasPrefixAccents = 0; 448 pattern->hasSuffixAccents = 0; 449 } else { 450 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 451 SECOND_LAST_BYTE_SHIFT_; 452 index = length; 453 U16_BACK_1(patterntext, 0, index); 454 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 455 LAST_BYTE_MASK_; 456 } 457 458 // ** HACK ** 459 if (strsrch->pattern.pces != NULL) { 460 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 461 uprv_free(strsrch->pattern.pces); 462 } 463 464 strsrch->pattern.pces = NULL; 465 } 466 467 // since intializePattern is an internal method status is a success. 468 return initializePatternCETable(strsrch, status); 469 } 470 471 /** 472 * Initializing shift tables, with the default values. 473 * If a corresponding default value is 0, the shift table is not set. 474 * @param shift table for forwards shift 475 * @param backshift table for backwards shift 476 * @param cetable table containing pattern ce 477 * @param cesize size of the pattern ces 478 * @param expansionsize total size of the expansions 479 * @param defaultforward the default forward value 480 * @param defaultbackward the default backward value 481 */ 482 static 483 inline void setShiftTable(int16_t shift[], int16_t backshift[], 484 int32_t *cetable, int32_t cesize, 485 int16_t expansionsize, 486 int16_t defaultforward, 487 int16_t defaultbackward) 488 { 489 // estimate the value to shift. to do that we estimate the smallest 490 // number of characters to give the relevant ces, ie approximately 491 // the number of ces minus their expansion, since expansions can come 492 // from a character. 493 int32_t count; 494 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 495 shift[count] = defaultforward; 496 } 497 cesize --; // down to the last index 498 for (count = 0; count < cesize; count ++) { 499 // number of ces from right of array to the count 500 int temp = defaultforward - count - 1; 501 shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1; 502 } 503 shift[hashFromCE32(cetable[cesize])] = 1; 504 // for ignorables we just shift by one. see test examples. 505 shift[hashFromCE32(0)] = 1; 506 507 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 508 backshift[count] = defaultbackward; 509 } 510 for (count = cesize; count > 0; count --) { 511 // the original value count does not seem to work 512 backshift[hashFromCE32(cetable[count])] = count > expansionsize ? 513 (int16_t)(count - expansionsize) : 1; 514 } 515 backshift[hashFromCE32(cetable[0])] = 1; 516 backshift[hashFromCE32(0)] = 1; 517 } 518 519 /** 520 * Building of the pattern collation element list and the boyer moore strsrch 521 * table. 522 * The canonical match will only be performed after the default match fails. 523 * For both cases we need to remember the size of the composed and decomposed 524 * versions of the string. Since the Boyer-Moore shift calculations shifts by 525 * a number of characters in the text and tries to match the pattern from that 526 * offset, the shift value can not be too large in case we miss some 527 * characters. To choose a right shift size, we estimate the NFC form of the 528 * and use its size as a shift guide. The NFC form should be the small 529 * possible representation of the pattern. Anyways, we'll err on the smaller 530 * shift size. Hence the calculation for minlength. 531 * Canonical match will be performed slightly differently. We'll split the 532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by 533 * the first and last base character (MS), the ending accents (EA). Matches 534 * will be done on MS first, and only when we match MS then some processing 535 * will be required for the prefix and end accents in order to determine if 536 * they match PA and EA. Hence the default shift values 537 * for the canonical match will take the size of either end's accent into 538 * consideration. Forwards search will take the end accents into consideration 539 * for the default shift values and the backwards search will take the prefix 540 * accents into consideration. 541 * If pattern has no non-ignorable ce, we return a illegal argument error. 542 * Internal method, status assumed to be success. 543 * @param strsrch UStringSearch data storage 544 * @param status for output errors if it occurs, status is assumed to be a 545 * success when it is passed in. 546 */ 547 static 548 inline void initialize(UStringSearch *strsrch, UErrorCode *status) 549 { 550 int16_t expandlength = initializePattern(strsrch, status); 551 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { 552 UPattern *pattern = &strsrch->pattern; 553 int32_t cesize = pattern->cesLength; 554 555 int16_t minlength = cesize > expandlength 556 ? (int16_t)cesize - expandlength : 1; 557 pattern->defaultShiftSize = minlength; 558 setShiftTable(pattern->shift, pattern->backShift, pattern->ces, 559 cesize, expandlength, minlength, minlength); 560 return; 561 } 562 strsrch->pattern.defaultShiftSize = 0; 563 } 564 565 #if BOYER_MOORE 566 /** 567 * Check to make sure that the match length is at the end of the character by 568 * using the breakiterator. 569 * @param strsrch string search data 570 * @param start target text start offset 571 * @param end target text end offset 572 */ 573 static 574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 575 int32_t *end) 576 { 577 #if !UCONFIG_NO_BREAK_ITERATION 578 UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 579 if (breakiterator) { 580 int32_t matchend = *end; 581 //int32_t matchstart = *start; 582 583 if (!ubrk_isBoundary(breakiterator, matchend)) { 584 *end = ubrk_following(breakiterator, matchend); 585 } 586 587 /* Check the start of the matched text to make sure it doesn't have any accents 588 * before it. This code may not be necessary and so it is commented out */ 589 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 590 *start = ubrk_preceding(breakiterator, matchstart); 591 }*/ 592 } 593 #endif 594 } 595 596 /** 597 * Determine whether the target text in UStringSearch bounded by the offset 598 * start and end is one or more whole units of text as 599 * determined by the breakiterator in UStringSearch. 600 * @param strsrch string search data 601 * @param start target text start offset 602 * @param end target text end offset 603 */ 604 static 605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 606 int32_t end) 607 { 608 #if !UCONFIG_NO_BREAK_ITERATION 609 UBreakIterator *breakiterator = strsrch->search->breakIter; 610 //TODO: Add here. 611 if (breakiterator) { 612 int32_t startindex = ubrk_first(breakiterator); 613 int32_t endindex = ubrk_last(breakiterator); 614 615 // out-of-range indexes are never boundary positions 616 if (start < startindex || start > endindex || 617 end < startindex || end > endindex) { 618 return FALSE; 619 } 620 // otherwise, we can use following() on the position before the 621 // specified one and return true of the position we get back is the 622 // one the user specified 623 UBool result = (start == startindex || 624 ubrk_following(breakiterator, start - 1) == start) && 625 (end == endindex || 626 ubrk_following(breakiterator, end - 1) == end); 627 if (result) { 628 // iterates the individual ces 629 UCollationElements *coleiter = strsrch->utilIter; 630 const UChar *text = strsrch->search->text + 631 start; 632 UErrorCode status = U_ZERO_ERROR; 633 ucol_setText(coleiter, text, end - start, &status); 634 for (int32_t count = 0; count < strsrch->pattern.cesLength; 635 count ++) { 636 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 637 if (ce == UCOL_IGNORABLE) { 638 count --; 639 continue; 640 } 641 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { 642 return FALSE; 643 } 644 } 645 int32_t nextce = ucol_next(coleiter, &status); 646 while (ucol_getOffset(coleiter) == (end - start) 647 && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 648 nextce = ucol_next(coleiter, &status); 649 } 650 if (ucol_getOffset(coleiter) == (end - start) 651 && nextce != UCOL_NULLORDER) { 652 // extra collation elements at the end of the match 653 return FALSE; 654 } 655 } 656 return result; 657 } 658 #endif 659 return TRUE; 660 } 661 662 /** 663 * Getting the next base character offset if current offset is an accent, 664 * or the current offset if the current character contains a base character. 665 * accents the following base character will be returned 666 * @param text string 667 * @param textoffset current offset 668 * @param textlength length of text string 669 * @return the next base character or the current offset 670 * if the current character is contains a base character. 671 */ 672 static 673 inline int32_t getNextBaseOffset(const UChar *text, 674 int32_t textoffset, 675 int32_t textlength) 676 { 677 if (textoffset < textlength) { 678 int32_t temp = textoffset; 679 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 680 while (temp < textlength) { 681 int32_t result = temp; 682 if ((getFCD(text, &temp, textlength) >> 683 SECOND_LAST_BYTE_SHIFT_) == 0) { 684 return result; 685 } 686 } 687 return textlength; 688 } 689 } 690 return textoffset; 691 } 692 693 /** 694 * Gets the next base character offset depending on the string search pattern 695 * data 696 * @param strsrch string search data 697 * @param textoffset current offset, one offset away from the last character 698 * to search for. 699 * @return start index of the next base character or the current offset 700 * if the current character is contains a base character. 701 */ 702 static 703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 704 int32_t textoffset) 705 { 706 int32_t textlength = strsrch->search->textLength; 707 if (strsrch->pattern.hasSuffixAccents && 708 textoffset < textlength) { 709 int32_t temp = textoffset; 710 const UChar *text = strsrch->search->text; 711 U16_BACK_1(text, 0, temp); 712 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 713 return getNextBaseOffset(text, textoffset, textlength); 714 } 715 } 716 return textoffset; 717 } 718 719 /** 720 * Shifting the collation element iterator position forward to prepare for 721 * a following match. If the last character is a unsafe character, we'll only 722 * shift by 1 to capture contractions, normalization etc. 723 * Internal method, status assumed to be success. 724 * @param text strsrch string search data 725 * @param textoffset start text position to do search 726 * @param ce the text ce which failed the match. 727 * @param patternceindex index of the ce within the pattern ce buffer which 728 * failed the match 729 * @return final offset 730 */ 731 static 732 inline int32_t shiftForward(UStringSearch *strsrch, 733 int32_t textoffset, 734 int32_t ce, 735 int32_t patternceindex) 736 { 737 UPattern *pattern = &(strsrch->pattern); 738 if (ce != UCOL_NULLORDER) { 739 int32_t shift = pattern->shift[hashFromCE32(ce)]; 740 // this is to adjust for characters in the middle of the 741 // substring for matching that failed. 742 int32_t adjust = pattern->cesLength - patternceindex; 743 if (adjust > 1 && shift >= adjust) { 744 shift -= adjust - 1; 745 } 746 textoffset += shift; 747 } 748 else { 749 textoffset += pattern->defaultShiftSize; 750 } 751 752 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 753 // check for unsafe characters 754 // * if it is the start or middle of a contraction: to be done after 755 // a initial match is found 756 // * thai or lao base consonant character: similar to contraction 757 // * high surrogate character: similar to contraction 758 // * next character is a accent: shift to the next base character 759 return textoffset; 760 } 761 #endif // #if BOYER_MOORE 762 763 /** 764 * sets match not found 765 * @param strsrch string search data 766 */ 767 static 768 inline void setMatchNotFound(UStringSearch *strsrch) 769 { 770 // this method resets the match result regardless of the error status. 771 strsrch->search->matchedIndex = USEARCH_DONE; 772 strsrch->search->matchedLength = 0; 773 if (strsrch->search->isForwardSearching) { 774 setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 775 } 776 else { 777 setColEIterOffset(strsrch->textIter, 0); 778 } 779 } 780 781 #if BOYER_MOORE 782 /** 783 * Gets the offset to the next safe point in text. 784 * ie. not the middle of a contraction, swappable characters or supplementary 785 * characters. 786 * @param collator collation sata 787 * @param text string to work with 788 * @param textoffset offset in string 789 * @param textlength length of text string 790 * @return offset to the next safe character 791 */ 792 static 793 inline int32_t getNextSafeOffset(const UCollator *collator, 794 const UChar *text, 795 int32_t textoffset, 796 int32_t textlength) 797 { 798 int32_t result = textoffset; // first contraction character 799 while (result != textlength && ucol_unsafeCP(text[result], collator)) { 800 result ++; 801 } 802 return result; 803 } 804 805 /** 806 * This checks for accents in the potential match started with a . 807 * composite character. 808 * This is really painful... we have to check that composite character do not 809 * have any extra accents. We have to normalize the potential match and find 810 * the immediate decomposed character before the match. 811 * The first composite character would have been taken care of by the fcd 812 * checks in checkForwardExactMatch. 813 * This is the slow path after the fcd of the first character and 814 * the last character has been checked by checkForwardExactMatch and we 815 * determine that the potential match has extra non-ignorable preceding 816 * ces. 817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 819 * Note here that accents checking are slow and cautioned in the API docs. 820 * Internal method, status assumed to be a success, caller should check status 821 * before calling this method 822 * @param strsrch string search data 823 * @param start index of the potential unfriendly composite character 824 * @param end index of the potential unfriendly composite character 825 * @param status output error status if any. 826 * @return TRUE if there is non-ignorable accents before at the beginning 827 * of the match, FALSE otherwise. 828 */ 829 830 static 831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 832 int32_t end, 833 UErrorCode *status) 834 { 835 UBool result = FALSE; 836 if (strsrch->pattern.hasPrefixAccents) { 837 int32_t length = end - start; 838 int32_t offset = 0; 839 const UChar *text = strsrch->search->text + start; 840 841 U16_FWD_1(text, offset, length); 842 // we are only concerned with the first composite character 843 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 844 int32_t safeoffset = getNextSafeOffset(strsrch->collator, 845 text, 0, length); 846 if (safeoffset != length) { 847 safeoffset ++; 848 } 849 UChar *norm = NULL; 850 UChar buffer[INITIAL_ARRAY_SIZE_]; 851 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 852 buffer, INITIAL_ARRAY_SIZE_, 853 status); 854 if (U_FAILURE(*status)) { 855 return FALSE; 856 } 857 if (size >= INITIAL_ARRAY_SIZE_) { 858 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 859 status); 860 // if allocation failed, status will be set to 861 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 862 // checks for it. 863 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 864 size, status); 865 if (U_FAILURE(*status) && norm != NULL) { 866 uprv_free(norm); 867 return FALSE; 868 } 869 } 870 else { 871 norm = buffer; 872 } 873 874 UCollationElements *coleiter = strsrch->utilIter; 875 ucol_setText(coleiter, norm, size, status); 876 uint32_t firstce = strsrch->pattern.ces[0]; 877 UBool ignorable = TRUE; 878 uint32_t ce = UCOL_IGNORABLE; 879 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 880 offset = ucol_getOffset(coleiter); 881 if (ce != firstce && ce != UCOL_IGNORABLE) { 882 ignorable = FALSE; 883 } 884 ce = ucol_next(coleiter, status); 885 } 886 UChar32 codepoint; 887 U16_PREV(norm, 0, offset, codepoint); 888 result = !ignorable && (u_getCombiningClass(codepoint) != 0); 889 890 if (norm != buffer) { 891 uprv_free(norm); 892 } 893 } 894 } 895 896 return result; 897 } 898 899 /** 900 * Used by exact matches, checks if there are accents before the match. 901 * This is really painful... we have to check that composite characters at 902 * the start of the matches have to not have any extra accents. 903 * We check the FCD of the character first, if it starts with an accent and 904 * the first pattern ce does not match the first ce of the character, we bail. 905 * Otherwise we try normalizing the first composite 906 * character and find the immediate decomposed character before the match to 907 * see if it is an non-ignorable accent. 908 * Now normalizing the first composite character is enough because we ensure 909 * that when the match is passed in here with extra beginning ces, the 910 * first or last ce that match has to occur within the first character. 911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 913 * Note here that accents checking are slow and cautioned in the API docs. 914 * @param strsrch string search data 915 * @param start offset 916 * @param end offset 917 * @return TRUE if there are accents on either side of the match, 918 * FALSE otherwise 919 */ 920 static 921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 922 int32_t end) 923 { 924 if (strsrch->pattern.hasPrefixAccents) { 925 UCollationElements *coleiter = strsrch->textIter; 926 UErrorCode status = U_ZERO_ERROR; 927 // we have been iterating forwards previously 928 uint32_t ignorable = TRUE; 929 int32_t firstce = strsrch->pattern.ces[0]; 930 931 setColEIterOffset(coleiter, start); 932 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 933 if (U_FAILURE(status)) { 934 return TRUE; 935 } 936 while (ce != firstce) { 937 if (ce != UCOL_IGNORABLE) { 938 ignorable = FALSE; 939 } 940 ce = getCE(strsrch, ucol_next(coleiter, &status)); 941 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 942 return TRUE; 943 } 944 } 945 if (!ignorable && inNormBuf(coleiter)) { 946 // within normalization buffer, discontiguous handled here 947 return TRUE; 948 } 949 950 // within text 951 int32_t temp = start; 952 // original code 953 // accent = (getFCD(strsrch->search->text, &temp, 954 // strsrch->search->textLength) 955 // >> SECOND_LAST_BYTE_SHIFT_); 956 // however this code does not work well with VC7 .net in release mode. 957 // maybe the inlines for getFCD combined with shifting has bugs in 958 // VC7. anyways this is a work around. 959 UBool accent = getFCD(strsrch->search->text, &temp, 960 strsrch->search->textLength) > 0xFF; 961 if (!accent) { 962 return checkExtraMatchAccents(strsrch, start, end, &status); 963 } 964 if (!ignorable) { 965 return TRUE; 966 } 967 if (start > 0) { 968 temp = start; 969 U16_BACK_1(strsrch->search->text, 0, temp); 970 if (getFCD(strsrch->search->text, &temp, 971 strsrch->search->textLength) & LAST_BYTE_MASK_) { 972 setColEIterOffset(coleiter, start); 973 ce = ucol_previous(coleiter, &status); 974 if (U_FAILURE(status) || 975 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 976 return TRUE; 977 } 978 } 979 } 980 } 981 982 return FALSE; 983 } 984 985 /** 986 * Used by exact matches, checks if there are accents bounding the match. 987 * Note this is the initial boundary check. If the potential match 988 * starts or ends with composite characters, the accents in those 989 * characters will be determined later. 990 * Not doing backwards iteration here, since discontiguos contraction for 991 * backwards collation element iterator, use up too many characters. 992 * E.g. looking for \u030A ring in \u01FA A ring above and acute, 993 * should fail since there is a acute at the end of \u01FA 994 * Note here that accents checking are slow and cautioned in the API docs. 995 * @param strsrch string search data 996 * @param start offset of match 997 * @param end end offset of the match 998 * @return TRUE if there are accents on either side of the match, 999 * FALSE otherwise 1000 */ 1001 static 1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1003 int32_t end) 1004 { 1005 if (strsrch->pattern.hasSuffixAccents) { 1006 const UChar *text = strsrch->search->text; 1007 int32_t temp = end; 1008 int32_t textlength = strsrch->search->textLength; 1009 U16_BACK_1(text, 0, temp); 1010 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1011 int32_t firstce = strsrch->pattern.ces[0]; 1012 UCollationElements *coleiter = strsrch->textIter; 1013 UErrorCode status = U_ZERO_ERROR; 1014 int32_t ce; 1015 setColEIterOffset(coleiter, start); 1016 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1017 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1018 return TRUE; 1019 } 1020 } 1021 int32_t count = 1; 1022 while (count < strsrch->pattern.cesLength) { 1023 if (getCE(strsrch, ucol_next(coleiter, &status)) 1024 == UCOL_IGNORABLE) { 1025 // Thai can give an ignorable here. 1026 count --; 1027 } 1028 if (U_FAILURE(status)) { 1029 return TRUE; 1030 } 1031 count ++; 1032 } 1033 1034 ce = ucol_next(coleiter, &status); 1035 if (U_FAILURE(status)) { 1036 return TRUE; 1037 } 1038 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1039 ce = getCE(strsrch, ce); 1040 } 1041 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1042 if (ucol_getOffset(coleiter) <= end) { 1043 return TRUE; 1044 } 1045 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1046 return TRUE; 1047 } 1048 } 1049 } 1050 } 1051 return FALSE; 1052 } 1053 #endif // #if BOYER_MOORE 1054 1055 /** 1056 * Checks if the offset runs out of the text string 1057 * @param offset 1058 * @param textlength of the text string 1059 * @return TRUE if offset is out of bounds, FALSE otherwise 1060 */ 1061 static 1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1063 { 1064 return offset < 0 || offset > textlength; 1065 } 1066 1067 /** 1068 * Checks for identical match 1069 * @param strsrch string search data 1070 * @param start offset of possible match 1071 * @param end offset of possible match 1072 * @return TRUE if identical match is found 1073 */ 1074 static 1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1076 int32_t end) 1077 { 1078 if (strsrch->strength != UCOL_IDENTICAL) { 1079 return TRUE; 1080 } 1081 1082 // Note: We could use Normalizer::compare() or similar, but for short strings 1083 // which may not be in FCD it might be faster to just NFD them. 1084 UErrorCode status = U_ZERO_ERROR; 1085 UnicodeString t2, p2; 1086 strsrch->nfd->normalize( 1087 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 1088 strsrch->nfd->normalize( 1089 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1090 // return FALSE if NFD failed 1091 return U_SUCCESS(status) && t2 == p2; 1092 } 1093 1094 #if BOYER_MOORE 1095 /** 1096 * Checks to see if the match is repeated 1097 * @param strsrch string search data 1098 * @param start new match start index 1099 * @param end new match end index 1100 * @return TRUE if the the match is repeated, FALSE otherwise 1101 */ 1102 static 1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1104 int32_t start, 1105 int32_t end) 1106 { 1107 int32_t lastmatchindex = strsrch->search->matchedIndex; 1108 UBool result; 1109 if (lastmatchindex == USEARCH_DONE) { 1110 return FALSE; 1111 } 1112 if (strsrch->search->isForwardSearching) { 1113 result = start <= lastmatchindex; 1114 } 1115 else { 1116 result = start >= lastmatchindex; 1117 } 1118 if (!result && !strsrch->search->isOverlap) { 1119 if (strsrch->search->isForwardSearching) { 1120 result = start < lastmatchindex + strsrch->search->matchedLength; 1121 } 1122 else { 1123 result = end > lastmatchindex; 1124 } 1125 } 1126 return result; 1127 } 1128 1129 /** 1130 * Gets the collation element iterator's current offset. 1131 * @param coleiter collation element iterator 1132 * @param forwards flag TRUE if we are moving in th forwards direction 1133 * @return current offset 1134 */ 1135 static 1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1137 UBool forwards) 1138 { 1139 int32_t result = ucol_getOffset(coleiter); 1140 // intricacies of the the backwards collation element iterator 1141 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1142 result ++; 1143 } 1144 return result; 1145 } 1146 1147 /** 1148 * Checks match for contraction. 1149 * If the match ends with a partial contraction we fail. 1150 * If the match starts too far off (because of backwards iteration) we try to 1151 * chip off the extra characters depending on whether a breakiterator has 1152 * been used. 1153 * Internal method, error assumed to be success, caller has to check status 1154 * before calling this method. 1155 * @param strsrch string search data 1156 * @param start offset of potential match, to be modified if necessary 1157 * @param end offset of potential match, to be modified if necessary 1158 * @param status output error status if any 1159 * @return TRUE if match passes the contraction test, FALSE otherwise 1160 */ 1161 1162 static 1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1164 int32_t *start, 1165 int32_t *end, UErrorCode *status) 1166 { 1167 UCollationElements *coleiter = strsrch->textIter; 1168 int32_t textlength = strsrch->search->textLength; 1169 int32_t temp = *start; 1170 const UCollator *collator = strsrch->collator; 1171 const UChar *text = strsrch->search->text; 1172 // This part checks if either ends of the match contains potential 1173 // contraction. If so we'll have to iterate through them 1174 // The start contraction needs to be checked since ucol_previous dumps 1175 // all characters till the first safe character into the buffer. 1176 // *start + 1 is used to test for the unsafe characters instead of *start 1177 // because ucol_prev takes all unsafe characters till the first safe 1178 // character ie *start. so by testing *start + 1, we can estimate if 1179 // excess prefix characters has been included in the potential search 1180 // results. 1181 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1182 (*start + 1 < textlength 1183 && ucol_unsafeCP(text[*start + 1], collator))) { 1184 int32_t expansion = getExpansionPrefix(coleiter); 1185 UBool expandflag = expansion > 0; 1186 setColEIterOffset(coleiter, *start); 1187 while (expansion > 0) { 1188 // getting rid of the redundant ce, caused by setOffset. 1189 // since backward contraction/expansion may have extra ces if we 1190 // are in the normalization buffer, hasAccentsBeforeMatch would 1191 // have taken care of it. 1192 // E.g. the character \u01FA will have an expansion of 3, but if 1193 // we are only looking for acute and ring \u030A and \u0301, we'll 1194 // have to skip the first ce in the expansion buffer. 1195 ucol_next(coleiter, status); 1196 if (U_FAILURE(*status)) { 1197 return FALSE; 1198 } 1199 if (ucol_getOffset(coleiter) != temp) { 1200 *start = temp; 1201 temp = ucol_getOffset(coleiter); 1202 } 1203 expansion --; 1204 } 1205 1206 int32_t *patternce = strsrch->pattern.ces; 1207 int32_t patterncelength = strsrch->pattern.cesLength; 1208 int32_t count = 0; 1209 while (count < patterncelength) { 1210 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1211 if (ce == UCOL_IGNORABLE) { 1212 continue; 1213 } 1214 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1215 *start = temp; 1216 temp = ucol_getOffset(coleiter); 1217 } 1218 if (U_FAILURE(*status) || ce != patternce[count]) { 1219 (*end) ++; 1220 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1221 return FALSE; 1222 } 1223 count ++; 1224 } 1225 } 1226 return TRUE; 1227 } 1228 1229 /** 1230 * Checks and sets the match information if found. 1231 * Checks 1232 * <ul> 1233 * <li> the potential match does not repeat the previous match 1234 * <li> boundaries are correct 1235 * <li> exact matches has no extra accents 1236 * <li> identical matchesb 1237 * <li> potential match does not end in the middle of a contraction 1238 * <\ul> 1239 * Otherwise the offset will be shifted to the next character. 1240 * Internal method, status assumed to be success, caller has to check status 1241 * before calling this method. 1242 * @param strsrch string search data 1243 * @param textoffset offset in the collation element text. the returned value 1244 * will be the truncated end offset of the match or the new start 1245 * search offset. 1246 * @param status output error status if any 1247 * @return TRUE if the match is valid, FALSE otherwise 1248 */ 1249 static 1250 inline UBool checkNextExactMatch(UStringSearch *strsrch, 1251 int32_t *textoffset, UErrorCode *status) 1252 { 1253 UCollationElements *coleiter = strsrch->textIter; 1254 int32_t start = getColElemIterOffset(coleiter, FALSE); 1255 1256 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1257 return FALSE; 1258 } 1259 1260 // this totally matches, however we need to check if it is repeating 1261 if (!isBreakUnit(strsrch, start, *textoffset) || 1262 checkRepeatedMatch(strsrch, start, *textoffset) || 1263 hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1264 !checkIdentical(strsrch, start, *textoffset) || 1265 hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1266 1267 (*textoffset) ++; 1268 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1269 return FALSE; 1270 } 1271 1272 //Add breakiterator boundary check for primary strength search. 1273 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1274 checkBreakBoundary(strsrch, &start, textoffset); 1275 } 1276 1277 // totally match, we will get rid of the ending ignorables. 1278 strsrch->search->matchedIndex = start; 1279 strsrch->search->matchedLength = *textoffset - start; 1280 return TRUE; 1281 } 1282 1283 /** 1284 * Getting the previous base character offset, or the current offset if the 1285 * current character is a base character 1286 * @param text string 1287 * @param textoffset one offset after the current character 1288 * @return the offset of the next character after the base character or the first 1289 * composed character with accents 1290 */ 1291 static 1292 inline int32_t getPreviousBaseOffset(const UChar *text, 1293 int32_t textoffset) 1294 { 1295 if (textoffset > 0) { 1296 for (;;) { 1297 int32_t result = textoffset; 1298 U16_BACK_1(text, 0, textoffset); 1299 int32_t temp = textoffset; 1300 uint16_t fcd = getFCD(text, &temp, result); 1301 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1302 if (fcd & LAST_BYTE_MASK_) { 1303 return textoffset; 1304 } 1305 return result; 1306 } 1307 if (textoffset == 0) { 1308 return 0; 1309 } 1310 } 1311 } 1312 return textoffset; 1313 } 1314 1315 /** 1316 * Getting the indexes of the accents that are not blocked in the argument 1317 * accent array 1318 * @param accents array of accents in nfd terminated by a 0. 1319 * @param accentsindex array of indexes of the accents that are not blocked 1320 */ 1321 static 1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1323 { 1324 int32_t index = 0; 1325 int32_t length = u_strlen(accents); 1326 UChar32 codepoint = 0; 1327 int cclass = 0; 1328 int result = 0; 1329 int32_t temp; 1330 while (index < length) { 1331 temp = index; 1332 U16_NEXT(accents, index, length, codepoint); 1333 if (u_getCombiningClass(codepoint) != cclass) { 1334 cclass = u_getCombiningClass(codepoint); 1335 accentsindex[result] = temp; 1336 result ++; 1337 } 1338 } 1339 accentsindex[result] = length; 1340 return result; 1341 } 1342 1343 /** 1344 * Appends 3 UChar arrays to a destination array. 1345 * Creates a new array if we run out of space. The caller will have to 1346 * manually deallocate the newly allocated array. 1347 * Internal method, status assumed to be success, caller has to check status 1348 * before calling this method. destination not to be NULL and has at least 1349 * size destinationlength. 1350 * @param destination target array 1351 * @param destinationlength target array size, returning the appended length 1352 * @param source1 null-terminated first array 1353 * @param source2 second array 1354 * @param source2length length of seond array 1355 * @param source3 null-terminated third array 1356 * @param status error status if any 1357 * @return new destination array, destination if there was no new allocation 1358 */ 1359 static 1360 inline UChar * addToUCharArray( UChar *destination, 1361 int32_t *destinationlength, 1362 const UChar *source1, 1363 const UChar *source2, 1364 int32_t source2length, 1365 const UChar *source3, 1366 UErrorCode *status) 1367 { 1368 int32_t source1length = source1 ? u_strlen(source1) : 0; 1369 int32_t source3length = source3 ? u_strlen(source3) : 0; 1370 if (*destinationlength < source1length + source2length + source3length + 1371 1) 1372 { 1373 destination = (UChar *)allocateMemory( 1374 (source1length + source2length + source3length + 1) * sizeof(UChar), 1375 status); 1376 // if error allocating memory, status will be 1377 // U_MEMORY_ALLOCATION_ERROR 1378 if (U_FAILURE(*status)) { 1379 *destinationlength = 0; 1380 return NULL; 1381 } 1382 } 1383 if (source1length != 0) { 1384 u_memcpy(destination, source1, source1length); 1385 } 1386 if (source2length != 0) { 1387 uprv_memcpy(destination + source1length, source2, 1388 sizeof(UChar) * source2length); 1389 } 1390 if (source3length != 0) { 1391 uprv_memcpy(destination + source1length + source2length, source3, 1392 sizeof(UChar) * source3length); 1393 } 1394 *destinationlength = source1length + source2length + source3length; 1395 return destination; 1396 } 1397 1398 /** 1399 * Running through a collation element iterator to see if the contents matches 1400 * pattern in string search data 1401 * @param strsrch string search data 1402 * @param coleiter collation element iterator 1403 * @return TRUE if a match if found, FALSE otherwise 1404 */ 1405 static 1406 inline UBool checkCollationMatch(const UStringSearch *strsrch, 1407 UCollationElements *coleiter) 1408 { 1409 int patternceindex = strsrch->pattern.cesLength; 1410 int32_t *patternce = strsrch->pattern.ces; 1411 UErrorCode status = U_ZERO_ERROR; 1412 while (patternceindex > 0) { 1413 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1414 if (ce == UCOL_IGNORABLE) { 1415 continue; 1416 } 1417 if (U_FAILURE(status) || ce != *patternce) { 1418 return FALSE; 1419 } 1420 patternce ++; 1421 patternceindex --; 1422 } 1423 return TRUE; 1424 } 1425 1426 /** 1427 * Rearranges the front accents to try matching. 1428 * Prefix accents in the text will be grouped according to their combining 1429 * class and the groups will be mixed and matched to try find the perfect 1430 * match with the pattern. 1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1433 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1434 * "\u0301\u0325". 1435 * step 2: check if any of the generated substrings matches the pattern. 1436 * Internal method, status is assumed to be success, caller has to check status 1437 * before calling this method. 1438 * @param strsrch string search match 1439 * @param start first offset of the accents to start searching 1440 * @param end start of the last accent set 1441 * @param status output error status if any 1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1443 * offset of the match. Note this start includes all preceding accents. 1444 */ 1445 static 1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1447 int32_t start, 1448 int32_t end, 1449 UErrorCode *status) 1450 { 1451 const UChar *text = strsrch->search->text; 1452 int32_t textlength = strsrch->search->textLength; 1453 int32_t tempstart = start; 1454 1455 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1456 // die... failed at a base character 1457 return USEARCH_DONE; 1458 } 1459 1460 int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1461 start = getPreviousBaseOffset(text, tempstart); 1462 1463 UChar accents[INITIAL_ARRAY_SIZE_]; 1464 // normalizing the offensive string 1465 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1466 INITIAL_ARRAY_SIZE_, status); 1467 if (U_FAILURE(*status)) { 1468 return USEARCH_DONE; 1469 } 1470 1471 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1472 int32_t accentsize = getUnblockedAccentIndex(accents, 1473 accentsindex); 1474 int32_t count = (2 << (accentsize - 1)) - 1; 1475 UChar buffer[INITIAL_ARRAY_SIZE_]; 1476 UCollationElements *coleiter = strsrch->utilIter; 1477 while (U_SUCCESS(*status) && count > 0) { 1478 UChar *rearrange = strsrch->canonicalPrefixAccents; 1479 // copy the base characters 1480 for (int k = 0; k < accentsindex[0]; k ++) { 1481 *rearrange ++ = accents[k]; 1482 } 1483 // forming all possible canonical rearrangement by dropping 1484 // sets of accents 1485 for (int i = 0; i <= accentsize - 1; i ++) { 1486 int32_t mask = 1 << (accentsize - i - 1); 1487 if (count & mask) { 1488 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1489 *rearrange ++ = accents[j]; 1490 } 1491 } 1492 } 1493 *rearrange = 0; 1494 int32_t matchsize = INITIAL_ARRAY_SIZE_; 1495 UChar *match = addToUCharArray(buffer, &matchsize, 1496 strsrch->canonicalPrefixAccents, 1497 strsrch->search->text + offset, 1498 end - offset, 1499 strsrch->canonicalSuffixAccents, 1500 status); 1501 1502 // if status is a failure, ucol_setText does nothing. 1503 // run the collator iterator through this match 1504 ucol_setText(coleiter, match, matchsize, status); 1505 if (U_SUCCESS(*status)) { 1506 if (checkCollationMatch(strsrch, coleiter)) { 1507 if (match != buffer) { 1508 uprv_free(match); 1509 } 1510 return start; 1511 } 1512 } 1513 count --; 1514 } 1515 return USEARCH_DONE; 1516 } 1517 1518 /** 1519 * Gets the offset to the safe point in text before textoffset. 1520 * ie. not the middle of a contraction, swappable characters or supplementary 1521 * characters. 1522 * @param collator collation sata 1523 * @param text string to work with 1524 * @param textoffset offset in string 1525 * @param textlength length of text string 1526 * @return offset to the previous safe character 1527 */ 1528 static 1529 inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1530 const UChar *text, 1531 int32_t textoffset) 1532 { 1533 int32_t result = textoffset; // first contraction character 1534 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1535 result --; 1536 } 1537 if (result != 0) { 1538 // the first contraction character is consider unsafe here 1539 result --; 1540 } 1541 return result; 1542 } 1543 1544 /** 1545 * Cleaning up after we passed the safe zone 1546 * @param strsrch string search data 1547 * @param safetext safe text array 1548 * @param safebuffer safe text buffer 1549 * @param coleiter collation element iterator for safe text 1550 */ 1551 static 1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1553 UChar *safebuffer) 1554 { 1555 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1556 { 1557 uprv_free(safetext); 1558 } 1559 } 1560 1561 /** 1562 * Take the rearranged end accents and tries matching. If match failed at 1563 * a seperate preceding set of accents (seperated from the rearranged on by 1564 * at least a base character) then we rearrange the preceding accents and 1565 * tries matching again. 1566 * We allow skipping of the ends of the accent set if the ces do not match. 1567 * However if the failure is found before the accent set, it fails. 1568 * Internal method, status assumed to be success, caller has to check status 1569 * before calling this method. 1570 * @param strsrch string search data 1571 * @param textoffset of the start of the rearranged accent 1572 * @param status output error status if any 1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1574 * offset of the match. Note this start includes all preceding accents. 1575 */ 1576 static 1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1578 int32_t textoffset, 1579 UErrorCode *status) 1580 { 1581 const UChar *text = strsrch->search->text; 1582 const UCollator *collator = strsrch->collator; 1583 int32_t safelength = 0; 1584 UChar *safetext; 1585 int32_t safetextlength; 1586 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1587 UCollationElements *coleiter = strsrch->utilIter; 1588 int32_t safeoffset = textoffset; 1589 1590 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1591 collator)) { 1592 safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1593 safelength = textoffset - safeoffset; 1594 safetextlength = INITIAL_ARRAY_SIZE_; 1595 safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1596 text + safeoffset, safelength, 1597 strsrch->canonicalSuffixAccents, 1598 status); 1599 } 1600 else { 1601 safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1602 safetext = strsrch->canonicalSuffixAccents; 1603 } 1604 1605 // if status is a failure, ucol_setText does nothing 1606 ucol_setText(coleiter, safetext, safetextlength, status); 1607 // status checked in loop below 1608 1609 int32_t *ce = strsrch->pattern.ces; 1610 int32_t celength = strsrch->pattern.cesLength; 1611 int ceindex = celength - 1; 1612 UBool isSafe = TRUE; // indication flag for position in safe zone 1613 1614 while (ceindex >= 0) { 1615 int32_t textce = ucol_previous(coleiter, status); 1616 if (U_FAILURE(*status)) { 1617 if (isSafe) { 1618 cleanUpSafeText(strsrch, safetext, safebuffer); 1619 } 1620 return USEARCH_DONE; 1621 } 1622 if (textce == UCOL_NULLORDER) { 1623 // check if we have passed the safe buffer 1624 if (coleiter == strsrch->textIter) { 1625 cleanUpSafeText(strsrch, safetext, safebuffer); 1626 return USEARCH_DONE; 1627 } 1628 cleanUpSafeText(strsrch, safetext, safebuffer); 1629 safetext = safebuffer; 1630 coleiter = strsrch->textIter; 1631 setColEIterOffset(coleiter, safeoffset); 1632 // status checked at the start of the loop 1633 isSafe = FALSE; 1634 continue; 1635 } 1636 textce = getCE(strsrch, textce); 1637 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1638 // do the beginning stuff 1639 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1640 if (isSafe && failedoffset >= safelength) { 1641 // alas... no hope. failed at rearranged accent set 1642 cleanUpSafeText(strsrch, safetext, safebuffer); 1643 return USEARCH_DONE; 1644 } 1645 else { 1646 if (isSafe) { 1647 failedoffset += safeoffset; 1648 cleanUpSafeText(strsrch, safetext, safebuffer); 1649 } 1650 1651 // try rearranging the front accents 1652 int32_t result = doNextCanonicalPrefixMatch(strsrch, 1653 failedoffset, textoffset, status); 1654 if (result != USEARCH_DONE) { 1655 // if status is a failure, ucol_setOffset does nothing 1656 setColEIterOffset(strsrch->textIter, result); 1657 } 1658 if (U_FAILURE(*status)) { 1659 return USEARCH_DONE; 1660 } 1661 return result; 1662 } 1663 } 1664 if (textce == ce[ceindex]) { 1665 ceindex --; 1666 } 1667 } 1668 // set offset here 1669 if (isSafe) { 1670 int32_t result = getColElemIterOffset(coleiter, FALSE); 1671 // sets the text iterator here with the correct expansion and offset 1672 int32_t leftoverces = getExpansionPrefix(coleiter); 1673 cleanUpSafeText(strsrch, safetext, safebuffer); 1674 if (result >= safelength) { 1675 result = textoffset; 1676 } 1677 else { 1678 result += safeoffset; 1679 } 1680 setColEIterOffset(strsrch->textIter, result); 1681 strsrch->textIter->iteratordata_.toReturn = 1682 setExpansionPrefix(strsrch->textIter, leftoverces); 1683 return result; 1684 } 1685 1686 return ucol_getOffset(coleiter); 1687 } 1688 1689 /** 1690 * Trying out the substring and sees if it can be a canonical match. 1691 * This will try normalizing the end accents and arranging them into canonical 1692 * equivalents and check their corresponding ces with the pattern ce. 1693 * Suffix accents in the text will be grouped according to their combining 1694 * class and the groups will be mixed and matched to try find the perfect 1695 * match with the pattern. 1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1698 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1699 * "\u0301\u0325". 1700 * step 2: check if any of the generated substrings matches the pattern. 1701 * Internal method, status assumed to be success, caller has to check status 1702 * before calling this method. 1703 * @param strsrch string search data 1704 * @param textoffset end offset in the collation element text that ends with 1705 * the accents to be rearranged 1706 * @param status error status if any 1707 * @return TRUE if the match is valid, FALSE otherwise 1708 */ 1709 static 1710 UBool doNextCanonicalMatch(UStringSearch *strsrch, 1711 int32_t textoffset, 1712 UErrorCode *status) 1713 { 1714 const UChar *text = strsrch->search->text; 1715 int32_t temp = textoffset; 1716 U16_BACK_1(text, 0, temp); 1717 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1718 UCollationElements *coleiter = strsrch->textIter; 1719 int32_t offset = getColElemIterOffset(coleiter, FALSE); 1720 if (strsrch->pattern.hasPrefixAccents) { 1721 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1722 status); 1723 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1724 setColEIterOffset(coleiter, offset); 1725 return TRUE; 1726 } 1727 } 1728 return FALSE; 1729 } 1730 1731 if (!strsrch->pattern.hasSuffixAccents) { 1732 return FALSE; 1733 } 1734 1735 UChar accents[INITIAL_ARRAY_SIZE_]; 1736 // offset to the last base character in substring to search 1737 int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1738 // normalizing the offensive string 1739 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1740 0, accents, INITIAL_ARRAY_SIZE_, status); 1741 // status checked in loop below 1742 1743 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1744 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1745 1746 // 2 power n - 1 plus the full set of accents 1747 int32_t count = (2 << (size - 1)) - 1; 1748 while (U_SUCCESS(*status) && count > 0) { 1749 UChar *rearrange = strsrch->canonicalSuffixAccents; 1750 // copy the base characters 1751 for (int k = 0; k < accentsindex[0]; k ++) { 1752 *rearrange ++ = accents[k]; 1753 } 1754 // forming all possible canonical rearrangement by dropping 1755 // sets of accents 1756 for (int i = 0; i <= size - 1; i ++) { 1757 int32_t mask = 1 << (size - i - 1); 1758 if (count & mask) { 1759 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1760 *rearrange ++ = accents[j]; 1761 } 1762 } 1763 } 1764 *rearrange = 0; 1765 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1766 status); 1767 if (offset != USEARCH_DONE) { 1768 return TRUE; // match found 1769 } 1770 count --; 1771 } 1772 return FALSE; 1773 } 1774 1775 /** 1776 * Gets the previous base character offset depending on the string search 1777 * pattern data 1778 * @param strsrch string search data 1779 * @param textoffset current offset, current character 1780 * @return the offset of the next character after this base character or itself 1781 * if it is a composed character with accents 1782 */ 1783 static 1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1785 int32_t textoffset) 1786 { 1787 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1788 const UChar *text = strsrch->search->text; 1789 int32_t offset = textoffset; 1790 if (getFCD(text, &offset, strsrch->search->textLength) >> 1791 SECOND_LAST_BYTE_SHIFT_) { 1792 return getPreviousBaseOffset(text, textoffset); 1793 } 1794 } 1795 return textoffset; 1796 } 1797 1798 /** 1799 * Checks match for contraction. 1800 * If the match ends with a partial contraction we fail. 1801 * If the match starts too far off (because of backwards iteration) we try to 1802 * chip off the extra characters 1803 * Internal method, status assumed to be success, caller has to check status 1804 * before calling this method. 1805 * @param strsrch string search data 1806 * @param start offset of potential match, to be modified if necessary 1807 * @param end offset of potential match, to be modified if necessary 1808 * @param status output error status if any 1809 * @return TRUE if match passes the contraction test, FALSE otherwise 1810 */ 1811 static 1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1813 int32_t *start, 1814 int32_t *end, 1815 UErrorCode *status) 1816 { 1817 UCollationElements *coleiter = strsrch->textIter; 1818 int32_t textlength = strsrch->search->textLength; 1819 int32_t temp = *start; 1820 const UCollator *collator = strsrch->collator; 1821 const UChar *text = strsrch->search->text; 1822 // This part checks if either ends of the match contains potential 1823 // contraction. If so we'll have to iterate through them 1824 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1825 (*start + 1 < textlength 1826 && ucol_unsafeCP(text[*start + 1], collator))) { 1827 int32_t expansion = getExpansionPrefix(coleiter); 1828 UBool expandflag = expansion > 0; 1829 setColEIterOffset(coleiter, *start); 1830 while (expansion > 0) { 1831 // getting rid of the redundant ce, caused by setOffset. 1832 // since backward contraction/expansion may have extra ces if we 1833 // are in the normalization buffer, hasAccentsBeforeMatch would 1834 // have taken care of it. 1835 // E.g. the character \u01FA will have an expansion of 3, but if 1836 // we are only looking for acute and ring \u030A and \u0301, we'll 1837 // have to skip the first ce in the expansion buffer. 1838 ucol_next(coleiter, status); 1839 if (U_FAILURE(*status)) { 1840 return FALSE; 1841 } 1842 if (ucol_getOffset(coleiter) != temp) { 1843 *start = temp; 1844 temp = ucol_getOffset(coleiter); 1845 } 1846 expansion --; 1847 } 1848 1849 int32_t *patternce = strsrch->pattern.ces; 1850 int32_t patterncelength = strsrch->pattern.cesLength; 1851 int32_t count = 0; 1852 int32_t textlength = strsrch->search->textLength; 1853 while (count < patterncelength) { 1854 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1855 // status checked below, note that if status is a failure 1856 // ucol_next returns UCOL_NULLORDER 1857 if (ce == UCOL_IGNORABLE) { 1858 continue; 1859 } 1860 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1861 *start = temp; 1862 temp = ucol_getOffset(coleiter); 1863 } 1864 1865 if (count == 0 && ce != patternce[0]) { 1866 // accents may have extra starting ces, this occurs when a 1867 // pure accent pattern is matched without rearrangement 1868 // text \u0325\u0300 and looking for \u0300 1869 int32_t expected = patternce[0]; 1870 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1871 ce = getCE(strsrch, ucol_next(coleiter, status)); 1872 while (U_SUCCESS(*status) && ce != expected && 1873 ce != UCOL_NULLORDER && 1874 ucol_getOffset(coleiter) <= *end) { 1875 ce = getCE(strsrch, ucol_next(coleiter, status)); 1876 } 1877 } 1878 } 1879 if (U_FAILURE(*status) || ce != patternce[count]) { 1880 (*end) ++; 1881 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1882 return FALSE; 1883 } 1884 count ++; 1885 } 1886 } 1887 return TRUE; 1888 } 1889 1890 /** 1891 * Checks and sets the match information if found. 1892 * Checks 1893 * <ul> 1894 * <li> the potential match does not repeat the previous match 1895 * <li> boundaries are correct 1896 * <li> potential match does not end in the middle of a contraction 1897 * <li> identical matches 1898 * <\ul> 1899 * Otherwise the offset will be shifted to the next character. 1900 * Internal method, status assumed to be success, caller has to check the 1901 * status before calling this method. 1902 * @param strsrch string search data 1903 * @param textoffset offset in the collation element text. the returned value 1904 * will be the truncated end offset of the match or the new start 1905 * search offset. 1906 * @param status output error status if any 1907 * @return TRUE if the match is valid, FALSE otherwise 1908 */ 1909 static 1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1911 int32_t *textoffset, 1912 UErrorCode *status) 1913 { 1914 // to ensure that the start and ends are not composite characters 1915 UCollationElements *coleiter = strsrch->textIter; 1916 // if we have a canonical accent match 1917 if ((strsrch->pattern.hasSuffixAccents && 1918 strsrch->canonicalSuffixAccents[0]) || 1919 (strsrch->pattern.hasPrefixAccents && 1920 strsrch->canonicalPrefixAccents[0])) { 1921 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1922 strsrch, 1923 ucol_getOffset(coleiter)); 1924 strsrch->search->matchedLength = *textoffset - 1925 strsrch->search->matchedIndex; 1926 return TRUE; 1927 } 1928 1929 int32_t start = getColElemIterOffset(coleiter, FALSE); 1930 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1931 status) || U_FAILURE(*status)) { 1932 return FALSE; 1933 } 1934 1935 start = getPreviousUStringSearchBaseOffset(strsrch, start); 1936 // this totally matches, however we need to check if it is repeating 1937 if (checkRepeatedMatch(strsrch, start, *textoffset) || 1938 !isBreakUnit(strsrch, start, *textoffset) || 1939 !checkIdentical(strsrch, start, *textoffset)) { 1940 (*textoffset) ++; 1941 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1942 strsrch->search->textLength); 1943 return FALSE; 1944 } 1945 1946 strsrch->search->matchedIndex = start; 1947 strsrch->search->matchedLength = *textoffset - start; 1948 return TRUE; 1949 } 1950 1951 /** 1952 * Shifting the collation element iterator position forward to prepare for 1953 * a preceding match. If the first character is a unsafe character, we'll only 1954 * shift by 1 to capture contractions, normalization etc. 1955 * Internal method, status assumed to be success, caller has to check status 1956 * before calling this method. 1957 * @param text strsrch string search data 1958 * @param textoffset start text position to do search 1959 * @param ce the text ce which failed the match. 1960 * @param patternceindex index of the ce within the pattern ce buffer which 1961 * failed the match 1962 * @return final offset 1963 */ 1964 static 1965 inline int32_t reverseShift(UStringSearch *strsrch, 1966 int32_t textoffset, 1967 int32_t ce, 1968 int32_t patternceindex) 1969 { 1970 if (strsrch->search->isOverlap) { 1971 if (textoffset != strsrch->search->textLength) { 1972 textoffset --; 1973 } 1974 else { 1975 textoffset -= strsrch->pattern.defaultShiftSize; 1976 } 1977 } 1978 else { 1979 if (ce != UCOL_NULLORDER) { 1980 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)]; 1981 1982 // this is to adjust for characters in the middle of the substring 1983 // for matching that failed. 1984 int32_t adjust = patternceindex; 1985 if (adjust > 1 && shift > adjust) { 1986 shift -= adjust - 1; 1987 } 1988 textoffset -= shift; 1989 } 1990 else { 1991 textoffset -= strsrch->pattern.defaultShiftSize; 1992 } 1993 } 1994 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1995 return textoffset; 1996 } 1997 1998 /** 1999 * Checks match for contraction. 2000 * If the match starts with a partial contraction we fail. 2001 * Internal method, status assumed to be success, caller has to check status 2002 * before calling this method. 2003 * @param strsrch string search data 2004 * @param start offset of potential match, to be modified if necessary 2005 * @param end offset of potential match, to be modified if necessary 2006 * @param status output error status if any 2007 * @return TRUE if match passes the contraction test, FALSE otherwise 2008 */ 2009 static 2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2011 int32_t *start, 2012 int32_t *end, UErrorCode *status) 2013 { 2014 UCollationElements *coleiter = strsrch->textIter; 2015 int32_t textlength = strsrch->search->textLength; 2016 int32_t temp = *end; 2017 const UCollator *collator = strsrch->collator; 2018 const UChar *text = strsrch->search->text; 2019 // This part checks if either if the start of the match contains potential 2020 // contraction. If so we'll have to iterate through them 2021 // Since we used ucol_next while previously looking for the potential 2022 // match, this guarantees that our end will not be a partial contraction, 2023 // or a partial supplementary character. 2024 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2025 int32_t expansion = getExpansionSuffix(coleiter); 2026 UBool expandflag = expansion > 0; 2027 setColEIterOffset(coleiter, *end); 2028 while (U_SUCCESS(*status) && expansion > 0) { 2029 // getting rid of the redundant ce 2030 // since forward contraction/expansion may have extra ces 2031 // if we are in the normalization buffer, hasAccentsBeforeMatch 2032 // would have taken care of it. 2033 // E.g. the character \u01FA will have an expansion of 3, but if 2034 // we are only looking for A ring A\u030A, we'll have to skip the 2035 // last ce in the expansion buffer 2036 ucol_previous(coleiter, status); 2037 if (U_FAILURE(*status)) { 2038 return FALSE; 2039 } 2040 if (ucol_getOffset(coleiter) != temp) { 2041 *end = temp; 2042 temp = ucol_getOffset(coleiter); 2043 } 2044 expansion --; 2045 } 2046 2047 int32_t *patternce = strsrch->pattern.ces; 2048 int32_t patterncelength = strsrch->pattern.cesLength; 2049 int32_t count = patterncelength; 2050 while (count > 0) { 2051 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2052 // status checked below, note that if status is a failure 2053 // ucol_previous returns UCOL_NULLORDER 2054 if (ce == UCOL_IGNORABLE) { 2055 continue; 2056 } 2057 if (expandflag && count == 0 && 2058 getColElemIterOffset(coleiter, FALSE) != temp) { 2059 *end = temp; 2060 temp = ucol_getOffset(coleiter); 2061 } 2062 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2063 (*start) --; 2064 *start = getPreviousBaseOffset(text, *start); 2065 return FALSE; 2066 } 2067 count --; 2068 } 2069 } 2070 return TRUE; 2071 } 2072 2073 /** 2074 * Checks and sets the match information if found. 2075 * Checks 2076 * <ul> 2077 * <li> the current match does not repeat the last match 2078 * <li> boundaries are correct 2079 * <li> exact matches has no extra accents 2080 * <li> identical matches 2081 * <\ul> 2082 * Otherwise the offset will be shifted to the preceding character. 2083 * Internal method, status assumed to be success, caller has to check status 2084 * before calling this method. 2085 * @param strsrch string search data 2086 * @param collator 2087 * @param coleiter collation element iterator 2088 * @param text string 2089 * @param textoffset offset in the collation element text. the returned value 2090 * will be the truncated start offset of the match or the new start 2091 * search offset. 2092 * @param status output error status if any 2093 * @return TRUE if the match is valid, FALSE otherwise 2094 */ 2095 static 2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2097 int32_t *textoffset, 2098 UErrorCode *status) 2099 { 2100 // to ensure that the start and ends are not composite characters 2101 int32_t end = ucol_getOffset(strsrch->textIter); 2102 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2103 || U_FAILURE(*status)) { 2104 return FALSE; 2105 } 2106 2107 // this totally matches, however we need to check if it is repeating 2108 // the old match 2109 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2110 !isBreakUnit(strsrch, *textoffset, end) || 2111 hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2112 !checkIdentical(strsrch, *textoffset, end) || 2113 hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2114 (*textoffset) --; 2115 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2116 *textoffset); 2117 return FALSE; 2118 } 2119 2120 //Add breakiterator boundary check for primary strength search. 2121 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2122 checkBreakBoundary(strsrch, textoffset, &end); 2123 } 2124 2125 strsrch->search->matchedIndex = *textoffset; 2126 strsrch->search->matchedLength = end - *textoffset; 2127 return TRUE; 2128 } 2129 2130 /** 2131 * Rearranges the end accents to try matching. 2132 * Suffix accents in the text will be grouped according to their combining 2133 * class and the groups will be mixed and matched to try find the perfect 2134 * match with the pattern. 2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2137 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2138 * "\u0301\u0325". 2139 * step 2: check if any of the generated substrings matches the pattern. 2140 * Internal method, status assumed to be success, user has to check status 2141 * before calling this method. 2142 * @param strsrch string search match 2143 * @param start offset of the first base character 2144 * @param end start of the last accent set 2145 * @param status only error status if any 2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2147 * offset of the match. Note this start includes all following accents. 2148 */ 2149 static 2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2151 int32_t start, 2152 int32_t end, 2153 UErrorCode *status) 2154 { 2155 const UChar *text = strsrch->search->text; 2156 int32_t tempend = end; 2157 2158 U16_BACK_1(text, 0, tempend); 2159 if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2160 LAST_BYTE_MASK_)) { 2161 // die... failed at a base character 2162 return USEARCH_DONE; 2163 } 2164 end = getNextBaseOffset(text, end, strsrch->search->textLength); 2165 2166 if (U_SUCCESS(*status)) { 2167 UChar accents[INITIAL_ARRAY_SIZE_]; 2168 int32_t offset = getPreviousBaseOffset(text, end); 2169 // normalizing the offensive string 2170 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2171 INITIAL_ARRAY_SIZE_, status); 2172 2173 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2174 int32_t accentsize = getUnblockedAccentIndex(accents, 2175 accentsindex); 2176 int32_t count = (2 << (accentsize - 1)) - 1; 2177 UChar buffer[INITIAL_ARRAY_SIZE_]; 2178 UCollationElements *coleiter = strsrch->utilIter; 2179 while (U_SUCCESS(*status) && count > 0) { 2180 UChar *rearrange = strsrch->canonicalSuffixAccents; 2181 // copy the base characters 2182 for (int k = 0; k < accentsindex[0]; k ++) { 2183 *rearrange ++ = accents[k]; 2184 } 2185 // forming all possible canonical rearrangement by dropping 2186 // sets of accents 2187 for (int i = 0; i <= accentsize - 1; i ++) { 2188 int32_t mask = 1 << (accentsize - i - 1); 2189 if (count & mask) { 2190 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2191 *rearrange ++ = accents[j]; 2192 } 2193 } 2194 } 2195 *rearrange = 0; 2196 int32_t matchsize = INITIAL_ARRAY_SIZE_; 2197 UChar *match = addToUCharArray(buffer, &matchsize, 2198 strsrch->canonicalPrefixAccents, 2199 strsrch->search->text + start, 2200 offset - start, 2201 strsrch->canonicalSuffixAccents, 2202 status); 2203 2204 // run the collator iterator through this match 2205 // if status is a failure ucol_setText does nothing 2206 ucol_setText(coleiter, match, matchsize, status); 2207 if (U_SUCCESS(*status)) { 2208 if (checkCollationMatch(strsrch, coleiter)) { 2209 if (match != buffer) { 2210 uprv_free(match); 2211 } 2212 return end; 2213 } 2214 } 2215 count --; 2216 } 2217 } 2218 return USEARCH_DONE; 2219 } 2220 2221 /** 2222 * Take the rearranged start accents and tries matching. If match failed at 2223 * a seperate following set of accents (seperated from the rearranged on by 2224 * at least a base character) then we rearrange the preceding accents and 2225 * tries matching again. 2226 * We allow skipping of the ends of the accent set if the ces do not match. 2227 * However if the failure is found before the accent set, it fails. 2228 * Internal method, status assumed to be success, caller has to check status 2229 * before calling this method. 2230 * @param strsrch string search data 2231 * @param textoffset of the ends of the rearranged accent 2232 * @param status output error status if any 2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2234 * offset of the match. Note this start includes all following accents. 2235 */ 2236 static 2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2238 int32_t textoffset, 2239 UErrorCode *status) 2240 { 2241 const UChar *text = strsrch->search->text; 2242 const UCollator *collator = strsrch->collator; 2243 int32_t safelength = 0; 2244 UChar *safetext; 2245 int32_t safetextlength; 2246 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2247 int32_t safeoffset = textoffset; 2248 2249 if (textoffset && 2250 ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2251 u_strlen(strsrch->canonicalPrefixAccents) - 1 2252 ], collator)) { 2253 safeoffset = getNextSafeOffset(collator, text, textoffset, 2254 strsrch->search->textLength); 2255 safelength = safeoffset - textoffset; 2256 safetextlength = INITIAL_ARRAY_SIZE_; 2257 safetext = addToUCharArray(safebuffer, &safetextlength, 2258 strsrch->canonicalPrefixAccents, 2259 text + textoffset, safelength, 2260 NULL, status); 2261 } 2262 else { 2263 safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2264 safetext = strsrch->canonicalPrefixAccents; 2265 } 2266 2267 UCollationElements *coleiter = strsrch->utilIter; 2268 // if status is a failure, ucol_setText does nothing 2269 ucol_setText(coleiter, safetext, safetextlength, status); 2270 // status checked in loop below 2271 2272 int32_t *ce = strsrch->pattern.ces; 2273 int32_t celength = strsrch->pattern.cesLength; 2274 int ceindex = 0; 2275 UBool isSafe = TRUE; // safe zone indication flag for position 2276 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2277 2278 while (ceindex < celength) { 2279 int32_t textce = ucol_next(coleiter, status); 2280 if (U_FAILURE(*status)) { 2281 if (isSafe) { 2282 cleanUpSafeText(strsrch, safetext, safebuffer); 2283 } 2284 return USEARCH_DONE; 2285 } 2286 if (textce == UCOL_NULLORDER) { 2287 // check if we have passed the safe buffer 2288 if (coleiter == strsrch->textIter) { 2289 cleanUpSafeText(strsrch, safetext, safebuffer); 2290 return USEARCH_DONE; 2291 } 2292 cleanUpSafeText(strsrch, safetext, safebuffer); 2293 safetext = safebuffer; 2294 coleiter = strsrch->textIter; 2295 setColEIterOffset(coleiter, safeoffset); 2296 // status checked at the start of the loop 2297 isSafe = FALSE; 2298 continue; 2299 } 2300 textce = getCE(strsrch, textce); 2301 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2302 // do the beginning stuff 2303 int32_t failedoffset = ucol_getOffset(coleiter); 2304 if (isSafe && failedoffset <= prefixlength) { 2305 // alas... no hope. failed at rearranged accent set 2306 cleanUpSafeText(strsrch, safetext, safebuffer); 2307 return USEARCH_DONE; 2308 } 2309 else { 2310 if (isSafe) { 2311 failedoffset = safeoffset - failedoffset; 2312 cleanUpSafeText(strsrch, safetext, safebuffer); 2313 } 2314 2315 // try rearranging the end accents 2316 int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2317 textoffset, failedoffset, status); 2318 if (result != USEARCH_DONE) { 2319 // if status is a failure, ucol_setOffset does nothing 2320 setColEIterOffset(strsrch->textIter, result); 2321 } 2322 if (U_FAILURE(*status)) { 2323 return USEARCH_DONE; 2324 } 2325 return result; 2326 } 2327 } 2328 if (textce == ce[ceindex]) { 2329 ceindex ++; 2330 } 2331 } 2332 // set offset here 2333 if (isSafe) { 2334 int32_t result = ucol_getOffset(coleiter); 2335 // sets the text iterator here with the correct expansion and offset 2336 int32_t leftoverces = getExpansionSuffix(coleiter); 2337 cleanUpSafeText(strsrch, safetext, safebuffer); 2338 if (result <= prefixlength) { 2339 result = textoffset; 2340 } 2341 else { 2342 result = textoffset + (safeoffset - result); 2343 } 2344 setColEIterOffset(strsrch->textIter, result); 2345 setExpansionSuffix(strsrch->textIter, leftoverces); 2346 return result; 2347 } 2348 2349 return ucol_getOffset(coleiter); 2350 } 2351 2352 /** 2353 * Trying out the substring and sees if it can be a canonical match. 2354 * This will try normalizing the starting accents and arranging them into 2355 * canonical equivalents and check their corresponding ces with the pattern ce. 2356 * Prefix accents in the text will be grouped according to their combining 2357 * class and the groups will be mixed and matched to try find the perfect 2358 * match with the pattern. 2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2361 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2362 * "\u0301\u0325". 2363 * step 2: check if any of the generated substrings matches the pattern. 2364 * Internal method, status assumed to be success, caller has to check status 2365 * before calling this method. 2366 * @param strsrch string search data 2367 * @param textoffset start offset in the collation element text that starts 2368 * with the accents to be rearranged 2369 * @param status output error status if any 2370 * @return TRUE if the match is valid, FALSE otherwise 2371 */ 2372 static 2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2374 int32_t textoffset, 2375 UErrorCode *status) 2376 { 2377 const UChar *text = strsrch->search->text; 2378 int32_t temp = textoffset; 2379 int32_t textlength = strsrch->search->textLength; 2380 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2381 UCollationElements *coleiter = strsrch->textIter; 2382 int32_t offset = ucol_getOffset(coleiter); 2383 if (strsrch->pattern.hasSuffixAccents) { 2384 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2385 offset, status); 2386 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2387 setColEIterOffset(coleiter, offset); 2388 return TRUE; 2389 } 2390 } 2391 return FALSE; 2392 } 2393 2394 if (!strsrch->pattern.hasPrefixAccents) { 2395 return FALSE; 2396 } 2397 2398 UChar accents[INITIAL_ARRAY_SIZE_]; 2399 // offset to the last base character in substring to search 2400 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2401 // normalizing the offensive string 2402 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2403 0, accents, INITIAL_ARRAY_SIZE_, status); 2404 // status checked in loop 2405 2406 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2407 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2408 2409 // 2 power n - 1 plus the full set of accents 2410 int32_t count = (2 << (size - 1)) - 1; 2411 while (U_SUCCESS(*status) && count > 0) { 2412 UChar *rearrange = strsrch->canonicalPrefixAccents; 2413 // copy the base characters 2414 for (int k = 0; k < accentsindex[0]; k ++) { 2415 *rearrange ++ = accents[k]; 2416 } 2417 // forming all possible canonical rearrangement by dropping 2418 // sets of accents 2419 for (int i = 0; i <= size - 1; i ++) { 2420 int32_t mask = 1 << (size - i - 1); 2421 if (count & mask) { 2422 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2423 *rearrange ++ = accents[j]; 2424 } 2425 } 2426 } 2427 *rearrange = 0; 2428 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2429 baseoffset, status); 2430 if (offset != USEARCH_DONE) { 2431 return TRUE; // match found 2432 } 2433 count --; 2434 } 2435 return FALSE; 2436 } 2437 2438 /** 2439 * Checks match for contraction. 2440 * If the match starts with a partial contraction we fail. 2441 * Internal method, status assumed to be success, caller has to check status 2442 * before calling this method. 2443 * @param strsrch string search data 2444 * @param start offset of potential match, to be modified if necessary 2445 * @param end offset of potential match, to be modified if necessary 2446 * @param status only error status if any 2447 * @return TRUE if match passes the contraction test, FALSE otherwise 2448 */ 2449 static 2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2451 int32_t *start, 2452 int32_t *end, UErrorCode *status) 2453 { 2454 UCollationElements *coleiter = strsrch->textIter; 2455 int32_t textlength = strsrch->search->textLength; 2456 int32_t temp = *end; 2457 const UCollator *collator = strsrch->collator; 2458 const UChar *text = strsrch->search->text; 2459 // This part checks if either if the start of the match contains potential 2460 // contraction. If so we'll have to iterate through them 2461 // Since we used ucol_next while previously looking for the potential 2462 // match, this guarantees that our end will not be a partial contraction, 2463 // or a partial supplementary character. 2464 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2465 int32_t expansion = getExpansionSuffix(coleiter); 2466 UBool expandflag = expansion > 0; 2467 setColEIterOffset(coleiter, *end); 2468 while (expansion > 0) { 2469 // getting rid of the redundant ce 2470 // since forward contraction/expansion may have extra ces 2471 // if we are in the normalization buffer, hasAccentsBeforeMatch 2472 // would have taken care of it. 2473 // E.g. the character \u01FA will have an expansion of 3, but if 2474 // we are only looking for A ring A\u030A, we'll have to skip the 2475 // last ce in the expansion buffer 2476 ucol_previous(coleiter, status); 2477 if (U_FAILURE(*status)) { 2478 return FALSE; 2479 } 2480 if (ucol_getOffset(coleiter) != temp) { 2481 *end = temp; 2482 temp = ucol_getOffset(coleiter); 2483 } 2484 expansion --; 2485 } 2486 2487 int32_t *patternce = strsrch->pattern.ces; 2488 int32_t patterncelength = strsrch->pattern.cesLength; 2489 int32_t count = patterncelength; 2490 while (count > 0) { 2491 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2492 // status checked below, note that if status is a failure 2493 // ucol_previous returns UCOL_NULLORDER 2494 if (ce == UCOL_IGNORABLE) { 2495 continue; 2496 } 2497 if (expandflag && count == 0 && 2498 getColElemIterOffset(coleiter, FALSE) != temp) { 2499 *end = temp; 2500 temp = ucol_getOffset(coleiter); 2501 } 2502 if (count == patterncelength && 2503 ce != patternce[patterncelength - 1]) { 2504 // accents may have extra starting ces, this occurs when a 2505 // pure accent pattern is matched without rearrangement 2506 int32_t expected = patternce[patterncelength - 1]; 2507 U16_BACK_1(text, 0, *end); 2508 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2509 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2510 while (U_SUCCESS(*status) && ce != expected && 2511 ce != UCOL_NULLORDER && 2512 ucol_getOffset(coleiter) <= *start) { 2513 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2514 } 2515 } 2516 } 2517 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2518 (*start) --; 2519 *start = getPreviousBaseOffset(text, *start); 2520 return FALSE; 2521 } 2522 count --; 2523 } 2524 } 2525 return TRUE; 2526 } 2527 2528 /** 2529 * Checks and sets the match information if found. 2530 * Checks 2531 * <ul> 2532 * <li> the potential match does not repeat the previous match 2533 * <li> boundaries are correct 2534 * <li> potential match does not end in the middle of a contraction 2535 * <li> identical matches 2536 * <\ul> 2537 * Otherwise the offset will be shifted to the next character. 2538 * Internal method, status assumed to be success, caller has to check status 2539 * before calling this method. 2540 * @param strsrch string search data 2541 * @param textoffset offset in the collation element text. the returned value 2542 * will be the truncated start offset of the match or the new start 2543 * search offset. 2544 * @param status only error status if any 2545 * @return TRUE if the match is valid, FALSE otherwise 2546 */ 2547 static 2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2549 int32_t *textoffset, 2550 UErrorCode *status) 2551 { 2552 // to ensure that the start and ends are not composite characters 2553 UCollationElements *coleiter = strsrch->textIter; 2554 // if we have a canonical accent match 2555 if ((strsrch->pattern.hasSuffixAccents && 2556 strsrch->canonicalSuffixAccents[0]) || 2557 (strsrch->pattern.hasPrefixAccents && 2558 strsrch->canonicalPrefixAccents[0])) { 2559 strsrch->search->matchedIndex = *textoffset; 2560 strsrch->search->matchedLength = 2561 getNextUStringSearchBaseOffset(strsrch, 2562 getColElemIterOffset(coleiter, FALSE)) 2563 - *textoffset; 2564 return TRUE; 2565 } 2566 2567 int32_t end = ucol_getOffset(coleiter); 2568 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2569 status) || 2570 U_FAILURE(*status)) { 2571 return FALSE; 2572 } 2573 2574 end = getNextUStringSearchBaseOffset(strsrch, end); 2575 // this totally matches, however we need to check if it is repeating 2576 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2577 !isBreakUnit(strsrch, *textoffset, end) || 2578 !checkIdentical(strsrch, *textoffset, end)) { 2579 (*textoffset) --; 2580 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2581 *textoffset); 2582 return FALSE; 2583 } 2584 2585 strsrch->search->matchedIndex = *textoffset; 2586 strsrch->search->matchedLength = end - *textoffset; 2587 return TRUE; 2588 } 2589 #endif // #if BOYER_MOORE 2590 2591 // constructors and destructor ------------------------------------------- 2592 2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2594 int32_t patternlength, 2595 const UChar *text, 2596 int32_t textlength, 2597 const char *locale, 2598 UBreakIterator *breakiter, 2599 UErrorCode *status) 2600 { 2601 if (U_FAILURE(*status)) { 2602 return NULL; 2603 } 2604 #if UCONFIG_NO_BREAK_ITERATION 2605 if (breakiter != NULL) { 2606 *status = U_UNSUPPORTED_ERROR; 2607 return NULL; 2608 } 2609 #endif 2610 if (locale) { 2611 // ucol_open internally checks for status 2612 UCollator *collator = ucol_open(locale, status); 2613 // pattern, text checks are done in usearch_openFromCollator 2614 UStringSearch *result = usearch_openFromCollator(pattern, 2615 patternlength, text, textlength, 2616 collator, breakiter, status); 2617 2618 if (result == NULL || U_FAILURE(*status)) { 2619 if (collator) { 2620 ucol_close(collator); 2621 } 2622 return NULL; 2623 } 2624 else { 2625 result->ownCollator = TRUE; 2626 } 2627 return result; 2628 } 2629 *status = U_ILLEGAL_ARGUMENT_ERROR; 2630 return NULL; 2631 } 2632 2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2634 const UChar *pattern, 2635 int32_t patternlength, 2636 const UChar *text, 2637 int32_t textlength, 2638 const UCollator *collator, 2639 UBreakIterator *breakiter, 2640 UErrorCode *status) 2641 { 2642 if (U_FAILURE(*status)) { 2643 return NULL; 2644 } 2645 #if UCONFIG_NO_BREAK_ITERATION 2646 if (breakiter != NULL) { 2647 *status = U_UNSUPPORTED_ERROR; 2648 return NULL; 2649 } 2650 #endif 2651 if (pattern == NULL || text == NULL || collator == NULL) { 2652 *status = U_ILLEGAL_ARGUMENT_ERROR; 2653 return NULL; 2654 } 2655 2656 // string search does not really work when numeric collation is turned on 2657 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2658 *status = U_UNSUPPORTED_ERROR; 2659 return NULL; 2660 } 2661 2662 if (U_SUCCESS(*status)) { 2663 initializeFCD(status); 2664 if (U_FAILURE(*status)) { 2665 return NULL; 2666 } 2667 2668 UStringSearch *result; 2669 if (textlength == -1) { 2670 textlength = u_strlen(text); 2671 } 2672 if (patternlength == -1) { 2673 patternlength = u_strlen(pattern); 2674 } 2675 if (textlength <= 0 || patternlength <= 0) { 2676 *status = U_ILLEGAL_ARGUMENT_ERROR; 2677 return NULL; 2678 } 2679 2680 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2681 if (result == NULL) { 2682 *status = U_MEMORY_ALLOCATION_ERROR; 2683 return NULL; 2684 } 2685 2686 result->collator = collator; 2687 result->strength = ucol_getStrength(collator); 2688 result->ceMask = getMask(result->strength); 2689 result->toShift = 2690 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2691 UCOL_SHIFTED; 2692 result->variableTop = ucol_getVariableTop(collator, status); 2693 2694 result->nfd = Normalizer2::getNFDInstance(*status); 2695 2696 if (U_FAILURE(*status)) { 2697 uprv_free(result); 2698 return NULL; 2699 } 2700 2701 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2702 if (result->search == NULL) { 2703 *status = U_MEMORY_ALLOCATION_ERROR; 2704 uprv_free(result); 2705 return NULL; 2706 } 2707 2708 result->search->text = text; 2709 result->search->textLength = textlength; 2710 2711 result->pattern.text = pattern; 2712 result->pattern.textLength = patternlength; 2713 result->pattern.ces = NULL; 2714 result->pattern.pces = NULL; 2715 2716 result->search->breakIter = breakiter; 2717 #if !UCONFIG_NO_BREAK_ITERATION 2718 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2719 if (breakiter) { 2720 ubrk_setText(breakiter, text, textlength, status); 2721 } 2722 #endif 2723 2724 result->ownCollator = FALSE; 2725 result->search->matchedLength = 0; 2726 result->search->matchedIndex = USEARCH_DONE; 2727 result->utilIter = NULL; 2728 result->textIter = ucol_openElements(collator, text, 2729 textlength, status); 2730 result->textProcessedIter = NULL; 2731 if (U_FAILURE(*status)) { 2732 usearch_close(result); 2733 return NULL; 2734 } 2735 2736 result->search->isOverlap = FALSE; 2737 result->search->isCanonicalMatch = FALSE; 2738 result->search->elementComparisonType = 0; 2739 result->search->isForwardSearching = TRUE; 2740 result->search->reset = TRUE; 2741 2742 initialize(result, status); 2743 2744 if (U_FAILURE(*status)) { 2745 usearch_close(result); 2746 return NULL; 2747 } 2748 2749 return result; 2750 } 2751 return NULL; 2752 } 2753 2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2755 { 2756 if (strsrch) { 2757 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && 2758 strsrch->pattern.ces) { 2759 uprv_free(strsrch->pattern.ces); 2760 } 2761 2762 if (strsrch->pattern.pces != NULL && 2763 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 2764 uprv_free(strsrch->pattern.pces); 2765 } 2766 2767 delete strsrch->textProcessedIter; 2768 ucol_closeElements(strsrch->textIter); 2769 ucol_closeElements(strsrch->utilIter); 2770 2771 if (strsrch->ownCollator && strsrch->collator) { 2772 ucol_close((UCollator *)strsrch->collator); 2773 } 2774 2775 #if !UCONFIG_NO_BREAK_ITERATION 2776 if (strsrch->search->internalBreakIter) { 2777 ubrk_close(strsrch->search->internalBreakIter); 2778 } 2779 #endif 2780 2781 uprv_free(strsrch->search); 2782 uprv_free(strsrch); 2783 } 2784 } 2785 2786 namespace { 2787 2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 2789 if (U_FAILURE(*status)) { return FALSE; } 2790 if (strsrch->textProcessedIter == NULL) { 2791 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 2792 if (strsrch->textProcessedIter == NULL) { 2793 *status = U_MEMORY_ALLOCATION_ERROR; 2794 return FALSE; 2795 } 2796 } else { 2797 strsrch->textProcessedIter->init(strsrch->textIter); 2798 } 2799 return TRUE; 2800 } 2801 2802 } 2803 2804 // set and get methods -------------------------------------------------- 2805 2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2807 int32_t position, 2808 UErrorCode *status) 2809 { 2810 if (U_SUCCESS(*status) && strsrch) { 2811 if (isOutOfBounds(strsrch->search->textLength, position)) { 2812 *status = U_INDEX_OUTOFBOUNDS_ERROR; 2813 } 2814 else { 2815 setColEIterOffset(strsrch->textIter, position); 2816 } 2817 strsrch->search->matchedIndex = USEARCH_DONE; 2818 strsrch->search->matchedLength = 0; 2819 strsrch->search->reset = FALSE; 2820 } 2821 } 2822 2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2824 { 2825 if (strsrch) { 2826 int32_t result = ucol_getOffset(strsrch->textIter); 2827 if (isOutOfBounds(strsrch->search->textLength, result)) { 2828 return USEARCH_DONE; 2829 } 2830 return result; 2831 } 2832 return USEARCH_DONE; 2833 } 2834 2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2836 USearchAttribute attribute, 2837 USearchAttributeValue value, 2838 UErrorCode *status) 2839 { 2840 if (U_SUCCESS(*status) && strsrch) { 2841 switch (attribute) 2842 { 2843 case USEARCH_OVERLAP : 2844 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2845 break; 2846 case USEARCH_CANONICAL_MATCH : 2847 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2848 FALSE); 2849 break; 2850 case USEARCH_ELEMENT_COMPARISON : 2851 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2852 strsrch->search->elementComparisonType = (int16_t)value; 2853 } else { 2854 strsrch->search->elementComparisonType = 0; 2855 } 2856 break; 2857 case USEARCH_ATTRIBUTE_COUNT : 2858 default: 2859 *status = U_ILLEGAL_ARGUMENT_ERROR; 2860 } 2861 } 2862 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2863 *status = U_ILLEGAL_ARGUMENT_ERROR; 2864 } 2865 } 2866 2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2868 const UStringSearch *strsrch, 2869 USearchAttribute attribute) 2870 { 2871 if (strsrch) { 2872 switch (attribute) { 2873 case USEARCH_OVERLAP : 2874 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2875 USEARCH_OFF); 2876 case USEARCH_CANONICAL_MATCH : 2877 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2878 USEARCH_OFF); 2879 case USEARCH_ELEMENT_COMPARISON : 2880 { 2881 int16_t value = strsrch->search->elementComparisonType; 2882 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2883 return (USearchAttributeValue)value; 2884 } else { 2885 return USEARCH_STANDARD_ELEMENT_COMPARISON; 2886 } 2887 } 2888 case USEARCH_ATTRIBUTE_COUNT : 2889 return USEARCH_DEFAULT; 2890 } 2891 } 2892 return USEARCH_DEFAULT; 2893 } 2894 2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2896 const UStringSearch *strsrch) 2897 { 2898 if (strsrch == NULL) { 2899 return USEARCH_DONE; 2900 } 2901 return strsrch->search->matchedIndex; 2902 } 2903 2904 2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2906 UChar *result, 2907 int32_t resultCapacity, 2908 UErrorCode *status) 2909 { 2910 if (U_FAILURE(*status)) { 2911 return USEARCH_DONE; 2912 } 2913 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2914 result == NULL)) { 2915 *status = U_ILLEGAL_ARGUMENT_ERROR; 2916 return USEARCH_DONE; 2917 } 2918 2919 int32_t copylength = strsrch->search->matchedLength; 2920 int32_t copyindex = strsrch->search->matchedIndex; 2921 if (copyindex == USEARCH_DONE) { 2922 u_terminateUChars(result, resultCapacity, 0, status); 2923 return USEARCH_DONE; 2924 } 2925 2926 if (resultCapacity < copylength) { 2927 copylength = resultCapacity; 2928 } 2929 if (copylength > 0) { 2930 uprv_memcpy(result, strsrch->search->text + copyindex, 2931 copylength * sizeof(UChar)); 2932 } 2933 return u_terminateUChars(result, resultCapacity, 2934 strsrch->search->matchedLength, status); 2935 } 2936 2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2938 const UStringSearch *strsrch) 2939 { 2940 if (strsrch) { 2941 return strsrch->search->matchedLength; 2942 } 2943 return USEARCH_DONE; 2944 } 2945 2946 #if !UCONFIG_NO_BREAK_ITERATION 2947 2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2949 UBreakIterator *breakiter, 2950 UErrorCode *status) 2951 { 2952 if (U_SUCCESS(*status) && strsrch) { 2953 strsrch->search->breakIter = breakiter; 2954 if (breakiter) { 2955 ubrk_setText(breakiter, strsrch->search->text, 2956 strsrch->search->textLength, status); 2957 } 2958 } 2959 } 2960 2961 U_CAPI const UBreakIterator* U_EXPORT2 2962 usearch_getBreakIterator(const UStringSearch *strsrch) 2963 { 2964 if (strsrch) { 2965 return strsrch->search->breakIter; 2966 } 2967 return NULL; 2968 } 2969 2970 #endif 2971 2972 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2973 const UChar *text, 2974 int32_t textlength, 2975 UErrorCode *status) 2976 { 2977 if (U_SUCCESS(*status)) { 2978 if (strsrch == NULL || text == NULL || textlength < -1 || 2979 textlength == 0) { 2980 *status = U_ILLEGAL_ARGUMENT_ERROR; 2981 } 2982 else { 2983 if (textlength == -1) { 2984 textlength = u_strlen(text); 2985 } 2986 strsrch->search->text = text; 2987 strsrch->search->textLength = textlength; 2988 ucol_setText(strsrch->textIter, text, textlength, status); 2989 strsrch->search->matchedIndex = USEARCH_DONE; 2990 strsrch->search->matchedLength = 0; 2991 strsrch->search->reset = TRUE; 2992 #if !UCONFIG_NO_BREAK_ITERATION 2993 if (strsrch->search->breakIter != NULL) { 2994 ubrk_setText(strsrch->search->breakIter, text, 2995 textlength, status); 2996 } 2997 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2998 #endif 2999 } 3000 } 3001 } 3002 3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 3004 int32_t *length) 3005 { 3006 if (strsrch) { 3007 *length = strsrch->search->textLength; 3008 return strsrch->search->text; 3009 } 3010 return NULL; 3011 } 3012 3013 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3014 const UCollator *collator, 3015 UErrorCode *status) 3016 { 3017 if (U_SUCCESS(*status)) { 3018 if (collator == NULL) { 3019 *status = U_ILLEGAL_ARGUMENT_ERROR; 3020 return; 3021 } 3022 3023 if (strsrch) { 3024 delete strsrch->textProcessedIter; 3025 strsrch->textProcessedIter = NULL; 3026 ucol_closeElements(strsrch->textIter); 3027 ucol_closeElements(strsrch->utilIter); 3028 strsrch->textIter = strsrch->utilIter = NULL; 3029 if (strsrch->ownCollator && (strsrch->collator != collator)) { 3030 ucol_close((UCollator *)strsrch->collator); 3031 strsrch->ownCollator = FALSE; 3032 } 3033 strsrch->collator = collator; 3034 strsrch->strength = ucol_getStrength(collator); 3035 strsrch->ceMask = getMask(strsrch->strength); 3036 #if !UCONFIG_NO_BREAK_ITERATION 3037 ubrk_close(strsrch->search->internalBreakIter); 3038 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3039 strsrch->search->text, strsrch->search->textLength, status); 3040 #endif 3041 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3042 strsrch->toShift = 3043 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3044 UCOL_SHIFTED; 3045 // if status is a failure, ucol_getVariableTop returns 0 3046 strsrch->variableTop = ucol_getVariableTop(collator, status); 3047 strsrch->textIter = ucol_openElements(collator, 3048 strsrch->search->text, 3049 strsrch->search->textLength, 3050 status); 3051 strsrch->utilIter = ucol_openElements( 3052 collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 3053 // initialize() _after_ setting the iterators for the new collator. 3054 initialize(strsrch, status); 3055 } 3056 3057 // **** are these calls needed? 3058 // **** we call uprv_init_pce in initializePatternPCETable 3059 // **** and the CEIBuffer constructor... 3060 #if 0 3061 uprv_init_pce(strsrch->textIter); 3062 uprv_init_pce(strsrch->utilIter); 3063 #endif 3064 } 3065 } 3066 3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3068 { 3069 if (strsrch) { 3070 return (UCollator *)strsrch->collator; 3071 } 3072 return NULL; 3073 } 3074 3075 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3076 const UChar *pattern, 3077 int32_t patternlength, 3078 UErrorCode *status) 3079 { 3080 if (U_SUCCESS(*status)) { 3081 if (strsrch == NULL || pattern == NULL) { 3082 *status = U_ILLEGAL_ARGUMENT_ERROR; 3083 } 3084 else { 3085 if (patternlength == -1) { 3086 patternlength = u_strlen(pattern); 3087 } 3088 if (patternlength == 0) { 3089 *status = U_ILLEGAL_ARGUMENT_ERROR; 3090 return; 3091 } 3092 strsrch->pattern.text = pattern; 3093 strsrch->pattern.textLength = patternlength; 3094 initialize(strsrch, status); 3095 } 3096 } 3097 } 3098 3099 U_CAPI const UChar* U_EXPORT2 3100 usearch_getPattern(const UStringSearch *strsrch, 3101 int32_t *length) 3102 { 3103 if (strsrch) { 3104 *length = strsrch->pattern.textLength; 3105 return strsrch->pattern.text; 3106 } 3107 return NULL; 3108 } 3109 3110 // miscellanous methods -------------------------------------------------- 3111 3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3113 UErrorCode *status) 3114 { 3115 if (strsrch && U_SUCCESS(*status)) { 3116 strsrch->search->isForwardSearching = TRUE; 3117 usearch_setOffset(strsrch, 0, status); 3118 if (U_SUCCESS(*status)) { 3119 return usearch_next(strsrch, status); 3120 } 3121 } 3122 return USEARCH_DONE; 3123 } 3124 3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3126 int32_t position, 3127 UErrorCode *status) 3128 { 3129 if (strsrch && U_SUCCESS(*status)) { 3130 strsrch->search->isForwardSearching = TRUE; 3131 // position checked in usearch_setOffset 3132 usearch_setOffset(strsrch, position, status); 3133 if (U_SUCCESS(*status)) { 3134 return usearch_next(strsrch, status); 3135 } 3136 } 3137 return USEARCH_DONE; 3138 } 3139 3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3141 UErrorCode *status) 3142 { 3143 if (strsrch && U_SUCCESS(*status)) { 3144 strsrch->search->isForwardSearching = FALSE; 3145 usearch_setOffset(strsrch, strsrch->search->textLength, status); 3146 if (U_SUCCESS(*status)) { 3147 return usearch_previous(strsrch, status); 3148 } 3149 } 3150 return USEARCH_DONE; 3151 } 3152 3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3154 int32_t position, 3155 UErrorCode *status) 3156 { 3157 if (strsrch && U_SUCCESS(*status)) { 3158 strsrch->search->isForwardSearching = FALSE; 3159 // position checked in usearch_setOffset 3160 usearch_setOffset(strsrch, position, status); 3161 if (U_SUCCESS(*status)) { 3162 return usearch_previous(strsrch, status); 3163 } 3164 } 3165 return USEARCH_DONE; 3166 } 3167 3168 /** 3169 * If a direction switch is required, we'll count the number of ces till the 3170 * beginning of the collation element iterator and iterate forwards that 3171 * number of times. This is so that we get to the correct point within the 3172 * string to continue the search in. Imagine when we are in the middle of the 3173 * normalization buffer when the change in direction is request. arrrgghh.... 3174 * After searching the offset within the collation element iterator will be 3175 * shifted to the start of the match. If a match is not found, the offset would 3176 * have been set to the end of the text string in the collation element 3177 * iterator. 3178 * Okay, here's my take on normalization buffer. The only time when there can 3179 * be 2 matches within the same normalization is when the pattern is consists 3180 * of all accents. But since the offset returned is from the text string, we 3181 * should not confuse the caller by returning the second match within the 3182 * same normalization buffer. If we do, the 2 results will have the same match 3183 * offsets, and that'll be confusing. I'll return the next match that doesn't 3184 * fall within the same normalization buffer. Note this does not affect the 3185 * results of matches spanning the text and the normalization buffer. 3186 * The position to start searching is taken from the collation element 3187 * iterator. Callers of this API would have to set the offset in the collation 3188 * element iterator before using this method. 3189 */ 3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3191 UErrorCode *status) 3192 { 3193 if (U_SUCCESS(*status) && strsrch) { 3194 // note offset is either equivalent to the start of the previous match 3195 // or is set by the user 3196 int32_t offset = usearch_getOffset(strsrch); 3197 USearch *search = strsrch->search; 3198 search->reset = FALSE; 3199 int32_t textlength = search->textLength; 3200 if (search->isForwardSearching) { 3201 #if BOYER_MOORE 3202 if (offset == textlength 3203 || (!search->isOverlap && 3204 (offset + strsrch->pattern.defaultShiftSize > textlength || 3205 (search->matchedIndex != USEARCH_DONE && 3206 offset + search->matchedLength >= textlength)))) { 3207 // not enough characters to match 3208 setMatchNotFound(strsrch); 3209 return USEARCH_DONE; 3210 } 3211 #else 3212 if (offset == textlength || 3213 (! search->isOverlap && 3214 (search->matchedIndex != USEARCH_DONE && 3215 offset + search->matchedLength > textlength))) { 3216 // not enough characters to match 3217 setMatchNotFound(strsrch); 3218 return USEARCH_DONE; 3219 } 3220 #endif 3221 } 3222 else { 3223 // switching direction. 3224 // if matchedIndex == USEARCH_DONE, it means that either a 3225 // setOffset has been called or that previous ran off the text 3226 // string. the iterator would have been set to offset 0 if a 3227 // match is not found. 3228 search->isForwardSearching = TRUE; 3229 if (search->matchedIndex != USEARCH_DONE) { 3230 // there's no need to set the collation element iterator 3231 // the next call to next will set the offset. 3232 return search->matchedIndex; 3233 } 3234 } 3235 3236 if (U_SUCCESS(*status)) { 3237 if (strsrch->pattern.cesLength == 0) { 3238 if (search->matchedIndex == USEARCH_DONE) { 3239 search->matchedIndex = offset; 3240 } 3241 else { // moves by codepoints 3242 U16_FWD_1(search->text, search->matchedIndex, textlength); 3243 } 3244 3245 search->matchedLength = 0; 3246 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3247 // status checked below 3248 if (search->matchedIndex == textlength) { 3249 search->matchedIndex = USEARCH_DONE; 3250 } 3251 } 3252 else { 3253 if (search->matchedLength > 0) { 3254 // if matchlength is 0 we are at the start of the iteration 3255 if (search->isOverlap) { 3256 ucol_setOffset(strsrch->textIter, offset + 1, status); 3257 } 3258 else { 3259 ucol_setOffset(strsrch->textIter, 3260 offset + search->matchedLength, status); 3261 } 3262 } 3263 else { 3264 // for boundary check purposes. this will ensure that the 3265 // next match will not preceed the current offset 3266 // note search->matchedIndex will always be set to something 3267 // in the code 3268 search->matchedIndex = offset - 1; 3269 } 3270 3271 if (search->isCanonicalMatch) { 3272 // can't use exact here since extra accents are allowed. 3273 usearch_handleNextCanonical(strsrch, status); 3274 } 3275 else { 3276 usearch_handleNextExact(strsrch, status); 3277 } 3278 } 3279 3280 if (U_FAILURE(*status)) { 3281 return USEARCH_DONE; 3282 } 3283 3284 #if !BOYER_MOORE 3285 if (search->matchedIndex == USEARCH_DONE) { 3286 ucol_setOffset(strsrch->textIter, search->textLength, status); 3287 } else { 3288 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3289 } 3290 #endif 3291 3292 return search->matchedIndex; 3293 } 3294 } 3295 return USEARCH_DONE; 3296 } 3297 3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3299 UErrorCode *status) 3300 { 3301 if (U_SUCCESS(*status) && strsrch) { 3302 int32_t offset; 3303 USearch *search = strsrch->search; 3304 if (search->reset) { 3305 offset = search->textLength; 3306 search->isForwardSearching = FALSE; 3307 search->reset = FALSE; 3308 setColEIterOffset(strsrch->textIter, offset); 3309 } 3310 else { 3311 offset = usearch_getOffset(strsrch); 3312 } 3313 3314 int32_t matchedindex = search->matchedIndex; 3315 if (search->isForwardSearching == TRUE) { 3316 // switching direction. 3317 // if matchedIndex == USEARCH_DONE, it means that either a 3318 // setOffset has been called or that next ran off the text 3319 // string. the iterator would have been set to offset textLength if 3320 // a match is not found. 3321 search->isForwardSearching = FALSE; 3322 if (matchedindex != USEARCH_DONE) { 3323 return matchedindex; 3324 } 3325 } 3326 else { 3327 #if BOYER_MOORE 3328 if (offset == 0 || matchedindex == 0 || 3329 (!search->isOverlap && 3330 (offset < strsrch->pattern.defaultShiftSize || 3331 (matchedindex != USEARCH_DONE && 3332 matchedindex < strsrch->pattern.defaultShiftSize)))) { 3333 // not enough characters to match 3334 setMatchNotFound(strsrch); 3335 return USEARCH_DONE; 3336 } 3337 #else 3338 // Could check pattern length, but the 3339 // linear search will do the right thing 3340 if (offset == 0 || matchedindex == 0) { 3341 setMatchNotFound(strsrch); 3342 return USEARCH_DONE; 3343 } 3344 #endif 3345 } 3346 3347 if (U_SUCCESS(*status)) { 3348 if (strsrch->pattern.cesLength == 0) { 3349 search->matchedIndex = 3350 (matchedindex == USEARCH_DONE ? offset : matchedindex); 3351 if (search->matchedIndex == 0) { 3352 setMatchNotFound(strsrch); 3353 // status checked below 3354 } 3355 else { // move by codepoints 3356 U16_BACK_1(search->text, 0, search->matchedIndex); 3357 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3358 // status checked below 3359 search->matchedLength = 0; 3360 } 3361 } 3362 else { 3363 if (strsrch->search->isCanonicalMatch) { 3364 // can't use exact here since extra accents are allowed. 3365 usearch_handlePreviousCanonical(strsrch, status); 3366 // status checked below 3367 } 3368 else { 3369 usearch_handlePreviousExact(strsrch, status); 3370 // status checked below 3371 } 3372 } 3373 3374 if (U_FAILURE(*status)) { 3375 return USEARCH_DONE; 3376 } 3377 3378 return search->matchedIndex; 3379 } 3380 } 3381 return USEARCH_DONE; 3382 } 3383 3384 3385 3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3387 { 3388 /* 3389 reset is setting the attributes that are already in 3390 string search, hence all attributes in the collator should 3391 be retrieved without any problems 3392 */ 3393 if (strsrch) { 3394 UErrorCode status = U_ZERO_ERROR; 3395 UBool sameCollAttribute = TRUE; 3396 uint32_t ceMask; 3397 UBool shift; 3398 uint32_t varTop; 3399 3400 // **** hack to deal w/ how processed CEs encode quaternary **** 3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3404 sameCollAttribute = FALSE; 3405 } 3406 3407 strsrch->strength = ucol_getStrength(strsrch->collator); 3408 ceMask = getMask(strsrch->strength); 3409 if (strsrch->ceMask != ceMask) { 3410 strsrch->ceMask = ceMask; 3411 sameCollAttribute = FALSE; 3412 } 3413 3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3416 &status) == UCOL_SHIFTED; 3417 if (strsrch->toShift != shift) { 3418 strsrch->toShift = shift; 3419 sameCollAttribute = FALSE; 3420 } 3421 3422 // if status is a failure, ucol_getVariableTop returns 0 3423 varTop = ucol_getVariableTop(strsrch->collator, &status); 3424 if (strsrch->variableTop != varTop) { 3425 strsrch->variableTop = varTop; 3426 sameCollAttribute = FALSE; 3427 } 3428 if (!sameCollAttribute) { 3429 initialize(strsrch, &status); 3430 } 3431 ucol_setText(strsrch->textIter, strsrch->search->text, 3432 strsrch->search->textLength, 3433 &status); 3434 strsrch->search->matchedLength = 0; 3435 strsrch->search->matchedIndex = USEARCH_DONE; 3436 strsrch->search->isOverlap = FALSE; 3437 strsrch->search->isCanonicalMatch = FALSE; 3438 strsrch->search->elementComparisonType = 0; 3439 strsrch->search->isForwardSearching = TRUE; 3440 strsrch->search->reset = TRUE; 3441 } 3442 } 3443 3444 // 3445 // CEI Collation Element + source text index. 3446 // These structs are kept in the circular buffer. 3447 // 3448 struct CEI { 3449 int64_t ce; 3450 int32_t lowIndex; 3451 int32_t highIndex; 3452 }; 3453 3454 U_NAMESPACE_BEGIN 3455 3456 namespace { 3457 // 3458 // CEIBuffer A circular buffer of CEs-with-index from the text being searched. 3459 // 3460 #define DEFAULT_CEBUFFER_SIZE 96 3461 #define CEBUFFER_EXTRA 32 3462 // Some typical max values to make buffer size more reasonable for asymmetric search. 3463 // #8694 is for a better long-term solution to allocation of this buffer. 3464 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3465 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3466 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3467 struct CEIBuffer { 3468 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3469 CEI *buf; 3470 int32_t bufSize; 3471 int32_t firstIx; 3472 int32_t limitIx; 3473 UCollationElements *ceIter; 3474 UStringSearch *strSearch; 3475 3476 3477 3478 CEIBuffer(UStringSearch *ss, UErrorCode *status); 3479 ~CEIBuffer(); 3480 const CEI *get(int32_t index); 3481 const CEI *getPrevious(int32_t index); 3482 }; 3483 3484 3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { 3486 buf = defBuf; 3487 strSearch = ss; 3488 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; 3489 if (ss->search->elementComparisonType != 0) { 3490 const UChar * patText = ss->pattern.text; 3491 if (patText) { 3492 const UChar * patTextLimit = patText + ss->pattern.textLength; 3493 while ( patText < patTextLimit ) { 3494 UChar c = *patText++; 3495 if (MIGHT_BE_JAMO_L(c)) { 3496 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3497 } else { 3498 // No check for surrogates, we might allocate slightly more buffer than necessary. 3499 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3500 } 3501 } 3502 } 3503 } 3504 ceIter = ss->textIter; 3505 firstIx = 0; 3506 limitIx = 0; 3507 3508 if (!initTextProcessedIter(ss, status)) { return; } 3509 3510 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3511 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3512 if (buf == NULL) { 3513 *status = U_MEMORY_ALLOCATION_ERROR; 3514 } 3515 } 3516 } 3517 3518 // TODO: add a reset or init function so that allocated 3519 // buffers can be retained & reused. 3520 3521 CEIBuffer::~CEIBuffer() { 3522 if (buf != defBuf) { 3523 uprv_free(buf); 3524 } 3525 } 3526 3527 3528 // Get the CE with the specified index. 3529 // Index must be in the range 3530 // n-history_size < index < n+1 3531 // where n is the largest index to have been fetched by some previous call to this function. 3532 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3533 // 3534 const CEI *CEIBuffer::get(int32_t index) { 3535 int i = index % bufSize; 3536 3537 if (index>=firstIx && index<limitIx) { 3538 // The request was for an entry already in our buffer. 3539 // Just return it. 3540 return &buf[i]; 3541 } 3542 3543 // Caller is requesting a new, never accessed before, CE. 3544 // Verify that it is the next one in sequence, which is all 3545 // that is allowed. 3546 if (index != limitIx) { 3547 U_ASSERT(FALSE); 3548 3549 return NULL; 3550 } 3551 3552 // Manage the circular CE buffer indexing 3553 limitIx++; 3554 3555 if (limitIx - firstIx >= bufSize) { 3556 // The buffer is full, knock out the lowest-indexed entry. 3557 firstIx++; 3558 } 3559 3560 UErrorCode status = U_ZERO_ERROR; 3561 3562 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3563 3564 return &buf[i]; 3565 } 3566 3567 // Get the CE with the specified index. 3568 // Index must be in the range 3569 // n-history_size < index < n+1 3570 // where n is the largest index to have been fetched by some previous call to this function. 3571 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3572 // 3573 const CEI *CEIBuffer::getPrevious(int32_t index) { 3574 int i = index % bufSize; 3575 3576 if (index>=firstIx && index<limitIx) { 3577 // The request was for an entry already in our buffer. 3578 // Just return it. 3579 return &buf[i]; 3580 } 3581 3582 // Caller is requesting a new, never accessed before, CE. 3583 // Verify that it is the next one in sequence, which is all 3584 // that is allowed. 3585 if (index != limitIx) { 3586 U_ASSERT(FALSE); 3587 3588 return NULL; 3589 } 3590 3591 // Manage the circular CE buffer indexing 3592 limitIx++; 3593 3594 if (limitIx - firstIx >= bufSize) { 3595 // The buffer is full, knock out the lowest-indexed entry. 3596 firstIx++; 3597 } 3598 3599 UErrorCode status = U_ZERO_ERROR; 3600 3601 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3602 3603 return &buf[i]; 3604 } 3605 3606 } 3607 3608 U_NAMESPACE_END 3609 3610 3611 // #define USEARCH_DEBUG 3612 3613 #ifdef USEARCH_DEBUG 3614 #include <stdio.h> 3615 #include <stdlib.h> 3616 #endif 3617 3618 /* 3619 * Find the next break boundary after startIndex. If the UStringSearch object 3620 * has an external break iterator, use that. Otherwise use the internal character 3621 * break iterator. 3622 */ 3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3624 #if 0 3625 const UChar *text = strsrch->search->text; 3626 int32_t textLen = strsrch->search->textLength; 3627 3628 U_ASSERT(startIndex>=0); 3629 U_ASSERT(startIndex<=textLen); 3630 3631 if (startIndex >= textLen) { 3632 return startIndex; 3633 } 3634 3635 UChar32 c; 3636 int32_t i = startIndex; 3637 U16_NEXT(text, i, textLen, c); 3638 3639 // If we are on a control character, stop without looking for combining marks. 3640 // Control characters do not combine. 3641 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3642 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3643 return i; 3644 } 3645 3646 // The initial character was not a control, and can thus accept trailing 3647 // combining characters. Advance over however many of them there are. 3648 int32_t indexOfLastCharChecked; 3649 for (;;) { 3650 indexOfLastCharChecked = i; 3651 if (i>=textLen) { 3652 break; 3653 } 3654 U16_NEXT(text, i, textLen, c); 3655 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3656 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3657 break; 3658 } 3659 } 3660 return indexOfLastCharChecked; 3661 #elif !UCONFIG_NO_BREAK_ITERATION 3662 UBreakIterator *breakiterator = strsrch->search->breakIter; 3663 3664 if (breakiterator == NULL) { 3665 breakiterator = strsrch->search->internalBreakIter; 3666 } 3667 3668 if (breakiterator != NULL) { 3669 return ubrk_following(breakiterator, startIndex); 3670 } 3671 3672 return startIndex; 3673 #else 3674 // **** or should we use the original code? **** 3675 return startIndex; 3676 #endif 3677 3678 } 3679 3680 /* 3681 * Returns TRUE if index is on a break boundary. If the UStringSearch 3682 * has an external break iterator, test using that, otherwise test 3683 * using the internal character break iterator. 3684 */ 3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3686 #if 0 3687 const UChar *text = strsrch->search->text; 3688 int32_t textLen = strsrch->search->textLength; 3689 3690 U_ASSERT(index>=0); 3691 U_ASSERT(index<=textLen); 3692 3693 if (index>=textLen || index<=0) { 3694 return TRUE; 3695 } 3696 3697 // If the character at the current index is not a GRAPHEME_EXTEND 3698 // then we can not be within a combining sequence. 3699 UChar32 c; 3700 U16_GET(text, 0, index, textLen, c); 3701 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3702 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3703 return TRUE; 3704 } 3705 3706 // We are at a combining mark. If the preceding character is anything 3707 // except a CONTROL, CR or LF, we are in a combining sequence. 3708 U16_PREV(text, 0, index, c); 3709 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3710 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3711 return !combining; 3712 #elif !UCONFIG_NO_BREAK_ITERATION 3713 UBreakIterator *breakiterator = strsrch->search->breakIter; 3714 3715 if (breakiterator == NULL) { 3716 breakiterator = strsrch->search->internalBreakIter; 3717 } 3718 3719 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3720 #else 3721 // **** or use the original code? **** 3722 return TRUE; 3723 #endif 3724 } 3725 3726 #if 0 3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3728 { 3729 #if !UCONFIG_NO_BREAK_ITERATION 3730 UBreakIterator *breakiterator = strsrch->search->breakIter; 3731 3732 if (breakiterator != NULL) { 3733 int32_t startindex = ubrk_first(breakiterator); 3734 int32_t endindex = ubrk_last(breakiterator); 3735 3736 // out-of-range indexes are never boundary positions 3737 if (start < startindex || start > endindex || 3738 end < startindex || end > endindex) { 3739 return FALSE; 3740 } 3741 3742 return ubrk_isBoundary(breakiterator, start) && 3743 ubrk_isBoundary(breakiterator, end); 3744 } 3745 #endif 3746 3747 return TRUE; 3748 } 3749 #endif 3750 3751 typedef enum { 3752 U_CE_MATCH = -1, 3753 U_CE_NO_MATCH = 0, 3754 U_CE_SKIP_TARG, 3755 U_CE_SKIP_PATN 3756 } UCompareCEsResult; 3757 #define U_CE_LEVEL2_BASE 0x00000005 3758 #define U_CE_LEVEL3_BASE 0x00050000 3759 3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 3761 if (targCE == patCE) { 3762 return U_CE_MATCH; 3763 } 3764 if (compareType == 0) { 3765 return U_CE_NO_MATCH; 3766 } 3767 3768 int64_t targCEshifted = targCE >> 32; 3769 int64_t patCEshifted = patCE >> 32; 3770 int64_t mask; 3771 3772 mask = 0xFFFF0000; 3773 int32_t targLev1 = (int32_t)(targCEshifted & mask); 3774 int32_t patLev1 = (int32_t)(patCEshifted & mask); 3775 if ( targLev1 != patLev1 ) { 3776 if ( targLev1 == 0 ) { 3777 return U_CE_SKIP_TARG; 3778 } 3779 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3780 return U_CE_SKIP_PATN; 3781 } 3782 return U_CE_NO_MATCH; 3783 } 3784 3785 mask = 0x0000FFFF; 3786 int32_t targLev2 = (int32_t)(targCEshifted & mask); 3787 int32_t patLev2 = (int32_t)(patCEshifted & mask); 3788 if ( targLev2 != patLev2 ) { 3789 if ( targLev2 == 0 ) { 3790 return U_CE_SKIP_TARG; 3791 } 3792 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3793 return U_CE_SKIP_PATN; 3794 } 3795 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 3796 U_CE_MATCH: U_CE_NO_MATCH; 3797 } 3798 3799 mask = 0xFFFF0000; 3800 int32_t targLev3 = (int32_t)(targCE & mask); 3801 int32_t patLev3 = (int32_t)(patCE & mask); 3802 if ( targLev3 != patLev3 ) { 3803 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 3804 U_CE_MATCH: U_CE_NO_MATCH; 3805 } 3806 3807 return U_CE_MATCH; 3808 } 3809 3810 #if BOYER_MOORE 3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 3812 #endif 3813 3814 namespace { 3815 3816 UChar32 codePointAt(const USearch &search, int32_t index) { 3817 if (index < search.textLength) { 3818 UChar32 c; 3819 U16_NEXT(search.text, index, search.textLength, c); 3820 return c; 3821 } 3822 return U_SENTINEL; 3823 } 3824 3825 UChar32 codePointBefore(const USearch &search, int32_t index) { 3826 if (0 < index) { 3827 UChar32 c; 3828 U16_PREV(search.text, 0, index, c); 3829 return c; 3830 } 3831 return U_SENTINEL; 3832 } 3833 3834 } // namespace 3835 3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3837 int32_t startIdx, 3838 int32_t *matchStart, 3839 int32_t *matchLimit, 3840 UErrorCode *status) 3841 { 3842 if (U_FAILURE(*status)) { 3843 return FALSE; 3844 } 3845 3846 // TODO: reject search patterns beginning with a combining char. 3847 3848 #ifdef USEARCH_DEBUG 3849 if (getenv("USEARCH_DEBUG") != NULL) { 3850 printf("Pattern CEs\n"); 3851 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 3852 printf(" %8x", strsrch->pattern.ces[ii]); 3853 } 3854 printf("\n"); 3855 } 3856 3857 #endif 3858 // Input parameter sanity check. 3859 // TODO: should input indicies clip to the text length 3860 // in the same way that UText does. 3861 if(strsrch->pattern.cesLength == 0 || 3862 startIdx < 0 || 3863 startIdx > strsrch->search->textLength || 3864 strsrch->pattern.ces == NULL) { 3865 *status = U_ILLEGAL_ARGUMENT_ERROR; 3866 return FALSE; 3867 } 3868 3869 if (strsrch->pattern.pces == NULL) { 3870 initializePatternPCETable(strsrch, status); 3871 } 3872 3873 ucol_setOffset(strsrch->textIter, startIdx, status); 3874 CEIBuffer ceb(strsrch, status); 3875 3876 3877 int32_t targetIx = 0; 3878 const CEI *targetCEI = NULL; 3879 int32_t patIx; 3880 UBool found; 3881 3882 int32_t mStart = -1; 3883 int32_t mLimit = -1; 3884 int32_t minLimit; 3885 int32_t maxLimit; 3886 3887 3888 3889 // Outer loop moves over match starting positions in the 3890 // target CE space. 3891 // Here we see the target as a sequence of collation elements, resulting from the following: 3892 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 3893 // (for example, digraphs such as IJ may be broken into two characters). 3894 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 3895 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 3896 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 3897 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 3898 // then the CE is deleted, so the following code sees only CEs that are relevant. 3899 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 3900 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 3901 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 3902 // 3903 for(targetIx=0; ; targetIx++) 3904 { 3905 found = TRUE; 3906 // Inner loop checks for a match beginning at each 3907 // position from the outer loop. 3908 int32_t targetIxOffset = 0; 3909 int64_t patCE = 0; 3910 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3911 // (compared to the last CE fetched for the previous targetIx value) as we need to go 3912 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3913 const CEI *firstCEI = ceb.get(targetIx); 3914 if (firstCEI == NULL) { 3915 *status = U_INTERNAL_PROGRAM_ERROR; 3916 found = FALSE; 3917 break; 3918 } 3919 3920 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) { 3921 patCE = strsrch->pattern.pces[patIx]; 3922 targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3923 // Compare CE from target string with CE from the pattern. 3924 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3925 // which will fail the compare, below. 3926 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 3927 if ( ceMatch == U_CE_NO_MATCH ) { 3928 found = FALSE; 3929 break; 3930 } else if ( ceMatch > U_CE_NO_MATCH ) { 3931 if ( ceMatch == U_CE_SKIP_TARG ) { 3932 // redo with same patCE, next targCE 3933 patIx--; 3934 targetIxOffset++; 3935 } else { // ceMatch == U_CE_SKIP_PATN 3936 // redo with same targCE, next patCE 3937 targetIxOffset--; 3938 } 3939 } 3940 } 3941 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far 3942 3943 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3944 // No match at this targetIx. Try again at the next. 3945 continue; 3946 } 3947 3948 if (!found) { 3949 // No match at all, we have run off the end of the target text. 3950 break; 3951 } 3952 3953 3954 // We have found a match in CE space. 3955 // Now determine the bounds in string index space. 3956 // There still is a chance of match failure if the CE range not correspond to 3957 // an acceptable character range. 3958 // 3959 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3960 3961 mStart = firstCEI->lowIndex; 3962 minLimit = lastCEI->lowIndex; 3963 3964 // Look at the CE following the match. If it is UCOL_NULLORDER the match 3965 // extended to the end of input, and the match is good. 3966 3967 // Look at the high and low indices of the CE following the match. If 3968 // they are the same it means one of two things: 3969 // 1. The match extended to the last CE from the target text, which is OK, or 3970 // 2. The last CE that was part of the match is in an expansion that extends 3971 // to the first CE after the match. In this case, we reject the match. 3972 const CEI *nextCEI = 0; 3973 if (strsrch->search->elementComparisonType == 0) { 3974 nextCEI = ceb.get(targetIx + targetIxOffset); 3975 maxLimit = nextCEI->lowIndex; 3976 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 3977 found = FALSE; 3978 } 3979 } else { 3980 for ( ; ; ++targetIxOffset ) { 3981 nextCEI = ceb.get(targetIx + targetIxOffset); 3982 maxLimit = nextCEI->lowIndex; 3983 // If we are at the end of the target too, match succeeds 3984 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 3985 break; 3986 } 3987 // As long as the next CE has primary weight of 0, 3988 // it is part of the last target element matched by the pattern; 3989 // make sure it can be part of a match with the last patCE 3990 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3991 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3992 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3993 found = FALSE; 3994 break; 3995 } 3996 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 3997 // target element, but it has non-zero primary weight => match fails 3998 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3999 found = false; 4000 break; 4001 // Else the target CE is not part of an expansion of the last matched element, match succeeds 4002 } else { 4003 break; 4004 } 4005 } 4006 } 4007 4008 4009 // Check for the start of the match being within a combining sequence. 4010 // This can happen if the pattern itself begins with a combining char, and 4011 // the match found combining marks in the target text that were attached 4012 // to something else. 4013 // This type of match should be rejected for not completely consuming a 4014 // combining sequence. 4015 if (!isBreakBoundary(strsrch, mStart)) { 4016 found = FALSE; 4017 } 4018 4019 // Check for the start of the match being within an Collation Element Expansion, 4020 // meaning that the first char of the match is only partially matched. 4021 // With exapnsions, the first CE will report the index of the source 4022 // character, and all subsequent (expansions) CEs will report the source index of the 4023 // _following_ character. 4024 int32_t secondIx = firstCEI->highIndex; 4025 if (mStart == secondIx) { 4026 found = FALSE; 4027 } 4028 4029 // Allow matches to end in the middle of a grapheme cluster if the following 4030 // conditions are met; this is needed to make prefix search work properly in 4031 // Indic, see #11750 4032 // * the default breakIter is being used 4033 // * the next collation element after this combining sequence 4034 // - has non-zero primary weight 4035 // - corresponds to a separate character following the one at end of the current match 4036 // (the second of these conditions, and perhaps both, may be redundant given the 4037 // subsequent check for normalization boundary; however they are likely much faster 4038 // tests in any case) 4039 // * the match limit is a normalization boundary 4040 UBool allowMidclusterMatch = FALSE; 4041 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4042 allowMidclusterMatch = 4043 strsrch->search->breakIter == NULL && 4044 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4045 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4046 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4047 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4048 } 4049 // If those conditions are met, then: 4050 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4051 // the match limit may be backed off to a previous break boundary. This handles 4052 // cases in which mLimit includes target characters that are ignorable with current 4053 // settings (such as space) and which extend beyond the pattern match. 4054 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4055 // * do NOT require that match limit be on a breakIter boundary 4056 4057 // Advance the match end position to the first acceptable match boundary. 4058 // This advances the index over any combining charcters. 4059 mLimit = maxLimit; 4060 if (minLimit < maxLimit) { 4061 // When the last CE's low index is same with its high index, the CE is likely 4062 // a part of expansion. In this case, the index is located just after the 4063 // character corresponding to the CEs compared above. If the index is right 4064 // at the break boundary, move the position to the next boundary will result 4065 // incorrect match length when there are ignorable characters exist between 4066 // the position and the next character produces CE(s). See ticket#8482. 4067 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4068 mLimit = minLimit; 4069 } else { 4070 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4071 // Note that we can have nba < maxLimit && nba >= minLImit, in which 4072 // case we want to set mLimit to nba regardless of allowMidclusterMatch 4073 // (i.e. we back off mLimit to the previous breakIterator boundary). 4074 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4075 mLimit = nba; 4076 } 4077 } 4078 } 4079 4080 #ifdef USEARCH_DEBUG 4081 if (getenv("USEARCH_DEBUG") != NULL) { 4082 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4083 } 4084 #endif 4085 4086 if (!allowMidclusterMatch) { 4087 // If advancing to the end of a combining sequence in character indexing space 4088 // advanced us beyond the end of the match in CE space, reject this match. 4089 if (mLimit > maxLimit) { 4090 found = FALSE; 4091 } 4092 4093 if (!isBreakBoundary(strsrch, mLimit)) { 4094 found = FALSE; 4095 } 4096 } 4097 4098 if (! checkIdentical(strsrch, mStart, mLimit)) { 4099 found = FALSE; 4100 } 4101 4102 if (found) { 4103 break; 4104 } 4105 } 4106 4107 #ifdef USEARCH_DEBUG 4108 if (getenv("USEARCH_DEBUG") != NULL) { 4109 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4110 int32_t lastToPrint = ceb.limitIx+2; 4111 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4112 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4113 } 4114 printf("\n%s\n", found? "match found" : "no match"); 4115 } 4116 #endif 4117 4118 // All Done. Store back the match bounds to the caller. 4119 // 4120 if (found==FALSE) { 4121 mLimit = -1; 4122 mStart = -1; 4123 } 4124 4125 if (matchStart != NULL) { 4126 *matchStart= mStart; 4127 } 4128 4129 if (matchLimit != NULL) { 4130 *matchLimit = mLimit; 4131 } 4132 4133 return found; 4134 } 4135 4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4137 int32_t startIdx, 4138 int32_t *matchStart, 4139 int32_t *matchLimit, 4140 UErrorCode *status) 4141 { 4142 if (U_FAILURE(*status)) { 4143 return FALSE; 4144 } 4145 4146 // TODO: reject search patterns beginning with a combining char. 4147 4148 #ifdef USEARCH_DEBUG 4149 if (getenv("USEARCH_DEBUG") != NULL) { 4150 printf("Pattern CEs\n"); 4151 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 4152 printf(" %8x", strsrch->pattern.ces[ii]); 4153 } 4154 printf("\n"); 4155 } 4156 4157 #endif 4158 // Input parameter sanity check. 4159 // TODO: should input indicies clip to the text length 4160 // in the same way that UText does. 4161 if(strsrch->pattern.cesLength == 0 || 4162 startIdx < 0 || 4163 startIdx > strsrch->search->textLength || 4164 strsrch->pattern.ces == NULL) { 4165 *status = U_ILLEGAL_ARGUMENT_ERROR; 4166 return FALSE; 4167 } 4168 4169 if (strsrch->pattern.pces == NULL) { 4170 initializePatternPCETable(strsrch, status); 4171 } 4172 4173 CEIBuffer ceb(strsrch, status); 4174 int32_t targetIx = 0; 4175 4176 /* 4177 * Pre-load the buffer with the CE's for the grapheme 4178 * after our starting position so that we're sure that 4179 * we can look at the CE following the match when we 4180 * check the match boundaries. 4181 * 4182 * This will also pre-fetch the first CE that we'll 4183 * consider for the match. 4184 */ 4185 if (startIdx < strsrch->search->textLength) { 4186 UBreakIterator *bi = strsrch->search->internalBreakIter; 4187 int32_t next = ubrk_following(bi, startIdx); 4188 4189 ucol_setOffset(strsrch->textIter, next, status); 4190 4191 for (targetIx = 0; ; targetIx += 1) { 4192 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4193 break; 4194 } 4195 } 4196 } else { 4197 ucol_setOffset(strsrch->textIter, startIdx, status); 4198 } 4199 4200 4201 const CEI *targetCEI = NULL; 4202 int32_t patIx; 4203 UBool found; 4204 4205 int32_t limitIx = targetIx; 4206 int32_t mStart = -1; 4207 int32_t mLimit = -1; 4208 int32_t minLimit; 4209 int32_t maxLimit; 4210 4211 4212 4213 // Outer loop moves over match starting positions in the 4214 // target CE space. 4215 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 4216 // But patIx is 0 at the beginning of the pattern and increases toward the end. 4217 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 4218 // and the beginning of the base text. 4219 for(targetIx = limitIx; ; targetIx += 1) 4220 { 4221 found = TRUE; 4222 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4223 // (compared to the last CE fetched for the previous targetIx value) as we need to go 4224 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4225 const CEI *lastCEI = ceb.getPrevious(targetIx); 4226 if (lastCEI == NULL) { 4227 *status = U_INTERNAL_PROGRAM_ERROR; 4228 found = FALSE; 4229 break; 4230 } 4231 // Inner loop checks for a match beginning at each 4232 // position from the outer loop. 4233 int32_t targetIxOffset = 0; 4234 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { 4235 int64_t patCE = strsrch->pattern.pces[patIx]; 4236 4237 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); 4238 // Compare CE from target string with CE from the pattern. 4239 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4240 // which will fail the compare, below. 4241 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 4242 if ( ceMatch == U_CE_NO_MATCH ) { 4243 found = FALSE; 4244 break; 4245 } else if ( ceMatch > U_CE_NO_MATCH ) { 4246 if ( ceMatch == U_CE_SKIP_TARG ) { 4247 // redo with same patCE, next targCE 4248 patIx++; 4249 targetIxOffset++; 4250 } else { // ceMatch == U_CE_SKIP_PATN 4251 // redo with same targCE, next patCE 4252 targetIxOffset--; 4253 } 4254 } 4255 } 4256 4257 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4258 // No match at this targetIx. Try again at the next. 4259 continue; 4260 } 4261 4262 if (!found) { 4263 // No match at all, we have run off the end of the target text. 4264 break; 4265 } 4266 4267 4268 // We have found a match in CE space. 4269 // Now determine the bounds in string index space. 4270 // There still is a chance of match failure if the CE range not correspond to 4271 // an acceptable character range. 4272 // 4273 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); 4274 mStart = firstCEI->lowIndex; 4275 4276 // Check for the start of the match being within a combining sequence. 4277 // This can happen if the pattern itself begins with a combining char, and 4278 // the match found combining marks in the target text that were attached 4279 // to something else. 4280 // This type of match should be rejected for not completely consuming a 4281 // combining sequence. 4282 if (!isBreakBoundary(strsrch, mStart)) { 4283 found = FALSE; 4284 } 4285 4286 // Look at the high index of the first CE in the match. If it's the same as the 4287 // low index, the first CE in the match is in the middle of an expansion. 4288 if (mStart == firstCEI->highIndex) { 4289 found = FALSE; 4290 } 4291 4292 4293 minLimit = lastCEI->lowIndex; 4294 4295 if (targetIx > 0) { 4296 // Look at the CE following the match. If it is UCOL_NULLORDER the match 4297 // extended to the end of input, and the match is good. 4298 4299 // Look at the high and low indices of the CE following the match. If 4300 // they are the same it means one of two things: 4301 // 1. The match extended to the last CE from the target text, which is OK, or 4302 // 2. The last CE that was part of the match is in an expansion that extends 4303 // to the first CE after the match. In this case, we reject the match. 4304 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4305 4306 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4307 found = FALSE; 4308 } 4309 4310 mLimit = maxLimit = nextCEI->lowIndex; 4311 4312 // Allow matches to end in the middle of a grapheme cluster if the following 4313 // conditions are met; this is needed to make prefix search work properly in 4314 // Indic, see #11750 4315 // * the default breakIter is being used 4316 // * the next collation element after this combining sequence 4317 // - has non-zero primary weight 4318 // - corresponds to a separate character following the one at end of the current match 4319 // (the second of these conditions, and perhaps both, may be redundant given the 4320 // subsequent check for normalization boundary; however they are likely much faster 4321 // tests in any case) 4322 // * the match limit is a normalization boundary 4323 UBool allowMidclusterMatch = FALSE; 4324 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4325 allowMidclusterMatch = 4326 strsrch->search->breakIter == NULL && 4327 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4328 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4329 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4330 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4331 } 4332 // If those conditions are met, then: 4333 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4334 // the match limit may be backed off to a previous break boundary. This handles 4335 // cases in which mLimit includes target characters that are ignorable with current 4336 // settings (such as space) and which extend beyond the pattern match. 4337 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4338 // * do NOT require that match limit be on a breakIter boundary 4339 4340 // Advance the match end position to the first acceptable match boundary. 4341 // This advances the index over any combining characters. 4342 if (minLimit < maxLimit) { 4343 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4344 // Note that we can have nba < maxLimit && nba >= minLImit, in which 4345 // case we want to set mLimit to nba regardless of allowMidclusterMatch 4346 // (i.e. we back off mLimit to the previous breakIterator boundary). 4347 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4348 mLimit = nba; 4349 } 4350 } 4351 4352 if (!allowMidclusterMatch) { 4353 // If advancing to the end of a combining sequence in character indexing space 4354 // advanced us beyond the end of the match in CE space, reject this match. 4355 if (mLimit > maxLimit) { 4356 found = FALSE; 4357 } 4358 4359 // Make sure the end of the match is on a break boundary 4360 if (!isBreakBoundary(strsrch, mLimit)) { 4361 found = FALSE; 4362 } 4363 } 4364 4365 } else { 4366 // No non-ignorable CEs after this point. 4367 // The maximum position is detected by boundary after 4368 // the last non-ignorable CE. Combining sequence 4369 // across the start index will be truncated. 4370 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4371 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4372 } 4373 4374 #ifdef USEARCH_DEBUG 4375 if (getenv("USEARCH_DEBUG") != NULL) { 4376 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4377 } 4378 #endif 4379 4380 4381 if (! checkIdentical(strsrch, mStart, mLimit)) { 4382 found = FALSE; 4383 } 4384 4385 if (found) { 4386 break; 4387 } 4388 } 4389 4390 #ifdef USEARCH_DEBUG 4391 if (getenv("USEARCH_DEBUG") != NULL) { 4392 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4393 int32_t lastToPrint = ceb.limitIx+2; 4394 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4395 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4396 } 4397 printf("\n%s\n", found? "match found" : "no match"); 4398 } 4399 #endif 4400 4401 // All Done. Store back the match bounds to the caller. 4402 // 4403 if (found==FALSE) { 4404 mLimit = -1; 4405 mStart = -1; 4406 } 4407 4408 if (matchStart != NULL) { 4409 *matchStart= mStart; 4410 } 4411 4412 if (matchLimit != NULL) { 4413 *matchLimit = mLimit; 4414 } 4415 4416 return found; 4417 } 4418 4419 // internal use methods declared in usrchimp.h ----------------------------- 4420 4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4422 { 4423 if (U_FAILURE(*status)) { 4424 setMatchNotFound(strsrch); 4425 return FALSE; 4426 } 4427 4428 #if BOYER_MOORE 4429 UCollationElements *coleiter = strsrch->textIter; 4430 int32_t textlength = strsrch->search->textLength; 4431 int32_t *patternce = strsrch->pattern.ces; 4432 int32_t patterncelength = strsrch->pattern.cesLength; 4433 int32_t textoffset = ucol_getOffset(coleiter); 4434 4435 // status used in setting coleiter offset, since offset is checked in 4436 // shiftForward before setting the coleiter offset, status never 4437 // a failure 4438 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4439 patterncelength); 4440 while (textoffset <= textlength) 4441 { 4442 uint32_t patternceindex = patterncelength - 1; 4443 int32_t targetce; 4444 UBool found = FALSE; 4445 int32_t lastce = UCOL_NULLORDER; 4446 4447 setColEIterOffset(coleiter, textoffset); 4448 4449 for (;;) { 4450 // finding the last pattern ce match, imagine composite characters 4451 // for example: search for pattern A in text \u00C0 4452 // we'll have to skip \u0300 the grave first before we get to A 4453 targetce = ucol_previous(coleiter, status); 4454 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4455 found = FALSE; 4456 break; 4457 } 4458 targetce = getCE(strsrch, targetce); 4459 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4460 // this is for the text \u0315\u0300 that requires 4461 // normalization and pattern \u0300, where \u0315 is ignorable 4462 continue; 4463 } 4464 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4465 lastce = targetce; 4466 } 4467 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4468 if (targetce == patternce[patternceindex]) { 4469 // the first ce can be a contraction 4470 found = TRUE; 4471 break; 4472 } 4473 if (!hasExpansion(coleiter)) { 4474 found = FALSE; 4475 break; 4476 } 4477 } 4478 4479 //targetce = lastce; 4480 4481 while (found && patternceindex > 0) { 4482 lastce = targetce; 4483 targetce = ucol_previous(coleiter, status); 4484 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4485 found = FALSE; 4486 break; 4487 } 4488 targetce = getCE(strsrch, targetce); 4489 if (targetce == UCOL_IGNORABLE) { 4490 continue; 4491 } 4492 4493 patternceindex --; 4494 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4495 found = found && targetce == patternce[patternceindex]; 4496 } 4497 4498 targetce = lastce; 4499 4500 if (!found) { 4501 if (U_FAILURE(*status)) { 4502 break; 4503 } 4504 textoffset = shiftForward(strsrch, textoffset, lastce, 4505 patternceindex); 4506 // status checked at loop. 4507 patternceindex = patterncelength; 4508 continue; 4509 } 4510 4511 if (checkNextExactMatch(strsrch, &textoffset, status)) { 4512 // status checked in ucol_setOffset 4513 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4514 return TRUE; 4515 } 4516 } 4517 setMatchNotFound(strsrch); 4518 return FALSE; 4519 #else 4520 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4521 int32_t start = -1; 4522 int32_t end = -1; 4523 4524 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4525 strsrch->search->matchedIndex = start; 4526 strsrch->search->matchedLength = end - start; 4527 return TRUE; 4528 } else { 4529 setMatchNotFound(strsrch); 4530 return FALSE; 4531 } 4532 #endif 4533 } 4534 4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4536 { 4537 if (U_FAILURE(*status)) { 4538 setMatchNotFound(strsrch); 4539 return FALSE; 4540 } 4541 4542 #if BOYER_MOORE 4543 UCollationElements *coleiter = strsrch->textIter; 4544 int32_t textlength = strsrch->search->textLength; 4545 int32_t *patternce = strsrch->pattern.ces; 4546 int32_t patterncelength = strsrch->pattern.cesLength; 4547 int32_t textoffset = ucol_getOffset(coleiter); 4548 UBool hasPatternAccents = 4549 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4550 4551 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4552 patterncelength); 4553 strsrch->canonicalPrefixAccents[0] = 0; 4554 strsrch->canonicalSuffixAccents[0] = 0; 4555 4556 while (textoffset <= textlength) 4557 { 4558 int32_t patternceindex = patterncelength - 1; 4559 int32_t targetce; 4560 UBool found = FALSE; 4561 int32_t lastce = UCOL_NULLORDER; 4562 4563 setColEIterOffset(coleiter, textoffset); 4564 4565 for (;;) { 4566 // finding the last pattern ce match, imagine composite characters 4567 // for example: search for pattern A in text \u00C0 4568 // we'll have to skip \u0300 the grave first before we get to A 4569 targetce = ucol_previous(coleiter, status); 4570 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4571 found = FALSE; 4572 break; 4573 } 4574 targetce = getCE(strsrch, targetce); 4575 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4576 lastce = targetce; 4577 } 4578 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4579 if (targetce == patternce[patternceindex]) { 4580 // the first ce can be a contraction 4581 found = TRUE; 4582 break; 4583 } 4584 if (!hasExpansion(coleiter)) { 4585 found = FALSE; 4586 break; 4587 } 4588 } 4589 4590 while (found && patternceindex > 0) { 4591 targetce = ucol_previous(coleiter, status); 4592 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4593 found = FALSE; 4594 break; 4595 } 4596 targetce = getCE(strsrch, targetce); 4597 if (targetce == UCOL_IGNORABLE) { 4598 continue; 4599 } 4600 4601 patternceindex --; 4602 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4603 found = found && targetce == patternce[patternceindex]; 4604 } 4605 4606 // initializing the rearranged accent array 4607 if (hasPatternAccents && !found) { 4608 strsrch->canonicalPrefixAccents[0] = 0; 4609 strsrch->canonicalSuffixAccents[0] = 0; 4610 if (U_FAILURE(*status)) { 4611 break; 4612 } 4613 found = doNextCanonicalMatch(strsrch, textoffset, status); 4614 } 4615 4616 if (!found) { 4617 if (U_FAILURE(*status)) { 4618 break; 4619 } 4620 textoffset = shiftForward(strsrch, textoffset, lastce, 4621 patternceindex); 4622 // status checked at loop 4623 patternceindex = patterncelength; 4624 continue; 4625 } 4626 4627 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4628 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4629 return TRUE; 4630 } 4631 } 4632 setMatchNotFound(strsrch); 4633 return FALSE; 4634 #else 4635 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4636 int32_t start = -1; 4637 int32_t end = -1; 4638 4639 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4640 strsrch->search->matchedIndex = start; 4641 strsrch->search->matchedLength = end - start; 4642 return TRUE; 4643 } else { 4644 setMatchNotFound(strsrch); 4645 return FALSE; 4646 } 4647 #endif 4648 } 4649 4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4651 { 4652 if (U_FAILURE(*status)) { 4653 setMatchNotFound(strsrch); 4654 return FALSE; 4655 } 4656 4657 #if BOYER_MOORE 4658 UCollationElements *coleiter = strsrch->textIter; 4659 int32_t *patternce = strsrch->pattern.ces; 4660 int32_t patterncelength = strsrch->pattern.cesLength; 4661 int32_t textoffset = ucol_getOffset(coleiter); 4662 4663 // shifting it check for setting offset 4664 // if setOffset is called previously or there was no previous match, we 4665 // leave the offset as it is. 4666 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4667 textoffset = strsrch->search->matchedIndex; 4668 } 4669 4670 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4671 patterncelength); 4672 4673 while (textoffset >= 0) 4674 { 4675 int32_t patternceindex = 1; 4676 int32_t targetce; 4677 UBool found = FALSE; 4678 int32_t firstce = UCOL_NULLORDER; 4679 4680 // if status is a failure, ucol_setOffset does nothing 4681 setColEIterOffset(coleiter, textoffset); 4682 4683 for (;;) { 4684 // finding the first pattern ce match, imagine composite 4685 // characters. for example: search for pattern \u0300 in text 4686 // \u00C0, we'll have to skip A first before we get to 4687 // \u0300 the grave accent 4688 targetce = ucol_next(coleiter, status); 4689 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4690 found = FALSE; 4691 break; 4692 } 4693 targetce = getCE(strsrch, targetce); 4694 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4695 firstce = targetce; 4696 } 4697 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4698 continue; 4699 } 4700 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4701 if (targetce == patternce[0]) { 4702 found = TRUE; 4703 break; 4704 } 4705 if (!hasExpansion(coleiter)) { 4706 // checking for accents in composite character 4707 found = FALSE; 4708 break; 4709 } 4710 } 4711 4712 //targetce = firstce; 4713 4714 while (found && (patternceindex < patterncelength)) { 4715 firstce = targetce; 4716 targetce = ucol_next(coleiter, status); 4717 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4718 found = FALSE; 4719 break; 4720 } 4721 targetce = getCE(strsrch, targetce); 4722 if (targetce == UCOL_IGNORABLE) { 4723 continue; 4724 } 4725 4726 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4727 found = found && targetce == patternce[patternceindex]; 4728 patternceindex ++; 4729 } 4730 4731 targetce = firstce; 4732 4733 if (!found) { 4734 if (U_FAILURE(*status)) { 4735 break; 4736 } 4737 4738 textoffset = reverseShift(strsrch, textoffset, targetce, 4739 patternceindex); 4740 patternceindex = 0; 4741 continue; 4742 } 4743 4744 if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4745 setColEIterOffset(coleiter, textoffset); 4746 return TRUE; 4747 } 4748 } 4749 setMatchNotFound(strsrch); 4750 return FALSE; 4751 #else 4752 int32_t textOffset; 4753 4754 if (strsrch->search->isOverlap) { 4755 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4756 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4757 } else { 4758 // move the start position at the end of possible match 4759 initializePatternPCETable(strsrch, status); 4760 if (!initTextProcessedIter(strsrch, status)) { 4761 setMatchNotFound(strsrch); 4762 return FALSE; 4763 } 4764 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4765 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4766 if (pce == UCOL_PROCESSED_NULLORDER) { 4767 // at the end of the text 4768 break; 4769 } 4770 } 4771 if (U_FAILURE(*status)) { 4772 setMatchNotFound(strsrch); 4773 return FALSE; 4774 } 4775 textOffset = ucol_getOffset(strsrch->textIter); 4776 } 4777 } else { 4778 textOffset = ucol_getOffset(strsrch->textIter); 4779 } 4780 4781 int32_t start = -1; 4782 int32_t end = -1; 4783 4784 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4785 strsrch->search->matchedIndex = start; 4786 strsrch->search->matchedLength = end - start; 4787 return TRUE; 4788 } else { 4789 setMatchNotFound(strsrch); 4790 return FALSE; 4791 } 4792 #endif 4793 } 4794 4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4796 UErrorCode *status) 4797 { 4798 if (U_FAILURE(*status)) { 4799 setMatchNotFound(strsrch); 4800 return FALSE; 4801 } 4802 4803 #if BOYER_MOORE 4804 UCollationElements *coleiter = strsrch->textIter; 4805 int32_t *patternce = strsrch->pattern.ces; 4806 int32_t patterncelength = strsrch->pattern.cesLength; 4807 int32_t textoffset = ucol_getOffset(coleiter); 4808 UBool hasPatternAccents = 4809 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4810 4811 // shifting it check for setting offset 4812 // if setOffset is called previously or there was no previous match, we 4813 // leave the offset as it is. 4814 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4815 textoffset = strsrch->search->matchedIndex; 4816 } 4817 4818 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4819 patterncelength); 4820 strsrch->canonicalPrefixAccents[0] = 0; 4821 strsrch->canonicalSuffixAccents[0] = 0; 4822 4823 while (textoffset >= 0) 4824 { 4825 int32_t patternceindex = 1; 4826 int32_t targetce; 4827 UBool found = FALSE; 4828 int32_t firstce = UCOL_NULLORDER; 4829 4830 setColEIterOffset(coleiter, textoffset); 4831 for (;;) { 4832 // finding the first pattern ce match, imagine composite 4833 // characters. for example: search for pattern \u0300 in text 4834 // \u00C0, we'll have to skip A first before we get to 4835 // \u0300 the grave accent 4836 targetce = ucol_next(coleiter, status); 4837 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4838 found = FALSE; 4839 break; 4840 } 4841 targetce = getCE(strsrch, targetce); 4842 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4843 firstce = targetce; 4844 } 4845 4846 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4847 if (targetce == patternce[0]) { 4848 // the first ce can be a contraction 4849 found = TRUE; 4850 break; 4851 } 4852 if (!hasExpansion(coleiter)) { 4853 // checking for accents in composite character 4854 found = FALSE; 4855 break; 4856 } 4857 } 4858 4859 targetce = firstce; 4860 4861 while (found && patternceindex < patterncelength) { 4862 targetce = ucol_next(coleiter, status); 4863 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4864 found = FALSE; 4865 break; 4866 } 4867 targetce = getCE(strsrch, targetce); 4868 if (targetce == UCOL_IGNORABLE) { 4869 continue; 4870 } 4871 4872 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4873 found = found && targetce == patternce[patternceindex]; 4874 patternceindex ++; 4875 } 4876 4877 // initializing the rearranged accent array 4878 if (hasPatternAccents && !found) { 4879 strsrch->canonicalPrefixAccents[0] = 0; 4880 strsrch->canonicalSuffixAccents[0] = 0; 4881 if (U_FAILURE(*status)) { 4882 break; 4883 } 4884 found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4885 } 4886 4887 if (!found) { 4888 if (U_FAILURE(*status)) { 4889 break; 4890 } 4891 textoffset = reverseShift(strsrch, textoffset, targetce, 4892 patternceindex); 4893 patternceindex = 0; 4894 continue; 4895 } 4896 4897 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4898 setColEIterOffset(coleiter, textoffset); 4899 return TRUE; 4900 } 4901 } 4902 setMatchNotFound(strsrch); 4903 return FALSE; 4904 #else 4905 int32_t textOffset; 4906 4907 if (strsrch->search->isOverlap) { 4908 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4909 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4910 } else { 4911 // move the start position at the end of possible match 4912 initializePatternPCETable(strsrch, status); 4913 if (!initTextProcessedIter(strsrch, status)) { 4914 setMatchNotFound(strsrch); 4915 return FALSE; 4916 } 4917 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4918 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4919 if (pce == UCOL_PROCESSED_NULLORDER) { 4920 // at the end of the text 4921 break; 4922 } 4923 } 4924 if (U_FAILURE(*status)) { 4925 setMatchNotFound(strsrch); 4926 return FALSE; 4927 } 4928 textOffset = ucol_getOffset(strsrch->textIter); 4929 } 4930 } else { 4931 textOffset = ucol_getOffset(strsrch->textIter); 4932 } 4933 4934 int32_t start = -1; 4935 int32_t end = -1; 4936 4937 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4938 strsrch->search->matchedIndex = start; 4939 strsrch->search->matchedLength = end - start; 4940 return TRUE; 4941 } else { 4942 setMatchNotFound(strsrch); 4943 return FALSE; 4944 } 4945 #endif 4946 } 4947 4948 #endif /* #if !UCONFIG_NO_COLLATION */ 4949