1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2015 IBM and others. All rights reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 07/02/2001 synwee Creation. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/usearch.h" 15 #include "unicode/ustring.h" 16 #include "unicode/uchar.h" 17 #include "unicode/utf16.h" 18 #include "normalizer2impl.h" 19 #include "usrchimp.h" 20 #include "cmemory.h" 21 #include "ucln_in.h" 22 #include "uassert.h" 23 #include "ustr_imp.h" 24 25 U_NAMESPACE_USE 26 27 // don't use Boyer-Moore 28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 29 #define BOYER_MOORE 0 30 31 // internal definition --------------------------------------------------- 32 33 #define LAST_BYTE_MASK_ 0xFF 34 #define SECOND_LAST_BYTE_SHIFT_ 8 35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 36 37 static const Normalizer2Impl *g_nfcImpl = NULL; 38 39 // internal methods ------------------------------------------------- 40 41 /** 42 * Fast collation element iterator setOffset. 43 * This function does not check for bounds. 44 * @param coleiter collation element iterator 45 * @param offset to set 46 */ 47 static 48 inline void setColEIterOffset(UCollationElements *elems, 49 int32_t offset) 50 { 51 // Note: Not "fast" any more after the 2013 collation rewrite. 52 // We do not want to expose more internals than necessary. 53 UErrorCode status = U_ZERO_ERROR; 54 ucol_setOffset(elems, offset, &status); 55 } 56 57 /** 58 * Getting the mask for collation strength 59 * @param strength collation strength 60 * @return collation element mask 61 */ 62 static 63 inline uint32_t getMask(UCollationStrength strength) 64 { 65 switch (strength) 66 { 67 case UCOL_PRIMARY: 68 return UCOL_PRIMARYORDERMASK; 69 case UCOL_SECONDARY: 70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 71 default: 72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 73 UCOL_PRIMARYORDERMASK; 74 } 75 } 76 77 /** 78 * @param ce 32-bit collation element 79 * @return hash code 80 */ 81 static 82 inline int hashFromCE32(uint32_t ce) 83 { 84 int hc = (int)( 85 ((((((ce >> 24) * 37) + 86 (ce >> 16)) * 37) + 87 (ce >> 8)) * 37) + 88 ce); 89 hc %= MAX_TABLE_SIZE_; 90 if (hc < 0) { 91 hc += MAX_TABLE_SIZE_; 92 } 93 return hc; 94 } 95 96 U_CDECL_BEGIN 97 static UBool U_CALLCONV 98 usearch_cleanup(void) { 99 g_nfcImpl = NULL; 100 return TRUE; 101 } 102 U_CDECL_END 103 104 /** 105 * Initializing the fcd tables. 106 * Internal method, status assumed to be a success. 107 * @param status output error if any, caller to check status before calling 108 * method, status assumed to be success when passed in. 109 */ 110 static 111 inline void initializeFCD(UErrorCode *status) 112 { 113 if (g_nfcImpl == NULL) { 114 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 115 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 116 } 117 } 118 119 /** 120 * Gets the fcd value for a character at the argument index. 121 * This method takes into accounts of the supplementary characters. 122 * @param str UTF16 string where character for fcd retrieval resides 123 * @param offset position of the character whose fcd is to be retrieved, to be 124 * overwritten with the next character position, taking 125 * surrogate characters into consideration. 126 * @param strlength length of the argument string 127 * @return fcd value 128 */ 129 static 130 uint16_t getFCD(const UChar *str, int32_t *offset, 131 int32_t strlength) 132 { 133 const UChar *temp = str + *offset; 134 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 135 *offset = (int32_t)(temp - str); 136 return result; 137 } 138 139 /** 140 * Getting the modified collation elements taking into account the collation 141 * attributes 142 * @param strsrch string search data 143 * @param sourcece 144 * @return the modified collation element 145 */ 146 static 147 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 148 { 149 // note for tertiary we can't use the collator->tertiaryMask, that 150 // is a preprocessed mask that takes into account case options. since 151 // we are only concerned with exact matches, we don't need that. 152 sourcece &= strsrch->ceMask; 153 154 if (strsrch->toShift) { 155 // alternate handling here, since only the 16 most significant digits 156 // is only used, we can safely do a compare without masking 157 // if the ce is a variable, we mask and get only the primary values 158 // no shifting to quartenary is required since all primary values 159 // less than variabletop will need to be masked off anyway. 160 if (strsrch->variableTop > sourcece) { 161 if (strsrch->strength >= UCOL_QUATERNARY) { 162 sourcece &= UCOL_PRIMARYORDERMASK; 163 } 164 else { 165 sourcece = UCOL_IGNORABLE; 166 } 167 } 168 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 169 sourcece = 0xFFFF; 170 } 171 172 return sourcece; 173 } 174 175 /** 176 * Allocate a memory and returns NULL if it failed. 177 * Internal method, status assumed to be a success. 178 * @param size to allocate 179 * @param status output error if any, caller to check status before calling 180 * method, status assumed to be success when passed in. 181 * @return newly allocated array, NULL otherwise 182 */ 183 static 184 inline void * allocateMemory(uint32_t size, UErrorCode *status) 185 { 186 uint32_t *result = (uint32_t *)uprv_malloc(size); 187 if (result == NULL) { 188 *status = U_MEMORY_ALLOCATION_ERROR; 189 } 190 return result; 191 } 192 193 /** 194 * Adds a uint32_t value to a destination array. 195 * Creates a new array if we run out of space. The caller will have to 196 * manually deallocate the newly allocated array. 197 * Internal method, status assumed to be success, caller has to check status 198 * before calling this method. destination not to be NULL and has at least 199 * size destinationlength. 200 * @param destination target array 201 * @param offset destination offset to add value 202 * @param destinationlength target array size, return value for the new size 203 * @param value to be added 204 * @param increments incremental size expected 205 * @param status output error if any, caller to check status before calling 206 * method, status assumed to be success when passed in. 207 * @return new destination array, destination if there was no new allocation 208 */ 209 static 210 inline int32_t * addTouint32_tArray(int32_t *destination, 211 uint32_t offset, 212 uint32_t *destinationlength, 213 uint32_t value, 214 uint32_t increments, 215 UErrorCode *status) 216 { 217 uint32_t newlength = *destinationlength; 218 if (offset + 1 == newlength) { 219 newlength += increments; 220 int32_t *temp = (int32_t *)allocateMemory( 221 sizeof(int32_t) * newlength, status); 222 if (U_FAILURE(*status)) { 223 return NULL; 224 } 225 uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 226 *destinationlength = newlength; 227 destination = temp; 228 } 229 destination[offset] = value; 230 return destination; 231 } 232 233 /** 234 * Adds a uint64_t value to a destination array. 235 * Creates a new array if we run out of space. The caller will have to 236 * manually deallocate the newly allocated array. 237 * Internal method, status assumed to be success, caller has to check status 238 * before calling this method. destination not to be NULL and has at least 239 * size destinationlength. 240 * @param destination target array 241 * @param offset destination offset to add value 242 * @param destinationlength target array size, return value for the new size 243 * @param value to be added 244 * @param increments incremental size expected 245 * @param status output error if any, caller to check status before calling 246 * method, status assumed to be success when passed in. 247 * @return new destination array, destination if there was no new allocation 248 */ 249 static 250 inline int64_t * addTouint64_tArray(int64_t *destination, 251 uint32_t offset, 252 uint32_t *destinationlength, 253 uint64_t value, 254 uint32_t increments, 255 UErrorCode *status) 256 { 257 uint32_t newlength = *destinationlength; 258 if (offset + 1 == newlength) { 259 newlength += increments; 260 int64_t *temp = (int64_t *)allocateMemory( 261 sizeof(int64_t) * newlength, status); 262 263 if (U_FAILURE(*status)) { 264 return NULL; 265 } 266 267 uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 268 *destinationlength = newlength; 269 destination = temp; 270 } 271 272 destination[offset] = value; 273 274 return destination; 275 } 276 277 /** 278 * Initializing the ce table for a pattern. 279 * Stores non-ignorable collation keys. 280 * Table size will be estimated by the size of the pattern text. Table 281 * expansion will be perform as we go along. Adding 1 to ensure that the table 282 * size definitely increases. 283 * Internal method, status assumed to be a success. 284 * @param strsrch string search data 285 * @param status output error if any, caller to check status before calling 286 * method, status assumed to be success when passed in. 287 * @return total number of expansions 288 */ 289 static 290 inline uint16_t initializePatternCETable(UStringSearch *strsrch, 291 UErrorCode *status) 292 { 293 UPattern *pattern = &(strsrch->pattern); 294 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 295 int32_t *cetable = pattern->cesBuffer; 296 uint32_t patternlength = pattern->textLength; 297 UCollationElements *coleiter = strsrch->utilIter; 298 299 if (coleiter == NULL) { 300 coleiter = ucol_openElements(strsrch->collator, pattern->text, 301 patternlength, status); 302 // status will be checked in ucol_next(..) later and if it is an 303 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 304 // returned. 305 strsrch->utilIter = coleiter; 306 } 307 else { 308 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 309 } 310 if(U_FAILURE(*status)) { 311 return 0; 312 } 313 314 if (pattern->ces != cetable && pattern->ces) { 315 uprv_free(pattern->ces); 316 } 317 318 uint16_t offset = 0; 319 uint16_t result = 0; 320 int32_t ce; 321 322 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 323 U_SUCCESS(*status)) { 324 uint32_t newce = getCE(strsrch, ce); 325 if (newce) { 326 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 327 newce, 328 patternlength - ucol_getOffset(coleiter) + 1, 329 status); 330 if (U_FAILURE(*status)) { 331 return 0; 332 } 333 offset ++; 334 if (cetable != temp && cetable != pattern->cesBuffer) { 335 uprv_free(cetable); 336 } 337 cetable = temp; 338 } 339 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 340 } 341 342 cetable[offset] = 0; 343 pattern->ces = cetable; 344 pattern->cesLength = offset; 345 346 return result; 347 } 348 349 /** 350 * Initializing the pce table for a pattern. 351 * Stores non-ignorable collation keys. 352 * Table size will be estimated by the size of the pattern text. Table 353 * expansion will be perform as we go along. Adding 1 to ensure that the table 354 * size definitely increases. 355 * Internal method, status assumed to be a success. 356 * @param strsrch string search data 357 * @param status output error if any, caller to check status before calling 358 * method, status assumed to be success when passed in. 359 * @return total number of expansions 360 */ 361 static 362 inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 363 UErrorCode *status) 364 { 365 UPattern *pattern = &(strsrch->pattern); 366 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 367 int64_t *pcetable = pattern->pcesBuffer; 368 uint32_t patternlength = pattern->textLength; 369 UCollationElements *coleiter = strsrch->utilIter; 370 371 if (coleiter == NULL) { 372 coleiter = ucol_openElements(strsrch->collator, pattern->text, 373 patternlength, status); 374 // status will be checked in ucol_next(..) later and if it is an 375 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 376 // returned. 377 strsrch->utilIter = coleiter; 378 } else { 379 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 380 } 381 if(U_FAILURE(*status)) { 382 return 0; 383 } 384 385 if (pattern->pces != pcetable && pattern->pces != NULL) { 386 uprv_free(pattern->pces); 387 } 388 389 uint16_t offset = 0; 390 uint16_t result = 0; 391 int64_t pce; 392 393 icu::UCollationPCE iter(coleiter); 394 395 // ** Should processed CEs be signed or unsigned? 396 // ** (the rest of the code in this file seems to play fast-and-loose with 397 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 398 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 399 U_SUCCESS(*status)) { 400 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 401 pce, 402 patternlength - ucol_getOffset(coleiter) + 1, 403 status); 404 405 if (U_FAILURE(*status)) { 406 return 0; 407 } 408 409 offset += 1; 410 411 if (pcetable != temp && pcetable != pattern->pcesBuffer) { 412 uprv_free(pcetable); 413 } 414 415 pcetable = temp; 416 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 417 } 418 419 pcetable[offset] = 0; 420 pattern->pces = pcetable; 421 pattern->pcesLength = offset; 422 423 return result; 424 } 425 426 /** 427 * Initializes the pattern struct. 428 * Internal method, status assumed to be success. 429 * @param strsrch UStringSearch data storage 430 * @param status output error if any, caller to check status before calling 431 * method, status assumed to be success when passed in. 432 * @return expansionsize the total expansion size of the pattern 433 */ 434 static 435 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 436 { 437 if (U_FAILURE(*status)) { return 0; } 438 UPattern *pattern = &(strsrch->pattern); 439 const UChar *patterntext = pattern->text; 440 int32_t length = pattern->textLength; 441 int32_t index = 0; 442 443 // Since the strength is primary, accents are ignored in the pattern. 444 if (strsrch->strength == UCOL_PRIMARY) { 445 pattern->hasPrefixAccents = 0; 446 pattern->hasSuffixAccents = 0; 447 } else { 448 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 449 SECOND_LAST_BYTE_SHIFT_; 450 index = length; 451 U16_BACK_1(patterntext, 0, index); 452 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 453 LAST_BYTE_MASK_; 454 } 455 456 // ** HACK ** 457 if (strsrch->pattern.pces != NULL) { 458 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 459 uprv_free(strsrch->pattern.pces); 460 } 461 462 strsrch->pattern.pces = NULL; 463 } 464 465 // since intializePattern is an internal method status is a success. 466 return initializePatternCETable(strsrch, status); 467 } 468 469 /** 470 * Initializing shift tables, with the default values. 471 * If a corresponding default value is 0, the shift table is not set. 472 * @param shift table for forwards shift 473 * @param backshift table for backwards shift 474 * @param cetable table containing pattern ce 475 * @param cesize size of the pattern ces 476 * @param expansionsize total size of the expansions 477 * @param defaultforward the default forward value 478 * @param defaultbackward the default backward value 479 */ 480 static 481 inline void setShiftTable(int16_t shift[], int16_t backshift[], 482 int32_t *cetable, int32_t cesize, 483 int16_t expansionsize, 484 int16_t defaultforward, 485 int16_t defaultbackward) 486 { 487 // estimate the value to shift. to do that we estimate the smallest 488 // number of characters to give the relevant ces, ie approximately 489 // the number of ces minus their expansion, since expansions can come 490 // from a character. 491 int32_t count; 492 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 493 shift[count] = defaultforward; 494 } 495 cesize --; // down to the last index 496 for (count = 0; count < cesize; count ++) { 497 // number of ces from right of array to the count 498 int temp = defaultforward - count - 1; 499 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1; 500 } 501 shift[hashFromCE32(cetable[cesize])] = 1; 502 // for ignorables we just shift by one. see test examples. 503 shift[hashFromCE32(0)] = 1; 504 505 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 506 backshift[count] = defaultbackward; 507 } 508 for (count = cesize; count > 0; count --) { 509 // the original value count does not seem to work 510 backshift[hashFromCE32(cetable[count])] = count > expansionsize ? 511 (int16_t)(count - expansionsize) : 1; 512 } 513 backshift[hashFromCE32(cetable[0])] = 1; 514 backshift[hashFromCE32(0)] = 1; 515 } 516 517 /** 518 * Building of the pattern collation element list and the boyer moore strsrch 519 * table. 520 * The canonical match will only be performed after the default match fails. 521 * For both cases we need to remember the size of the composed and decomposed 522 * versions of the string. Since the Boyer-Moore shift calculations shifts by 523 * a number of characters in the text and tries to match the pattern from that 524 * offset, the shift value can not be too large in case we miss some 525 * characters. To choose a right shift size, we estimate the NFC form of the 526 * and use its size as a shift guide. The NFC form should be the small 527 * possible representation of the pattern. Anyways, we'll err on the smaller 528 * shift size. Hence the calculation for minlength. 529 * Canonical match will be performed slightly differently. We'll split the 530 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by 531 * the first and last base character (MS), the ending accents (EA). Matches 532 * will be done on MS first, and only when we match MS then some processing 533 * will be required for the prefix and end accents in order to determine if 534 * they match PA and EA. Hence the default shift values 535 * for the canonical match will take the size of either end's accent into 536 * consideration. Forwards search will take the end accents into consideration 537 * for the default shift values and the backwards search will take the prefix 538 * accents into consideration. 539 * If pattern has no non-ignorable ce, we return a illegal argument error. 540 * Internal method, status assumed to be success. 541 * @param strsrch UStringSearch data storage 542 * @param status for output errors if it occurs, status is assumed to be a 543 * success when it is passed in. 544 */ 545 static 546 inline void initialize(UStringSearch *strsrch, UErrorCode *status) 547 { 548 int16_t expandlength = initializePattern(strsrch, status); 549 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { 550 UPattern *pattern = &strsrch->pattern; 551 int32_t cesize = pattern->cesLength; 552 553 int16_t minlength = cesize > expandlength 554 ? (int16_t)cesize - expandlength : 1; 555 pattern->defaultShiftSize = minlength; 556 setShiftTable(pattern->shift, pattern->backShift, pattern->ces, 557 cesize, expandlength, minlength, minlength); 558 return; 559 } 560 strsrch->pattern.defaultShiftSize = 0; 561 } 562 563 #if BOYER_MOORE 564 /** 565 * Check to make sure that the match length is at the end of the character by 566 * using the breakiterator. 567 * @param strsrch string search data 568 * @param start target text start offset 569 * @param end target text end offset 570 */ 571 static 572 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 573 int32_t *end) 574 { 575 #if !UCONFIG_NO_BREAK_ITERATION 576 UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 577 if (breakiterator) { 578 int32_t matchend = *end; 579 //int32_t matchstart = *start; 580 581 if (!ubrk_isBoundary(breakiterator, matchend)) { 582 *end = ubrk_following(breakiterator, matchend); 583 } 584 585 /* Check the start of the matched text to make sure it doesn't have any accents 586 * before it. This code may not be necessary and so it is commented out */ 587 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 588 *start = ubrk_preceding(breakiterator, matchstart); 589 }*/ 590 } 591 #endif 592 } 593 594 /** 595 * Determine whether the target text in UStringSearch bounded by the offset 596 * start and end is one or more whole units of text as 597 * determined by the breakiterator in UStringSearch. 598 * @param strsrch string search data 599 * @param start target text start offset 600 * @param end target text end offset 601 */ 602 static 603 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 604 int32_t end) 605 { 606 #if !UCONFIG_NO_BREAK_ITERATION 607 UBreakIterator *breakiterator = strsrch->search->breakIter; 608 //TODO: Add here. 609 if (breakiterator) { 610 int32_t startindex = ubrk_first(breakiterator); 611 int32_t endindex = ubrk_last(breakiterator); 612 613 // out-of-range indexes are never boundary positions 614 if (start < startindex || start > endindex || 615 end < startindex || end > endindex) { 616 return FALSE; 617 } 618 // otherwise, we can use following() on the position before the 619 // specified one and return true of the position we get back is the 620 // one the user specified 621 UBool result = (start == startindex || 622 ubrk_following(breakiterator, start - 1) == start) && 623 (end == endindex || 624 ubrk_following(breakiterator, end - 1) == end); 625 if (result) { 626 // iterates the individual ces 627 UCollationElements *coleiter = strsrch->utilIter; 628 const UChar *text = strsrch->search->text + 629 start; 630 UErrorCode status = U_ZERO_ERROR; 631 ucol_setText(coleiter, text, end - start, &status); 632 for (int32_t count = 0; count < strsrch->pattern.cesLength; 633 count ++) { 634 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 635 if (ce == UCOL_IGNORABLE) { 636 count --; 637 continue; 638 } 639 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { 640 return FALSE; 641 } 642 } 643 int32_t nextce = ucol_next(coleiter, &status); 644 while (ucol_getOffset(coleiter) == (end - start) 645 && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 646 nextce = ucol_next(coleiter, &status); 647 } 648 if (ucol_getOffset(coleiter) == (end - start) 649 && nextce != UCOL_NULLORDER) { 650 // extra collation elements at the end of the match 651 return FALSE; 652 } 653 } 654 return result; 655 } 656 #endif 657 return TRUE; 658 } 659 660 /** 661 * Getting the next base character offset if current offset is an accent, 662 * or the current offset if the current character contains a base character. 663 * accents the following base character will be returned 664 * @param text string 665 * @param textoffset current offset 666 * @param textlength length of text string 667 * @return the next base character or the current offset 668 * if the current character is contains a base character. 669 */ 670 static 671 inline int32_t getNextBaseOffset(const UChar *text, 672 int32_t textoffset, 673 int32_t textlength) 674 { 675 if (textoffset < textlength) { 676 int32_t temp = textoffset; 677 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 678 while (temp < textlength) { 679 int32_t result = temp; 680 if ((getFCD(text, &temp, textlength) >> 681 SECOND_LAST_BYTE_SHIFT_) == 0) { 682 return result; 683 } 684 } 685 return textlength; 686 } 687 } 688 return textoffset; 689 } 690 691 /** 692 * Gets the next base character offset depending on the string search pattern 693 * data 694 * @param strsrch string search data 695 * @param textoffset current offset, one offset away from the last character 696 * to search for. 697 * @return start index of the next base character or the current offset 698 * if the current character is contains a base character. 699 */ 700 static 701 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 702 int32_t textoffset) 703 { 704 int32_t textlength = strsrch->search->textLength; 705 if (strsrch->pattern.hasSuffixAccents && 706 textoffset < textlength) { 707 int32_t temp = textoffset; 708 const UChar *text = strsrch->search->text; 709 U16_BACK_1(text, 0, temp); 710 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 711 return getNextBaseOffset(text, textoffset, textlength); 712 } 713 } 714 return textoffset; 715 } 716 717 /** 718 * Shifting the collation element iterator position forward to prepare for 719 * a following match. If the last character is a unsafe character, we'll only 720 * shift by 1 to capture contractions, normalization etc. 721 * Internal method, status assumed to be success. 722 * @param text strsrch string search data 723 * @param textoffset start text position to do search 724 * @param ce the text ce which failed the match. 725 * @param patternceindex index of the ce within the pattern ce buffer which 726 * failed the match 727 * @return final offset 728 */ 729 static 730 inline int32_t shiftForward(UStringSearch *strsrch, 731 int32_t textoffset, 732 int32_t ce, 733 int32_t patternceindex) 734 { 735 UPattern *pattern = &(strsrch->pattern); 736 if (ce != UCOL_NULLORDER) { 737 int32_t shift = pattern->shift[hashFromCE32(ce)]; 738 // this is to adjust for characters in the middle of the 739 // substring for matching that failed. 740 int32_t adjust = pattern->cesLength - patternceindex; 741 if (adjust > 1 && shift >= adjust) { 742 shift -= adjust - 1; 743 } 744 textoffset += shift; 745 } 746 else { 747 textoffset += pattern->defaultShiftSize; 748 } 749 750 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 751 // check for unsafe characters 752 // * if it is the start or middle of a contraction: to be done after 753 // a initial match is found 754 // * thai or lao base consonant character: similar to contraction 755 // * high surrogate character: similar to contraction 756 // * next character is a accent: shift to the next base character 757 return textoffset; 758 } 759 #endif // #if BOYER_MOORE 760 761 /** 762 * sets match not found 763 * @param strsrch string search data 764 */ 765 static 766 inline void setMatchNotFound(UStringSearch *strsrch) 767 { 768 // this method resets the match result regardless of the error status. 769 strsrch->search->matchedIndex = USEARCH_DONE; 770 strsrch->search->matchedLength = 0; 771 if (strsrch->search->isForwardSearching) { 772 setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 773 } 774 else { 775 setColEIterOffset(strsrch->textIter, 0); 776 } 777 } 778 779 #if BOYER_MOORE 780 /** 781 * Gets the offset to the next safe point in text. 782 * ie. not the middle of a contraction, swappable characters or supplementary 783 * characters. 784 * @param collator collation sata 785 * @param text string to work with 786 * @param textoffset offset in string 787 * @param textlength length of text string 788 * @return offset to the next safe character 789 */ 790 static 791 inline int32_t getNextSafeOffset(const UCollator *collator, 792 const UChar *text, 793 int32_t textoffset, 794 int32_t textlength) 795 { 796 int32_t result = textoffset; // first contraction character 797 while (result != textlength && ucol_unsafeCP(text[result], collator)) { 798 result ++; 799 } 800 return result; 801 } 802 803 /** 804 * This checks for accents in the potential match started with a . 805 * composite character. 806 * This is really painful... we have to check that composite character do not 807 * have any extra accents. We have to normalize the potential match and find 808 * the immediate decomposed character before the match. 809 * The first composite character would have been taken care of by the fcd 810 * checks in checkForwardExactMatch. 811 * This is the slow path after the fcd of the first character and 812 * the last character has been checked by checkForwardExactMatch and we 813 * determine that the potential match has extra non-ignorable preceding 814 * ces. 815 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 816 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 817 * Note here that accents checking are slow and cautioned in the API docs. 818 * Internal method, status assumed to be a success, caller should check status 819 * before calling this method 820 * @param strsrch string search data 821 * @param start index of the potential unfriendly composite character 822 * @param end index of the potential unfriendly composite character 823 * @param status output error status if any. 824 * @return TRUE if there is non-ignorable accents before at the beginning 825 * of the match, FALSE otherwise. 826 */ 827 828 static 829 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 830 int32_t end, 831 UErrorCode *status) 832 { 833 UBool result = FALSE; 834 if (strsrch->pattern.hasPrefixAccents) { 835 int32_t length = end - start; 836 int32_t offset = 0; 837 const UChar *text = strsrch->search->text + start; 838 839 U16_FWD_1(text, offset, length); 840 // we are only concerned with the first composite character 841 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 842 int32_t safeoffset = getNextSafeOffset(strsrch->collator, 843 text, 0, length); 844 if (safeoffset != length) { 845 safeoffset ++; 846 } 847 UChar *norm = NULL; 848 UChar buffer[INITIAL_ARRAY_SIZE_]; 849 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 850 buffer, INITIAL_ARRAY_SIZE_, 851 status); 852 if (U_FAILURE(*status)) { 853 return FALSE; 854 } 855 if (size >= INITIAL_ARRAY_SIZE_) { 856 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 857 status); 858 // if allocation failed, status will be set to 859 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 860 // checks for it. 861 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 862 size, status); 863 if (U_FAILURE(*status) && norm != NULL) { 864 uprv_free(norm); 865 return FALSE; 866 } 867 } 868 else { 869 norm = buffer; 870 } 871 872 UCollationElements *coleiter = strsrch->utilIter; 873 ucol_setText(coleiter, norm, size, status); 874 uint32_t firstce = strsrch->pattern.ces[0]; 875 UBool ignorable = TRUE; 876 uint32_t ce = UCOL_IGNORABLE; 877 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 878 offset = ucol_getOffset(coleiter); 879 if (ce != firstce && ce != UCOL_IGNORABLE) { 880 ignorable = FALSE; 881 } 882 ce = ucol_next(coleiter, status); 883 } 884 UChar32 codepoint; 885 U16_PREV(norm, 0, offset, codepoint); 886 result = !ignorable && (u_getCombiningClass(codepoint) != 0); 887 888 if (norm != buffer) { 889 uprv_free(norm); 890 } 891 } 892 } 893 894 return result; 895 } 896 897 /** 898 * Used by exact matches, checks if there are accents before the match. 899 * This is really painful... we have to check that composite characters at 900 * the start of the matches have to not have any extra accents. 901 * We check the FCD of the character first, if it starts with an accent and 902 * the first pattern ce does not match the first ce of the character, we bail. 903 * Otherwise we try normalizing the first composite 904 * character and find the immediate decomposed character before the match to 905 * see if it is an non-ignorable accent. 906 * Now normalizing the first composite character is enough because we ensure 907 * that when the match is passed in here with extra beginning ces, the 908 * first or last ce that match has to occur within the first character. 909 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 910 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 911 * Note here that accents checking are slow and cautioned in the API docs. 912 * @param strsrch string search data 913 * @param start offset 914 * @param end offset 915 * @return TRUE if there are accents on either side of the match, 916 * FALSE otherwise 917 */ 918 static 919 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 920 int32_t end) 921 { 922 if (strsrch->pattern.hasPrefixAccents) { 923 UCollationElements *coleiter = strsrch->textIter; 924 UErrorCode status = U_ZERO_ERROR; 925 // we have been iterating forwards previously 926 uint32_t ignorable = TRUE; 927 int32_t firstce = strsrch->pattern.ces[0]; 928 929 setColEIterOffset(coleiter, start); 930 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 931 if (U_FAILURE(status)) { 932 return TRUE; 933 } 934 while (ce != firstce) { 935 if (ce != UCOL_IGNORABLE) { 936 ignorable = FALSE; 937 } 938 ce = getCE(strsrch, ucol_next(coleiter, &status)); 939 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 940 return TRUE; 941 } 942 } 943 if (!ignorable && inNormBuf(coleiter)) { 944 // within normalization buffer, discontiguous handled here 945 return TRUE; 946 } 947 948 // within text 949 int32_t temp = start; 950 // original code 951 // accent = (getFCD(strsrch->search->text, &temp, 952 // strsrch->search->textLength) 953 // >> SECOND_LAST_BYTE_SHIFT_); 954 // however this code does not work well with VC7 .net in release mode. 955 // maybe the inlines for getFCD combined with shifting has bugs in 956 // VC7. anyways this is a work around. 957 UBool accent = getFCD(strsrch->search->text, &temp, 958 strsrch->search->textLength) > 0xFF; 959 if (!accent) { 960 return checkExtraMatchAccents(strsrch, start, end, &status); 961 } 962 if (!ignorable) { 963 return TRUE; 964 } 965 if (start > 0) { 966 temp = start; 967 U16_BACK_1(strsrch->search->text, 0, temp); 968 if (getFCD(strsrch->search->text, &temp, 969 strsrch->search->textLength) & LAST_BYTE_MASK_) { 970 setColEIterOffset(coleiter, start); 971 ce = ucol_previous(coleiter, &status); 972 if (U_FAILURE(status) || 973 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 974 return TRUE; 975 } 976 } 977 } 978 } 979 980 return FALSE; 981 } 982 983 /** 984 * Used by exact matches, checks if there are accents bounding the match. 985 * Note this is the initial boundary check. If the potential match 986 * starts or ends with composite characters, the accents in those 987 * characters will be determined later. 988 * Not doing backwards iteration here, since discontiguos contraction for 989 * backwards collation element iterator, use up too many characters. 990 * E.g. looking for \u030A ring in \u01FA A ring above and acute, 991 * should fail since there is a acute at the end of \u01FA 992 * Note here that accents checking are slow and cautioned in the API docs. 993 * @param strsrch string search data 994 * @param start offset of match 995 * @param end end offset of the match 996 * @return TRUE if there are accents on either side of the match, 997 * FALSE otherwise 998 */ 999 static 1000 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1001 int32_t end) 1002 { 1003 if (strsrch->pattern.hasSuffixAccents) { 1004 const UChar *text = strsrch->search->text; 1005 int32_t temp = end; 1006 int32_t textlength = strsrch->search->textLength; 1007 U16_BACK_1(text, 0, temp); 1008 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1009 int32_t firstce = strsrch->pattern.ces[0]; 1010 UCollationElements *coleiter = strsrch->textIter; 1011 UErrorCode status = U_ZERO_ERROR; 1012 int32_t ce; 1013 setColEIterOffset(coleiter, start); 1014 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1015 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1016 return TRUE; 1017 } 1018 } 1019 int32_t count = 1; 1020 while (count < strsrch->pattern.cesLength) { 1021 if (getCE(strsrch, ucol_next(coleiter, &status)) 1022 == UCOL_IGNORABLE) { 1023 // Thai can give an ignorable here. 1024 count --; 1025 } 1026 if (U_FAILURE(status)) { 1027 return TRUE; 1028 } 1029 count ++; 1030 } 1031 1032 ce = ucol_next(coleiter, &status); 1033 if (U_FAILURE(status)) { 1034 return TRUE; 1035 } 1036 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1037 ce = getCE(strsrch, ce); 1038 } 1039 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1040 if (ucol_getOffset(coleiter) <= end) { 1041 return TRUE; 1042 } 1043 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1044 return TRUE; 1045 } 1046 } 1047 } 1048 } 1049 return FALSE; 1050 } 1051 #endif // #if BOYER_MOORE 1052 1053 /** 1054 * Checks if the offset runs out of the text string 1055 * @param offset 1056 * @param textlength of the text string 1057 * @return TRUE if offset is out of bounds, FALSE otherwise 1058 */ 1059 static 1060 inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1061 { 1062 return offset < 0 || offset > textlength; 1063 } 1064 1065 /** 1066 * Checks for identical match 1067 * @param strsrch string search data 1068 * @param start offset of possible match 1069 * @param end offset of possible match 1070 * @return TRUE if identical match is found 1071 */ 1072 static 1073 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1074 int32_t end) 1075 { 1076 if (strsrch->strength != UCOL_IDENTICAL) { 1077 return TRUE; 1078 } 1079 1080 // Note: We could use Normalizer::compare() or similar, but for short strings 1081 // which may not be in FCD it might be faster to just NFD them. 1082 UErrorCode status = U_ZERO_ERROR; 1083 UnicodeString t2, p2; 1084 strsrch->nfd->normalize( 1085 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 1086 strsrch->nfd->normalize( 1087 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1088 // return FALSE if NFD failed 1089 return U_SUCCESS(status) && t2 == p2; 1090 } 1091 1092 #if BOYER_MOORE 1093 /** 1094 * Checks to see if the match is repeated 1095 * @param strsrch string search data 1096 * @param start new match start index 1097 * @param end new match end index 1098 * @return TRUE if the the match is repeated, FALSE otherwise 1099 */ 1100 static 1101 inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1102 int32_t start, 1103 int32_t end) 1104 { 1105 int32_t lastmatchindex = strsrch->search->matchedIndex; 1106 UBool result; 1107 if (lastmatchindex == USEARCH_DONE) { 1108 return FALSE; 1109 } 1110 if (strsrch->search->isForwardSearching) { 1111 result = start <= lastmatchindex; 1112 } 1113 else { 1114 result = start >= lastmatchindex; 1115 } 1116 if (!result && !strsrch->search->isOverlap) { 1117 if (strsrch->search->isForwardSearching) { 1118 result = start < lastmatchindex + strsrch->search->matchedLength; 1119 } 1120 else { 1121 result = end > lastmatchindex; 1122 } 1123 } 1124 return result; 1125 } 1126 1127 /** 1128 * Gets the collation element iterator's current offset. 1129 * @param coleiter collation element iterator 1130 * @param forwards flag TRUE if we are moving in th forwards direction 1131 * @return current offset 1132 */ 1133 static 1134 inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1135 UBool forwards) 1136 { 1137 int32_t result = ucol_getOffset(coleiter); 1138 // intricacies of the the backwards collation element iterator 1139 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1140 result ++; 1141 } 1142 return result; 1143 } 1144 1145 /** 1146 * Checks match for contraction. 1147 * If the match ends with a partial contraction we fail. 1148 * If the match starts too far off (because of backwards iteration) we try to 1149 * chip off the extra characters depending on whether a breakiterator has 1150 * been used. 1151 * Internal method, error assumed to be success, caller has to check status 1152 * before calling this method. 1153 * @param strsrch string search data 1154 * @param start offset of potential match, to be modified if necessary 1155 * @param end offset of potential match, to be modified if necessary 1156 * @param status output error status if any 1157 * @return TRUE if match passes the contraction test, FALSE otherwise 1158 */ 1159 1160 static 1161 UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1162 int32_t *start, 1163 int32_t *end, UErrorCode *status) 1164 { 1165 UCollationElements *coleiter = strsrch->textIter; 1166 int32_t textlength = strsrch->search->textLength; 1167 int32_t temp = *start; 1168 const UCollator *collator = strsrch->collator; 1169 const UChar *text = strsrch->search->text; 1170 // This part checks if either ends of the match contains potential 1171 // contraction. If so we'll have to iterate through them 1172 // The start contraction needs to be checked since ucol_previous dumps 1173 // all characters till the first safe character into the buffer. 1174 // *start + 1 is used to test for the unsafe characters instead of *start 1175 // because ucol_prev takes all unsafe characters till the first safe 1176 // character ie *start. so by testing *start + 1, we can estimate if 1177 // excess prefix characters has been included in the potential search 1178 // results. 1179 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1180 (*start + 1 < textlength 1181 && ucol_unsafeCP(text[*start + 1], collator))) { 1182 int32_t expansion = getExpansionPrefix(coleiter); 1183 UBool expandflag = expansion > 0; 1184 setColEIterOffset(coleiter, *start); 1185 while (expansion > 0) { 1186 // getting rid of the redundant ce, caused by setOffset. 1187 // since backward contraction/expansion may have extra ces if we 1188 // are in the normalization buffer, hasAccentsBeforeMatch would 1189 // have taken care of it. 1190 // E.g. the character \u01FA will have an expansion of 3, but if 1191 // we are only looking for acute and ring \u030A and \u0301, we'll 1192 // have to skip the first ce in the expansion buffer. 1193 ucol_next(coleiter, status); 1194 if (U_FAILURE(*status)) { 1195 return FALSE; 1196 } 1197 if (ucol_getOffset(coleiter) != temp) { 1198 *start = temp; 1199 temp = ucol_getOffset(coleiter); 1200 } 1201 expansion --; 1202 } 1203 1204 int32_t *patternce = strsrch->pattern.ces; 1205 int32_t patterncelength = strsrch->pattern.cesLength; 1206 int32_t count = 0; 1207 while (count < patterncelength) { 1208 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1209 if (ce == UCOL_IGNORABLE) { 1210 continue; 1211 } 1212 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1213 *start = temp; 1214 temp = ucol_getOffset(coleiter); 1215 } 1216 if (U_FAILURE(*status) || ce != patternce[count]) { 1217 (*end) ++; 1218 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1219 return FALSE; 1220 } 1221 count ++; 1222 } 1223 } 1224 return TRUE; 1225 } 1226 1227 /** 1228 * Checks and sets the match information if found. 1229 * Checks 1230 * <ul> 1231 * <li> the potential match does not repeat the previous match 1232 * <li> boundaries are correct 1233 * <li> exact matches has no extra accents 1234 * <li> identical matchesb 1235 * <li> potential match does not end in the middle of a contraction 1236 * <\ul> 1237 * Otherwise the offset will be shifted to the next character. 1238 * Internal method, status assumed to be success, caller has to check status 1239 * before calling this method. 1240 * @param strsrch string search data 1241 * @param textoffset offset in the collation element text. the returned value 1242 * will be the truncated end offset of the match or the new start 1243 * search offset. 1244 * @param status output error status if any 1245 * @return TRUE if the match is valid, FALSE otherwise 1246 */ 1247 static 1248 inline UBool checkNextExactMatch(UStringSearch *strsrch, 1249 int32_t *textoffset, UErrorCode *status) 1250 { 1251 UCollationElements *coleiter = strsrch->textIter; 1252 int32_t start = getColElemIterOffset(coleiter, FALSE); 1253 1254 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1255 return FALSE; 1256 } 1257 1258 // this totally matches, however we need to check if it is repeating 1259 if (!isBreakUnit(strsrch, start, *textoffset) || 1260 checkRepeatedMatch(strsrch, start, *textoffset) || 1261 hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1262 !checkIdentical(strsrch, start, *textoffset) || 1263 hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1264 1265 (*textoffset) ++; 1266 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1267 return FALSE; 1268 } 1269 1270 //Add breakiterator boundary check for primary strength search. 1271 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1272 checkBreakBoundary(strsrch, &start, textoffset); 1273 } 1274 1275 // totally match, we will get rid of the ending ignorables. 1276 strsrch->search->matchedIndex = start; 1277 strsrch->search->matchedLength = *textoffset - start; 1278 return TRUE; 1279 } 1280 1281 /** 1282 * Getting the previous base character offset, or the current offset if the 1283 * current character is a base character 1284 * @param text string 1285 * @param textoffset one offset after the current character 1286 * @return the offset of the next character after the base character or the first 1287 * composed character with accents 1288 */ 1289 static 1290 inline int32_t getPreviousBaseOffset(const UChar *text, 1291 int32_t textoffset) 1292 { 1293 if (textoffset > 0) { 1294 for (;;) { 1295 int32_t result = textoffset; 1296 U16_BACK_1(text, 0, textoffset); 1297 int32_t temp = textoffset; 1298 uint16_t fcd = getFCD(text, &temp, result); 1299 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1300 if (fcd & LAST_BYTE_MASK_) { 1301 return textoffset; 1302 } 1303 return result; 1304 } 1305 if (textoffset == 0) { 1306 return 0; 1307 } 1308 } 1309 } 1310 return textoffset; 1311 } 1312 1313 /** 1314 * Getting the indexes of the accents that are not blocked in the argument 1315 * accent array 1316 * @param accents array of accents in nfd terminated by a 0. 1317 * @param accentsindex array of indexes of the accents that are not blocked 1318 */ 1319 static 1320 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1321 { 1322 int32_t index = 0; 1323 int32_t length = u_strlen(accents); 1324 UChar32 codepoint = 0; 1325 int cclass = 0; 1326 int result = 0; 1327 int32_t temp; 1328 while (index < length) { 1329 temp = index; 1330 U16_NEXT(accents, index, length, codepoint); 1331 if (u_getCombiningClass(codepoint) != cclass) { 1332 cclass = u_getCombiningClass(codepoint); 1333 accentsindex[result] = temp; 1334 result ++; 1335 } 1336 } 1337 accentsindex[result] = length; 1338 return result; 1339 } 1340 1341 /** 1342 * Appends 3 UChar arrays to a destination array. 1343 * Creates a new array if we run out of space. The caller will have to 1344 * manually deallocate the newly allocated array. 1345 * Internal method, status assumed to be success, caller has to check status 1346 * before calling this method. destination not to be NULL and has at least 1347 * size destinationlength. 1348 * @param destination target array 1349 * @param destinationlength target array size, returning the appended length 1350 * @param source1 null-terminated first array 1351 * @param source2 second array 1352 * @param source2length length of seond array 1353 * @param source3 null-terminated third array 1354 * @param status error status if any 1355 * @return new destination array, destination if there was no new allocation 1356 */ 1357 static 1358 inline UChar * addToUCharArray( UChar *destination, 1359 int32_t *destinationlength, 1360 const UChar *source1, 1361 const UChar *source2, 1362 int32_t source2length, 1363 const UChar *source3, 1364 UErrorCode *status) 1365 { 1366 int32_t source1length = source1 ? u_strlen(source1) : 0; 1367 int32_t source3length = source3 ? u_strlen(source3) : 0; 1368 if (*destinationlength < source1length + source2length + source3length + 1369 1) 1370 { 1371 destination = (UChar *)allocateMemory( 1372 (source1length + source2length + source3length + 1) * sizeof(UChar), 1373 status); 1374 // if error allocating memory, status will be 1375 // U_MEMORY_ALLOCATION_ERROR 1376 if (U_FAILURE(*status)) { 1377 *destinationlength = 0; 1378 return NULL; 1379 } 1380 } 1381 if (source1length != 0) { 1382 uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1383 } 1384 if (source2length != 0) { 1385 uprv_memcpy(destination + source1length, source2, 1386 sizeof(UChar) * source2length); 1387 } 1388 if (source3length != 0) { 1389 uprv_memcpy(destination + source1length + source2length, source3, 1390 sizeof(UChar) * source3length); 1391 } 1392 *destinationlength = source1length + source2length + source3length; 1393 return destination; 1394 } 1395 1396 /** 1397 * Running through a collation element iterator to see if the contents matches 1398 * pattern in string search data 1399 * @param strsrch string search data 1400 * @param coleiter collation element iterator 1401 * @return TRUE if a match if found, FALSE otherwise 1402 */ 1403 static 1404 inline UBool checkCollationMatch(const UStringSearch *strsrch, 1405 UCollationElements *coleiter) 1406 { 1407 int patternceindex = strsrch->pattern.cesLength; 1408 int32_t *patternce = strsrch->pattern.ces; 1409 UErrorCode status = U_ZERO_ERROR; 1410 while (patternceindex > 0) { 1411 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1412 if (ce == UCOL_IGNORABLE) { 1413 continue; 1414 } 1415 if (U_FAILURE(status) || ce != *patternce) { 1416 return FALSE; 1417 } 1418 patternce ++; 1419 patternceindex --; 1420 } 1421 return TRUE; 1422 } 1423 1424 /** 1425 * Rearranges the front accents to try matching. 1426 * Prefix accents in the text will be grouped according to their combining 1427 * class and the groups will be mixed and matched to try find the perfect 1428 * match with the pattern. 1429 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1430 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1431 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1432 * "\u0301\u0325". 1433 * step 2: check if any of the generated substrings matches the pattern. 1434 * Internal method, status is assumed to be success, caller has to check status 1435 * before calling this method. 1436 * @param strsrch string search match 1437 * @param start first offset of the accents to start searching 1438 * @param end start of the last accent set 1439 * @param status output error status if any 1440 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1441 * offset of the match. Note this start includes all preceding accents. 1442 */ 1443 static 1444 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1445 int32_t start, 1446 int32_t end, 1447 UErrorCode *status) 1448 { 1449 const UChar *text = strsrch->search->text; 1450 int32_t textlength = strsrch->search->textLength; 1451 int32_t tempstart = start; 1452 1453 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1454 // die... failed at a base character 1455 return USEARCH_DONE; 1456 } 1457 1458 int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1459 start = getPreviousBaseOffset(text, tempstart); 1460 1461 UChar accents[INITIAL_ARRAY_SIZE_]; 1462 // normalizing the offensive string 1463 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1464 INITIAL_ARRAY_SIZE_, status); 1465 if (U_FAILURE(*status)) { 1466 return USEARCH_DONE; 1467 } 1468 1469 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1470 int32_t accentsize = getUnblockedAccentIndex(accents, 1471 accentsindex); 1472 int32_t count = (2 << (accentsize - 1)) - 1; 1473 UChar buffer[INITIAL_ARRAY_SIZE_]; 1474 UCollationElements *coleiter = strsrch->utilIter; 1475 while (U_SUCCESS(*status) && count > 0) { 1476 UChar *rearrange = strsrch->canonicalPrefixAccents; 1477 // copy the base characters 1478 for (int k = 0; k < accentsindex[0]; k ++) { 1479 *rearrange ++ = accents[k]; 1480 } 1481 // forming all possible canonical rearrangement by dropping 1482 // sets of accents 1483 for (int i = 0; i <= accentsize - 1; i ++) { 1484 int32_t mask = 1 << (accentsize - i - 1); 1485 if (count & mask) { 1486 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1487 *rearrange ++ = accents[j]; 1488 } 1489 } 1490 } 1491 *rearrange = 0; 1492 int32_t matchsize = INITIAL_ARRAY_SIZE_; 1493 UChar *match = addToUCharArray(buffer, &matchsize, 1494 strsrch->canonicalPrefixAccents, 1495 strsrch->search->text + offset, 1496 end - offset, 1497 strsrch->canonicalSuffixAccents, 1498 status); 1499 1500 // if status is a failure, ucol_setText does nothing. 1501 // run the collator iterator through this match 1502 ucol_setText(coleiter, match, matchsize, status); 1503 if (U_SUCCESS(*status)) { 1504 if (checkCollationMatch(strsrch, coleiter)) { 1505 if (match != buffer) { 1506 uprv_free(match); 1507 } 1508 return start; 1509 } 1510 } 1511 count --; 1512 } 1513 return USEARCH_DONE; 1514 } 1515 1516 /** 1517 * Gets the offset to the safe point in text before textoffset. 1518 * ie. not the middle of a contraction, swappable characters or supplementary 1519 * characters. 1520 * @param collator collation sata 1521 * @param text string to work with 1522 * @param textoffset offset in string 1523 * @param textlength length of text string 1524 * @return offset to the previous safe character 1525 */ 1526 static 1527 inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1528 const UChar *text, 1529 int32_t textoffset) 1530 { 1531 int32_t result = textoffset; // first contraction character 1532 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1533 result --; 1534 } 1535 if (result != 0) { 1536 // the first contraction character is consider unsafe here 1537 result --; 1538 } 1539 return result; 1540 } 1541 1542 /** 1543 * Cleaning up after we passed the safe zone 1544 * @param strsrch string search data 1545 * @param safetext safe text array 1546 * @param safebuffer safe text buffer 1547 * @param coleiter collation element iterator for safe text 1548 */ 1549 static 1550 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1551 UChar *safebuffer) 1552 { 1553 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1554 { 1555 uprv_free(safetext); 1556 } 1557 } 1558 1559 /** 1560 * Take the rearranged end accents and tries matching. If match failed at 1561 * a seperate preceding set of accents (seperated from the rearranged on by 1562 * at least a base character) then we rearrange the preceding accents and 1563 * tries matching again. 1564 * We allow skipping of the ends of the accent set if the ces do not match. 1565 * However if the failure is found before the accent set, it fails. 1566 * Internal method, status assumed to be success, caller has to check status 1567 * before calling this method. 1568 * @param strsrch string search data 1569 * @param textoffset of the start of the rearranged accent 1570 * @param status output error status if any 1571 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1572 * offset of the match. Note this start includes all preceding accents. 1573 */ 1574 static 1575 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1576 int32_t textoffset, 1577 UErrorCode *status) 1578 { 1579 const UChar *text = strsrch->search->text; 1580 const UCollator *collator = strsrch->collator; 1581 int32_t safelength = 0; 1582 UChar *safetext; 1583 int32_t safetextlength; 1584 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1585 UCollationElements *coleiter = strsrch->utilIter; 1586 int32_t safeoffset = textoffset; 1587 1588 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1589 collator)) { 1590 safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1591 safelength = textoffset - safeoffset; 1592 safetextlength = INITIAL_ARRAY_SIZE_; 1593 safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1594 text + safeoffset, safelength, 1595 strsrch->canonicalSuffixAccents, 1596 status); 1597 } 1598 else { 1599 safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1600 safetext = strsrch->canonicalSuffixAccents; 1601 } 1602 1603 // if status is a failure, ucol_setText does nothing 1604 ucol_setText(coleiter, safetext, safetextlength, status); 1605 // status checked in loop below 1606 1607 int32_t *ce = strsrch->pattern.ces; 1608 int32_t celength = strsrch->pattern.cesLength; 1609 int ceindex = celength - 1; 1610 UBool isSafe = TRUE; // indication flag for position in safe zone 1611 1612 while (ceindex >= 0) { 1613 int32_t textce = ucol_previous(coleiter, status); 1614 if (U_FAILURE(*status)) { 1615 if (isSafe) { 1616 cleanUpSafeText(strsrch, safetext, safebuffer); 1617 } 1618 return USEARCH_DONE; 1619 } 1620 if (textce == UCOL_NULLORDER) { 1621 // check if we have passed the safe buffer 1622 if (coleiter == strsrch->textIter) { 1623 cleanUpSafeText(strsrch, safetext, safebuffer); 1624 return USEARCH_DONE; 1625 } 1626 cleanUpSafeText(strsrch, safetext, safebuffer); 1627 safetext = safebuffer; 1628 coleiter = strsrch->textIter; 1629 setColEIterOffset(coleiter, safeoffset); 1630 // status checked at the start of the loop 1631 isSafe = FALSE; 1632 continue; 1633 } 1634 textce = getCE(strsrch, textce); 1635 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1636 // do the beginning stuff 1637 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1638 if (isSafe && failedoffset >= safelength) { 1639 // alas... no hope. failed at rearranged accent set 1640 cleanUpSafeText(strsrch, safetext, safebuffer); 1641 return USEARCH_DONE; 1642 } 1643 else { 1644 if (isSafe) { 1645 failedoffset += safeoffset; 1646 cleanUpSafeText(strsrch, safetext, safebuffer); 1647 } 1648 1649 // try rearranging the front accents 1650 int32_t result = doNextCanonicalPrefixMatch(strsrch, 1651 failedoffset, textoffset, status); 1652 if (result != USEARCH_DONE) { 1653 // if status is a failure, ucol_setOffset does nothing 1654 setColEIterOffset(strsrch->textIter, result); 1655 } 1656 if (U_FAILURE(*status)) { 1657 return USEARCH_DONE; 1658 } 1659 return result; 1660 } 1661 } 1662 if (textce == ce[ceindex]) { 1663 ceindex --; 1664 } 1665 } 1666 // set offset here 1667 if (isSafe) { 1668 int32_t result = getColElemIterOffset(coleiter, FALSE); 1669 // sets the text iterator here with the correct expansion and offset 1670 int32_t leftoverces = getExpansionPrefix(coleiter); 1671 cleanUpSafeText(strsrch, safetext, safebuffer); 1672 if (result >= safelength) { 1673 result = textoffset; 1674 } 1675 else { 1676 result += safeoffset; 1677 } 1678 setColEIterOffset(strsrch->textIter, result); 1679 strsrch->textIter->iteratordata_.toReturn = 1680 setExpansionPrefix(strsrch->textIter, leftoverces); 1681 return result; 1682 } 1683 1684 return ucol_getOffset(coleiter); 1685 } 1686 1687 /** 1688 * Trying out the substring and sees if it can be a canonical match. 1689 * This will try normalizing the end accents and arranging them into canonical 1690 * equivalents and check their corresponding ces with the pattern ce. 1691 * Suffix accents in the text will be grouped according to their combining 1692 * class and the groups will be mixed and matched to try find the perfect 1693 * match with the pattern. 1694 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1695 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1696 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1697 * "\u0301\u0325". 1698 * step 2: check if any of the generated substrings matches the pattern. 1699 * Internal method, status assumed to be success, caller has to check status 1700 * before calling this method. 1701 * @param strsrch string search data 1702 * @param textoffset end offset in the collation element text that ends with 1703 * the accents to be rearranged 1704 * @param status error status if any 1705 * @return TRUE if the match is valid, FALSE otherwise 1706 */ 1707 static 1708 UBool doNextCanonicalMatch(UStringSearch *strsrch, 1709 int32_t textoffset, 1710 UErrorCode *status) 1711 { 1712 const UChar *text = strsrch->search->text; 1713 int32_t temp = textoffset; 1714 U16_BACK_1(text, 0, temp); 1715 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1716 UCollationElements *coleiter = strsrch->textIter; 1717 int32_t offset = getColElemIterOffset(coleiter, FALSE); 1718 if (strsrch->pattern.hasPrefixAccents) { 1719 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1720 status); 1721 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1722 setColEIterOffset(coleiter, offset); 1723 return TRUE; 1724 } 1725 } 1726 return FALSE; 1727 } 1728 1729 if (!strsrch->pattern.hasSuffixAccents) { 1730 return FALSE; 1731 } 1732 1733 UChar accents[INITIAL_ARRAY_SIZE_]; 1734 // offset to the last base character in substring to search 1735 int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1736 // normalizing the offensive string 1737 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1738 0, accents, INITIAL_ARRAY_SIZE_, status); 1739 // status checked in loop below 1740 1741 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1742 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1743 1744 // 2 power n - 1 plus the full set of accents 1745 int32_t count = (2 << (size - 1)) - 1; 1746 while (U_SUCCESS(*status) && count > 0) { 1747 UChar *rearrange = strsrch->canonicalSuffixAccents; 1748 // copy the base characters 1749 for (int k = 0; k < accentsindex[0]; k ++) { 1750 *rearrange ++ = accents[k]; 1751 } 1752 // forming all possible canonical rearrangement by dropping 1753 // sets of accents 1754 for (int i = 0; i <= size - 1; i ++) { 1755 int32_t mask = 1 << (size - i - 1); 1756 if (count & mask) { 1757 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1758 *rearrange ++ = accents[j]; 1759 } 1760 } 1761 } 1762 *rearrange = 0; 1763 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1764 status); 1765 if (offset != USEARCH_DONE) { 1766 return TRUE; // match found 1767 } 1768 count --; 1769 } 1770 return FALSE; 1771 } 1772 1773 /** 1774 * Gets the previous base character offset depending on the string search 1775 * pattern data 1776 * @param strsrch string search data 1777 * @param textoffset current offset, current character 1778 * @return the offset of the next character after this base character or itself 1779 * if it is a composed character with accents 1780 */ 1781 static 1782 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1783 int32_t textoffset) 1784 { 1785 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1786 const UChar *text = strsrch->search->text; 1787 int32_t offset = textoffset; 1788 if (getFCD(text, &offset, strsrch->search->textLength) >> 1789 SECOND_LAST_BYTE_SHIFT_) { 1790 return getPreviousBaseOffset(text, textoffset); 1791 } 1792 } 1793 return textoffset; 1794 } 1795 1796 /** 1797 * Checks match for contraction. 1798 * If the match ends with a partial contraction we fail. 1799 * If the match starts too far off (because of backwards iteration) we try to 1800 * chip off the extra characters 1801 * Internal method, status assumed to be success, caller has to check status 1802 * before calling this method. 1803 * @param strsrch string search data 1804 * @param start offset of potential match, to be modified if necessary 1805 * @param end offset of potential match, to be modified if necessary 1806 * @param status output error status if any 1807 * @return TRUE if match passes the contraction test, FALSE otherwise 1808 */ 1809 static 1810 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1811 int32_t *start, 1812 int32_t *end, 1813 UErrorCode *status) 1814 { 1815 UCollationElements *coleiter = strsrch->textIter; 1816 int32_t textlength = strsrch->search->textLength; 1817 int32_t temp = *start; 1818 const UCollator *collator = strsrch->collator; 1819 const UChar *text = strsrch->search->text; 1820 // This part checks if either ends of the match contains potential 1821 // contraction. If so we'll have to iterate through them 1822 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1823 (*start + 1 < textlength 1824 && ucol_unsafeCP(text[*start + 1], collator))) { 1825 int32_t expansion = getExpansionPrefix(coleiter); 1826 UBool expandflag = expansion > 0; 1827 setColEIterOffset(coleiter, *start); 1828 while (expansion > 0) { 1829 // getting rid of the redundant ce, caused by setOffset. 1830 // since backward contraction/expansion may have extra ces if we 1831 // are in the normalization buffer, hasAccentsBeforeMatch would 1832 // have taken care of it. 1833 // E.g. the character \u01FA will have an expansion of 3, but if 1834 // we are only looking for acute and ring \u030A and \u0301, we'll 1835 // have to skip the first ce in the expansion buffer. 1836 ucol_next(coleiter, status); 1837 if (U_FAILURE(*status)) { 1838 return FALSE; 1839 } 1840 if (ucol_getOffset(coleiter) != temp) { 1841 *start = temp; 1842 temp = ucol_getOffset(coleiter); 1843 } 1844 expansion --; 1845 } 1846 1847 int32_t *patternce = strsrch->pattern.ces; 1848 int32_t patterncelength = strsrch->pattern.cesLength; 1849 int32_t count = 0; 1850 int32_t textlength = strsrch->search->textLength; 1851 while (count < patterncelength) { 1852 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1853 // status checked below, note that if status is a failure 1854 // ucol_next returns UCOL_NULLORDER 1855 if (ce == UCOL_IGNORABLE) { 1856 continue; 1857 } 1858 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1859 *start = temp; 1860 temp = ucol_getOffset(coleiter); 1861 } 1862 1863 if (count == 0 && ce != patternce[0]) { 1864 // accents may have extra starting ces, this occurs when a 1865 // pure accent pattern is matched without rearrangement 1866 // text \u0325\u0300 and looking for \u0300 1867 int32_t expected = patternce[0]; 1868 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1869 ce = getCE(strsrch, ucol_next(coleiter, status)); 1870 while (U_SUCCESS(*status) && ce != expected && 1871 ce != UCOL_NULLORDER && 1872 ucol_getOffset(coleiter) <= *end) { 1873 ce = getCE(strsrch, ucol_next(coleiter, status)); 1874 } 1875 } 1876 } 1877 if (U_FAILURE(*status) || ce != patternce[count]) { 1878 (*end) ++; 1879 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1880 return FALSE; 1881 } 1882 count ++; 1883 } 1884 } 1885 return TRUE; 1886 } 1887 1888 /** 1889 * Checks and sets the match information if found. 1890 * Checks 1891 * <ul> 1892 * <li> the potential match does not repeat the previous match 1893 * <li> boundaries are correct 1894 * <li> potential match does not end in the middle of a contraction 1895 * <li> identical matches 1896 * <\ul> 1897 * Otherwise the offset will be shifted to the next character. 1898 * Internal method, status assumed to be success, caller has to check the 1899 * status before calling this method. 1900 * @param strsrch string search data 1901 * @param textoffset offset in the collation element text. the returned value 1902 * will be the truncated end offset of the match or the new start 1903 * search offset. 1904 * @param status output error status if any 1905 * @return TRUE if the match is valid, FALSE otherwise 1906 */ 1907 static 1908 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1909 int32_t *textoffset, 1910 UErrorCode *status) 1911 { 1912 // to ensure that the start and ends are not composite characters 1913 UCollationElements *coleiter = strsrch->textIter; 1914 // if we have a canonical accent match 1915 if ((strsrch->pattern.hasSuffixAccents && 1916 strsrch->canonicalSuffixAccents[0]) || 1917 (strsrch->pattern.hasPrefixAccents && 1918 strsrch->canonicalPrefixAccents[0])) { 1919 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1920 strsrch, 1921 ucol_getOffset(coleiter)); 1922 strsrch->search->matchedLength = *textoffset - 1923 strsrch->search->matchedIndex; 1924 return TRUE; 1925 } 1926 1927 int32_t start = getColElemIterOffset(coleiter, FALSE); 1928 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1929 status) || U_FAILURE(*status)) { 1930 return FALSE; 1931 } 1932 1933 start = getPreviousUStringSearchBaseOffset(strsrch, start); 1934 // this totally matches, however we need to check if it is repeating 1935 if (checkRepeatedMatch(strsrch, start, *textoffset) || 1936 !isBreakUnit(strsrch, start, *textoffset) || 1937 !checkIdentical(strsrch, start, *textoffset)) { 1938 (*textoffset) ++; 1939 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1940 strsrch->search->textLength); 1941 return FALSE; 1942 } 1943 1944 strsrch->search->matchedIndex = start; 1945 strsrch->search->matchedLength = *textoffset - start; 1946 return TRUE; 1947 } 1948 1949 /** 1950 * Shifting the collation element iterator position forward to prepare for 1951 * a preceding match. If the first character is a unsafe character, we'll only 1952 * shift by 1 to capture contractions, normalization etc. 1953 * Internal method, status assumed to be success, caller has to check status 1954 * before calling this method. 1955 * @param text strsrch string search data 1956 * @param textoffset start text position to do search 1957 * @param ce the text ce which failed the match. 1958 * @param patternceindex index of the ce within the pattern ce buffer which 1959 * failed the match 1960 * @return final offset 1961 */ 1962 static 1963 inline int32_t reverseShift(UStringSearch *strsrch, 1964 int32_t textoffset, 1965 int32_t ce, 1966 int32_t patternceindex) 1967 { 1968 if (strsrch->search->isOverlap) { 1969 if (textoffset != strsrch->search->textLength) { 1970 textoffset --; 1971 } 1972 else { 1973 textoffset -= strsrch->pattern.defaultShiftSize; 1974 } 1975 } 1976 else { 1977 if (ce != UCOL_NULLORDER) { 1978 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)]; 1979 1980 // this is to adjust for characters in the middle of the substring 1981 // for matching that failed. 1982 int32_t adjust = patternceindex; 1983 if (adjust > 1 && shift > adjust) { 1984 shift -= adjust - 1; 1985 } 1986 textoffset -= shift; 1987 } 1988 else { 1989 textoffset -= strsrch->pattern.defaultShiftSize; 1990 } 1991 } 1992 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1993 return textoffset; 1994 } 1995 1996 /** 1997 * Checks match for contraction. 1998 * If the match starts with a partial contraction we fail. 1999 * Internal method, status assumed to be success, caller has to check status 2000 * before calling this method. 2001 * @param strsrch string search data 2002 * @param start offset of potential match, to be modified if necessary 2003 * @param end offset of potential match, to be modified if necessary 2004 * @param status output error status if any 2005 * @return TRUE if match passes the contraction test, FALSE otherwise 2006 */ 2007 static 2008 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2009 int32_t *start, 2010 int32_t *end, UErrorCode *status) 2011 { 2012 UCollationElements *coleiter = strsrch->textIter; 2013 int32_t textlength = strsrch->search->textLength; 2014 int32_t temp = *end; 2015 const UCollator *collator = strsrch->collator; 2016 const UChar *text = strsrch->search->text; 2017 // This part checks if either if the start of the match contains potential 2018 // contraction. If so we'll have to iterate through them 2019 // Since we used ucol_next while previously looking for the potential 2020 // match, this guarantees that our end will not be a partial contraction, 2021 // or a partial supplementary character. 2022 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2023 int32_t expansion = getExpansionSuffix(coleiter); 2024 UBool expandflag = expansion > 0; 2025 setColEIterOffset(coleiter, *end); 2026 while (U_SUCCESS(*status) && expansion > 0) { 2027 // getting rid of the redundant ce 2028 // since forward contraction/expansion may have extra ces 2029 // if we are in the normalization buffer, hasAccentsBeforeMatch 2030 // would have taken care of it. 2031 // E.g. the character \u01FA will have an expansion of 3, but if 2032 // we are only looking for A ring A\u030A, we'll have to skip the 2033 // last ce in the expansion buffer 2034 ucol_previous(coleiter, status); 2035 if (U_FAILURE(*status)) { 2036 return FALSE; 2037 } 2038 if (ucol_getOffset(coleiter) != temp) { 2039 *end = temp; 2040 temp = ucol_getOffset(coleiter); 2041 } 2042 expansion --; 2043 } 2044 2045 int32_t *patternce = strsrch->pattern.ces; 2046 int32_t patterncelength = strsrch->pattern.cesLength; 2047 int32_t count = patterncelength; 2048 while (count > 0) { 2049 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2050 // status checked below, note that if status is a failure 2051 // ucol_previous returns UCOL_NULLORDER 2052 if (ce == UCOL_IGNORABLE) { 2053 continue; 2054 } 2055 if (expandflag && count == 0 && 2056 getColElemIterOffset(coleiter, FALSE) != temp) { 2057 *end = temp; 2058 temp = ucol_getOffset(coleiter); 2059 } 2060 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2061 (*start) --; 2062 *start = getPreviousBaseOffset(text, *start); 2063 return FALSE; 2064 } 2065 count --; 2066 } 2067 } 2068 return TRUE; 2069 } 2070 2071 /** 2072 * Checks and sets the match information if found. 2073 * Checks 2074 * <ul> 2075 * <li> the current match does not repeat the last match 2076 * <li> boundaries are correct 2077 * <li> exact matches has no extra accents 2078 * <li> identical matches 2079 * <\ul> 2080 * Otherwise the offset will be shifted to the preceding character. 2081 * Internal method, status assumed to be success, caller has to check status 2082 * before calling this method. 2083 * @param strsrch string search data 2084 * @param collator 2085 * @param coleiter collation element iterator 2086 * @param text string 2087 * @param textoffset offset in the collation element text. the returned value 2088 * will be the truncated start offset of the match or the new start 2089 * search offset. 2090 * @param status output error status if any 2091 * @return TRUE if the match is valid, FALSE otherwise 2092 */ 2093 static 2094 inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2095 int32_t *textoffset, 2096 UErrorCode *status) 2097 { 2098 // to ensure that the start and ends are not composite characters 2099 int32_t end = ucol_getOffset(strsrch->textIter); 2100 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2101 || U_FAILURE(*status)) { 2102 return FALSE; 2103 } 2104 2105 // this totally matches, however we need to check if it is repeating 2106 // the old match 2107 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2108 !isBreakUnit(strsrch, *textoffset, end) || 2109 hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2110 !checkIdentical(strsrch, *textoffset, end) || 2111 hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2112 (*textoffset) --; 2113 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2114 *textoffset); 2115 return FALSE; 2116 } 2117 2118 //Add breakiterator boundary check for primary strength search. 2119 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2120 checkBreakBoundary(strsrch, textoffset, &end); 2121 } 2122 2123 strsrch->search->matchedIndex = *textoffset; 2124 strsrch->search->matchedLength = end - *textoffset; 2125 return TRUE; 2126 } 2127 2128 /** 2129 * Rearranges the end accents to try matching. 2130 * Suffix accents in the text will be grouped according to their combining 2131 * class and the groups will be mixed and matched to try find the perfect 2132 * match with the pattern. 2133 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2134 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2135 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2136 * "\u0301\u0325". 2137 * step 2: check if any of the generated substrings matches the pattern. 2138 * Internal method, status assumed to be success, user has to check status 2139 * before calling this method. 2140 * @param strsrch string search match 2141 * @param start offset of the first base character 2142 * @param end start of the last accent set 2143 * @param status only error status if any 2144 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2145 * offset of the match. Note this start includes all following accents. 2146 */ 2147 static 2148 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2149 int32_t start, 2150 int32_t end, 2151 UErrorCode *status) 2152 { 2153 const UChar *text = strsrch->search->text; 2154 int32_t tempend = end; 2155 2156 U16_BACK_1(text, 0, tempend); 2157 if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2158 LAST_BYTE_MASK_)) { 2159 // die... failed at a base character 2160 return USEARCH_DONE; 2161 } 2162 end = getNextBaseOffset(text, end, strsrch->search->textLength); 2163 2164 if (U_SUCCESS(*status)) { 2165 UChar accents[INITIAL_ARRAY_SIZE_]; 2166 int32_t offset = getPreviousBaseOffset(text, end); 2167 // normalizing the offensive string 2168 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2169 INITIAL_ARRAY_SIZE_, status); 2170 2171 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2172 int32_t accentsize = getUnblockedAccentIndex(accents, 2173 accentsindex); 2174 int32_t count = (2 << (accentsize - 1)) - 1; 2175 UChar buffer[INITIAL_ARRAY_SIZE_]; 2176 UCollationElements *coleiter = strsrch->utilIter; 2177 while (U_SUCCESS(*status) && count > 0) { 2178 UChar *rearrange = strsrch->canonicalSuffixAccents; 2179 // copy the base characters 2180 for (int k = 0; k < accentsindex[0]; k ++) { 2181 *rearrange ++ = accents[k]; 2182 } 2183 // forming all possible canonical rearrangement by dropping 2184 // sets of accents 2185 for (int i = 0; i <= accentsize - 1; i ++) { 2186 int32_t mask = 1 << (accentsize - i - 1); 2187 if (count & mask) { 2188 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2189 *rearrange ++ = accents[j]; 2190 } 2191 } 2192 } 2193 *rearrange = 0; 2194 int32_t matchsize = INITIAL_ARRAY_SIZE_; 2195 UChar *match = addToUCharArray(buffer, &matchsize, 2196 strsrch->canonicalPrefixAccents, 2197 strsrch->search->text + start, 2198 offset - start, 2199 strsrch->canonicalSuffixAccents, 2200 status); 2201 2202 // run the collator iterator through this match 2203 // if status is a failure ucol_setText does nothing 2204 ucol_setText(coleiter, match, matchsize, status); 2205 if (U_SUCCESS(*status)) { 2206 if (checkCollationMatch(strsrch, coleiter)) { 2207 if (match != buffer) { 2208 uprv_free(match); 2209 } 2210 return end; 2211 } 2212 } 2213 count --; 2214 } 2215 } 2216 return USEARCH_DONE; 2217 } 2218 2219 /** 2220 * Take the rearranged start accents and tries matching. If match failed at 2221 * a seperate following set of accents (seperated from the rearranged on by 2222 * at least a base character) then we rearrange the preceding accents and 2223 * tries matching again. 2224 * We allow skipping of the ends of the accent set if the ces do not match. 2225 * However if the failure is found before the accent set, it fails. 2226 * Internal method, status assumed to be success, caller has to check status 2227 * before calling this method. 2228 * @param strsrch string search data 2229 * @param textoffset of the ends of the rearranged accent 2230 * @param status output error status if any 2231 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2232 * offset of the match. Note this start includes all following accents. 2233 */ 2234 static 2235 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2236 int32_t textoffset, 2237 UErrorCode *status) 2238 { 2239 const UChar *text = strsrch->search->text; 2240 const UCollator *collator = strsrch->collator; 2241 int32_t safelength = 0; 2242 UChar *safetext; 2243 int32_t safetextlength; 2244 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2245 int32_t safeoffset = textoffset; 2246 2247 if (textoffset && 2248 ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2249 u_strlen(strsrch->canonicalPrefixAccents) - 1 2250 ], collator)) { 2251 safeoffset = getNextSafeOffset(collator, text, textoffset, 2252 strsrch->search->textLength); 2253 safelength = safeoffset - textoffset; 2254 safetextlength = INITIAL_ARRAY_SIZE_; 2255 safetext = addToUCharArray(safebuffer, &safetextlength, 2256 strsrch->canonicalPrefixAccents, 2257 text + textoffset, safelength, 2258 NULL, status); 2259 } 2260 else { 2261 safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2262 safetext = strsrch->canonicalPrefixAccents; 2263 } 2264 2265 UCollationElements *coleiter = strsrch->utilIter; 2266 // if status is a failure, ucol_setText does nothing 2267 ucol_setText(coleiter, safetext, safetextlength, status); 2268 // status checked in loop below 2269 2270 int32_t *ce = strsrch->pattern.ces; 2271 int32_t celength = strsrch->pattern.cesLength; 2272 int ceindex = 0; 2273 UBool isSafe = TRUE; // safe zone indication flag for position 2274 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2275 2276 while (ceindex < celength) { 2277 int32_t textce = ucol_next(coleiter, status); 2278 if (U_FAILURE(*status)) { 2279 if (isSafe) { 2280 cleanUpSafeText(strsrch, safetext, safebuffer); 2281 } 2282 return USEARCH_DONE; 2283 } 2284 if (textce == UCOL_NULLORDER) { 2285 // check if we have passed the safe buffer 2286 if (coleiter == strsrch->textIter) { 2287 cleanUpSafeText(strsrch, safetext, safebuffer); 2288 return USEARCH_DONE; 2289 } 2290 cleanUpSafeText(strsrch, safetext, safebuffer); 2291 safetext = safebuffer; 2292 coleiter = strsrch->textIter; 2293 setColEIterOffset(coleiter, safeoffset); 2294 // status checked at the start of the loop 2295 isSafe = FALSE; 2296 continue; 2297 } 2298 textce = getCE(strsrch, textce); 2299 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2300 // do the beginning stuff 2301 int32_t failedoffset = ucol_getOffset(coleiter); 2302 if (isSafe && failedoffset <= prefixlength) { 2303 // alas... no hope. failed at rearranged accent set 2304 cleanUpSafeText(strsrch, safetext, safebuffer); 2305 return USEARCH_DONE; 2306 } 2307 else { 2308 if (isSafe) { 2309 failedoffset = safeoffset - failedoffset; 2310 cleanUpSafeText(strsrch, safetext, safebuffer); 2311 } 2312 2313 // try rearranging the end accents 2314 int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2315 textoffset, failedoffset, status); 2316 if (result != USEARCH_DONE) { 2317 // if status is a failure, ucol_setOffset does nothing 2318 setColEIterOffset(strsrch->textIter, result); 2319 } 2320 if (U_FAILURE(*status)) { 2321 return USEARCH_DONE; 2322 } 2323 return result; 2324 } 2325 } 2326 if (textce == ce[ceindex]) { 2327 ceindex ++; 2328 } 2329 } 2330 // set offset here 2331 if (isSafe) { 2332 int32_t result = ucol_getOffset(coleiter); 2333 // sets the text iterator here with the correct expansion and offset 2334 int32_t leftoverces = getExpansionSuffix(coleiter); 2335 cleanUpSafeText(strsrch, safetext, safebuffer); 2336 if (result <= prefixlength) { 2337 result = textoffset; 2338 } 2339 else { 2340 result = textoffset + (safeoffset - result); 2341 } 2342 setColEIterOffset(strsrch->textIter, result); 2343 setExpansionSuffix(strsrch->textIter, leftoverces); 2344 return result; 2345 } 2346 2347 return ucol_getOffset(coleiter); 2348 } 2349 2350 /** 2351 * Trying out the substring and sees if it can be a canonical match. 2352 * This will try normalizing the starting accents and arranging them into 2353 * canonical equivalents and check their corresponding ces with the pattern ce. 2354 * Prefix accents in the text will be grouped according to their combining 2355 * class and the groups will be mixed and matched to try find the perfect 2356 * match with the pattern. 2357 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2358 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2359 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2360 * "\u0301\u0325". 2361 * step 2: check if any of the generated substrings matches the pattern. 2362 * Internal method, status assumed to be success, caller has to check status 2363 * before calling this method. 2364 * @param strsrch string search data 2365 * @param textoffset start offset in the collation element text that starts 2366 * with the accents to be rearranged 2367 * @param status output error status if any 2368 * @return TRUE if the match is valid, FALSE otherwise 2369 */ 2370 static 2371 UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2372 int32_t textoffset, 2373 UErrorCode *status) 2374 { 2375 const UChar *text = strsrch->search->text; 2376 int32_t temp = textoffset; 2377 int32_t textlength = strsrch->search->textLength; 2378 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2379 UCollationElements *coleiter = strsrch->textIter; 2380 int32_t offset = ucol_getOffset(coleiter); 2381 if (strsrch->pattern.hasSuffixAccents) { 2382 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2383 offset, status); 2384 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2385 setColEIterOffset(coleiter, offset); 2386 return TRUE; 2387 } 2388 } 2389 return FALSE; 2390 } 2391 2392 if (!strsrch->pattern.hasPrefixAccents) { 2393 return FALSE; 2394 } 2395 2396 UChar accents[INITIAL_ARRAY_SIZE_]; 2397 // offset to the last base character in substring to search 2398 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2399 // normalizing the offensive string 2400 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2401 0, accents, INITIAL_ARRAY_SIZE_, status); 2402 // status checked in loop 2403 2404 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2405 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2406 2407 // 2 power n - 1 plus the full set of accents 2408 int32_t count = (2 << (size - 1)) - 1; 2409 while (U_SUCCESS(*status) && count > 0) { 2410 UChar *rearrange = strsrch->canonicalPrefixAccents; 2411 // copy the base characters 2412 for (int k = 0; k < accentsindex[0]; k ++) { 2413 *rearrange ++ = accents[k]; 2414 } 2415 // forming all possible canonical rearrangement by dropping 2416 // sets of accents 2417 for (int i = 0; i <= size - 1; i ++) { 2418 int32_t mask = 1 << (size - i - 1); 2419 if (count & mask) { 2420 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2421 *rearrange ++ = accents[j]; 2422 } 2423 } 2424 } 2425 *rearrange = 0; 2426 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2427 baseoffset, status); 2428 if (offset != USEARCH_DONE) { 2429 return TRUE; // match found 2430 } 2431 count --; 2432 } 2433 return FALSE; 2434 } 2435 2436 /** 2437 * Checks match for contraction. 2438 * If the match starts with a partial contraction we fail. 2439 * Internal method, status assumed to be success, caller has to check status 2440 * before calling this method. 2441 * @param strsrch string search data 2442 * @param start offset of potential match, to be modified if necessary 2443 * @param end offset of potential match, to be modified if necessary 2444 * @param status only error status if any 2445 * @return TRUE if match passes the contraction test, FALSE otherwise 2446 */ 2447 static 2448 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2449 int32_t *start, 2450 int32_t *end, UErrorCode *status) 2451 { 2452 UCollationElements *coleiter = strsrch->textIter; 2453 int32_t textlength = strsrch->search->textLength; 2454 int32_t temp = *end; 2455 const UCollator *collator = strsrch->collator; 2456 const UChar *text = strsrch->search->text; 2457 // This part checks if either if the start of the match contains potential 2458 // contraction. If so we'll have to iterate through them 2459 // Since we used ucol_next while previously looking for the potential 2460 // match, this guarantees that our end will not be a partial contraction, 2461 // or a partial supplementary character. 2462 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2463 int32_t expansion = getExpansionSuffix(coleiter); 2464 UBool expandflag = expansion > 0; 2465 setColEIterOffset(coleiter, *end); 2466 while (expansion > 0) { 2467 // getting rid of the redundant ce 2468 // since forward contraction/expansion may have extra ces 2469 // if we are in the normalization buffer, hasAccentsBeforeMatch 2470 // would have taken care of it. 2471 // E.g. the character \u01FA will have an expansion of 3, but if 2472 // we are only looking for A ring A\u030A, we'll have to skip the 2473 // last ce in the expansion buffer 2474 ucol_previous(coleiter, status); 2475 if (U_FAILURE(*status)) { 2476 return FALSE; 2477 } 2478 if (ucol_getOffset(coleiter) != temp) { 2479 *end = temp; 2480 temp = ucol_getOffset(coleiter); 2481 } 2482 expansion --; 2483 } 2484 2485 int32_t *patternce = strsrch->pattern.ces; 2486 int32_t patterncelength = strsrch->pattern.cesLength; 2487 int32_t count = patterncelength; 2488 while (count > 0) { 2489 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2490 // status checked below, note that if status is a failure 2491 // ucol_previous returns UCOL_NULLORDER 2492 if (ce == UCOL_IGNORABLE) { 2493 continue; 2494 } 2495 if (expandflag && count == 0 && 2496 getColElemIterOffset(coleiter, FALSE) != temp) { 2497 *end = temp; 2498 temp = ucol_getOffset(coleiter); 2499 } 2500 if (count == patterncelength && 2501 ce != patternce[patterncelength - 1]) { 2502 // accents may have extra starting ces, this occurs when a 2503 // pure accent pattern is matched without rearrangement 2504 int32_t expected = patternce[patterncelength - 1]; 2505 U16_BACK_1(text, 0, *end); 2506 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2507 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2508 while (U_SUCCESS(*status) && ce != expected && 2509 ce != UCOL_NULLORDER && 2510 ucol_getOffset(coleiter) <= *start) { 2511 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2512 } 2513 } 2514 } 2515 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2516 (*start) --; 2517 *start = getPreviousBaseOffset(text, *start); 2518 return FALSE; 2519 } 2520 count --; 2521 } 2522 } 2523 return TRUE; 2524 } 2525 2526 /** 2527 * Checks and sets the match information if found. 2528 * Checks 2529 * <ul> 2530 * <li> the potential match does not repeat the previous match 2531 * <li> boundaries are correct 2532 * <li> potential match does not end in the middle of a contraction 2533 * <li> identical matches 2534 * <\ul> 2535 * Otherwise the offset will be shifted to the next character. 2536 * Internal method, status assumed to be success, caller has to check status 2537 * before calling this method. 2538 * @param strsrch string search data 2539 * @param textoffset offset in the collation element text. the returned value 2540 * will be the truncated start offset of the match or the new start 2541 * search offset. 2542 * @param status only error status if any 2543 * @return TRUE if the match is valid, FALSE otherwise 2544 */ 2545 static 2546 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2547 int32_t *textoffset, 2548 UErrorCode *status) 2549 { 2550 // to ensure that the start and ends are not composite characters 2551 UCollationElements *coleiter = strsrch->textIter; 2552 // if we have a canonical accent match 2553 if ((strsrch->pattern.hasSuffixAccents && 2554 strsrch->canonicalSuffixAccents[0]) || 2555 (strsrch->pattern.hasPrefixAccents && 2556 strsrch->canonicalPrefixAccents[0])) { 2557 strsrch->search->matchedIndex = *textoffset; 2558 strsrch->search->matchedLength = 2559 getNextUStringSearchBaseOffset(strsrch, 2560 getColElemIterOffset(coleiter, FALSE)) 2561 - *textoffset; 2562 return TRUE; 2563 } 2564 2565 int32_t end = ucol_getOffset(coleiter); 2566 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2567 status) || 2568 U_FAILURE(*status)) { 2569 return FALSE; 2570 } 2571 2572 end = getNextUStringSearchBaseOffset(strsrch, end); 2573 // this totally matches, however we need to check if it is repeating 2574 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2575 !isBreakUnit(strsrch, *textoffset, end) || 2576 !checkIdentical(strsrch, *textoffset, end)) { 2577 (*textoffset) --; 2578 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2579 *textoffset); 2580 return FALSE; 2581 } 2582 2583 strsrch->search->matchedIndex = *textoffset; 2584 strsrch->search->matchedLength = end - *textoffset; 2585 return TRUE; 2586 } 2587 #endif // #if BOYER_MOORE 2588 2589 // constructors and destructor ------------------------------------------- 2590 2591 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2592 int32_t patternlength, 2593 const UChar *text, 2594 int32_t textlength, 2595 const char *locale, 2596 UBreakIterator *breakiter, 2597 UErrorCode *status) 2598 { 2599 if (U_FAILURE(*status)) { 2600 return NULL; 2601 } 2602 #if UCONFIG_NO_BREAK_ITERATION 2603 if (breakiter != NULL) { 2604 *status = U_UNSUPPORTED_ERROR; 2605 return NULL; 2606 } 2607 #endif 2608 if (locale) { 2609 // ucol_open internally checks for status 2610 UCollator *collator = ucol_open(locale, status); 2611 // pattern, text checks are done in usearch_openFromCollator 2612 UStringSearch *result = usearch_openFromCollator(pattern, 2613 patternlength, text, textlength, 2614 collator, breakiter, status); 2615 2616 if (result == NULL || U_FAILURE(*status)) { 2617 if (collator) { 2618 ucol_close(collator); 2619 } 2620 return NULL; 2621 } 2622 else { 2623 result->ownCollator = TRUE; 2624 } 2625 return result; 2626 } 2627 *status = U_ILLEGAL_ARGUMENT_ERROR; 2628 return NULL; 2629 } 2630 2631 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2632 const UChar *pattern, 2633 int32_t patternlength, 2634 const UChar *text, 2635 int32_t textlength, 2636 const UCollator *collator, 2637 UBreakIterator *breakiter, 2638 UErrorCode *status) 2639 { 2640 if (U_FAILURE(*status)) { 2641 return NULL; 2642 } 2643 #if UCONFIG_NO_BREAK_ITERATION 2644 if (breakiter != NULL) { 2645 *status = U_UNSUPPORTED_ERROR; 2646 return NULL; 2647 } 2648 #endif 2649 if (pattern == NULL || text == NULL || collator == NULL) { 2650 *status = U_ILLEGAL_ARGUMENT_ERROR; 2651 return NULL; 2652 } 2653 2654 // string search does not really work when numeric collation is turned on 2655 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2656 *status = U_UNSUPPORTED_ERROR; 2657 return NULL; 2658 } 2659 2660 if (U_SUCCESS(*status)) { 2661 initializeFCD(status); 2662 if (U_FAILURE(*status)) { 2663 return NULL; 2664 } 2665 2666 UStringSearch *result; 2667 if (textlength == -1) { 2668 textlength = u_strlen(text); 2669 } 2670 if (patternlength == -1) { 2671 patternlength = u_strlen(pattern); 2672 } 2673 if (textlength <= 0 || patternlength <= 0) { 2674 *status = U_ILLEGAL_ARGUMENT_ERROR; 2675 return NULL; 2676 } 2677 2678 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2679 if (result == NULL) { 2680 *status = U_MEMORY_ALLOCATION_ERROR; 2681 return NULL; 2682 } 2683 2684 result->collator = collator; 2685 result->strength = ucol_getStrength(collator); 2686 result->ceMask = getMask(result->strength); 2687 result->toShift = 2688 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2689 UCOL_SHIFTED; 2690 result->variableTop = ucol_getVariableTop(collator, status); 2691 2692 result->nfd = Normalizer2::getNFDInstance(*status); 2693 2694 if (U_FAILURE(*status)) { 2695 uprv_free(result); 2696 return NULL; 2697 } 2698 2699 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2700 if (result->search == NULL) { 2701 *status = U_MEMORY_ALLOCATION_ERROR; 2702 uprv_free(result); 2703 return NULL; 2704 } 2705 2706 result->search->text = text; 2707 result->search->textLength = textlength; 2708 2709 result->pattern.text = pattern; 2710 result->pattern.textLength = patternlength; 2711 result->pattern.ces = NULL; 2712 result->pattern.pces = NULL; 2713 2714 result->search->breakIter = breakiter; 2715 #if !UCONFIG_NO_BREAK_ITERATION 2716 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2717 if (breakiter) { 2718 ubrk_setText(breakiter, text, textlength, status); 2719 } 2720 #endif 2721 2722 result->ownCollator = FALSE; 2723 result->search->matchedLength = 0; 2724 result->search->matchedIndex = USEARCH_DONE; 2725 result->utilIter = NULL; 2726 result->textIter = ucol_openElements(collator, text, 2727 textlength, status); 2728 result->textProcessedIter = NULL; 2729 if (U_FAILURE(*status)) { 2730 usearch_close(result); 2731 return NULL; 2732 } 2733 2734 result->search->isOverlap = FALSE; 2735 result->search->isCanonicalMatch = FALSE; 2736 result->search->elementComparisonType = 0; 2737 result->search->isForwardSearching = TRUE; 2738 result->search->reset = TRUE; 2739 2740 initialize(result, status); 2741 2742 if (U_FAILURE(*status)) { 2743 usearch_close(result); 2744 return NULL; 2745 } 2746 2747 return result; 2748 } 2749 return NULL; 2750 } 2751 2752 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2753 { 2754 if (strsrch) { 2755 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && 2756 strsrch->pattern.ces) { 2757 uprv_free(strsrch->pattern.ces); 2758 } 2759 2760 if (strsrch->pattern.pces != NULL && 2761 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 2762 uprv_free(strsrch->pattern.pces); 2763 } 2764 2765 delete strsrch->textProcessedIter; 2766 ucol_closeElements(strsrch->textIter); 2767 ucol_closeElements(strsrch->utilIter); 2768 2769 if (strsrch->ownCollator && strsrch->collator) { 2770 ucol_close((UCollator *)strsrch->collator); 2771 } 2772 2773 #if !UCONFIG_NO_BREAK_ITERATION 2774 if (strsrch->search->internalBreakIter) { 2775 ubrk_close(strsrch->search->internalBreakIter); 2776 } 2777 #endif 2778 2779 uprv_free(strsrch->search); 2780 uprv_free(strsrch); 2781 } 2782 } 2783 2784 namespace { 2785 2786 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 2787 if (U_FAILURE(*status)) { return FALSE; } 2788 if (strsrch->textProcessedIter == NULL) { 2789 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 2790 if (strsrch->textProcessedIter == NULL) { 2791 *status = U_MEMORY_ALLOCATION_ERROR; 2792 return FALSE; 2793 } 2794 } else { 2795 strsrch->textProcessedIter->init(strsrch->textIter); 2796 } 2797 return TRUE; 2798 } 2799 2800 } 2801 2802 // set and get methods -------------------------------------------------- 2803 2804 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2805 int32_t position, 2806 UErrorCode *status) 2807 { 2808 if (U_SUCCESS(*status) && strsrch) { 2809 if (isOutOfBounds(strsrch->search->textLength, position)) { 2810 *status = U_INDEX_OUTOFBOUNDS_ERROR; 2811 } 2812 else { 2813 setColEIterOffset(strsrch->textIter, position); 2814 } 2815 strsrch->search->matchedIndex = USEARCH_DONE; 2816 strsrch->search->matchedLength = 0; 2817 strsrch->search->reset = FALSE; 2818 } 2819 } 2820 2821 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2822 { 2823 if (strsrch) { 2824 int32_t result = ucol_getOffset(strsrch->textIter); 2825 if (isOutOfBounds(strsrch->search->textLength, result)) { 2826 return USEARCH_DONE; 2827 } 2828 return result; 2829 } 2830 return USEARCH_DONE; 2831 } 2832 2833 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2834 USearchAttribute attribute, 2835 USearchAttributeValue value, 2836 UErrorCode *status) 2837 { 2838 if (U_SUCCESS(*status) && strsrch) { 2839 switch (attribute) 2840 { 2841 case USEARCH_OVERLAP : 2842 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2843 break; 2844 case USEARCH_CANONICAL_MATCH : 2845 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2846 FALSE); 2847 break; 2848 case USEARCH_ELEMENT_COMPARISON : 2849 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2850 strsrch->search->elementComparisonType = (int16_t)value; 2851 } else { 2852 strsrch->search->elementComparisonType = 0; 2853 } 2854 break; 2855 case USEARCH_ATTRIBUTE_COUNT : 2856 default: 2857 *status = U_ILLEGAL_ARGUMENT_ERROR; 2858 } 2859 } 2860 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2861 *status = U_ILLEGAL_ARGUMENT_ERROR; 2862 } 2863 } 2864 2865 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2866 const UStringSearch *strsrch, 2867 USearchAttribute attribute) 2868 { 2869 if (strsrch) { 2870 switch (attribute) { 2871 case USEARCH_OVERLAP : 2872 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2873 USEARCH_OFF); 2874 case USEARCH_CANONICAL_MATCH : 2875 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2876 USEARCH_OFF); 2877 case USEARCH_ELEMENT_COMPARISON : 2878 { 2879 int16_t value = strsrch->search->elementComparisonType; 2880 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2881 return (USearchAttributeValue)value; 2882 } else { 2883 return USEARCH_STANDARD_ELEMENT_COMPARISON; 2884 } 2885 } 2886 case USEARCH_ATTRIBUTE_COUNT : 2887 return USEARCH_DEFAULT; 2888 } 2889 } 2890 return USEARCH_DEFAULT; 2891 } 2892 2893 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2894 const UStringSearch *strsrch) 2895 { 2896 if (strsrch == NULL) { 2897 return USEARCH_DONE; 2898 } 2899 return strsrch->search->matchedIndex; 2900 } 2901 2902 2903 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2904 UChar *result, 2905 int32_t resultCapacity, 2906 UErrorCode *status) 2907 { 2908 if (U_FAILURE(*status)) { 2909 return USEARCH_DONE; 2910 } 2911 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2912 result == NULL)) { 2913 *status = U_ILLEGAL_ARGUMENT_ERROR; 2914 return USEARCH_DONE; 2915 } 2916 2917 int32_t copylength = strsrch->search->matchedLength; 2918 int32_t copyindex = strsrch->search->matchedIndex; 2919 if (copyindex == USEARCH_DONE) { 2920 u_terminateUChars(result, resultCapacity, 0, status); 2921 return USEARCH_DONE; 2922 } 2923 2924 if (resultCapacity < copylength) { 2925 copylength = resultCapacity; 2926 } 2927 if (copylength > 0) { 2928 uprv_memcpy(result, strsrch->search->text + copyindex, 2929 copylength * sizeof(UChar)); 2930 } 2931 return u_terminateUChars(result, resultCapacity, 2932 strsrch->search->matchedLength, status); 2933 } 2934 2935 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2936 const UStringSearch *strsrch) 2937 { 2938 if (strsrch) { 2939 return strsrch->search->matchedLength; 2940 } 2941 return USEARCH_DONE; 2942 } 2943 2944 #if !UCONFIG_NO_BREAK_ITERATION 2945 2946 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2947 UBreakIterator *breakiter, 2948 UErrorCode *status) 2949 { 2950 if (U_SUCCESS(*status) && strsrch) { 2951 strsrch->search->breakIter = breakiter; 2952 if (breakiter) { 2953 ubrk_setText(breakiter, strsrch->search->text, 2954 strsrch->search->textLength, status); 2955 } 2956 } 2957 } 2958 2959 U_CAPI const UBreakIterator* U_EXPORT2 2960 usearch_getBreakIterator(const UStringSearch *strsrch) 2961 { 2962 if (strsrch) { 2963 return strsrch->search->breakIter; 2964 } 2965 return NULL; 2966 } 2967 2968 #endif 2969 2970 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2971 const UChar *text, 2972 int32_t textlength, 2973 UErrorCode *status) 2974 { 2975 if (U_SUCCESS(*status)) { 2976 if (strsrch == NULL || text == NULL || textlength < -1 || 2977 textlength == 0) { 2978 *status = U_ILLEGAL_ARGUMENT_ERROR; 2979 } 2980 else { 2981 if (textlength == -1) { 2982 textlength = u_strlen(text); 2983 } 2984 strsrch->search->text = text; 2985 strsrch->search->textLength = textlength; 2986 ucol_setText(strsrch->textIter, text, textlength, status); 2987 strsrch->search->matchedIndex = USEARCH_DONE; 2988 strsrch->search->matchedLength = 0; 2989 strsrch->search->reset = TRUE; 2990 #if !UCONFIG_NO_BREAK_ITERATION 2991 if (strsrch->search->breakIter != NULL) { 2992 ubrk_setText(strsrch->search->breakIter, text, 2993 textlength, status); 2994 } 2995 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2996 #endif 2997 } 2998 } 2999 } 3000 3001 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 3002 int32_t *length) 3003 { 3004 if (strsrch) { 3005 *length = strsrch->search->textLength; 3006 return strsrch->search->text; 3007 } 3008 return NULL; 3009 } 3010 3011 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3012 const UCollator *collator, 3013 UErrorCode *status) 3014 { 3015 if (U_SUCCESS(*status)) { 3016 if (collator == NULL) { 3017 *status = U_ILLEGAL_ARGUMENT_ERROR; 3018 return; 3019 } 3020 3021 if (strsrch) { 3022 delete strsrch->textProcessedIter; 3023 strsrch->textProcessedIter = NULL; 3024 ucol_closeElements(strsrch->textIter); 3025 ucol_closeElements(strsrch->utilIter); 3026 strsrch->textIter = strsrch->utilIter = NULL; 3027 if (strsrch->ownCollator && (strsrch->collator != collator)) { 3028 ucol_close((UCollator *)strsrch->collator); 3029 strsrch->ownCollator = FALSE; 3030 } 3031 strsrch->collator = collator; 3032 strsrch->strength = ucol_getStrength(collator); 3033 strsrch->ceMask = getMask(strsrch->strength); 3034 #if !UCONFIG_NO_BREAK_ITERATION 3035 ubrk_close(strsrch->search->internalBreakIter); 3036 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3037 strsrch->search->text, strsrch->search->textLength, status); 3038 #endif 3039 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3040 strsrch->toShift = 3041 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3042 UCOL_SHIFTED; 3043 // if status is a failure, ucol_getVariableTop returns 0 3044 strsrch->variableTop = ucol_getVariableTop(collator, status); 3045 strsrch->textIter = ucol_openElements(collator, 3046 strsrch->search->text, 3047 strsrch->search->textLength, 3048 status); 3049 strsrch->utilIter = ucol_openElements( 3050 collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 3051 // initialize() _after_ setting the iterators for the new collator. 3052 initialize(strsrch, status); 3053 } 3054 3055 // **** are these calls needed? 3056 // **** we call uprv_init_pce in initializePatternPCETable 3057 // **** and the CEIBuffer constructor... 3058 #if 0 3059 uprv_init_pce(strsrch->textIter); 3060 uprv_init_pce(strsrch->utilIter); 3061 #endif 3062 } 3063 } 3064 3065 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3066 { 3067 if (strsrch) { 3068 return (UCollator *)strsrch->collator; 3069 } 3070 return NULL; 3071 } 3072 3073 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3074 const UChar *pattern, 3075 int32_t patternlength, 3076 UErrorCode *status) 3077 { 3078 if (U_SUCCESS(*status)) { 3079 if (strsrch == NULL || pattern == NULL) { 3080 *status = U_ILLEGAL_ARGUMENT_ERROR; 3081 } 3082 else { 3083 if (patternlength == -1) { 3084 patternlength = u_strlen(pattern); 3085 } 3086 if (patternlength == 0) { 3087 *status = U_ILLEGAL_ARGUMENT_ERROR; 3088 return; 3089 } 3090 strsrch->pattern.text = pattern; 3091 strsrch->pattern.textLength = patternlength; 3092 initialize(strsrch, status); 3093 } 3094 } 3095 } 3096 3097 U_CAPI const UChar* U_EXPORT2 3098 usearch_getPattern(const UStringSearch *strsrch, 3099 int32_t *length) 3100 { 3101 if (strsrch) { 3102 *length = strsrch->pattern.textLength; 3103 return strsrch->pattern.text; 3104 } 3105 return NULL; 3106 } 3107 3108 // miscellanous methods -------------------------------------------------- 3109 3110 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3111 UErrorCode *status) 3112 { 3113 if (strsrch && U_SUCCESS(*status)) { 3114 strsrch->search->isForwardSearching = TRUE; 3115 usearch_setOffset(strsrch, 0, status); 3116 if (U_SUCCESS(*status)) { 3117 return usearch_next(strsrch, status); 3118 } 3119 } 3120 return USEARCH_DONE; 3121 } 3122 3123 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3124 int32_t position, 3125 UErrorCode *status) 3126 { 3127 if (strsrch && U_SUCCESS(*status)) { 3128 strsrch->search->isForwardSearching = TRUE; 3129 // position checked in usearch_setOffset 3130 usearch_setOffset(strsrch, position, status); 3131 if (U_SUCCESS(*status)) { 3132 return usearch_next(strsrch, status); 3133 } 3134 } 3135 return USEARCH_DONE; 3136 } 3137 3138 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3139 UErrorCode *status) 3140 { 3141 if (strsrch && U_SUCCESS(*status)) { 3142 strsrch->search->isForwardSearching = FALSE; 3143 usearch_setOffset(strsrch, strsrch->search->textLength, status); 3144 if (U_SUCCESS(*status)) { 3145 return usearch_previous(strsrch, status); 3146 } 3147 } 3148 return USEARCH_DONE; 3149 } 3150 3151 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3152 int32_t position, 3153 UErrorCode *status) 3154 { 3155 if (strsrch && U_SUCCESS(*status)) { 3156 strsrch->search->isForwardSearching = FALSE; 3157 // position checked in usearch_setOffset 3158 usearch_setOffset(strsrch, position, status); 3159 if (U_SUCCESS(*status)) { 3160 return usearch_previous(strsrch, status); 3161 } 3162 } 3163 return USEARCH_DONE; 3164 } 3165 3166 /** 3167 * If a direction switch is required, we'll count the number of ces till the 3168 * beginning of the collation element iterator and iterate forwards that 3169 * number of times. This is so that we get to the correct point within the 3170 * string to continue the search in. Imagine when we are in the middle of the 3171 * normalization buffer when the change in direction is request. arrrgghh.... 3172 * After searching the offset within the collation element iterator will be 3173 * shifted to the start of the match. If a match is not found, the offset would 3174 * have been set to the end of the text string in the collation element 3175 * iterator. 3176 * Okay, here's my take on normalization buffer. The only time when there can 3177 * be 2 matches within the same normalization is when the pattern is consists 3178 * of all accents. But since the offset returned is from the text string, we 3179 * should not confuse the caller by returning the second match within the 3180 * same normalization buffer. If we do, the 2 results will have the same match 3181 * offsets, and that'll be confusing. I'll return the next match that doesn't 3182 * fall within the same normalization buffer. Note this does not affect the 3183 * results of matches spanning the text and the normalization buffer. 3184 * The position to start searching is taken from the collation element 3185 * iterator. Callers of this API would have to set the offset in the collation 3186 * element iterator before using this method. 3187 */ 3188 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3189 UErrorCode *status) 3190 { 3191 if (U_SUCCESS(*status) && strsrch) { 3192 // note offset is either equivalent to the start of the previous match 3193 // or is set by the user 3194 int32_t offset = usearch_getOffset(strsrch); 3195 USearch *search = strsrch->search; 3196 search->reset = FALSE; 3197 int32_t textlength = search->textLength; 3198 if (search->isForwardSearching) { 3199 #if BOYER_MOORE 3200 if (offset == textlength 3201 || (!search->isOverlap && 3202 (offset + strsrch->pattern.defaultShiftSize > textlength || 3203 (search->matchedIndex != USEARCH_DONE && 3204 offset + search->matchedLength >= textlength)))) { 3205 // not enough characters to match 3206 setMatchNotFound(strsrch); 3207 return USEARCH_DONE; 3208 } 3209 #else 3210 if (offset == textlength || 3211 (! search->isOverlap && 3212 (search->matchedIndex != USEARCH_DONE && 3213 offset + search->matchedLength > textlength))) { 3214 // not enough characters to match 3215 setMatchNotFound(strsrch); 3216 return USEARCH_DONE; 3217 } 3218 #endif 3219 } 3220 else { 3221 // switching direction. 3222 // if matchedIndex == USEARCH_DONE, it means that either a 3223 // setOffset has been called or that previous ran off the text 3224 // string. the iterator would have been set to offset 0 if a 3225 // match is not found. 3226 search->isForwardSearching = TRUE; 3227 if (search->matchedIndex != USEARCH_DONE) { 3228 // there's no need to set the collation element iterator 3229 // the next call to next will set the offset. 3230 return search->matchedIndex; 3231 } 3232 } 3233 3234 if (U_SUCCESS(*status)) { 3235 if (strsrch->pattern.cesLength == 0) { 3236 if (search->matchedIndex == USEARCH_DONE) { 3237 search->matchedIndex = offset; 3238 } 3239 else { // moves by codepoints 3240 U16_FWD_1(search->text, search->matchedIndex, textlength); 3241 } 3242 3243 search->matchedLength = 0; 3244 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3245 // status checked below 3246 if (search->matchedIndex == textlength) { 3247 search->matchedIndex = USEARCH_DONE; 3248 } 3249 } 3250 else { 3251 if (search->matchedLength > 0) { 3252 // if matchlength is 0 we are at the start of the iteration 3253 if (search->isOverlap) { 3254 ucol_setOffset(strsrch->textIter, offset + 1, status); 3255 } 3256 else { 3257 ucol_setOffset(strsrch->textIter, 3258 offset + search->matchedLength, status); 3259 } 3260 } 3261 else { 3262 // for boundary check purposes. this will ensure that the 3263 // next match will not preceed the current offset 3264 // note search->matchedIndex will always be set to something 3265 // in the code 3266 search->matchedIndex = offset - 1; 3267 } 3268 3269 if (search->isCanonicalMatch) { 3270 // can't use exact here since extra accents are allowed. 3271 usearch_handleNextCanonical(strsrch, status); 3272 } 3273 else { 3274 usearch_handleNextExact(strsrch, status); 3275 } 3276 } 3277 3278 if (U_FAILURE(*status)) { 3279 return USEARCH_DONE; 3280 } 3281 3282 #if !BOYER_MOORE 3283 if (search->matchedIndex == USEARCH_DONE) { 3284 ucol_setOffset(strsrch->textIter, search->textLength, status); 3285 } else { 3286 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3287 } 3288 #endif 3289 3290 return search->matchedIndex; 3291 } 3292 } 3293 return USEARCH_DONE; 3294 } 3295 3296 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3297 UErrorCode *status) 3298 { 3299 if (U_SUCCESS(*status) && strsrch) { 3300 int32_t offset; 3301 USearch *search = strsrch->search; 3302 if (search->reset) { 3303 offset = search->textLength; 3304 search->isForwardSearching = FALSE; 3305 search->reset = FALSE; 3306 setColEIterOffset(strsrch->textIter, offset); 3307 } 3308 else { 3309 offset = usearch_getOffset(strsrch); 3310 } 3311 3312 int32_t matchedindex = search->matchedIndex; 3313 if (search->isForwardSearching == TRUE) { 3314 // switching direction. 3315 // if matchedIndex == USEARCH_DONE, it means that either a 3316 // setOffset has been called or that next ran off the text 3317 // string. the iterator would have been set to offset textLength if 3318 // a match is not found. 3319 search->isForwardSearching = FALSE; 3320 if (matchedindex != USEARCH_DONE) { 3321 return matchedindex; 3322 } 3323 } 3324 else { 3325 #if BOYER_MOORE 3326 if (offset == 0 || matchedindex == 0 || 3327 (!search->isOverlap && 3328 (offset < strsrch->pattern.defaultShiftSize || 3329 (matchedindex != USEARCH_DONE && 3330 matchedindex < strsrch->pattern.defaultShiftSize)))) { 3331 // not enough characters to match 3332 setMatchNotFound(strsrch); 3333 return USEARCH_DONE; 3334 } 3335 #else 3336 // Could check pattern length, but the 3337 // linear search will do the right thing 3338 if (offset == 0 || matchedindex == 0) { 3339 setMatchNotFound(strsrch); 3340 return USEARCH_DONE; 3341 } 3342 #endif 3343 } 3344 3345 if (U_SUCCESS(*status)) { 3346 if (strsrch->pattern.cesLength == 0) { 3347 search->matchedIndex = 3348 (matchedindex == USEARCH_DONE ? offset : matchedindex); 3349 if (search->matchedIndex == 0) { 3350 setMatchNotFound(strsrch); 3351 // status checked below 3352 } 3353 else { // move by codepoints 3354 U16_BACK_1(search->text, 0, search->matchedIndex); 3355 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3356 // status checked below 3357 search->matchedLength = 0; 3358 } 3359 } 3360 else { 3361 if (strsrch->search->isCanonicalMatch) { 3362 // can't use exact here since extra accents are allowed. 3363 usearch_handlePreviousCanonical(strsrch, status); 3364 // status checked below 3365 } 3366 else { 3367 usearch_handlePreviousExact(strsrch, status); 3368 // status checked below 3369 } 3370 } 3371 3372 if (U_FAILURE(*status)) { 3373 return USEARCH_DONE; 3374 } 3375 3376 return search->matchedIndex; 3377 } 3378 } 3379 return USEARCH_DONE; 3380 } 3381 3382 3383 3384 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3385 { 3386 /* 3387 reset is setting the attributes that are already in 3388 string search, hence all attributes in the collator should 3389 be retrieved without any problems 3390 */ 3391 if (strsrch) { 3392 UErrorCode status = U_ZERO_ERROR; 3393 UBool sameCollAttribute = TRUE; 3394 uint32_t ceMask; 3395 UBool shift; 3396 uint32_t varTop; 3397 3398 // **** hack to deal w/ how processed CEs encode quaternary **** 3399 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3400 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3401 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3402 sameCollAttribute = FALSE; 3403 } 3404 3405 strsrch->strength = ucol_getStrength(strsrch->collator); 3406 ceMask = getMask(strsrch->strength); 3407 if (strsrch->ceMask != ceMask) { 3408 strsrch->ceMask = ceMask; 3409 sameCollAttribute = FALSE; 3410 } 3411 3412 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3413 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3414 &status) == UCOL_SHIFTED; 3415 if (strsrch->toShift != shift) { 3416 strsrch->toShift = shift; 3417 sameCollAttribute = FALSE; 3418 } 3419 3420 // if status is a failure, ucol_getVariableTop returns 0 3421 varTop = ucol_getVariableTop(strsrch->collator, &status); 3422 if (strsrch->variableTop != varTop) { 3423 strsrch->variableTop = varTop; 3424 sameCollAttribute = FALSE; 3425 } 3426 if (!sameCollAttribute) { 3427 initialize(strsrch, &status); 3428 } 3429 ucol_setText(strsrch->textIter, strsrch->search->text, 3430 strsrch->search->textLength, 3431 &status); 3432 strsrch->search->matchedLength = 0; 3433 strsrch->search->matchedIndex = USEARCH_DONE; 3434 strsrch->search->isOverlap = FALSE; 3435 strsrch->search->isCanonicalMatch = FALSE; 3436 strsrch->search->elementComparisonType = 0; 3437 strsrch->search->isForwardSearching = TRUE; 3438 strsrch->search->reset = TRUE; 3439 } 3440 } 3441 3442 // 3443 // CEI Collation Element + source text index. 3444 // These structs are kept in the circular buffer. 3445 // 3446 struct CEI { 3447 int64_t ce; 3448 int32_t lowIndex; 3449 int32_t highIndex; 3450 }; 3451 3452 U_NAMESPACE_BEGIN 3453 3454 namespace { 3455 // 3456 // CEIBuffer A circular buffer of CEs-with-index from the text being searched. 3457 // 3458 #define DEFAULT_CEBUFFER_SIZE 96 3459 #define CEBUFFER_EXTRA 32 3460 // Some typical max values to make buffer size more reasonable for asymmetric search. 3461 // #8694 is for a better long-term solution to allocation of this buffer. 3462 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3463 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3464 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3465 struct CEIBuffer { 3466 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3467 CEI *buf; 3468 int32_t bufSize; 3469 int32_t firstIx; 3470 int32_t limitIx; 3471 UCollationElements *ceIter; 3472 UStringSearch *strSearch; 3473 3474 3475 3476 CEIBuffer(UStringSearch *ss, UErrorCode *status); 3477 ~CEIBuffer(); 3478 const CEI *get(int32_t index); 3479 const CEI *getPrevious(int32_t index); 3480 }; 3481 3482 3483 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { 3484 buf = defBuf; 3485 strSearch = ss; 3486 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; 3487 if (ss->search->elementComparisonType != 0) { 3488 const UChar * patText = ss->pattern.text; 3489 if (patText) { 3490 const UChar * patTextLimit = patText + ss->pattern.textLength; 3491 while ( patText < patTextLimit ) { 3492 UChar c = *patText++; 3493 if (MIGHT_BE_JAMO_L(c)) { 3494 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3495 } else { 3496 // No check for surrogates, we might allocate slightly more buffer than necessary. 3497 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3498 } 3499 } 3500 } 3501 } 3502 ceIter = ss->textIter; 3503 firstIx = 0; 3504 limitIx = 0; 3505 3506 if (!initTextProcessedIter(ss, status)) { return; } 3507 3508 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3509 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3510 if (buf == NULL) { 3511 *status = U_MEMORY_ALLOCATION_ERROR; 3512 } 3513 } 3514 } 3515 3516 // TODO: add a reset or init function so that allocated 3517 // buffers can be retained & reused. 3518 3519 CEIBuffer::~CEIBuffer() { 3520 if (buf != defBuf) { 3521 uprv_free(buf); 3522 } 3523 } 3524 3525 3526 // Get the CE with the specified index. 3527 // Index must be in the range 3528 // n-history_size < index < n+1 3529 // where n is the largest index to have been fetched by some previous call to this function. 3530 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3531 // 3532 const CEI *CEIBuffer::get(int32_t index) { 3533 int i = index % bufSize; 3534 3535 if (index>=firstIx && index<limitIx) { 3536 // The request was for an entry already in our buffer. 3537 // Just return it. 3538 return &buf[i]; 3539 } 3540 3541 // Caller is requesting a new, never accessed before, CE. 3542 // Verify that it is the next one in sequence, which is all 3543 // that is allowed. 3544 if (index != limitIx) { 3545 U_ASSERT(FALSE); 3546 3547 return NULL; 3548 } 3549 3550 // Manage the circular CE buffer indexing 3551 limitIx++; 3552 3553 if (limitIx - firstIx >= bufSize) { 3554 // The buffer is full, knock out the lowest-indexed entry. 3555 firstIx++; 3556 } 3557 3558 UErrorCode status = U_ZERO_ERROR; 3559 3560 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3561 3562 return &buf[i]; 3563 } 3564 3565 // Get the CE with the specified index. 3566 // Index must be in the range 3567 // n-history_size < index < n+1 3568 // where n is the largest index to have been fetched by some previous call to this function. 3569 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3570 // 3571 const CEI *CEIBuffer::getPrevious(int32_t index) { 3572 int i = index % bufSize; 3573 3574 if (index>=firstIx && index<limitIx) { 3575 // The request was for an entry already in our buffer. 3576 // Just return it. 3577 return &buf[i]; 3578 } 3579 3580 // Caller is requesting a new, never accessed before, CE. 3581 // Verify that it is the next one in sequence, which is all 3582 // that is allowed. 3583 if (index != limitIx) { 3584 U_ASSERT(FALSE); 3585 3586 return NULL; 3587 } 3588 3589 // Manage the circular CE buffer indexing 3590 limitIx++; 3591 3592 if (limitIx - firstIx >= bufSize) { 3593 // The buffer is full, knock out the lowest-indexed entry. 3594 firstIx++; 3595 } 3596 3597 UErrorCode status = U_ZERO_ERROR; 3598 3599 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3600 3601 return &buf[i]; 3602 } 3603 3604 } 3605 3606 U_NAMESPACE_END 3607 3608 3609 // #define USEARCH_DEBUG 3610 3611 #ifdef USEARCH_DEBUG 3612 #include <stdio.h> 3613 #include <stdlib.h> 3614 #endif 3615 3616 /* 3617 * Find the next break boundary after startIndex. If the UStringSearch object 3618 * has an external break iterator, use that. Otherwise use the internal character 3619 * break iterator. 3620 */ 3621 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3622 #if 0 3623 const UChar *text = strsrch->search->text; 3624 int32_t textLen = strsrch->search->textLength; 3625 3626 U_ASSERT(startIndex>=0); 3627 U_ASSERT(startIndex<=textLen); 3628 3629 if (startIndex >= textLen) { 3630 return startIndex; 3631 } 3632 3633 UChar32 c; 3634 int32_t i = startIndex; 3635 U16_NEXT(text, i, textLen, c); 3636 3637 // If we are on a control character, stop without looking for combining marks. 3638 // Control characters do not combine. 3639 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3640 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3641 return i; 3642 } 3643 3644 // The initial character was not a control, and can thus accept trailing 3645 // combining characters. Advance over however many of them there are. 3646 int32_t indexOfLastCharChecked; 3647 for (;;) { 3648 indexOfLastCharChecked = i; 3649 if (i>=textLen) { 3650 break; 3651 } 3652 U16_NEXT(text, i, textLen, c); 3653 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3654 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3655 break; 3656 } 3657 } 3658 return indexOfLastCharChecked; 3659 #elif !UCONFIG_NO_BREAK_ITERATION 3660 UBreakIterator *breakiterator = strsrch->search->breakIter; 3661 3662 if (breakiterator == NULL) { 3663 breakiterator = strsrch->search->internalBreakIter; 3664 } 3665 3666 if (breakiterator != NULL) { 3667 return ubrk_following(breakiterator, startIndex); 3668 } 3669 3670 return startIndex; 3671 #else 3672 // **** or should we use the original code? **** 3673 return startIndex; 3674 #endif 3675 3676 } 3677 3678 /* 3679 * Returns TRUE if index is on a break boundary. If the UStringSearch 3680 * has an external break iterator, test using that, otherwise test 3681 * using the internal character break iterator. 3682 */ 3683 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3684 #if 0 3685 const UChar *text = strsrch->search->text; 3686 int32_t textLen = strsrch->search->textLength; 3687 3688 U_ASSERT(index>=0); 3689 U_ASSERT(index<=textLen); 3690 3691 if (index>=textLen || index<=0) { 3692 return TRUE; 3693 } 3694 3695 // If the character at the current index is not a GRAPHEME_EXTEND 3696 // then we can not be within a combining sequence. 3697 UChar32 c; 3698 U16_GET(text, 0, index, textLen, c); 3699 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3700 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3701 return TRUE; 3702 } 3703 3704 // We are at a combining mark. If the preceding character is anything 3705 // except a CONTROL, CR or LF, we are in a combining sequence. 3706 U16_PREV(text, 0, index, c); 3707 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3708 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3709 return !combining; 3710 #elif !UCONFIG_NO_BREAK_ITERATION 3711 UBreakIterator *breakiterator = strsrch->search->breakIter; 3712 3713 if (breakiterator == NULL) { 3714 breakiterator = strsrch->search->internalBreakIter; 3715 } 3716 3717 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3718 #else 3719 // **** or use the original code? **** 3720 return TRUE; 3721 #endif 3722 } 3723 3724 #if 0 3725 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3726 { 3727 #if !UCONFIG_NO_BREAK_ITERATION 3728 UBreakIterator *breakiterator = strsrch->search->breakIter; 3729 3730 if (breakiterator != NULL) { 3731 int32_t startindex = ubrk_first(breakiterator); 3732 int32_t endindex = ubrk_last(breakiterator); 3733 3734 // out-of-range indexes are never boundary positions 3735 if (start < startindex || start > endindex || 3736 end < startindex || end > endindex) { 3737 return FALSE; 3738 } 3739 3740 return ubrk_isBoundary(breakiterator, start) && 3741 ubrk_isBoundary(breakiterator, end); 3742 } 3743 #endif 3744 3745 return TRUE; 3746 } 3747 #endif 3748 3749 typedef enum { 3750 U_CE_MATCH = -1, 3751 U_CE_NO_MATCH = 0, 3752 U_CE_SKIP_TARG, 3753 U_CE_SKIP_PATN 3754 } UCompareCEsResult; 3755 #define U_CE_LEVEL2_BASE 0x00000005 3756 #define U_CE_LEVEL3_BASE 0x00050000 3757 3758 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 3759 if (targCE == patCE) { 3760 return U_CE_MATCH; 3761 } 3762 if (compareType == 0) { 3763 return U_CE_NO_MATCH; 3764 } 3765 3766 int64_t targCEshifted = targCE >> 32; 3767 int64_t patCEshifted = patCE >> 32; 3768 int64_t mask; 3769 3770 mask = 0xFFFF0000; 3771 int32_t targLev1 = (int32_t)(targCEshifted & mask); 3772 int32_t patLev1 = (int32_t)(patCEshifted & mask); 3773 if ( targLev1 != patLev1 ) { 3774 if ( targLev1 == 0 ) { 3775 return U_CE_SKIP_TARG; 3776 } 3777 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3778 return U_CE_SKIP_PATN; 3779 } 3780 return U_CE_NO_MATCH; 3781 } 3782 3783 mask = 0x0000FFFF; 3784 int32_t targLev2 = (int32_t)(targCEshifted & mask); 3785 int32_t patLev2 = (int32_t)(patCEshifted & mask); 3786 if ( targLev2 != patLev2 ) { 3787 if ( targLev2 == 0 ) { 3788 return U_CE_SKIP_TARG; 3789 } 3790 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3791 return U_CE_SKIP_PATN; 3792 } 3793 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 3794 U_CE_MATCH: U_CE_NO_MATCH; 3795 } 3796 3797 mask = 0xFFFF0000; 3798 int32_t targLev3 = (int32_t)(targCE & mask); 3799 int32_t patLev3 = (int32_t)(patCE & mask); 3800 if ( targLev3 != patLev3 ) { 3801 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 3802 U_CE_MATCH: U_CE_NO_MATCH; 3803 } 3804 3805 return U_CE_MATCH; 3806 } 3807 3808 #if BOYER_MOORE 3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 3810 #endif 3811 3812 namespace { 3813 3814 UChar32 codePointAt(const USearch &search, int32_t index) { 3815 if (index < search.textLength) { 3816 UChar32 c; 3817 U16_NEXT(search.text, index, search.textLength, c); 3818 return c; 3819 } 3820 return U_SENTINEL; 3821 } 3822 3823 UChar32 codePointBefore(const USearch &search, int32_t index) { 3824 if (0 < index) { 3825 UChar32 c; 3826 U16_PREV(search.text, 0, index, c); 3827 return c; 3828 } 3829 return U_SENTINEL; 3830 } 3831 3832 } // namespace 3833 3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3835 int32_t startIdx, 3836 int32_t *matchStart, 3837 int32_t *matchLimit, 3838 UErrorCode *status) 3839 { 3840 if (U_FAILURE(*status)) { 3841 return FALSE; 3842 } 3843 3844 // TODO: reject search patterns beginning with a combining char. 3845 3846 #ifdef USEARCH_DEBUG 3847 if (getenv("USEARCH_DEBUG") != NULL) { 3848 printf("Pattern CEs\n"); 3849 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 3850 printf(" %8x", strsrch->pattern.ces[ii]); 3851 } 3852 printf("\n"); 3853 } 3854 3855 #endif 3856 // Input parameter sanity check. 3857 // TODO: should input indicies clip to the text length 3858 // in the same way that UText does. 3859 if(strsrch->pattern.cesLength == 0 || 3860 startIdx < 0 || 3861 startIdx > strsrch->search->textLength || 3862 strsrch->pattern.ces == NULL) { 3863 *status = U_ILLEGAL_ARGUMENT_ERROR; 3864 return FALSE; 3865 } 3866 3867 if (strsrch->pattern.pces == NULL) { 3868 initializePatternPCETable(strsrch, status); 3869 } 3870 3871 ucol_setOffset(strsrch->textIter, startIdx, status); 3872 CEIBuffer ceb(strsrch, status); 3873 3874 3875 int32_t targetIx = 0; 3876 const CEI *targetCEI = NULL; 3877 int32_t patIx; 3878 UBool found; 3879 3880 int32_t mStart = -1; 3881 int32_t mLimit = -1; 3882 int32_t minLimit; 3883 int32_t maxLimit; 3884 3885 3886 3887 // Outer loop moves over match starting positions in the 3888 // target CE space. 3889 // Here we see the target as a sequence of collation elements, resulting from the following: 3890 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 3891 // (for example, digraphs such as IJ may be broken into two characters). 3892 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 3893 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 3894 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 3895 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 3896 // then the CE is deleted, so the following code sees only CEs that are relevant. 3897 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 3898 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 3899 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 3900 // 3901 for(targetIx=0; ; targetIx++) 3902 { 3903 found = TRUE; 3904 // Inner loop checks for a match beginning at each 3905 // position from the outer loop. 3906 int32_t targetIxOffset = 0; 3907 int64_t patCE = 0; 3908 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3909 // (compared to the last CE fetched for the previous targetIx value) as we need to go 3910 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3911 const CEI *firstCEI = ceb.get(targetIx); 3912 if (firstCEI == NULL) { 3913 *status = U_INTERNAL_PROGRAM_ERROR; 3914 found = FALSE; 3915 break; 3916 } 3917 3918 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) { 3919 patCE = strsrch->pattern.pces[patIx]; 3920 targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3921 // Compare CE from target string with CE from the pattern. 3922 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3923 // which will fail the compare, below. 3924 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 3925 if ( ceMatch == U_CE_NO_MATCH ) { 3926 found = FALSE; 3927 break; 3928 } else if ( ceMatch > U_CE_NO_MATCH ) { 3929 if ( ceMatch == U_CE_SKIP_TARG ) { 3930 // redo with same patCE, next targCE 3931 patIx--; 3932 targetIxOffset++; 3933 } else { // ceMatch == U_CE_SKIP_PATN 3934 // redo with same targCE, next patCE 3935 targetIxOffset--; 3936 } 3937 } 3938 } 3939 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far 3940 3941 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3942 // No match at this targetIx. Try again at the next. 3943 continue; 3944 } 3945 3946 if (!found) { 3947 // No match at all, we have run off the end of the target text. 3948 break; 3949 } 3950 3951 3952 // We have found a match in CE space. 3953 // Now determine the bounds in string index space. 3954 // There still is a chance of match failure if the CE range not correspond to 3955 // an acceptable character range. 3956 // 3957 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3958 3959 mStart = firstCEI->lowIndex; 3960 minLimit = lastCEI->lowIndex; 3961 3962 // Look at the CE following the match. If it is UCOL_NULLORDER the match 3963 // extended to the end of input, and the match is good. 3964 3965 // Look at the high and low indices of the CE following the match. If 3966 // they are the same it means one of two things: 3967 // 1. The match extended to the last CE from the target text, which is OK, or 3968 // 2. The last CE that was part of the match is in an expansion that extends 3969 // to the first CE after the match. In this case, we reject the match. 3970 const CEI *nextCEI = 0; 3971 if (strsrch->search->elementComparisonType == 0) { 3972 nextCEI = ceb.get(targetIx + targetIxOffset); 3973 maxLimit = nextCEI->lowIndex; 3974 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 3975 found = FALSE; 3976 } 3977 } else { 3978 for ( ; ; ++targetIxOffset ) { 3979 nextCEI = ceb.get(targetIx + targetIxOffset); 3980 maxLimit = nextCEI->lowIndex; 3981 // If we are at the end of the target too, match succeeds 3982 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 3983 break; 3984 } 3985 // As long as the next CE has primary weight of 0, 3986 // it is part of the last target element matched by the pattern; 3987 // make sure it can be part of a match with the last patCE 3988 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3989 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3990 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3991 found = FALSE; 3992 break; 3993 } 3994 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 3995 // target element, but it has non-zero primary weight => match fails 3996 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3997 found = false; 3998 break; 3999 // Else the target CE is not part of an expansion of the last matched element, match succeeds 4000 } else { 4001 break; 4002 } 4003 } 4004 } 4005 4006 4007 // Check for the start of the match being within a combining sequence. 4008 // This can happen if the pattern itself begins with a combining char, and 4009 // the match found combining marks in the target text that were attached 4010 // to something else. 4011 // This type of match should be rejected for not completely consuming a 4012 // combining sequence. 4013 if (!isBreakBoundary(strsrch, mStart)) { 4014 found = FALSE; 4015 } 4016 4017 // Check for the start of the match being within an Collation Element Expansion, 4018 // meaning that the first char of the match is only partially matched. 4019 // With exapnsions, the first CE will report the index of the source 4020 // character, and all subsequent (expansions) CEs will report the source index of the 4021 // _following_ character. 4022 int32_t secondIx = firstCEI->highIndex; 4023 if (mStart == secondIx) { 4024 found = FALSE; 4025 } 4026 4027 // Allow matches to end in the middle of a grapheme cluster if the following 4028 // conditions are met; this is needed to make prefix search work properly in 4029 // Indic, see #11750 4030 // * the default breakIter is being used 4031 // * the next collation element after this combining sequence 4032 // - has non-zero primary weight 4033 // - corresponds to a separate character following the one at end of the current match 4034 // (the second of these conditions, and perhaps both, may be redundant given the 4035 // subsequent check for normalization boundary; however they are likely much faster 4036 // tests in any case) 4037 // * the match limit is a normalization boundary 4038 UBool allowMidclusterMatch = FALSE; 4039 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4040 allowMidclusterMatch = 4041 strsrch->search->breakIter == NULL && 4042 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4043 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4044 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4045 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4046 } 4047 // If those conditions are met, then: 4048 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4049 // the match limit may be backed off to a previous break boundary. This handles 4050 // cases in which mLimit includes target characters that are ignorable with current 4051 // settings (such as space) and which extend beyond the pattern match. 4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4053 // * do NOT require that match limit be on a breakIter boundary 4054 4055 // Advance the match end position to the first acceptable match boundary. 4056 // This advances the index over any combining charcters. 4057 mLimit = maxLimit; 4058 if (minLimit < maxLimit) { 4059 // When the last CE's low index is same with its high index, the CE is likely 4060 // a part of expansion. In this case, the index is located just after the 4061 // character corresponding to the CEs compared above. If the index is right 4062 // at the break boundary, move the position to the next boundary will result 4063 // incorrect match length when there are ignorable characters exist between 4064 // the position and the next character produces CE(s). See ticket#8482. 4065 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4066 mLimit = minLimit; 4067 } else { 4068 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4069 // Note that we can have nba < maxLimit && nba >= minLImit, in which 4070 // case we want to set mLimit to nba regardless of allowMidclusterMatch 4071 // (i.e. we back off mLimit to the previous breakIterator boundary). 4072 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4073 mLimit = nba; 4074 } 4075 } 4076 } 4077 4078 #ifdef USEARCH_DEBUG 4079 if (getenv("USEARCH_DEBUG") != NULL) { 4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4081 } 4082 #endif 4083 4084 if (!allowMidclusterMatch) { 4085 // If advancing to the end of a combining sequence in character indexing space 4086 // advanced us beyond the end of the match in CE space, reject this match. 4087 if (mLimit > maxLimit) { 4088 found = FALSE; 4089 } 4090 4091 if (!isBreakBoundary(strsrch, mLimit)) { 4092 found = FALSE; 4093 } 4094 } 4095 4096 if (! checkIdentical(strsrch, mStart, mLimit)) { 4097 found = FALSE; 4098 } 4099 4100 if (found) { 4101 break; 4102 } 4103 } 4104 4105 #ifdef USEARCH_DEBUG 4106 if (getenv("USEARCH_DEBUG") != NULL) { 4107 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4108 int32_t lastToPrint = ceb.limitIx+2; 4109 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4110 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4111 } 4112 printf("\n%s\n", found? "match found" : "no match"); 4113 } 4114 #endif 4115 4116 // All Done. Store back the match bounds to the caller. 4117 // 4118 if (found==FALSE) { 4119 mLimit = -1; 4120 mStart = -1; 4121 } 4122 4123 if (matchStart != NULL) { 4124 *matchStart= mStart; 4125 } 4126 4127 if (matchLimit != NULL) { 4128 *matchLimit = mLimit; 4129 } 4130 4131 return found; 4132 } 4133 4134 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4135 int32_t startIdx, 4136 int32_t *matchStart, 4137 int32_t *matchLimit, 4138 UErrorCode *status) 4139 { 4140 if (U_FAILURE(*status)) { 4141 return FALSE; 4142 } 4143 4144 // TODO: reject search patterns beginning with a combining char. 4145 4146 #ifdef USEARCH_DEBUG 4147 if (getenv("USEARCH_DEBUG") != NULL) { 4148 printf("Pattern CEs\n"); 4149 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 4150 printf(" %8x", strsrch->pattern.ces[ii]); 4151 } 4152 printf("\n"); 4153 } 4154 4155 #endif 4156 // Input parameter sanity check. 4157 // TODO: should input indicies clip to the text length 4158 // in the same way that UText does. 4159 if(strsrch->pattern.cesLength == 0 || 4160 startIdx < 0 || 4161 startIdx > strsrch->search->textLength || 4162 strsrch->pattern.ces == NULL) { 4163 *status = U_ILLEGAL_ARGUMENT_ERROR; 4164 return FALSE; 4165 } 4166 4167 if (strsrch->pattern.pces == NULL) { 4168 initializePatternPCETable(strsrch, status); 4169 } 4170 4171 CEIBuffer ceb(strsrch, status); 4172 int32_t targetIx = 0; 4173 4174 /* 4175 * Pre-load the buffer with the CE's for the grapheme 4176 * after our starting position so that we're sure that 4177 * we can look at the CE following the match when we 4178 * check the match boundaries. 4179 * 4180 * This will also pre-fetch the first CE that we'll 4181 * consider for the match. 4182 */ 4183 if (startIdx < strsrch->search->textLength) { 4184 UBreakIterator *bi = strsrch->search->internalBreakIter; 4185 int32_t next = ubrk_following(bi, startIdx); 4186 4187 ucol_setOffset(strsrch->textIter, next, status); 4188 4189 for (targetIx = 0; ; targetIx += 1) { 4190 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4191 break; 4192 } 4193 } 4194 } else { 4195 ucol_setOffset(strsrch->textIter, startIdx, status); 4196 } 4197 4198 4199 const CEI *targetCEI = NULL; 4200 int32_t patIx; 4201 UBool found; 4202 4203 int32_t limitIx = targetIx; 4204 int32_t mStart = -1; 4205 int32_t mLimit = -1; 4206 int32_t minLimit; 4207 int32_t maxLimit; 4208 4209 4210 4211 // Outer loop moves over match starting positions in the 4212 // target CE space. 4213 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 4214 // But patIx is 0 at the beginning of the pattern and increases toward the end. 4215 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 4216 // and the beginning of the base text. 4217 for(targetIx = limitIx; ; targetIx += 1) 4218 { 4219 found = TRUE; 4220 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4221 // (compared to the last CE fetched for the previous targetIx value) as we need to go 4222 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4223 const CEI *lastCEI = ceb.getPrevious(targetIx); 4224 if (lastCEI == NULL) { 4225 *status = U_INTERNAL_PROGRAM_ERROR; 4226 found = FALSE; 4227 break; 4228 } 4229 // Inner loop checks for a match beginning at each 4230 // position from the outer loop. 4231 int32_t targetIxOffset = 0; 4232 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { 4233 int64_t patCE = strsrch->pattern.pces[patIx]; 4234 4235 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); 4236 // Compare CE from target string with CE from the pattern. 4237 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4238 // which will fail the compare, below. 4239 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 4240 if ( ceMatch == U_CE_NO_MATCH ) { 4241 found = FALSE; 4242 break; 4243 } else if ( ceMatch > U_CE_NO_MATCH ) { 4244 if ( ceMatch == U_CE_SKIP_TARG ) { 4245 // redo with same patCE, next targCE 4246 patIx++; 4247 targetIxOffset++; 4248 } else { // ceMatch == U_CE_SKIP_PATN 4249 // redo with same targCE, next patCE 4250 targetIxOffset--; 4251 } 4252 } 4253 } 4254 4255 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4256 // No match at this targetIx. Try again at the next. 4257 continue; 4258 } 4259 4260 if (!found) { 4261 // No match at all, we have run off the end of the target text. 4262 break; 4263 } 4264 4265 4266 // We have found a match in CE space. 4267 // Now determine the bounds in string index space. 4268 // There still is a chance of match failure if the CE range not correspond to 4269 // an acceptable character range. 4270 // 4271 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); 4272 mStart = firstCEI->lowIndex; 4273 4274 // Check for the start of the match being within a combining sequence. 4275 // This can happen if the pattern itself begins with a combining char, and 4276 // the match found combining marks in the target text that were attached 4277 // to something else. 4278 // This type of match should be rejected for not completely consuming a 4279 // combining sequence. 4280 if (!isBreakBoundary(strsrch, mStart)) { 4281 found = FALSE; 4282 } 4283 4284 // Look at the high index of the first CE in the match. If it's the same as the 4285 // low index, the first CE in the match is in the middle of an expansion. 4286 if (mStart == firstCEI->highIndex) { 4287 found = FALSE; 4288 } 4289 4290 4291 minLimit = lastCEI->lowIndex; 4292 4293 if (targetIx > 0) { 4294 // Look at the CE following the match. If it is UCOL_NULLORDER the match 4295 // extended to the end of input, and the match is good. 4296 4297 // Look at the high and low indices of the CE following the match. If 4298 // they are the same it means one of two things: 4299 // 1. The match extended to the last CE from the target text, which is OK, or 4300 // 2. The last CE that was part of the match is in an expansion that extends 4301 // to the first CE after the match. In this case, we reject the match. 4302 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4303 4304 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4305 found = FALSE; 4306 } 4307 4308 mLimit = maxLimit = nextCEI->lowIndex; 4309 4310 // Allow matches to end in the middle of a grapheme cluster if the following 4311 // conditions are met; this is needed to make prefix search work properly in 4312 // Indic, see #11750 4313 // * the default breakIter is being used 4314 // * the next collation element after this combining sequence 4315 // - has non-zero primary weight 4316 // - corresponds to a separate character following the one at end of the current match 4317 // (the second of these conditions, and perhaps both, may be redundant given the 4318 // subsequent check for normalization boundary; however they are likely much faster 4319 // tests in any case) 4320 // * the match limit is a normalization boundary 4321 UBool allowMidclusterMatch = FALSE; 4322 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) { 4323 allowMidclusterMatch = 4324 strsrch->search->breakIter == NULL && 4325 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 4326 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 4327 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 4328 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 4329 } 4330 // If those conditions are met, then: 4331 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 4332 // the match limit may be backed off to a previous break boundary. This handles 4333 // cases in which mLimit includes target characters that are ignorable with current 4334 // settings (such as space) and which extend beyond the pattern match. 4335 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 4336 // * do NOT require that match limit be on a breakIter boundary 4337 4338 // Advance the match end position to the first acceptable match boundary. 4339 // This advances the index over any combining characters. 4340 if (minLimit < maxLimit) { 4341 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4342 // Note that we can have nba < maxLimit && nba >= minLImit, in which 4343 // case we want to set mLimit to nba regardless of allowMidclusterMatch 4344 // (i.e. we back off mLimit to the previous breakIterator boundary). 4345 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 4346 mLimit = nba; 4347 } 4348 } 4349 4350 if (!allowMidclusterMatch) { 4351 // If advancing to the end of a combining sequence in character indexing space 4352 // advanced us beyond the end of the match in CE space, reject this match. 4353 if (mLimit > maxLimit) { 4354 found = FALSE; 4355 } 4356 4357 // Make sure the end of the match is on a break boundary 4358 if (!isBreakBoundary(strsrch, mLimit)) { 4359 found = FALSE; 4360 } 4361 } 4362 4363 } else { 4364 // No non-ignorable CEs after this point. 4365 // The maximum position is detected by boundary after 4366 // the last non-ignorable CE. Combining sequence 4367 // across the start index will be truncated. 4368 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4369 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4370 } 4371 4372 #ifdef USEARCH_DEBUG 4373 if (getenv("USEARCH_DEBUG") != NULL) { 4374 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4375 } 4376 #endif 4377 4378 4379 if (! checkIdentical(strsrch, mStart, mLimit)) { 4380 found = FALSE; 4381 } 4382 4383 if (found) { 4384 break; 4385 } 4386 } 4387 4388 #ifdef USEARCH_DEBUG 4389 if (getenv("USEARCH_DEBUG") != NULL) { 4390 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4391 int32_t lastToPrint = ceb.limitIx+2; 4392 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4393 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4394 } 4395 printf("\n%s\n", found? "match found" : "no match"); 4396 } 4397 #endif 4398 4399 // All Done. Store back the match bounds to the caller. 4400 // 4401 if (found==FALSE) { 4402 mLimit = -1; 4403 mStart = -1; 4404 } 4405 4406 if (matchStart != NULL) { 4407 *matchStart= mStart; 4408 } 4409 4410 if (matchLimit != NULL) { 4411 *matchLimit = mLimit; 4412 } 4413 4414 return found; 4415 } 4416 4417 // internal use methods declared in usrchimp.h ----------------------------- 4418 4419 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4420 { 4421 if (U_FAILURE(*status)) { 4422 setMatchNotFound(strsrch); 4423 return FALSE; 4424 } 4425 4426 #if BOYER_MOORE 4427 UCollationElements *coleiter = strsrch->textIter; 4428 int32_t textlength = strsrch->search->textLength; 4429 int32_t *patternce = strsrch->pattern.ces; 4430 int32_t patterncelength = strsrch->pattern.cesLength; 4431 int32_t textoffset = ucol_getOffset(coleiter); 4432 4433 // status used in setting coleiter offset, since offset is checked in 4434 // shiftForward before setting the coleiter offset, status never 4435 // a failure 4436 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4437 patterncelength); 4438 while (textoffset <= textlength) 4439 { 4440 uint32_t patternceindex = patterncelength - 1; 4441 int32_t targetce; 4442 UBool found = FALSE; 4443 int32_t lastce = UCOL_NULLORDER; 4444 4445 setColEIterOffset(coleiter, textoffset); 4446 4447 for (;;) { 4448 // finding the last pattern ce match, imagine composite characters 4449 // for example: search for pattern A in text \u00C0 4450 // we'll have to skip \u0300 the grave first before we get to A 4451 targetce = ucol_previous(coleiter, status); 4452 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4453 found = FALSE; 4454 break; 4455 } 4456 targetce = getCE(strsrch, targetce); 4457 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4458 // this is for the text \u0315\u0300 that requires 4459 // normalization and pattern \u0300, where \u0315 is ignorable 4460 continue; 4461 } 4462 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4463 lastce = targetce; 4464 } 4465 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4466 if (targetce == patternce[patternceindex]) { 4467 // the first ce can be a contraction 4468 found = TRUE; 4469 break; 4470 } 4471 if (!hasExpansion(coleiter)) { 4472 found = FALSE; 4473 break; 4474 } 4475 } 4476 4477 //targetce = lastce; 4478 4479 while (found && patternceindex > 0) { 4480 lastce = targetce; 4481 targetce = ucol_previous(coleiter, status); 4482 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4483 found = FALSE; 4484 break; 4485 } 4486 targetce = getCE(strsrch, targetce); 4487 if (targetce == UCOL_IGNORABLE) { 4488 continue; 4489 } 4490 4491 patternceindex --; 4492 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4493 found = found && targetce == patternce[patternceindex]; 4494 } 4495 4496 targetce = lastce; 4497 4498 if (!found) { 4499 if (U_FAILURE(*status)) { 4500 break; 4501 } 4502 textoffset = shiftForward(strsrch, textoffset, lastce, 4503 patternceindex); 4504 // status checked at loop. 4505 patternceindex = patterncelength; 4506 continue; 4507 } 4508 4509 if (checkNextExactMatch(strsrch, &textoffset, status)) { 4510 // status checked in ucol_setOffset 4511 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4512 return TRUE; 4513 } 4514 } 4515 setMatchNotFound(strsrch); 4516 return FALSE; 4517 #else 4518 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4519 int32_t start = -1; 4520 int32_t end = -1; 4521 4522 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4523 strsrch->search->matchedIndex = start; 4524 strsrch->search->matchedLength = end - start; 4525 return TRUE; 4526 } else { 4527 setMatchNotFound(strsrch); 4528 return FALSE; 4529 } 4530 #endif 4531 } 4532 4533 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4534 { 4535 if (U_FAILURE(*status)) { 4536 setMatchNotFound(strsrch); 4537 return FALSE; 4538 } 4539 4540 #if BOYER_MOORE 4541 UCollationElements *coleiter = strsrch->textIter; 4542 int32_t textlength = strsrch->search->textLength; 4543 int32_t *patternce = strsrch->pattern.ces; 4544 int32_t patterncelength = strsrch->pattern.cesLength; 4545 int32_t textoffset = ucol_getOffset(coleiter); 4546 UBool hasPatternAccents = 4547 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4548 4549 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4550 patterncelength); 4551 strsrch->canonicalPrefixAccents[0] = 0; 4552 strsrch->canonicalSuffixAccents[0] = 0; 4553 4554 while (textoffset <= textlength) 4555 { 4556 int32_t patternceindex = patterncelength - 1; 4557 int32_t targetce; 4558 UBool found = FALSE; 4559 int32_t lastce = UCOL_NULLORDER; 4560 4561 setColEIterOffset(coleiter, textoffset); 4562 4563 for (;;) { 4564 // finding the last pattern ce match, imagine composite characters 4565 // for example: search for pattern A in text \u00C0 4566 // we'll have to skip \u0300 the grave first before we get to A 4567 targetce = ucol_previous(coleiter, status); 4568 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4569 found = FALSE; 4570 break; 4571 } 4572 targetce = getCE(strsrch, targetce); 4573 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4574 lastce = targetce; 4575 } 4576 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4577 if (targetce == patternce[patternceindex]) { 4578 // the first ce can be a contraction 4579 found = TRUE; 4580 break; 4581 } 4582 if (!hasExpansion(coleiter)) { 4583 found = FALSE; 4584 break; 4585 } 4586 } 4587 4588 while (found && patternceindex > 0) { 4589 targetce = ucol_previous(coleiter, status); 4590 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4591 found = FALSE; 4592 break; 4593 } 4594 targetce = getCE(strsrch, targetce); 4595 if (targetce == UCOL_IGNORABLE) { 4596 continue; 4597 } 4598 4599 patternceindex --; 4600 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4601 found = found && targetce == patternce[patternceindex]; 4602 } 4603 4604 // initializing the rearranged accent array 4605 if (hasPatternAccents && !found) { 4606 strsrch->canonicalPrefixAccents[0] = 0; 4607 strsrch->canonicalSuffixAccents[0] = 0; 4608 if (U_FAILURE(*status)) { 4609 break; 4610 } 4611 found = doNextCanonicalMatch(strsrch, textoffset, status); 4612 } 4613 4614 if (!found) { 4615 if (U_FAILURE(*status)) { 4616 break; 4617 } 4618 textoffset = shiftForward(strsrch, textoffset, lastce, 4619 patternceindex); 4620 // status checked at loop 4621 patternceindex = patterncelength; 4622 continue; 4623 } 4624 4625 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4626 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4627 return TRUE; 4628 } 4629 } 4630 setMatchNotFound(strsrch); 4631 return FALSE; 4632 #else 4633 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4634 int32_t start = -1; 4635 int32_t end = -1; 4636 4637 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4638 strsrch->search->matchedIndex = start; 4639 strsrch->search->matchedLength = end - start; 4640 return TRUE; 4641 } else { 4642 setMatchNotFound(strsrch); 4643 return FALSE; 4644 } 4645 #endif 4646 } 4647 4648 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4649 { 4650 if (U_FAILURE(*status)) { 4651 setMatchNotFound(strsrch); 4652 return FALSE; 4653 } 4654 4655 #if BOYER_MOORE 4656 UCollationElements *coleiter = strsrch->textIter; 4657 int32_t *patternce = strsrch->pattern.ces; 4658 int32_t patterncelength = strsrch->pattern.cesLength; 4659 int32_t textoffset = ucol_getOffset(coleiter); 4660 4661 // shifting it check for setting offset 4662 // if setOffset is called previously or there was no previous match, we 4663 // leave the offset as it is. 4664 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4665 textoffset = strsrch->search->matchedIndex; 4666 } 4667 4668 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4669 patterncelength); 4670 4671 while (textoffset >= 0) 4672 { 4673 int32_t patternceindex = 1; 4674 int32_t targetce; 4675 UBool found = FALSE; 4676 int32_t firstce = UCOL_NULLORDER; 4677 4678 // if status is a failure, ucol_setOffset does nothing 4679 setColEIterOffset(coleiter, textoffset); 4680 4681 for (;;) { 4682 // finding the first pattern ce match, imagine composite 4683 // characters. for example: search for pattern \u0300 in text 4684 // \u00C0, we'll have to skip A first before we get to 4685 // \u0300 the grave accent 4686 targetce = ucol_next(coleiter, status); 4687 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4688 found = FALSE; 4689 break; 4690 } 4691 targetce = getCE(strsrch, targetce); 4692 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4693 firstce = targetce; 4694 } 4695 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4696 continue; 4697 } 4698 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4699 if (targetce == patternce[0]) { 4700 found = TRUE; 4701 break; 4702 } 4703 if (!hasExpansion(coleiter)) { 4704 // checking for accents in composite character 4705 found = FALSE; 4706 break; 4707 } 4708 } 4709 4710 //targetce = firstce; 4711 4712 while (found && (patternceindex < patterncelength)) { 4713 firstce = targetce; 4714 targetce = ucol_next(coleiter, status); 4715 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4716 found = FALSE; 4717 break; 4718 } 4719 targetce = getCE(strsrch, targetce); 4720 if (targetce == UCOL_IGNORABLE) { 4721 continue; 4722 } 4723 4724 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4725 found = found && targetce == patternce[patternceindex]; 4726 patternceindex ++; 4727 } 4728 4729 targetce = firstce; 4730 4731 if (!found) { 4732 if (U_FAILURE(*status)) { 4733 break; 4734 } 4735 4736 textoffset = reverseShift(strsrch, textoffset, targetce, 4737 patternceindex); 4738 patternceindex = 0; 4739 continue; 4740 } 4741 4742 if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4743 setColEIterOffset(coleiter, textoffset); 4744 return TRUE; 4745 } 4746 } 4747 setMatchNotFound(strsrch); 4748 return FALSE; 4749 #else 4750 int32_t textOffset; 4751 4752 if (strsrch->search->isOverlap) { 4753 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4754 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4755 } else { 4756 // move the start position at the end of possible match 4757 initializePatternPCETable(strsrch, status); 4758 if (!initTextProcessedIter(strsrch, status)) { 4759 setMatchNotFound(strsrch); 4760 return FALSE; 4761 } 4762 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4763 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4764 if (pce == UCOL_PROCESSED_NULLORDER) { 4765 // at the end of the text 4766 break; 4767 } 4768 } 4769 if (U_FAILURE(*status)) { 4770 setMatchNotFound(strsrch); 4771 return FALSE; 4772 } 4773 textOffset = ucol_getOffset(strsrch->textIter); 4774 } 4775 } else { 4776 textOffset = ucol_getOffset(strsrch->textIter); 4777 } 4778 4779 int32_t start = -1; 4780 int32_t end = -1; 4781 4782 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4783 strsrch->search->matchedIndex = start; 4784 strsrch->search->matchedLength = end - start; 4785 return TRUE; 4786 } else { 4787 setMatchNotFound(strsrch); 4788 return FALSE; 4789 } 4790 #endif 4791 } 4792 4793 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4794 UErrorCode *status) 4795 { 4796 if (U_FAILURE(*status)) { 4797 setMatchNotFound(strsrch); 4798 return FALSE; 4799 } 4800 4801 #if BOYER_MOORE 4802 UCollationElements *coleiter = strsrch->textIter; 4803 int32_t *patternce = strsrch->pattern.ces; 4804 int32_t patterncelength = strsrch->pattern.cesLength; 4805 int32_t textoffset = ucol_getOffset(coleiter); 4806 UBool hasPatternAccents = 4807 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4808 4809 // shifting it check for setting offset 4810 // if setOffset is called previously or there was no previous match, we 4811 // leave the offset as it is. 4812 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4813 textoffset = strsrch->search->matchedIndex; 4814 } 4815 4816 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4817 patterncelength); 4818 strsrch->canonicalPrefixAccents[0] = 0; 4819 strsrch->canonicalSuffixAccents[0] = 0; 4820 4821 while (textoffset >= 0) 4822 { 4823 int32_t patternceindex = 1; 4824 int32_t targetce; 4825 UBool found = FALSE; 4826 int32_t firstce = UCOL_NULLORDER; 4827 4828 setColEIterOffset(coleiter, textoffset); 4829 for (;;) { 4830 // finding the first pattern ce match, imagine composite 4831 // characters. for example: search for pattern \u0300 in text 4832 // \u00C0, we'll have to skip A first before we get to 4833 // \u0300 the grave accent 4834 targetce = ucol_next(coleiter, status); 4835 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4836 found = FALSE; 4837 break; 4838 } 4839 targetce = getCE(strsrch, targetce); 4840 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4841 firstce = targetce; 4842 } 4843 4844 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4845 if (targetce == patternce[0]) { 4846 // the first ce can be a contraction 4847 found = TRUE; 4848 break; 4849 } 4850 if (!hasExpansion(coleiter)) { 4851 // checking for accents in composite character 4852 found = FALSE; 4853 break; 4854 } 4855 } 4856 4857 targetce = firstce; 4858 4859 while (found && patternceindex < patterncelength) { 4860 targetce = ucol_next(coleiter, status); 4861 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4862 found = FALSE; 4863 break; 4864 } 4865 targetce = getCE(strsrch, targetce); 4866 if (targetce == UCOL_IGNORABLE) { 4867 continue; 4868 } 4869 4870 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4871 found = found && targetce == patternce[patternceindex]; 4872 patternceindex ++; 4873 } 4874 4875 // initializing the rearranged accent array 4876 if (hasPatternAccents && !found) { 4877 strsrch->canonicalPrefixAccents[0] = 0; 4878 strsrch->canonicalSuffixAccents[0] = 0; 4879 if (U_FAILURE(*status)) { 4880 break; 4881 } 4882 found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4883 } 4884 4885 if (!found) { 4886 if (U_FAILURE(*status)) { 4887 break; 4888 } 4889 textoffset = reverseShift(strsrch, textoffset, targetce, 4890 patternceindex); 4891 patternceindex = 0; 4892 continue; 4893 } 4894 4895 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4896 setColEIterOffset(coleiter, textoffset); 4897 return TRUE; 4898 } 4899 } 4900 setMatchNotFound(strsrch); 4901 return FALSE; 4902 #else 4903 int32_t textOffset; 4904 4905 if (strsrch->search->isOverlap) { 4906 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4907 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4908 } else { 4909 // move the start position at the end of possible match 4910 initializePatternPCETable(strsrch, status); 4911 if (!initTextProcessedIter(strsrch, status)) { 4912 setMatchNotFound(strsrch); 4913 return FALSE; 4914 } 4915 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4916 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4917 if (pce == UCOL_PROCESSED_NULLORDER) { 4918 // at the end of the text 4919 break; 4920 } 4921 } 4922 if (U_FAILURE(*status)) { 4923 setMatchNotFound(strsrch); 4924 return FALSE; 4925 } 4926 textOffset = ucol_getOffset(strsrch->textIter); 4927 } 4928 } else { 4929 textOffset = ucol_getOffset(strsrch->textIter); 4930 } 4931 4932 int32_t start = -1; 4933 int32_t end = -1; 4934 4935 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4936 strsrch->search->matchedIndex = start; 4937 strsrch->search->matchedLength = end - start; 4938 return TRUE; 4939 } else { 4940 setMatchNotFound(strsrch); 4941 return FALSE; 4942 } 4943 #endif 4944 } 4945 4946 #endif /* #if !UCONFIG_NO_COLLATION */ 4947