1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2014 IBM and others. All rights reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 07/02/2001 synwee Creation. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/usearch.h" 15 #include "unicode/ustring.h" 16 #include "unicode/uchar.h" 17 #include "unicode/utf16.h" 18 #include "normalizer2impl.h" 19 #include "usrchimp.h" 20 #include "cmemory.h" 21 #include "ucln_in.h" 22 #include "uassert.h" 23 #include "ustr_imp.h" 24 25 U_NAMESPACE_USE 26 27 // don't use Boyer-Moore 28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 29 #define BOYER_MOORE 0 30 31 // internal definition --------------------------------------------------- 32 33 #define LAST_BYTE_MASK_ 0xFF 34 #define SECOND_LAST_BYTE_SHIFT_ 8 35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 36 37 static const Normalizer2Impl *g_nfcImpl = NULL; 38 39 // internal methods ------------------------------------------------- 40 41 /** 42 * Fast collation element iterator setOffset. 43 * This function does not check for bounds. 44 * @param coleiter collation element iterator 45 * @param offset to set 46 */ 47 static 48 inline void setColEIterOffset(UCollationElements *elems, 49 int32_t offset) 50 { 51 // Note: Not "fast" any more after the 2013 collation rewrite. 52 // We do not want to expose more internals than necessary. 53 UErrorCode status = U_ZERO_ERROR; 54 ucol_setOffset(elems, offset, &status); 55 } 56 57 /** 58 * Getting the mask for collation strength 59 * @param strength collation strength 60 * @return collation element mask 61 */ 62 static 63 inline uint32_t getMask(UCollationStrength strength) 64 { 65 switch (strength) 66 { 67 case UCOL_PRIMARY: 68 return UCOL_PRIMARYORDERMASK; 69 case UCOL_SECONDARY: 70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 71 default: 72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 73 UCOL_PRIMARYORDERMASK; 74 } 75 } 76 77 /** 78 * This is to squeeze the 21bit ces into a 256 table 79 * @param ce collation element 80 * @return collapsed version of the collation element 81 */ 82 static 83 inline int hash(uint32_t ce) 84 { 85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 86 // well with the new collation where most of the latin 1 characters 87 // are of the value xx000xxx. their hashes will most of the time be 0 88 // to be discussed on the hash algo. 89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 90 } 91 92 U_CDECL_BEGIN 93 static UBool U_CALLCONV 94 usearch_cleanup(void) { 95 g_nfcImpl = NULL; 96 return TRUE; 97 } 98 U_CDECL_END 99 100 /** 101 * Initializing the fcd tables. 102 * Internal method, status assumed to be a success. 103 * @param status output error if any, caller to check status before calling 104 * method, status assumed to be success when passed in. 105 */ 106 static 107 inline void initializeFCD(UErrorCode *status) 108 { 109 if (g_nfcImpl == NULL) { 110 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 111 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 112 } 113 } 114 115 /** 116 * Gets the fcd value for a character at the argument index. 117 * This method takes into accounts of the supplementary characters. 118 * @param str UTF16 string where character for fcd retrieval resides 119 * @param offset position of the character whose fcd is to be retrieved, to be 120 * overwritten with the next character position, taking 121 * surrogate characters into consideration. 122 * @param strlength length of the argument string 123 * @return fcd value 124 */ 125 static 126 uint16_t getFCD(const UChar *str, int32_t *offset, 127 int32_t strlength) 128 { 129 const UChar *temp = str + *offset; 130 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 131 *offset = (int32_t)(temp - str); 132 return result; 133 } 134 135 /** 136 * Getting the modified collation elements taking into account the collation 137 * attributes 138 * @param strsrch string search data 139 * @param sourcece 140 * @return the modified collation element 141 */ 142 static 143 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 144 { 145 // note for tertiary we can't use the collator->tertiaryMask, that 146 // is a preprocessed mask that takes into account case options. since 147 // we are only concerned with exact matches, we don't need that. 148 sourcece &= strsrch->ceMask; 149 150 if (strsrch->toShift) { 151 // alternate handling here, since only the 16 most significant digits 152 // is only used, we can safely do a compare without masking 153 // if the ce is a variable, we mask and get only the primary values 154 // no shifting to quartenary is required since all primary values 155 // less than variabletop will need to be masked off anyway. 156 if (strsrch->variableTop > sourcece) { 157 if (strsrch->strength >= UCOL_QUATERNARY) { 158 sourcece &= UCOL_PRIMARYORDERMASK; 159 } 160 else { 161 sourcece = UCOL_IGNORABLE; 162 } 163 } 164 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 165 sourcece = 0xFFFF; 166 } 167 168 return sourcece; 169 } 170 171 /** 172 * Allocate a memory and returns NULL if it failed. 173 * Internal method, status assumed to be a success. 174 * @param size to allocate 175 * @param status output error if any, caller to check status before calling 176 * method, status assumed to be success when passed in. 177 * @return newly allocated array, NULL otherwise 178 */ 179 static 180 inline void * allocateMemory(uint32_t size, UErrorCode *status) 181 { 182 uint32_t *result = (uint32_t *)uprv_malloc(size); 183 if (result == NULL) { 184 *status = U_MEMORY_ALLOCATION_ERROR; 185 } 186 return result; 187 } 188 189 /** 190 * Adds a uint32_t value to a destination array. 191 * Creates a new array if we run out of space. The caller will have to 192 * manually deallocate the newly allocated array. 193 * Internal method, status assumed to be success, caller has to check status 194 * before calling this method. destination not to be NULL and has at least 195 * size destinationlength. 196 * @param destination target array 197 * @param offset destination offset to add value 198 * @param destinationlength target array size, return value for the new size 199 * @param value to be added 200 * @param increments incremental size expected 201 * @param status output error if any, caller to check status before calling 202 * method, status assumed to be success when passed in. 203 * @return new destination array, destination if there was no new allocation 204 */ 205 static 206 inline int32_t * addTouint32_tArray(int32_t *destination, 207 uint32_t offset, 208 uint32_t *destinationlength, 209 uint32_t value, 210 uint32_t increments, 211 UErrorCode *status) 212 { 213 uint32_t newlength = *destinationlength; 214 if (offset + 1 == newlength) { 215 newlength += increments; 216 int32_t *temp = (int32_t *)allocateMemory( 217 sizeof(int32_t) * newlength, status); 218 if (U_FAILURE(*status)) { 219 return NULL; 220 } 221 uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 222 *destinationlength = newlength; 223 destination = temp; 224 } 225 destination[offset] = value; 226 return destination; 227 } 228 229 /** 230 * Adds a uint64_t value to a destination array. 231 * Creates a new array if we run out of space. The caller will have to 232 * manually deallocate the newly allocated array. 233 * Internal method, status assumed to be success, caller has to check status 234 * before calling this method. destination not to be NULL and has at least 235 * size destinationlength. 236 * @param destination target array 237 * @param offset destination offset to add value 238 * @param destinationlength target array size, return value for the new size 239 * @param value to be added 240 * @param increments incremental size expected 241 * @param status output error if any, caller to check status before calling 242 * method, status assumed to be success when passed in. 243 * @return new destination array, destination if there was no new allocation 244 */ 245 static 246 inline int64_t * addTouint64_tArray(int64_t *destination, 247 uint32_t offset, 248 uint32_t *destinationlength, 249 uint64_t value, 250 uint32_t increments, 251 UErrorCode *status) 252 { 253 uint32_t newlength = *destinationlength; 254 if (offset + 1 == newlength) { 255 newlength += increments; 256 int64_t *temp = (int64_t *)allocateMemory( 257 sizeof(int64_t) * newlength, status); 258 259 if (U_FAILURE(*status)) { 260 return NULL; 261 } 262 263 uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 264 *destinationlength = newlength; 265 destination = temp; 266 } 267 268 destination[offset] = value; 269 270 return destination; 271 } 272 273 /** 274 * Initializing the ce table for a pattern. 275 * Stores non-ignorable collation keys. 276 * Table size will be estimated by the size of the pattern text. Table 277 * expansion will be perform as we go along. Adding 1 to ensure that the table 278 * size definitely increases. 279 * Internal method, status assumed to be a success. 280 * @param strsrch string search data 281 * @param status output error if any, caller to check status before calling 282 * method, status assumed to be success when passed in. 283 * @return total number of expansions 284 */ 285 static 286 inline uint16_t initializePatternCETable(UStringSearch *strsrch, 287 UErrorCode *status) 288 { 289 UPattern *pattern = &(strsrch->pattern); 290 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 291 int32_t *cetable = pattern->cesBuffer; 292 uint32_t patternlength = pattern->textLength; 293 UCollationElements *coleiter = strsrch->utilIter; 294 295 if (coleiter == NULL) { 296 coleiter = ucol_openElements(strsrch->collator, pattern->text, 297 patternlength, status); 298 // status will be checked in ucol_next(..) later and if it is an 299 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 300 // returned. 301 strsrch->utilIter = coleiter; 302 } 303 else { 304 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 305 } 306 if(U_FAILURE(*status)) { 307 return 0; 308 } 309 310 if (pattern->ces != cetable && pattern->ces) { 311 uprv_free(pattern->ces); 312 } 313 314 uint16_t offset = 0; 315 uint16_t result = 0; 316 int32_t ce; 317 318 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 319 U_SUCCESS(*status)) { 320 uint32_t newce = getCE(strsrch, ce); 321 if (newce) { 322 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 323 newce, 324 patternlength - ucol_getOffset(coleiter) + 1, 325 status); 326 if (U_FAILURE(*status)) { 327 return 0; 328 } 329 offset ++; 330 if (cetable != temp && cetable != pattern->cesBuffer) { 331 uprv_free(cetable); 332 } 333 cetable = temp; 334 } 335 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 336 } 337 338 cetable[offset] = 0; 339 pattern->ces = cetable; 340 pattern->cesLength = offset; 341 342 return result; 343 } 344 345 /** 346 * Initializing the pce table for a pattern. 347 * Stores non-ignorable collation keys. 348 * Table size will be estimated by the size of the pattern text. Table 349 * expansion will be perform as we go along. Adding 1 to ensure that the table 350 * size definitely increases. 351 * Internal method, status assumed to be a success. 352 * @param strsrch string search data 353 * @param status output error if any, caller to check status before calling 354 * method, status assumed to be success when passed in. 355 * @return total number of expansions 356 */ 357 static 358 inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 359 UErrorCode *status) 360 { 361 UPattern *pattern = &(strsrch->pattern); 362 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 363 int64_t *pcetable = pattern->pcesBuffer; 364 uint32_t patternlength = pattern->textLength; 365 UCollationElements *coleiter = strsrch->utilIter; 366 367 if (coleiter == NULL) { 368 coleiter = ucol_openElements(strsrch->collator, pattern->text, 369 patternlength, status); 370 // status will be checked in ucol_next(..) later and if it is an 371 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 372 // returned. 373 strsrch->utilIter = coleiter; 374 } else { 375 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 376 } 377 if(U_FAILURE(*status)) { 378 return 0; 379 } 380 381 if (pattern->pces != pcetable && pattern->pces != NULL) { 382 uprv_free(pattern->pces); 383 } 384 385 uint16_t offset = 0; 386 uint16_t result = 0; 387 int64_t pce; 388 389 icu::UCollationPCE iter(coleiter); 390 391 // ** Should processed CEs be signed or unsigned? 392 // ** (the rest of the code in this file seems to play fast-and-loose with 393 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 394 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 395 U_SUCCESS(*status)) { 396 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 397 pce, 398 patternlength - ucol_getOffset(coleiter) + 1, 399 status); 400 401 if (U_FAILURE(*status)) { 402 return 0; 403 } 404 405 offset += 1; 406 407 if (pcetable != temp && pcetable != pattern->pcesBuffer) { 408 uprv_free(pcetable); 409 } 410 411 pcetable = temp; 412 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 413 } 414 415 pcetable[offset] = 0; 416 pattern->pces = pcetable; 417 pattern->pcesLength = offset; 418 419 return result; 420 } 421 422 /** 423 * Initializes the pattern struct. 424 * Internal method, status assumed to be success. 425 * @param strsrch UStringSearch data storage 426 * @param status output error if any, caller to check status before calling 427 * method, status assumed to be success when passed in. 428 * @return expansionsize the total expansion size of the pattern 429 */ 430 static 431 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 432 { 433 if (U_FAILURE(*status)) { return 0; } 434 UPattern *pattern = &(strsrch->pattern); 435 const UChar *patterntext = pattern->text; 436 int32_t length = pattern->textLength; 437 int32_t index = 0; 438 439 // Since the strength is primary, accents are ignored in the pattern. 440 if (strsrch->strength == UCOL_PRIMARY) { 441 pattern->hasPrefixAccents = 0; 442 pattern->hasSuffixAccents = 0; 443 } else { 444 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 445 SECOND_LAST_BYTE_SHIFT_; 446 index = length; 447 U16_BACK_1(patterntext, 0, index); 448 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 449 LAST_BYTE_MASK_; 450 } 451 452 // ** HACK ** 453 if (strsrch->pattern.pces != NULL) { 454 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 455 uprv_free(strsrch->pattern.pces); 456 } 457 458 strsrch->pattern.pces = NULL; 459 } 460 461 // since intializePattern is an internal method status is a success. 462 return initializePatternCETable(strsrch, status); 463 } 464 465 /** 466 * Initializing shift tables, with the default values. 467 * If a corresponding default value is 0, the shift table is not set. 468 * @param shift table for forwards shift 469 * @param backshift table for backwards shift 470 * @param cetable table containing pattern ce 471 * @param cesize size of the pattern ces 472 * @param expansionsize total size of the expansions 473 * @param defaultforward the default forward value 474 * @param defaultbackward the default backward value 475 */ 476 static 477 inline void setShiftTable(int16_t shift[], int16_t backshift[], 478 int32_t *cetable, int32_t cesize, 479 int16_t expansionsize, 480 int16_t defaultforward, 481 int16_t defaultbackward) 482 { 483 // estimate the value to shift. to do that we estimate the smallest 484 // number of characters to give the relevant ces, ie approximately 485 // the number of ces minus their expansion, since expansions can come 486 // from a character. 487 int32_t count; 488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 489 shift[count] = defaultforward; 490 } 491 cesize --; // down to the last index 492 for (count = 0; count < cesize; count ++) { 493 // number of ces from right of array to the count 494 int temp = defaultforward - count - 1; 495 shift[hash(cetable[count])] = temp > 1 ? temp : 1; 496 } 497 shift[hash(cetable[cesize])] = 1; 498 // for ignorables we just shift by one. see test examples. 499 shift[hash(0)] = 1; 500 501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 502 backshift[count] = defaultbackward; 503 } 504 for (count = cesize; count > 0; count --) { 505 // the original value count does not seem to work 506 backshift[hash(cetable[count])] = count > expansionsize ? 507 (int16_t)(count - expansionsize) : 1; 508 } 509 backshift[hash(cetable[0])] = 1; 510 backshift[hash(0)] = 1; 511 } 512 513 /** 514 * Building of the pattern collation element list and the boyer moore strsrch 515 * table. 516 * The canonical match will only be performed after the default match fails. 517 * For both cases we need to remember the size of the composed and decomposed 518 * versions of the string. Since the Boyer-Moore shift calculations shifts by 519 * a number of characters in the text and tries to match the pattern from that 520 * offset, the shift value can not be too large in case we miss some 521 * characters. To choose a right shift size, we estimate the NFC form of the 522 * and use its size as a shift guide. The NFC form should be the small 523 * possible representation of the pattern. Anyways, we'll err on the smaller 524 * shift size. Hence the calculation for minlength. 525 * Canonical match will be performed slightly differently. We'll split the 526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by 527 * the first and last base character (MS), the ending accents (EA). Matches 528 * will be done on MS first, and only when we match MS then some processing 529 * will be required for the prefix and end accents in order to determine if 530 * they match PA and EA. Hence the default shift values 531 * for the canonical match will take the size of either end's accent into 532 * consideration. Forwards search will take the end accents into consideration 533 * for the default shift values and the backwards search will take the prefix 534 * accents into consideration. 535 * If pattern has no non-ignorable ce, we return a illegal argument error. 536 * Internal method, status assumed to be success. 537 * @param strsrch UStringSearch data storage 538 * @param status for output errors if it occurs, status is assumed to be a 539 * success when it is passed in. 540 */ 541 static 542 inline void initialize(UStringSearch *strsrch, UErrorCode *status) 543 { 544 int16_t expandlength = initializePattern(strsrch, status); 545 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { 546 UPattern *pattern = &strsrch->pattern; 547 int32_t cesize = pattern->cesLength; 548 549 int16_t minlength = cesize > expandlength 550 ? (int16_t)cesize - expandlength : 1; 551 pattern->defaultShiftSize = minlength; 552 setShiftTable(pattern->shift, pattern->backShift, pattern->ces, 553 cesize, expandlength, minlength, minlength); 554 return; 555 } 556 strsrch->pattern.defaultShiftSize = 0; 557 } 558 559 #if BOYER_MOORE 560 /** 561 * Check to make sure that the match length is at the end of the character by 562 * using the breakiterator. 563 * @param strsrch string search data 564 * @param start target text start offset 565 * @param end target text end offset 566 */ 567 static 568 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 569 int32_t *end) 570 { 571 #if !UCONFIG_NO_BREAK_ITERATION 572 UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 573 if (breakiterator) { 574 int32_t matchend = *end; 575 //int32_t matchstart = *start; 576 577 if (!ubrk_isBoundary(breakiterator, matchend)) { 578 *end = ubrk_following(breakiterator, matchend); 579 } 580 581 /* Check the start of the matched text to make sure it doesn't have any accents 582 * before it. This code may not be necessary and so it is commented out */ 583 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 584 *start = ubrk_preceding(breakiterator, matchstart); 585 }*/ 586 } 587 #endif 588 } 589 590 /** 591 * Determine whether the target text in UStringSearch bounded by the offset 592 * start and end is one or more whole units of text as 593 * determined by the breakiterator in UStringSearch. 594 * @param strsrch string search data 595 * @param start target text start offset 596 * @param end target text end offset 597 */ 598 static 599 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 600 int32_t end) 601 { 602 #if !UCONFIG_NO_BREAK_ITERATION 603 UBreakIterator *breakiterator = strsrch->search->breakIter; 604 //TODO: Add here. 605 if (breakiterator) { 606 int32_t startindex = ubrk_first(breakiterator); 607 int32_t endindex = ubrk_last(breakiterator); 608 609 // out-of-range indexes are never boundary positions 610 if (start < startindex || start > endindex || 611 end < startindex || end > endindex) { 612 return FALSE; 613 } 614 // otherwise, we can use following() on the position before the 615 // specified one and return true of the position we get back is the 616 // one the user specified 617 UBool result = (start == startindex || 618 ubrk_following(breakiterator, start - 1) == start) && 619 (end == endindex || 620 ubrk_following(breakiterator, end - 1) == end); 621 if (result) { 622 // iterates the individual ces 623 UCollationElements *coleiter = strsrch->utilIter; 624 const UChar *text = strsrch->search->text + 625 start; 626 UErrorCode status = U_ZERO_ERROR; 627 ucol_setText(coleiter, text, end - start, &status); 628 for (int32_t count = 0; count < strsrch->pattern.cesLength; 629 count ++) { 630 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 631 if (ce == UCOL_IGNORABLE) { 632 count --; 633 continue; 634 } 635 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { 636 return FALSE; 637 } 638 } 639 int32_t nextce = ucol_next(coleiter, &status); 640 while (ucol_getOffset(coleiter) == (end - start) 641 && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 642 nextce = ucol_next(coleiter, &status); 643 } 644 if (ucol_getOffset(coleiter) == (end - start) 645 && nextce != UCOL_NULLORDER) { 646 // extra collation elements at the end of the match 647 return FALSE; 648 } 649 } 650 return result; 651 } 652 #endif 653 return TRUE; 654 } 655 656 /** 657 * Getting the next base character offset if current offset is an accent, 658 * or the current offset if the current character contains a base character. 659 * accents the following base character will be returned 660 * @param text string 661 * @param textoffset current offset 662 * @param textlength length of text string 663 * @return the next base character or the current offset 664 * if the current character is contains a base character. 665 */ 666 static 667 inline int32_t getNextBaseOffset(const UChar *text, 668 int32_t textoffset, 669 int32_t textlength) 670 { 671 if (textoffset < textlength) { 672 int32_t temp = textoffset; 673 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 674 while (temp < textlength) { 675 int32_t result = temp; 676 if ((getFCD(text, &temp, textlength) >> 677 SECOND_LAST_BYTE_SHIFT_) == 0) { 678 return result; 679 } 680 } 681 return textlength; 682 } 683 } 684 return textoffset; 685 } 686 687 /** 688 * Gets the next base character offset depending on the string search pattern 689 * data 690 * @param strsrch string search data 691 * @param textoffset current offset, one offset away from the last character 692 * to search for. 693 * @return start index of the next base character or the current offset 694 * if the current character is contains a base character. 695 */ 696 static 697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 698 int32_t textoffset) 699 { 700 int32_t textlength = strsrch->search->textLength; 701 if (strsrch->pattern.hasSuffixAccents && 702 textoffset < textlength) { 703 int32_t temp = textoffset; 704 const UChar *text = strsrch->search->text; 705 U16_BACK_1(text, 0, temp); 706 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 707 return getNextBaseOffset(text, textoffset, textlength); 708 } 709 } 710 return textoffset; 711 } 712 713 /** 714 * Shifting the collation element iterator position forward to prepare for 715 * a following match. If the last character is a unsafe character, we'll only 716 * shift by 1 to capture contractions, normalization etc. 717 * Internal method, status assumed to be success. 718 * @param text strsrch string search data 719 * @param textoffset start text position to do search 720 * @param ce the text ce which failed the match. 721 * @param patternceindex index of the ce within the pattern ce buffer which 722 * failed the match 723 * @return final offset 724 */ 725 static 726 inline int32_t shiftForward(UStringSearch *strsrch, 727 int32_t textoffset, 728 int32_t ce, 729 int32_t patternceindex) 730 { 731 UPattern *pattern = &(strsrch->pattern); 732 if (ce != UCOL_NULLORDER) { 733 int32_t shift = pattern->shift[hash(ce)]; 734 // this is to adjust for characters in the middle of the 735 // substring for matching that failed. 736 int32_t adjust = pattern->cesLength - patternceindex; 737 if (adjust > 1 && shift >= adjust) { 738 shift -= adjust - 1; 739 } 740 textoffset += shift; 741 } 742 else { 743 textoffset += pattern->defaultShiftSize; 744 } 745 746 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 747 // check for unsafe characters 748 // * if it is the start or middle of a contraction: to be done after 749 // a initial match is found 750 // * thai or lao base consonant character: similar to contraction 751 // * high surrogate character: similar to contraction 752 // * next character is a accent: shift to the next base character 753 return textoffset; 754 } 755 #endif // #if BOYER_MOORE 756 757 /** 758 * sets match not found 759 * @param strsrch string search data 760 */ 761 static 762 inline void setMatchNotFound(UStringSearch *strsrch) 763 { 764 // this method resets the match result regardless of the error status. 765 strsrch->search->matchedIndex = USEARCH_DONE; 766 strsrch->search->matchedLength = 0; 767 if (strsrch->search->isForwardSearching) { 768 setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 769 } 770 else { 771 setColEIterOffset(strsrch->textIter, 0); 772 } 773 } 774 775 #if BOYER_MOORE 776 /** 777 * Gets the offset to the next safe point in text. 778 * ie. not the middle of a contraction, swappable characters or supplementary 779 * characters. 780 * @param collator collation sata 781 * @param text string to work with 782 * @param textoffset offset in string 783 * @param textlength length of text string 784 * @return offset to the next safe character 785 */ 786 static 787 inline int32_t getNextSafeOffset(const UCollator *collator, 788 const UChar *text, 789 int32_t textoffset, 790 int32_t textlength) 791 { 792 int32_t result = textoffset; // first contraction character 793 while (result != textlength && ucol_unsafeCP(text[result], collator)) { 794 result ++; 795 } 796 return result; 797 } 798 799 /** 800 * This checks for accents in the potential match started with a . 801 * composite character. 802 * This is really painful... we have to check that composite character do not 803 * have any extra accents. We have to normalize the potential match and find 804 * the immediate decomposed character before the match. 805 * The first composite character would have been taken care of by the fcd 806 * checks in checkForwardExactMatch. 807 * This is the slow path after the fcd of the first character and 808 * the last character has been checked by checkForwardExactMatch and we 809 * determine that the potential match has extra non-ignorable preceding 810 * ces. 811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 813 * Note here that accents checking are slow and cautioned in the API docs. 814 * Internal method, status assumed to be a success, caller should check status 815 * before calling this method 816 * @param strsrch string search data 817 * @param start index of the potential unfriendly composite character 818 * @param end index of the potential unfriendly composite character 819 * @param status output error status if any. 820 * @return TRUE if there is non-ignorable accents before at the beginning 821 * of the match, FALSE otherwise. 822 */ 823 824 static 825 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 826 int32_t end, 827 UErrorCode *status) 828 { 829 UBool result = FALSE; 830 if (strsrch->pattern.hasPrefixAccents) { 831 int32_t length = end - start; 832 int32_t offset = 0; 833 const UChar *text = strsrch->search->text + start; 834 835 U16_FWD_1(text, offset, length); 836 // we are only concerned with the first composite character 837 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 838 int32_t safeoffset = getNextSafeOffset(strsrch->collator, 839 text, 0, length); 840 if (safeoffset != length) { 841 safeoffset ++; 842 } 843 UChar *norm = NULL; 844 UChar buffer[INITIAL_ARRAY_SIZE_]; 845 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 846 buffer, INITIAL_ARRAY_SIZE_, 847 status); 848 if (U_FAILURE(*status)) { 849 return FALSE; 850 } 851 if (size >= INITIAL_ARRAY_SIZE_) { 852 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 853 status); 854 // if allocation failed, status will be set to 855 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 856 // checks for it. 857 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 858 size, status); 859 if (U_FAILURE(*status) && norm != NULL) { 860 uprv_free(norm); 861 return FALSE; 862 } 863 } 864 else { 865 norm = buffer; 866 } 867 868 UCollationElements *coleiter = strsrch->utilIter; 869 ucol_setText(coleiter, norm, size, status); 870 uint32_t firstce = strsrch->pattern.ces[0]; 871 UBool ignorable = TRUE; 872 uint32_t ce = UCOL_IGNORABLE; 873 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 874 offset = ucol_getOffset(coleiter); 875 if (ce != firstce && ce != UCOL_IGNORABLE) { 876 ignorable = FALSE; 877 } 878 ce = ucol_next(coleiter, status); 879 } 880 UChar32 codepoint; 881 U16_PREV(norm, 0, offset, codepoint); 882 result = !ignorable && (u_getCombiningClass(codepoint) != 0); 883 884 if (norm != buffer) { 885 uprv_free(norm); 886 } 887 } 888 } 889 890 return result; 891 } 892 893 /** 894 * Used by exact matches, checks if there are accents before the match. 895 * This is really painful... we have to check that composite characters at 896 * the start of the matches have to not have any extra accents. 897 * We check the FCD of the character first, if it starts with an accent and 898 * the first pattern ce does not match the first ce of the character, we bail. 899 * Otherwise we try normalizing the first composite 900 * character and find the immediate decomposed character before the match to 901 * see if it is an non-ignorable accent. 902 * Now normalizing the first composite character is enough because we ensure 903 * that when the match is passed in here with extra beginning ces, the 904 * first or last ce that match has to occur within the first character. 905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute, 906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA 907 * Note here that accents checking are slow and cautioned in the API docs. 908 * @param strsrch string search data 909 * @param start offset 910 * @param end offset 911 * @return TRUE if there are accents on either side of the match, 912 * FALSE otherwise 913 */ 914 static 915 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 916 int32_t end) 917 { 918 if (strsrch->pattern.hasPrefixAccents) { 919 UCollationElements *coleiter = strsrch->textIter; 920 UErrorCode status = U_ZERO_ERROR; 921 // we have been iterating forwards previously 922 uint32_t ignorable = TRUE; 923 int32_t firstce = strsrch->pattern.ces[0]; 924 925 setColEIterOffset(coleiter, start); 926 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 927 if (U_FAILURE(status)) { 928 return TRUE; 929 } 930 while (ce != firstce) { 931 if (ce != UCOL_IGNORABLE) { 932 ignorable = FALSE; 933 } 934 ce = getCE(strsrch, ucol_next(coleiter, &status)); 935 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 936 return TRUE; 937 } 938 } 939 if (!ignorable && inNormBuf(coleiter)) { 940 // within normalization buffer, discontiguous handled here 941 return TRUE; 942 } 943 944 // within text 945 int32_t temp = start; 946 // original code 947 // accent = (getFCD(strsrch->search->text, &temp, 948 // strsrch->search->textLength) 949 // >> SECOND_LAST_BYTE_SHIFT_); 950 // however this code does not work well with VC7 .net in release mode. 951 // maybe the inlines for getFCD combined with shifting has bugs in 952 // VC7. anyways this is a work around. 953 UBool accent = getFCD(strsrch->search->text, &temp, 954 strsrch->search->textLength) > 0xFF; 955 if (!accent) { 956 return checkExtraMatchAccents(strsrch, start, end, &status); 957 } 958 if (!ignorable) { 959 return TRUE; 960 } 961 if (start > 0) { 962 temp = start; 963 U16_BACK_1(strsrch->search->text, 0, temp); 964 if (getFCD(strsrch->search->text, &temp, 965 strsrch->search->textLength) & LAST_BYTE_MASK_) { 966 setColEIterOffset(coleiter, start); 967 ce = ucol_previous(coleiter, &status); 968 if (U_FAILURE(status) || 969 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 970 return TRUE; 971 } 972 } 973 } 974 } 975 976 return FALSE; 977 } 978 979 /** 980 * Used by exact matches, checks if there are accents bounding the match. 981 * Note this is the initial boundary check. If the potential match 982 * starts or ends with composite characters, the accents in those 983 * characters will be determined later. 984 * Not doing backwards iteration here, since discontiguos contraction for 985 * backwards collation element iterator, use up too many characters. 986 * E.g. looking for \u030A ring in \u01FA A ring above and acute, 987 * should fail since there is a acute at the end of \u01FA 988 * Note here that accents checking are slow and cautioned in the API docs. 989 * @param strsrch string search data 990 * @param start offset of match 991 * @param end end offset of the match 992 * @return TRUE if there are accents on either side of the match, 993 * FALSE otherwise 994 */ 995 static 996 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 997 int32_t end) 998 { 999 if (strsrch->pattern.hasSuffixAccents) { 1000 const UChar *text = strsrch->search->text; 1001 int32_t temp = end; 1002 int32_t textlength = strsrch->search->textLength; 1003 U16_BACK_1(text, 0, temp); 1004 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1005 int32_t firstce = strsrch->pattern.ces[0]; 1006 UCollationElements *coleiter = strsrch->textIter; 1007 UErrorCode status = U_ZERO_ERROR; 1008 int32_t ce; 1009 setColEIterOffset(coleiter, start); 1010 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1011 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1012 return TRUE; 1013 } 1014 } 1015 int32_t count = 1; 1016 while (count < strsrch->pattern.cesLength) { 1017 if (getCE(strsrch, ucol_next(coleiter, &status)) 1018 == UCOL_IGNORABLE) { 1019 // Thai can give an ignorable here. 1020 count --; 1021 } 1022 if (U_FAILURE(status)) { 1023 return TRUE; 1024 } 1025 count ++; 1026 } 1027 1028 ce = ucol_next(coleiter, &status); 1029 if (U_FAILURE(status)) { 1030 return TRUE; 1031 } 1032 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1033 ce = getCE(strsrch, ce); 1034 } 1035 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1036 if (ucol_getOffset(coleiter) <= end) { 1037 return TRUE; 1038 } 1039 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1040 return TRUE; 1041 } 1042 } 1043 } 1044 } 1045 return FALSE; 1046 } 1047 #endif // #if BOYER_MOORE 1048 1049 /** 1050 * Checks if the offset runs out of the text string 1051 * @param offset 1052 * @param textlength of the text string 1053 * @return TRUE if offset is out of bounds, FALSE otherwise 1054 */ 1055 static 1056 inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1057 { 1058 return offset < 0 || offset > textlength; 1059 } 1060 1061 /** 1062 * Checks for identical match 1063 * @param strsrch string search data 1064 * @param start offset of possible match 1065 * @param end offset of possible match 1066 * @return TRUE if identical match is found 1067 */ 1068 static 1069 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1070 int32_t end) 1071 { 1072 if (strsrch->strength != UCOL_IDENTICAL) { 1073 return TRUE; 1074 } 1075 1076 // Note: We could use Normalizer::compare() or similar, but for short strings 1077 // which may not be in FCD it might be faster to just NFD them. 1078 UErrorCode status = U_ZERO_ERROR; 1079 UnicodeString t2, p2; 1080 strsrch->nfd->normalize( 1081 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 1082 strsrch->nfd->normalize( 1083 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1084 // return FALSE if NFD failed 1085 return U_SUCCESS(status) && t2 == p2; 1086 } 1087 1088 #if BOYER_MOORE 1089 /** 1090 * Checks to see if the match is repeated 1091 * @param strsrch string search data 1092 * @param start new match start index 1093 * @param end new match end index 1094 * @return TRUE if the the match is repeated, FALSE otherwise 1095 */ 1096 static 1097 inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1098 int32_t start, 1099 int32_t end) 1100 { 1101 int32_t lastmatchindex = strsrch->search->matchedIndex; 1102 UBool result; 1103 if (lastmatchindex == USEARCH_DONE) { 1104 return FALSE; 1105 } 1106 if (strsrch->search->isForwardSearching) { 1107 result = start <= lastmatchindex; 1108 } 1109 else { 1110 result = start >= lastmatchindex; 1111 } 1112 if (!result && !strsrch->search->isOverlap) { 1113 if (strsrch->search->isForwardSearching) { 1114 result = start < lastmatchindex + strsrch->search->matchedLength; 1115 } 1116 else { 1117 result = end > lastmatchindex; 1118 } 1119 } 1120 return result; 1121 } 1122 1123 /** 1124 * Gets the collation element iterator's current offset. 1125 * @param coleiter collation element iterator 1126 * @param forwards flag TRUE if we are moving in th forwards direction 1127 * @return current offset 1128 */ 1129 static 1130 inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1131 UBool forwards) 1132 { 1133 int32_t result = ucol_getOffset(coleiter); 1134 // intricacies of the the backwards collation element iterator 1135 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1136 result ++; 1137 } 1138 return result; 1139 } 1140 1141 /** 1142 * Checks match for contraction. 1143 * If the match ends with a partial contraction we fail. 1144 * If the match starts too far off (because of backwards iteration) we try to 1145 * chip off the extra characters depending on whether a breakiterator has 1146 * been used. 1147 * Internal method, error assumed to be success, caller has to check status 1148 * before calling this method. 1149 * @param strsrch string search data 1150 * @param start offset of potential match, to be modified if necessary 1151 * @param end offset of potential match, to be modified if necessary 1152 * @param status output error status if any 1153 * @return TRUE if match passes the contraction test, FALSE otherwise 1154 */ 1155 1156 static 1157 UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1158 int32_t *start, 1159 int32_t *end, UErrorCode *status) 1160 { 1161 UCollationElements *coleiter = strsrch->textIter; 1162 int32_t textlength = strsrch->search->textLength; 1163 int32_t temp = *start; 1164 const UCollator *collator = strsrch->collator; 1165 const UChar *text = strsrch->search->text; 1166 // This part checks if either ends of the match contains potential 1167 // contraction. If so we'll have to iterate through them 1168 // The start contraction needs to be checked since ucol_previous dumps 1169 // all characters till the first safe character into the buffer. 1170 // *start + 1 is used to test for the unsafe characters instead of *start 1171 // because ucol_prev takes all unsafe characters till the first safe 1172 // character ie *start. so by testing *start + 1, we can estimate if 1173 // excess prefix characters has been included in the potential search 1174 // results. 1175 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1176 (*start + 1 < textlength 1177 && ucol_unsafeCP(text[*start + 1], collator))) { 1178 int32_t expansion = getExpansionPrefix(coleiter); 1179 UBool expandflag = expansion > 0; 1180 setColEIterOffset(coleiter, *start); 1181 while (expansion > 0) { 1182 // getting rid of the redundant ce, caused by setOffset. 1183 // since backward contraction/expansion may have extra ces if we 1184 // are in the normalization buffer, hasAccentsBeforeMatch would 1185 // have taken care of it. 1186 // E.g. the character \u01FA will have an expansion of 3, but if 1187 // we are only looking for acute and ring \u030A and \u0301, we'll 1188 // have to skip the first ce in the expansion buffer. 1189 ucol_next(coleiter, status); 1190 if (U_FAILURE(*status)) { 1191 return FALSE; 1192 } 1193 if (ucol_getOffset(coleiter) != temp) { 1194 *start = temp; 1195 temp = ucol_getOffset(coleiter); 1196 } 1197 expansion --; 1198 } 1199 1200 int32_t *patternce = strsrch->pattern.ces; 1201 int32_t patterncelength = strsrch->pattern.cesLength; 1202 int32_t count = 0; 1203 while (count < patterncelength) { 1204 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1205 if (ce == UCOL_IGNORABLE) { 1206 continue; 1207 } 1208 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1209 *start = temp; 1210 temp = ucol_getOffset(coleiter); 1211 } 1212 if (U_FAILURE(*status) || ce != patternce[count]) { 1213 (*end) ++; 1214 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1215 return FALSE; 1216 } 1217 count ++; 1218 } 1219 } 1220 return TRUE; 1221 } 1222 1223 /** 1224 * Checks and sets the match information if found. 1225 * Checks 1226 * <ul> 1227 * <li> the potential match does not repeat the previous match 1228 * <li> boundaries are correct 1229 * <li> exact matches has no extra accents 1230 * <li> identical matchesb 1231 * <li> potential match does not end in the middle of a contraction 1232 * <\ul> 1233 * Otherwise the offset will be shifted to the next character. 1234 * Internal method, status assumed to be success, caller has to check status 1235 * before calling this method. 1236 * @param strsrch string search data 1237 * @param textoffset offset in the collation element text. the returned value 1238 * will be the truncated end offset of the match or the new start 1239 * search offset. 1240 * @param status output error status if any 1241 * @return TRUE if the match is valid, FALSE otherwise 1242 */ 1243 static 1244 inline UBool checkNextExactMatch(UStringSearch *strsrch, 1245 int32_t *textoffset, UErrorCode *status) 1246 { 1247 UCollationElements *coleiter = strsrch->textIter; 1248 int32_t start = getColElemIterOffset(coleiter, FALSE); 1249 1250 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1251 return FALSE; 1252 } 1253 1254 // this totally matches, however we need to check if it is repeating 1255 if (!isBreakUnit(strsrch, start, *textoffset) || 1256 checkRepeatedMatch(strsrch, start, *textoffset) || 1257 hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1258 !checkIdentical(strsrch, start, *textoffset) || 1259 hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1260 1261 (*textoffset) ++; 1262 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1263 return FALSE; 1264 } 1265 1266 //Add breakiterator boundary check for primary strength search. 1267 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1268 checkBreakBoundary(strsrch, &start, textoffset); 1269 } 1270 1271 // totally match, we will get rid of the ending ignorables. 1272 strsrch->search->matchedIndex = start; 1273 strsrch->search->matchedLength = *textoffset - start; 1274 return TRUE; 1275 } 1276 1277 /** 1278 * Getting the previous base character offset, or the current offset if the 1279 * current character is a base character 1280 * @param text string 1281 * @param textoffset one offset after the current character 1282 * @return the offset of the next character after the base character or the first 1283 * composed character with accents 1284 */ 1285 static 1286 inline int32_t getPreviousBaseOffset(const UChar *text, 1287 int32_t textoffset) 1288 { 1289 if (textoffset > 0) { 1290 for (;;) { 1291 int32_t result = textoffset; 1292 U16_BACK_1(text, 0, textoffset); 1293 int32_t temp = textoffset; 1294 uint16_t fcd = getFCD(text, &temp, result); 1295 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1296 if (fcd & LAST_BYTE_MASK_) { 1297 return textoffset; 1298 } 1299 return result; 1300 } 1301 if (textoffset == 0) { 1302 return 0; 1303 } 1304 } 1305 } 1306 return textoffset; 1307 } 1308 1309 /** 1310 * Getting the indexes of the accents that are not blocked in the argument 1311 * accent array 1312 * @param accents array of accents in nfd terminated by a 0. 1313 * @param accentsindex array of indexes of the accents that are not blocked 1314 */ 1315 static 1316 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1317 { 1318 int32_t index = 0; 1319 int32_t length = u_strlen(accents); 1320 UChar32 codepoint = 0; 1321 int cclass = 0; 1322 int result = 0; 1323 int32_t temp; 1324 while (index < length) { 1325 temp = index; 1326 U16_NEXT(accents, index, length, codepoint); 1327 if (u_getCombiningClass(codepoint) != cclass) { 1328 cclass = u_getCombiningClass(codepoint); 1329 accentsindex[result] = temp; 1330 result ++; 1331 } 1332 } 1333 accentsindex[result] = length; 1334 return result; 1335 } 1336 1337 /** 1338 * Appends 3 UChar arrays to a destination array. 1339 * Creates a new array if we run out of space. The caller will have to 1340 * manually deallocate the newly allocated array. 1341 * Internal method, status assumed to be success, caller has to check status 1342 * before calling this method. destination not to be NULL and has at least 1343 * size destinationlength. 1344 * @param destination target array 1345 * @param destinationlength target array size, returning the appended length 1346 * @param source1 null-terminated first array 1347 * @param source2 second array 1348 * @param source2length length of seond array 1349 * @param source3 null-terminated third array 1350 * @param status error status if any 1351 * @return new destination array, destination if there was no new allocation 1352 */ 1353 static 1354 inline UChar * addToUCharArray( UChar *destination, 1355 int32_t *destinationlength, 1356 const UChar *source1, 1357 const UChar *source2, 1358 int32_t source2length, 1359 const UChar *source3, 1360 UErrorCode *status) 1361 { 1362 int32_t source1length = source1 ? u_strlen(source1) : 0; 1363 int32_t source3length = source3 ? u_strlen(source3) : 0; 1364 if (*destinationlength < source1length + source2length + source3length + 1365 1) 1366 { 1367 destination = (UChar *)allocateMemory( 1368 (source1length + source2length + source3length + 1) * sizeof(UChar), 1369 status); 1370 // if error allocating memory, status will be 1371 // U_MEMORY_ALLOCATION_ERROR 1372 if (U_FAILURE(*status)) { 1373 *destinationlength = 0; 1374 return NULL; 1375 } 1376 } 1377 if (source1length != 0) { 1378 uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1379 } 1380 if (source2length != 0) { 1381 uprv_memcpy(destination + source1length, source2, 1382 sizeof(UChar) * source2length); 1383 } 1384 if (source3length != 0) { 1385 uprv_memcpy(destination + source1length + source2length, source3, 1386 sizeof(UChar) * source3length); 1387 } 1388 *destinationlength = source1length + source2length + source3length; 1389 return destination; 1390 } 1391 1392 /** 1393 * Running through a collation element iterator to see if the contents matches 1394 * pattern in string search data 1395 * @param strsrch string search data 1396 * @param coleiter collation element iterator 1397 * @return TRUE if a match if found, FALSE otherwise 1398 */ 1399 static 1400 inline UBool checkCollationMatch(const UStringSearch *strsrch, 1401 UCollationElements *coleiter) 1402 { 1403 int patternceindex = strsrch->pattern.cesLength; 1404 int32_t *patternce = strsrch->pattern.ces; 1405 UErrorCode status = U_ZERO_ERROR; 1406 while (patternceindex > 0) { 1407 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1408 if (ce == UCOL_IGNORABLE) { 1409 continue; 1410 } 1411 if (U_FAILURE(status) || ce != *patternce) { 1412 return FALSE; 1413 } 1414 patternce ++; 1415 patternceindex --; 1416 } 1417 return TRUE; 1418 } 1419 1420 /** 1421 * Rearranges the front accents to try matching. 1422 * Prefix accents in the text will be grouped according to their combining 1423 * class and the groups will be mixed and matched to try find the perfect 1424 * match with the pattern. 1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1427 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1428 * "\u0301\u0325". 1429 * step 2: check if any of the generated substrings matches the pattern. 1430 * Internal method, status is assumed to be success, caller has to check status 1431 * before calling this method. 1432 * @param strsrch string search match 1433 * @param start first offset of the accents to start searching 1434 * @param end start of the last accent set 1435 * @param status output error status if any 1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1437 * offset of the match. Note this start includes all preceding accents. 1438 */ 1439 static 1440 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1441 int32_t start, 1442 int32_t end, 1443 UErrorCode *status) 1444 { 1445 const UChar *text = strsrch->search->text; 1446 int32_t textlength = strsrch->search->textLength; 1447 int32_t tempstart = start; 1448 1449 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1450 // die... failed at a base character 1451 return USEARCH_DONE; 1452 } 1453 1454 int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1455 start = getPreviousBaseOffset(text, tempstart); 1456 1457 UChar accents[INITIAL_ARRAY_SIZE_]; 1458 // normalizing the offensive string 1459 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1460 INITIAL_ARRAY_SIZE_, status); 1461 if (U_FAILURE(*status)) { 1462 return USEARCH_DONE; 1463 } 1464 1465 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1466 int32_t accentsize = getUnblockedAccentIndex(accents, 1467 accentsindex); 1468 int32_t count = (2 << (accentsize - 1)) - 1; 1469 UChar buffer[INITIAL_ARRAY_SIZE_]; 1470 UCollationElements *coleiter = strsrch->utilIter; 1471 while (U_SUCCESS(*status) && count > 0) { 1472 UChar *rearrange = strsrch->canonicalPrefixAccents; 1473 // copy the base characters 1474 for (int k = 0; k < accentsindex[0]; k ++) { 1475 *rearrange ++ = accents[k]; 1476 } 1477 // forming all possible canonical rearrangement by dropping 1478 // sets of accents 1479 for (int i = 0; i <= accentsize - 1; i ++) { 1480 int32_t mask = 1 << (accentsize - i - 1); 1481 if (count & mask) { 1482 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1483 *rearrange ++ = accents[j]; 1484 } 1485 } 1486 } 1487 *rearrange = 0; 1488 int32_t matchsize = INITIAL_ARRAY_SIZE_; 1489 UChar *match = addToUCharArray(buffer, &matchsize, 1490 strsrch->canonicalPrefixAccents, 1491 strsrch->search->text + offset, 1492 end - offset, 1493 strsrch->canonicalSuffixAccents, 1494 status); 1495 1496 // if status is a failure, ucol_setText does nothing. 1497 // run the collator iterator through this match 1498 ucol_setText(coleiter, match, matchsize, status); 1499 if (U_SUCCESS(*status)) { 1500 if (checkCollationMatch(strsrch, coleiter)) { 1501 if (match != buffer) { 1502 uprv_free(match); 1503 } 1504 return start; 1505 } 1506 } 1507 count --; 1508 } 1509 return USEARCH_DONE; 1510 } 1511 1512 /** 1513 * Gets the offset to the safe point in text before textoffset. 1514 * ie. not the middle of a contraction, swappable characters or supplementary 1515 * characters. 1516 * @param collator collation sata 1517 * @param text string to work with 1518 * @param textoffset offset in string 1519 * @param textlength length of text string 1520 * @return offset to the previous safe character 1521 */ 1522 static 1523 inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1524 const UChar *text, 1525 int32_t textoffset) 1526 { 1527 int32_t result = textoffset; // first contraction character 1528 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1529 result --; 1530 } 1531 if (result != 0) { 1532 // the first contraction character is consider unsafe here 1533 result --; 1534 } 1535 return result; 1536 } 1537 1538 /** 1539 * Cleaning up after we passed the safe zone 1540 * @param strsrch string search data 1541 * @param safetext safe text array 1542 * @param safebuffer safe text buffer 1543 * @param coleiter collation element iterator for safe text 1544 */ 1545 static 1546 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1547 UChar *safebuffer) 1548 { 1549 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1550 { 1551 uprv_free(safetext); 1552 } 1553 } 1554 1555 /** 1556 * Take the rearranged end accents and tries matching. If match failed at 1557 * a seperate preceding set of accents (seperated from the rearranged on by 1558 * at least a base character) then we rearrange the preceding accents and 1559 * tries matching again. 1560 * We allow skipping of the ends of the accent set if the ces do not match. 1561 * However if the failure is found before the accent set, it fails. 1562 * Internal method, status assumed to be success, caller has to check status 1563 * before calling this method. 1564 * @param strsrch string search data 1565 * @param textoffset of the start of the rearranged accent 1566 * @param status output error status if any 1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting 1568 * offset of the match. Note this start includes all preceding accents. 1569 */ 1570 static 1571 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1572 int32_t textoffset, 1573 UErrorCode *status) 1574 { 1575 const UChar *text = strsrch->search->text; 1576 const UCollator *collator = strsrch->collator; 1577 int32_t safelength = 0; 1578 UChar *safetext; 1579 int32_t safetextlength; 1580 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1581 UCollationElements *coleiter = strsrch->utilIter; 1582 int32_t safeoffset = textoffset; 1583 1584 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1585 collator)) { 1586 safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1587 safelength = textoffset - safeoffset; 1588 safetextlength = INITIAL_ARRAY_SIZE_; 1589 safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1590 text + safeoffset, safelength, 1591 strsrch->canonicalSuffixAccents, 1592 status); 1593 } 1594 else { 1595 safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1596 safetext = strsrch->canonicalSuffixAccents; 1597 } 1598 1599 // if status is a failure, ucol_setText does nothing 1600 ucol_setText(coleiter, safetext, safetextlength, status); 1601 // status checked in loop below 1602 1603 int32_t *ce = strsrch->pattern.ces; 1604 int32_t celength = strsrch->pattern.cesLength; 1605 int ceindex = celength - 1; 1606 UBool isSafe = TRUE; // indication flag for position in safe zone 1607 1608 while (ceindex >= 0) { 1609 int32_t textce = ucol_previous(coleiter, status); 1610 if (U_FAILURE(*status)) { 1611 if (isSafe) { 1612 cleanUpSafeText(strsrch, safetext, safebuffer); 1613 } 1614 return USEARCH_DONE; 1615 } 1616 if (textce == UCOL_NULLORDER) { 1617 // check if we have passed the safe buffer 1618 if (coleiter == strsrch->textIter) { 1619 cleanUpSafeText(strsrch, safetext, safebuffer); 1620 return USEARCH_DONE; 1621 } 1622 cleanUpSafeText(strsrch, safetext, safebuffer); 1623 safetext = safebuffer; 1624 coleiter = strsrch->textIter; 1625 setColEIterOffset(coleiter, safeoffset); 1626 // status checked at the start of the loop 1627 isSafe = FALSE; 1628 continue; 1629 } 1630 textce = getCE(strsrch, textce); 1631 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1632 // do the beginning stuff 1633 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1634 if (isSafe && failedoffset >= safelength) { 1635 // alas... no hope. failed at rearranged accent set 1636 cleanUpSafeText(strsrch, safetext, safebuffer); 1637 return USEARCH_DONE; 1638 } 1639 else { 1640 if (isSafe) { 1641 failedoffset += safeoffset; 1642 cleanUpSafeText(strsrch, safetext, safebuffer); 1643 } 1644 1645 // try rearranging the front accents 1646 int32_t result = doNextCanonicalPrefixMatch(strsrch, 1647 failedoffset, textoffset, status); 1648 if (result != USEARCH_DONE) { 1649 // if status is a failure, ucol_setOffset does nothing 1650 setColEIterOffset(strsrch->textIter, result); 1651 } 1652 if (U_FAILURE(*status)) { 1653 return USEARCH_DONE; 1654 } 1655 return result; 1656 } 1657 } 1658 if (textce == ce[ceindex]) { 1659 ceindex --; 1660 } 1661 } 1662 // set offset here 1663 if (isSafe) { 1664 int32_t result = getColElemIterOffset(coleiter, FALSE); 1665 // sets the text iterator here with the correct expansion and offset 1666 int32_t leftoverces = getExpansionPrefix(coleiter); 1667 cleanUpSafeText(strsrch, safetext, safebuffer); 1668 if (result >= safelength) { 1669 result = textoffset; 1670 } 1671 else { 1672 result += safeoffset; 1673 } 1674 setColEIterOffset(strsrch->textIter, result); 1675 strsrch->textIter->iteratordata_.toReturn = 1676 setExpansionPrefix(strsrch->textIter, leftoverces); 1677 return result; 1678 } 1679 1680 return ucol_getOffset(coleiter); 1681 } 1682 1683 /** 1684 * Trying out the substring and sees if it can be a canonical match. 1685 * This will try normalizing the end accents and arranging them into canonical 1686 * equivalents and check their corresponding ces with the pattern ce. 1687 * Suffix accents in the text will be grouped according to their combining 1688 * class and the groups will be mixed and matched to try find the perfect 1689 * match with the pattern. 1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1692 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1693 * "\u0301\u0325". 1694 * step 2: check if any of the generated substrings matches the pattern. 1695 * Internal method, status assumed to be success, caller has to check status 1696 * before calling this method. 1697 * @param strsrch string search data 1698 * @param textoffset end offset in the collation element text that ends with 1699 * the accents to be rearranged 1700 * @param status error status if any 1701 * @return TRUE if the match is valid, FALSE otherwise 1702 */ 1703 static 1704 UBool doNextCanonicalMatch(UStringSearch *strsrch, 1705 int32_t textoffset, 1706 UErrorCode *status) 1707 { 1708 const UChar *text = strsrch->search->text; 1709 int32_t temp = textoffset; 1710 U16_BACK_1(text, 0, temp); 1711 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1712 UCollationElements *coleiter = strsrch->textIter; 1713 int32_t offset = getColElemIterOffset(coleiter, FALSE); 1714 if (strsrch->pattern.hasPrefixAccents) { 1715 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1716 status); 1717 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1718 setColEIterOffset(coleiter, offset); 1719 return TRUE; 1720 } 1721 } 1722 return FALSE; 1723 } 1724 1725 if (!strsrch->pattern.hasSuffixAccents) { 1726 return FALSE; 1727 } 1728 1729 UChar accents[INITIAL_ARRAY_SIZE_]; 1730 // offset to the last base character in substring to search 1731 int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1732 // normalizing the offensive string 1733 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1734 0, accents, INITIAL_ARRAY_SIZE_, status); 1735 // status checked in loop below 1736 1737 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1738 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1739 1740 // 2 power n - 1 plus the full set of accents 1741 int32_t count = (2 << (size - 1)) - 1; 1742 while (U_SUCCESS(*status) && count > 0) { 1743 UChar *rearrange = strsrch->canonicalSuffixAccents; 1744 // copy the base characters 1745 for (int k = 0; k < accentsindex[0]; k ++) { 1746 *rearrange ++ = accents[k]; 1747 } 1748 // forming all possible canonical rearrangement by dropping 1749 // sets of accents 1750 for (int i = 0; i <= size - 1; i ++) { 1751 int32_t mask = 1 << (size - i - 1); 1752 if (count & mask) { 1753 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1754 *rearrange ++ = accents[j]; 1755 } 1756 } 1757 } 1758 *rearrange = 0; 1759 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1760 status); 1761 if (offset != USEARCH_DONE) { 1762 return TRUE; // match found 1763 } 1764 count --; 1765 } 1766 return FALSE; 1767 } 1768 1769 /** 1770 * Gets the previous base character offset depending on the string search 1771 * pattern data 1772 * @param strsrch string search data 1773 * @param textoffset current offset, current character 1774 * @return the offset of the next character after this base character or itself 1775 * if it is a composed character with accents 1776 */ 1777 static 1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1779 int32_t textoffset) 1780 { 1781 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1782 const UChar *text = strsrch->search->text; 1783 int32_t offset = textoffset; 1784 if (getFCD(text, &offset, strsrch->search->textLength) >> 1785 SECOND_LAST_BYTE_SHIFT_) { 1786 return getPreviousBaseOffset(text, textoffset); 1787 } 1788 } 1789 return textoffset; 1790 } 1791 1792 /** 1793 * Checks match for contraction. 1794 * If the match ends with a partial contraction we fail. 1795 * If the match starts too far off (because of backwards iteration) we try to 1796 * chip off the extra characters 1797 * Internal method, status assumed to be success, caller has to check status 1798 * before calling this method. 1799 * @param strsrch string search data 1800 * @param start offset of potential match, to be modified if necessary 1801 * @param end offset of potential match, to be modified if necessary 1802 * @param status output error status if any 1803 * @return TRUE if match passes the contraction test, FALSE otherwise 1804 */ 1805 static 1806 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1807 int32_t *start, 1808 int32_t *end, 1809 UErrorCode *status) 1810 { 1811 UCollationElements *coleiter = strsrch->textIter; 1812 int32_t textlength = strsrch->search->textLength; 1813 int32_t temp = *start; 1814 const UCollator *collator = strsrch->collator; 1815 const UChar *text = strsrch->search->text; 1816 // This part checks if either ends of the match contains potential 1817 // contraction. If so we'll have to iterate through them 1818 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1819 (*start + 1 < textlength 1820 && ucol_unsafeCP(text[*start + 1], collator))) { 1821 int32_t expansion = getExpansionPrefix(coleiter); 1822 UBool expandflag = expansion > 0; 1823 setColEIterOffset(coleiter, *start); 1824 while (expansion > 0) { 1825 // getting rid of the redundant ce, caused by setOffset. 1826 // since backward contraction/expansion may have extra ces if we 1827 // are in the normalization buffer, hasAccentsBeforeMatch would 1828 // have taken care of it. 1829 // E.g. the character \u01FA will have an expansion of 3, but if 1830 // we are only looking for acute and ring \u030A and \u0301, we'll 1831 // have to skip the first ce in the expansion buffer. 1832 ucol_next(coleiter, status); 1833 if (U_FAILURE(*status)) { 1834 return FALSE; 1835 } 1836 if (ucol_getOffset(coleiter) != temp) { 1837 *start = temp; 1838 temp = ucol_getOffset(coleiter); 1839 } 1840 expansion --; 1841 } 1842 1843 int32_t *patternce = strsrch->pattern.ces; 1844 int32_t patterncelength = strsrch->pattern.cesLength; 1845 int32_t count = 0; 1846 int32_t textlength = strsrch->search->textLength; 1847 while (count < patterncelength) { 1848 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1849 // status checked below, note that if status is a failure 1850 // ucol_next returns UCOL_NULLORDER 1851 if (ce == UCOL_IGNORABLE) { 1852 continue; 1853 } 1854 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1855 *start = temp; 1856 temp = ucol_getOffset(coleiter); 1857 } 1858 1859 if (count == 0 && ce != patternce[0]) { 1860 // accents may have extra starting ces, this occurs when a 1861 // pure accent pattern is matched without rearrangement 1862 // text \u0325\u0300 and looking for \u0300 1863 int32_t expected = patternce[0]; 1864 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1865 ce = getCE(strsrch, ucol_next(coleiter, status)); 1866 while (U_SUCCESS(*status) && ce != expected && 1867 ce != UCOL_NULLORDER && 1868 ucol_getOffset(coleiter) <= *end) { 1869 ce = getCE(strsrch, ucol_next(coleiter, status)); 1870 } 1871 } 1872 } 1873 if (U_FAILURE(*status) || ce != patternce[count]) { 1874 (*end) ++; 1875 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1876 return FALSE; 1877 } 1878 count ++; 1879 } 1880 } 1881 return TRUE; 1882 } 1883 1884 /** 1885 * Checks and sets the match information if found. 1886 * Checks 1887 * <ul> 1888 * <li> the potential match does not repeat the previous match 1889 * <li> boundaries are correct 1890 * <li> potential match does not end in the middle of a contraction 1891 * <li> identical matches 1892 * <\ul> 1893 * Otherwise the offset will be shifted to the next character. 1894 * Internal method, status assumed to be success, caller has to check the 1895 * status before calling this method. 1896 * @param strsrch string search data 1897 * @param textoffset offset in the collation element text. the returned value 1898 * will be the truncated end offset of the match or the new start 1899 * search offset. 1900 * @param status output error status if any 1901 * @return TRUE if the match is valid, FALSE otherwise 1902 */ 1903 static 1904 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1905 int32_t *textoffset, 1906 UErrorCode *status) 1907 { 1908 // to ensure that the start and ends are not composite characters 1909 UCollationElements *coleiter = strsrch->textIter; 1910 // if we have a canonical accent match 1911 if ((strsrch->pattern.hasSuffixAccents && 1912 strsrch->canonicalSuffixAccents[0]) || 1913 (strsrch->pattern.hasPrefixAccents && 1914 strsrch->canonicalPrefixAccents[0])) { 1915 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1916 strsrch, 1917 ucol_getOffset(coleiter)); 1918 strsrch->search->matchedLength = *textoffset - 1919 strsrch->search->matchedIndex; 1920 return TRUE; 1921 } 1922 1923 int32_t start = getColElemIterOffset(coleiter, FALSE); 1924 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1925 status) || U_FAILURE(*status)) { 1926 return FALSE; 1927 } 1928 1929 start = getPreviousUStringSearchBaseOffset(strsrch, start); 1930 // this totally matches, however we need to check if it is repeating 1931 if (checkRepeatedMatch(strsrch, start, *textoffset) || 1932 !isBreakUnit(strsrch, start, *textoffset) || 1933 !checkIdentical(strsrch, start, *textoffset)) { 1934 (*textoffset) ++; 1935 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1936 strsrch->search->textLength); 1937 return FALSE; 1938 } 1939 1940 strsrch->search->matchedIndex = start; 1941 strsrch->search->matchedLength = *textoffset - start; 1942 return TRUE; 1943 } 1944 1945 /** 1946 * Shifting the collation element iterator position forward to prepare for 1947 * a preceding match. If the first character is a unsafe character, we'll only 1948 * shift by 1 to capture contractions, normalization etc. 1949 * Internal method, status assumed to be success, caller has to check status 1950 * before calling this method. 1951 * @param text strsrch string search data 1952 * @param textoffset start text position to do search 1953 * @param ce the text ce which failed the match. 1954 * @param patternceindex index of the ce within the pattern ce buffer which 1955 * failed the match 1956 * @return final offset 1957 */ 1958 static 1959 inline int32_t reverseShift(UStringSearch *strsrch, 1960 int32_t textoffset, 1961 int32_t ce, 1962 int32_t patternceindex) 1963 { 1964 if (strsrch->search->isOverlap) { 1965 if (textoffset != strsrch->search->textLength) { 1966 textoffset --; 1967 } 1968 else { 1969 textoffset -= strsrch->pattern.defaultShiftSize; 1970 } 1971 } 1972 else { 1973 if (ce != UCOL_NULLORDER) { 1974 int32_t shift = strsrch->pattern.backShift[hash(ce)]; 1975 1976 // this is to adjust for characters in the middle of the substring 1977 // for matching that failed. 1978 int32_t adjust = patternceindex; 1979 if (adjust > 1 && shift > adjust) { 1980 shift -= adjust - 1; 1981 } 1982 textoffset -= shift; 1983 } 1984 else { 1985 textoffset -= strsrch->pattern.defaultShiftSize; 1986 } 1987 } 1988 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1989 return textoffset; 1990 } 1991 1992 /** 1993 * Checks match for contraction. 1994 * If the match starts with a partial contraction we fail. 1995 * Internal method, status assumed to be success, caller has to check status 1996 * before calling this method. 1997 * @param strsrch string search data 1998 * @param start offset of potential match, to be modified if necessary 1999 * @param end offset of potential match, to be modified if necessary 2000 * @param status output error status if any 2001 * @return TRUE if match passes the contraction test, FALSE otherwise 2002 */ 2003 static 2004 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2005 int32_t *start, 2006 int32_t *end, UErrorCode *status) 2007 { 2008 UCollationElements *coleiter = strsrch->textIter; 2009 int32_t textlength = strsrch->search->textLength; 2010 int32_t temp = *end; 2011 const UCollator *collator = strsrch->collator; 2012 const UChar *text = strsrch->search->text; 2013 // This part checks if either if the start of the match contains potential 2014 // contraction. If so we'll have to iterate through them 2015 // Since we used ucol_next while previously looking for the potential 2016 // match, this guarantees that our end will not be a partial contraction, 2017 // or a partial supplementary character. 2018 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2019 int32_t expansion = getExpansionSuffix(coleiter); 2020 UBool expandflag = expansion > 0; 2021 setColEIterOffset(coleiter, *end); 2022 while (U_SUCCESS(*status) && expansion > 0) { 2023 // getting rid of the redundant ce 2024 // since forward contraction/expansion may have extra ces 2025 // if we are in the normalization buffer, hasAccentsBeforeMatch 2026 // would have taken care of it. 2027 // E.g. the character \u01FA will have an expansion of 3, but if 2028 // we are only looking for A ring A\u030A, we'll have to skip the 2029 // last ce in the expansion buffer 2030 ucol_previous(coleiter, status); 2031 if (U_FAILURE(*status)) { 2032 return FALSE; 2033 } 2034 if (ucol_getOffset(coleiter) != temp) { 2035 *end = temp; 2036 temp = ucol_getOffset(coleiter); 2037 } 2038 expansion --; 2039 } 2040 2041 int32_t *patternce = strsrch->pattern.ces; 2042 int32_t patterncelength = strsrch->pattern.cesLength; 2043 int32_t count = patterncelength; 2044 while (count > 0) { 2045 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2046 // status checked below, note that if status is a failure 2047 // ucol_previous returns UCOL_NULLORDER 2048 if (ce == UCOL_IGNORABLE) { 2049 continue; 2050 } 2051 if (expandflag && count == 0 && 2052 getColElemIterOffset(coleiter, FALSE) != temp) { 2053 *end = temp; 2054 temp = ucol_getOffset(coleiter); 2055 } 2056 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2057 (*start) --; 2058 *start = getPreviousBaseOffset(text, *start); 2059 return FALSE; 2060 } 2061 count --; 2062 } 2063 } 2064 return TRUE; 2065 } 2066 2067 /** 2068 * Checks and sets the match information if found. 2069 * Checks 2070 * <ul> 2071 * <li> the current match does not repeat the last match 2072 * <li> boundaries are correct 2073 * <li> exact matches has no extra accents 2074 * <li> identical matches 2075 * <\ul> 2076 * Otherwise the offset will be shifted to the preceding character. 2077 * Internal method, status assumed to be success, caller has to check status 2078 * before calling this method. 2079 * @param strsrch string search data 2080 * @param collator 2081 * @param coleiter collation element iterator 2082 * @param text string 2083 * @param textoffset offset in the collation element text. the returned value 2084 * will be the truncated start offset of the match or the new start 2085 * search offset. 2086 * @param status output error status if any 2087 * @return TRUE if the match is valid, FALSE otherwise 2088 */ 2089 static 2090 inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2091 int32_t *textoffset, 2092 UErrorCode *status) 2093 { 2094 // to ensure that the start and ends are not composite characters 2095 int32_t end = ucol_getOffset(strsrch->textIter); 2096 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2097 || U_FAILURE(*status)) { 2098 return FALSE; 2099 } 2100 2101 // this totally matches, however we need to check if it is repeating 2102 // the old match 2103 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2104 !isBreakUnit(strsrch, *textoffset, end) || 2105 hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2106 !checkIdentical(strsrch, *textoffset, end) || 2107 hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2108 (*textoffset) --; 2109 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2110 *textoffset); 2111 return FALSE; 2112 } 2113 2114 //Add breakiterator boundary check for primary strength search. 2115 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2116 checkBreakBoundary(strsrch, textoffset, &end); 2117 } 2118 2119 strsrch->search->matchedIndex = *textoffset; 2120 strsrch->search->matchedLength = end - *textoffset; 2121 return TRUE; 2122 } 2123 2124 /** 2125 * Rearranges the end accents to try matching. 2126 * Suffix accents in the text will be grouped according to their combining 2127 * class and the groups will be mixed and matched to try find the perfect 2128 * match with the pattern. 2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2131 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2132 * "\u0301\u0325". 2133 * step 2: check if any of the generated substrings matches the pattern. 2134 * Internal method, status assumed to be success, user has to check status 2135 * before calling this method. 2136 * @param strsrch string search match 2137 * @param start offset of the first base character 2138 * @param end start of the last accent set 2139 * @param status only error status if any 2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2141 * offset of the match. Note this start includes all following accents. 2142 */ 2143 static 2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2145 int32_t start, 2146 int32_t end, 2147 UErrorCode *status) 2148 { 2149 const UChar *text = strsrch->search->text; 2150 int32_t tempend = end; 2151 2152 U16_BACK_1(text, 0, tempend); 2153 if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2154 LAST_BYTE_MASK_)) { 2155 // die... failed at a base character 2156 return USEARCH_DONE; 2157 } 2158 end = getNextBaseOffset(text, end, strsrch->search->textLength); 2159 2160 if (U_SUCCESS(*status)) { 2161 UChar accents[INITIAL_ARRAY_SIZE_]; 2162 int32_t offset = getPreviousBaseOffset(text, end); 2163 // normalizing the offensive string 2164 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2165 INITIAL_ARRAY_SIZE_, status); 2166 2167 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2168 int32_t accentsize = getUnblockedAccentIndex(accents, 2169 accentsindex); 2170 int32_t count = (2 << (accentsize - 1)) - 1; 2171 UChar buffer[INITIAL_ARRAY_SIZE_]; 2172 UCollationElements *coleiter = strsrch->utilIter; 2173 while (U_SUCCESS(*status) && count > 0) { 2174 UChar *rearrange = strsrch->canonicalSuffixAccents; 2175 // copy the base characters 2176 for (int k = 0; k < accentsindex[0]; k ++) { 2177 *rearrange ++ = accents[k]; 2178 } 2179 // forming all possible canonical rearrangement by dropping 2180 // sets of accents 2181 for (int i = 0; i <= accentsize - 1; i ++) { 2182 int32_t mask = 1 << (accentsize - i - 1); 2183 if (count & mask) { 2184 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2185 *rearrange ++ = accents[j]; 2186 } 2187 } 2188 } 2189 *rearrange = 0; 2190 int32_t matchsize = INITIAL_ARRAY_SIZE_; 2191 UChar *match = addToUCharArray(buffer, &matchsize, 2192 strsrch->canonicalPrefixAccents, 2193 strsrch->search->text + start, 2194 offset - start, 2195 strsrch->canonicalSuffixAccents, 2196 status); 2197 2198 // run the collator iterator through this match 2199 // if status is a failure ucol_setText does nothing 2200 ucol_setText(coleiter, match, matchsize, status); 2201 if (U_SUCCESS(*status)) { 2202 if (checkCollationMatch(strsrch, coleiter)) { 2203 if (match != buffer) { 2204 uprv_free(match); 2205 } 2206 return end; 2207 } 2208 } 2209 count --; 2210 } 2211 } 2212 return USEARCH_DONE; 2213 } 2214 2215 /** 2216 * Take the rearranged start accents and tries matching. If match failed at 2217 * a seperate following set of accents (seperated from the rearranged on by 2218 * at least a base character) then we rearrange the preceding accents and 2219 * tries matching again. 2220 * We allow skipping of the ends of the accent set if the ces do not match. 2221 * However if the failure is found before the accent set, it fails. 2222 * Internal method, status assumed to be success, caller has to check status 2223 * before calling this method. 2224 * @param strsrch string search data 2225 * @param textoffset of the ends of the rearranged accent 2226 * @param status output error status if any 2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending 2228 * offset of the match. Note this start includes all following accents. 2229 */ 2230 static 2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2232 int32_t textoffset, 2233 UErrorCode *status) 2234 { 2235 const UChar *text = strsrch->search->text; 2236 const UCollator *collator = strsrch->collator; 2237 int32_t safelength = 0; 2238 UChar *safetext; 2239 int32_t safetextlength; 2240 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2241 int32_t safeoffset = textoffset; 2242 2243 if (textoffset && 2244 ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2245 u_strlen(strsrch->canonicalPrefixAccents) - 1 2246 ], collator)) { 2247 safeoffset = getNextSafeOffset(collator, text, textoffset, 2248 strsrch->search->textLength); 2249 safelength = safeoffset - textoffset; 2250 safetextlength = INITIAL_ARRAY_SIZE_; 2251 safetext = addToUCharArray(safebuffer, &safetextlength, 2252 strsrch->canonicalPrefixAccents, 2253 text + textoffset, safelength, 2254 NULL, status); 2255 } 2256 else { 2257 safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2258 safetext = strsrch->canonicalPrefixAccents; 2259 } 2260 2261 UCollationElements *coleiter = strsrch->utilIter; 2262 // if status is a failure, ucol_setText does nothing 2263 ucol_setText(coleiter, safetext, safetextlength, status); 2264 // status checked in loop below 2265 2266 int32_t *ce = strsrch->pattern.ces; 2267 int32_t celength = strsrch->pattern.cesLength; 2268 int ceindex = 0; 2269 UBool isSafe = TRUE; // safe zone indication flag for position 2270 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2271 2272 while (ceindex < celength) { 2273 int32_t textce = ucol_next(coleiter, status); 2274 if (U_FAILURE(*status)) { 2275 if (isSafe) { 2276 cleanUpSafeText(strsrch, safetext, safebuffer); 2277 } 2278 return USEARCH_DONE; 2279 } 2280 if (textce == UCOL_NULLORDER) { 2281 // check if we have passed the safe buffer 2282 if (coleiter == strsrch->textIter) { 2283 cleanUpSafeText(strsrch, safetext, safebuffer); 2284 return USEARCH_DONE; 2285 } 2286 cleanUpSafeText(strsrch, safetext, safebuffer); 2287 safetext = safebuffer; 2288 coleiter = strsrch->textIter; 2289 setColEIterOffset(coleiter, safeoffset); 2290 // status checked at the start of the loop 2291 isSafe = FALSE; 2292 continue; 2293 } 2294 textce = getCE(strsrch, textce); 2295 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2296 // do the beginning stuff 2297 int32_t failedoffset = ucol_getOffset(coleiter); 2298 if (isSafe && failedoffset <= prefixlength) { 2299 // alas... no hope. failed at rearranged accent set 2300 cleanUpSafeText(strsrch, safetext, safebuffer); 2301 return USEARCH_DONE; 2302 } 2303 else { 2304 if (isSafe) { 2305 failedoffset = safeoffset - failedoffset; 2306 cleanUpSafeText(strsrch, safetext, safebuffer); 2307 } 2308 2309 // try rearranging the end accents 2310 int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2311 textoffset, failedoffset, status); 2312 if (result != USEARCH_DONE) { 2313 // if status is a failure, ucol_setOffset does nothing 2314 setColEIterOffset(strsrch->textIter, result); 2315 } 2316 if (U_FAILURE(*status)) { 2317 return USEARCH_DONE; 2318 } 2319 return result; 2320 } 2321 } 2322 if (textce == ce[ceindex]) { 2323 ceindex ++; 2324 } 2325 } 2326 // set offset here 2327 if (isSafe) { 2328 int32_t result = ucol_getOffset(coleiter); 2329 // sets the text iterator here with the correct expansion and offset 2330 int32_t leftoverces = getExpansionSuffix(coleiter); 2331 cleanUpSafeText(strsrch, safetext, safebuffer); 2332 if (result <= prefixlength) { 2333 result = textoffset; 2334 } 2335 else { 2336 result = textoffset + (safeoffset - result); 2337 } 2338 setColEIterOffset(strsrch->textIter, result); 2339 setExpansionSuffix(strsrch->textIter, leftoverces); 2340 return result; 2341 } 2342 2343 return ucol_getOffset(coleiter); 2344 } 2345 2346 /** 2347 * Trying out the substring and sees if it can be a canonical match. 2348 * This will try normalizing the starting accents and arranging them into 2349 * canonical equivalents and check their corresponding ces with the pattern ce. 2350 * Prefix accents in the text will be grouped according to their combining 2351 * class and the groups will be mixed and matched to try find the perfect 2352 * match with the pattern. 2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2355 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2356 * "\u0301\u0325". 2357 * step 2: check if any of the generated substrings matches the pattern. 2358 * Internal method, status assumed to be success, caller has to check status 2359 * before calling this method. 2360 * @param strsrch string search data 2361 * @param textoffset start offset in the collation element text that starts 2362 * with the accents to be rearranged 2363 * @param status output error status if any 2364 * @return TRUE if the match is valid, FALSE otherwise 2365 */ 2366 static 2367 UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2368 int32_t textoffset, 2369 UErrorCode *status) 2370 { 2371 const UChar *text = strsrch->search->text; 2372 int32_t temp = textoffset; 2373 int32_t textlength = strsrch->search->textLength; 2374 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2375 UCollationElements *coleiter = strsrch->textIter; 2376 int32_t offset = ucol_getOffset(coleiter); 2377 if (strsrch->pattern.hasSuffixAccents) { 2378 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2379 offset, status); 2380 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2381 setColEIterOffset(coleiter, offset); 2382 return TRUE; 2383 } 2384 } 2385 return FALSE; 2386 } 2387 2388 if (!strsrch->pattern.hasPrefixAccents) { 2389 return FALSE; 2390 } 2391 2392 UChar accents[INITIAL_ARRAY_SIZE_]; 2393 // offset to the last base character in substring to search 2394 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2395 // normalizing the offensive string 2396 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2397 0, accents, INITIAL_ARRAY_SIZE_, status); 2398 // status checked in loop 2399 2400 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2401 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2402 2403 // 2 power n - 1 plus the full set of accents 2404 int32_t count = (2 << (size - 1)) - 1; 2405 while (U_SUCCESS(*status) && count > 0) { 2406 UChar *rearrange = strsrch->canonicalPrefixAccents; 2407 // copy the base characters 2408 for (int k = 0; k < accentsindex[0]; k ++) { 2409 *rearrange ++ = accents[k]; 2410 } 2411 // forming all possible canonical rearrangement by dropping 2412 // sets of accents 2413 for (int i = 0; i <= size - 1; i ++) { 2414 int32_t mask = 1 << (size - i - 1); 2415 if (count & mask) { 2416 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2417 *rearrange ++ = accents[j]; 2418 } 2419 } 2420 } 2421 *rearrange = 0; 2422 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2423 baseoffset, status); 2424 if (offset != USEARCH_DONE) { 2425 return TRUE; // match found 2426 } 2427 count --; 2428 } 2429 return FALSE; 2430 } 2431 2432 /** 2433 * Checks match for contraction. 2434 * If the match starts with a partial contraction we fail. 2435 * Internal method, status assumed to be success, caller has to check status 2436 * before calling this method. 2437 * @param strsrch string search data 2438 * @param start offset of potential match, to be modified if necessary 2439 * @param end offset of potential match, to be modified if necessary 2440 * @param status only error status if any 2441 * @return TRUE if match passes the contraction test, FALSE otherwise 2442 */ 2443 static 2444 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2445 int32_t *start, 2446 int32_t *end, UErrorCode *status) 2447 { 2448 UCollationElements *coleiter = strsrch->textIter; 2449 int32_t textlength = strsrch->search->textLength; 2450 int32_t temp = *end; 2451 const UCollator *collator = strsrch->collator; 2452 const UChar *text = strsrch->search->text; 2453 // This part checks if either if the start of the match contains potential 2454 // contraction. If so we'll have to iterate through them 2455 // Since we used ucol_next while previously looking for the potential 2456 // match, this guarantees that our end will not be a partial contraction, 2457 // or a partial supplementary character. 2458 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2459 int32_t expansion = getExpansionSuffix(coleiter); 2460 UBool expandflag = expansion > 0; 2461 setColEIterOffset(coleiter, *end); 2462 while (expansion > 0) { 2463 // getting rid of the redundant ce 2464 // since forward contraction/expansion may have extra ces 2465 // if we are in the normalization buffer, hasAccentsBeforeMatch 2466 // would have taken care of it. 2467 // E.g. the character \u01FA will have an expansion of 3, but if 2468 // we are only looking for A ring A\u030A, we'll have to skip the 2469 // last ce in the expansion buffer 2470 ucol_previous(coleiter, status); 2471 if (U_FAILURE(*status)) { 2472 return FALSE; 2473 } 2474 if (ucol_getOffset(coleiter) != temp) { 2475 *end = temp; 2476 temp = ucol_getOffset(coleiter); 2477 } 2478 expansion --; 2479 } 2480 2481 int32_t *patternce = strsrch->pattern.ces; 2482 int32_t patterncelength = strsrch->pattern.cesLength; 2483 int32_t count = patterncelength; 2484 while (count > 0) { 2485 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2486 // status checked below, note that if status is a failure 2487 // ucol_previous returns UCOL_NULLORDER 2488 if (ce == UCOL_IGNORABLE) { 2489 continue; 2490 } 2491 if (expandflag && count == 0 && 2492 getColElemIterOffset(coleiter, FALSE) != temp) { 2493 *end = temp; 2494 temp = ucol_getOffset(coleiter); 2495 } 2496 if (count == patterncelength && 2497 ce != patternce[patterncelength - 1]) { 2498 // accents may have extra starting ces, this occurs when a 2499 // pure accent pattern is matched without rearrangement 2500 int32_t expected = patternce[patterncelength - 1]; 2501 U16_BACK_1(text, 0, *end); 2502 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2503 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2504 while (U_SUCCESS(*status) && ce != expected && 2505 ce != UCOL_NULLORDER && 2506 ucol_getOffset(coleiter) <= *start) { 2507 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2508 } 2509 } 2510 } 2511 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2512 (*start) --; 2513 *start = getPreviousBaseOffset(text, *start); 2514 return FALSE; 2515 } 2516 count --; 2517 } 2518 } 2519 return TRUE; 2520 } 2521 2522 /** 2523 * Checks and sets the match information if found. 2524 * Checks 2525 * <ul> 2526 * <li> the potential match does not repeat the previous match 2527 * <li> boundaries are correct 2528 * <li> potential match does not end in the middle of a contraction 2529 * <li> identical matches 2530 * <\ul> 2531 * Otherwise the offset will be shifted to the next character. 2532 * Internal method, status assumed to be success, caller has to check status 2533 * before calling this method. 2534 * @param strsrch string search data 2535 * @param textoffset offset in the collation element text. the returned value 2536 * will be the truncated start offset of the match or the new start 2537 * search offset. 2538 * @param status only error status if any 2539 * @return TRUE if the match is valid, FALSE otherwise 2540 */ 2541 static 2542 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2543 int32_t *textoffset, 2544 UErrorCode *status) 2545 { 2546 // to ensure that the start and ends are not composite characters 2547 UCollationElements *coleiter = strsrch->textIter; 2548 // if we have a canonical accent match 2549 if ((strsrch->pattern.hasSuffixAccents && 2550 strsrch->canonicalSuffixAccents[0]) || 2551 (strsrch->pattern.hasPrefixAccents && 2552 strsrch->canonicalPrefixAccents[0])) { 2553 strsrch->search->matchedIndex = *textoffset; 2554 strsrch->search->matchedLength = 2555 getNextUStringSearchBaseOffset(strsrch, 2556 getColElemIterOffset(coleiter, FALSE)) 2557 - *textoffset; 2558 return TRUE; 2559 } 2560 2561 int32_t end = ucol_getOffset(coleiter); 2562 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2563 status) || 2564 U_FAILURE(*status)) { 2565 return FALSE; 2566 } 2567 2568 end = getNextUStringSearchBaseOffset(strsrch, end); 2569 // this totally matches, however we need to check if it is repeating 2570 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2571 !isBreakUnit(strsrch, *textoffset, end) || 2572 !checkIdentical(strsrch, *textoffset, end)) { 2573 (*textoffset) --; 2574 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2575 *textoffset); 2576 return FALSE; 2577 } 2578 2579 strsrch->search->matchedIndex = *textoffset; 2580 strsrch->search->matchedLength = end - *textoffset; 2581 return TRUE; 2582 } 2583 #endif // #if BOYER_MOORE 2584 2585 // constructors and destructor ------------------------------------------- 2586 2587 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2588 int32_t patternlength, 2589 const UChar *text, 2590 int32_t textlength, 2591 const char *locale, 2592 UBreakIterator *breakiter, 2593 UErrorCode *status) 2594 { 2595 if (U_FAILURE(*status)) { 2596 return NULL; 2597 } 2598 #if UCONFIG_NO_BREAK_ITERATION 2599 if (breakiter != NULL) { 2600 *status = U_UNSUPPORTED_ERROR; 2601 return NULL; 2602 } 2603 #endif 2604 if (locale) { 2605 // ucol_open internally checks for status 2606 UCollator *collator = ucol_open(locale, status); 2607 // pattern, text checks are done in usearch_openFromCollator 2608 UStringSearch *result = usearch_openFromCollator(pattern, 2609 patternlength, text, textlength, 2610 collator, breakiter, status); 2611 2612 if (result == NULL || U_FAILURE(*status)) { 2613 if (collator) { 2614 ucol_close(collator); 2615 } 2616 return NULL; 2617 } 2618 else { 2619 result->ownCollator = TRUE; 2620 } 2621 return result; 2622 } 2623 *status = U_ILLEGAL_ARGUMENT_ERROR; 2624 return NULL; 2625 } 2626 2627 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2628 const UChar *pattern, 2629 int32_t patternlength, 2630 const UChar *text, 2631 int32_t textlength, 2632 const UCollator *collator, 2633 UBreakIterator *breakiter, 2634 UErrorCode *status) 2635 { 2636 if (U_FAILURE(*status)) { 2637 return NULL; 2638 } 2639 #if UCONFIG_NO_BREAK_ITERATION 2640 if (breakiter != NULL) { 2641 *status = U_UNSUPPORTED_ERROR; 2642 return NULL; 2643 } 2644 #endif 2645 if (pattern == NULL || text == NULL || collator == NULL) { 2646 *status = U_ILLEGAL_ARGUMENT_ERROR; 2647 return NULL; 2648 } 2649 2650 // string search does not really work when numeric collation is turned on 2651 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2652 *status = U_UNSUPPORTED_ERROR; 2653 return NULL; 2654 } 2655 2656 if (U_SUCCESS(*status)) { 2657 initializeFCD(status); 2658 if (U_FAILURE(*status)) { 2659 return NULL; 2660 } 2661 2662 UStringSearch *result; 2663 if (textlength == -1) { 2664 textlength = u_strlen(text); 2665 } 2666 if (patternlength == -1) { 2667 patternlength = u_strlen(pattern); 2668 } 2669 if (textlength <= 0 || patternlength <= 0) { 2670 *status = U_ILLEGAL_ARGUMENT_ERROR; 2671 return NULL; 2672 } 2673 2674 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2675 if (result == NULL) { 2676 *status = U_MEMORY_ALLOCATION_ERROR; 2677 return NULL; 2678 } 2679 2680 result->collator = collator; 2681 result->strength = ucol_getStrength(collator); 2682 result->ceMask = getMask(result->strength); 2683 result->toShift = 2684 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2685 UCOL_SHIFTED; 2686 result->variableTop = ucol_getVariableTop(collator, status); 2687 2688 result->nfd = Normalizer2::getNFDInstance(*status); 2689 2690 if (U_FAILURE(*status)) { 2691 uprv_free(result); 2692 return NULL; 2693 } 2694 2695 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2696 if (result->search == NULL) { 2697 *status = U_MEMORY_ALLOCATION_ERROR; 2698 uprv_free(result); 2699 return NULL; 2700 } 2701 2702 result->search->text = text; 2703 result->search->textLength = textlength; 2704 2705 result->pattern.text = pattern; 2706 result->pattern.textLength = patternlength; 2707 result->pattern.ces = NULL; 2708 result->pattern.pces = NULL; 2709 2710 result->search->breakIter = breakiter; 2711 #if !UCONFIG_NO_BREAK_ITERATION 2712 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2713 if (breakiter) { 2714 ubrk_setText(breakiter, text, textlength, status); 2715 } 2716 #endif 2717 2718 result->ownCollator = FALSE; 2719 result->search->matchedLength = 0; 2720 result->search->matchedIndex = USEARCH_DONE; 2721 result->utilIter = NULL; 2722 result->textIter = ucol_openElements(collator, text, 2723 textlength, status); 2724 result->textProcessedIter = NULL; 2725 if (U_FAILURE(*status)) { 2726 usearch_close(result); 2727 return NULL; 2728 } 2729 2730 result->search->isOverlap = FALSE; 2731 result->search->isCanonicalMatch = FALSE; 2732 result->search->elementComparisonType = 0; 2733 result->search->isForwardSearching = TRUE; 2734 result->search->reset = TRUE; 2735 2736 initialize(result, status); 2737 2738 if (U_FAILURE(*status)) { 2739 usearch_close(result); 2740 return NULL; 2741 } 2742 2743 return result; 2744 } 2745 return NULL; 2746 } 2747 2748 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2749 { 2750 if (strsrch) { 2751 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && 2752 strsrch->pattern.ces) { 2753 uprv_free(strsrch->pattern.ces); 2754 } 2755 2756 if (strsrch->pattern.pces != NULL && 2757 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 2758 uprv_free(strsrch->pattern.pces); 2759 } 2760 2761 delete strsrch->textProcessedIter; 2762 ucol_closeElements(strsrch->textIter); 2763 ucol_closeElements(strsrch->utilIter); 2764 2765 if (strsrch->ownCollator && strsrch->collator) { 2766 ucol_close((UCollator *)strsrch->collator); 2767 } 2768 2769 #if !UCONFIG_NO_BREAK_ITERATION 2770 if (strsrch->search->internalBreakIter) { 2771 ubrk_close(strsrch->search->internalBreakIter); 2772 } 2773 #endif 2774 2775 uprv_free(strsrch->search); 2776 uprv_free(strsrch); 2777 } 2778 } 2779 2780 namespace { 2781 2782 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 2783 if (U_FAILURE(*status)) { return FALSE; } 2784 if (strsrch->textProcessedIter == NULL) { 2785 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 2786 if (strsrch->textProcessedIter == NULL) { 2787 *status = U_MEMORY_ALLOCATION_ERROR; 2788 return FALSE; 2789 } 2790 } else { 2791 strsrch->textProcessedIter->init(strsrch->textIter); 2792 } 2793 return TRUE; 2794 } 2795 2796 } 2797 2798 // set and get methods -------------------------------------------------- 2799 2800 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2801 int32_t position, 2802 UErrorCode *status) 2803 { 2804 if (U_SUCCESS(*status) && strsrch) { 2805 if (isOutOfBounds(strsrch->search->textLength, position)) { 2806 *status = U_INDEX_OUTOFBOUNDS_ERROR; 2807 } 2808 else { 2809 setColEIterOffset(strsrch->textIter, position); 2810 } 2811 strsrch->search->matchedIndex = USEARCH_DONE; 2812 strsrch->search->matchedLength = 0; 2813 strsrch->search->reset = FALSE; 2814 } 2815 } 2816 2817 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2818 { 2819 if (strsrch) { 2820 int32_t result = ucol_getOffset(strsrch->textIter); 2821 if (isOutOfBounds(strsrch->search->textLength, result)) { 2822 return USEARCH_DONE; 2823 } 2824 return result; 2825 } 2826 return USEARCH_DONE; 2827 } 2828 2829 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2830 USearchAttribute attribute, 2831 USearchAttributeValue value, 2832 UErrorCode *status) 2833 { 2834 if (U_SUCCESS(*status) && strsrch) { 2835 switch (attribute) 2836 { 2837 case USEARCH_OVERLAP : 2838 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2839 break; 2840 case USEARCH_CANONICAL_MATCH : 2841 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2842 FALSE); 2843 break; 2844 case USEARCH_ELEMENT_COMPARISON : 2845 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2846 strsrch->search->elementComparisonType = (int16_t)value; 2847 } else { 2848 strsrch->search->elementComparisonType = 0; 2849 } 2850 break; 2851 case USEARCH_ATTRIBUTE_COUNT : 2852 default: 2853 *status = U_ILLEGAL_ARGUMENT_ERROR; 2854 } 2855 } 2856 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2857 *status = U_ILLEGAL_ARGUMENT_ERROR; 2858 } 2859 } 2860 2861 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2862 const UStringSearch *strsrch, 2863 USearchAttribute attribute) 2864 { 2865 if (strsrch) { 2866 switch (attribute) { 2867 case USEARCH_OVERLAP : 2868 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2869 USEARCH_OFF); 2870 case USEARCH_CANONICAL_MATCH : 2871 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2872 USEARCH_OFF); 2873 case USEARCH_ELEMENT_COMPARISON : 2874 { 2875 int16_t value = strsrch->search->elementComparisonType; 2876 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2877 return (USearchAttributeValue)value; 2878 } else { 2879 return USEARCH_STANDARD_ELEMENT_COMPARISON; 2880 } 2881 } 2882 case USEARCH_ATTRIBUTE_COUNT : 2883 return USEARCH_DEFAULT; 2884 } 2885 } 2886 return USEARCH_DEFAULT; 2887 } 2888 2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2890 const UStringSearch *strsrch) 2891 { 2892 if (strsrch == NULL) { 2893 return USEARCH_DONE; 2894 } 2895 return strsrch->search->matchedIndex; 2896 } 2897 2898 2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2900 UChar *result, 2901 int32_t resultCapacity, 2902 UErrorCode *status) 2903 { 2904 if (U_FAILURE(*status)) { 2905 return USEARCH_DONE; 2906 } 2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2908 result == NULL)) { 2909 *status = U_ILLEGAL_ARGUMENT_ERROR; 2910 return USEARCH_DONE; 2911 } 2912 2913 int32_t copylength = strsrch->search->matchedLength; 2914 int32_t copyindex = strsrch->search->matchedIndex; 2915 if (copyindex == USEARCH_DONE) { 2916 u_terminateUChars(result, resultCapacity, 0, status); 2917 return USEARCH_DONE; 2918 } 2919 2920 if (resultCapacity < copylength) { 2921 copylength = resultCapacity; 2922 } 2923 if (copylength > 0) { 2924 uprv_memcpy(result, strsrch->search->text + copyindex, 2925 copylength * sizeof(UChar)); 2926 } 2927 return u_terminateUChars(result, resultCapacity, 2928 strsrch->search->matchedLength, status); 2929 } 2930 2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2932 const UStringSearch *strsrch) 2933 { 2934 if (strsrch) { 2935 return strsrch->search->matchedLength; 2936 } 2937 return USEARCH_DONE; 2938 } 2939 2940 #if !UCONFIG_NO_BREAK_ITERATION 2941 2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2943 UBreakIterator *breakiter, 2944 UErrorCode *status) 2945 { 2946 if (U_SUCCESS(*status) && strsrch) { 2947 strsrch->search->breakIter = breakiter; 2948 if (breakiter) { 2949 ubrk_setText(breakiter, strsrch->search->text, 2950 strsrch->search->textLength, status); 2951 } 2952 } 2953 } 2954 2955 U_CAPI const UBreakIterator* U_EXPORT2 2956 usearch_getBreakIterator(const UStringSearch *strsrch) 2957 { 2958 if (strsrch) { 2959 return strsrch->search->breakIter; 2960 } 2961 return NULL; 2962 } 2963 2964 #endif 2965 2966 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2967 const UChar *text, 2968 int32_t textlength, 2969 UErrorCode *status) 2970 { 2971 if (U_SUCCESS(*status)) { 2972 if (strsrch == NULL || text == NULL || textlength < -1 || 2973 textlength == 0) { 2974 *status = U_ILLEGAL_ARGUMENT_ERROR; 2975 } 2976 else { 2977 if (textlength == -1) { 2978 textlength = u_strlen(text); 2979 } 2980 strsrch->search->text = text; 2981 strsrch->search->textLength = textlength; 2982 ucol_setText(strsrch->textIter, text, textlength, status); 2983 strsrch->search->matchedIndex = USEARCH_DONE; 2984 strsrch->search->matchedLength = 0; 2985 strsrch->search->reset = TRUE; 2986 #if !UCONFIG_NO_BREAK_ITERATION 2987 if (strsrch->search->breakIter != NULL) { 2988 ubrk_setText(strsrch->search->breakIter, text, 2989 textlength, status); 2990 } 2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2992 #endif 2993 } 2994 } 2995 } 2996 2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 2998 int32_t *length) 2999 { 3000 if (strsrch) { 3001 *length = strsrch->search->textLength; 3002 return strsrch->search->text; 3003 } 3004 return NULL; 3005 } 3006 3007 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3008 const UCollator *collator, 3009 UErrorCode *status) 3010 { 3011 if (U_SUCCESS(*status)) { 3012 if (collator == NULL) { 3013 *status = U_ILLEGAL_ARGUMENT_ERROR; 3014 return; 3015 } 3016 3017 if (strsrch) { 3018 delete strsrch->textProcessedIter; 3019 strsrch->textProcessedIter = NULL; 3020 ucol_closeElements(strsrch->textIter); 3021 ucol_closeElements(strsrch->utilIter); 3022 strsrch->textIter = strsrch->utilIter = NULL; 3023 if (strsrch->ownCollator && (strsrch->collator != collator)) { 3024 ucol_close((UCollator *)strsrch->collator); 3025 strsrch->ownCollator = FALSE; 3026 } 3027 strsrch->collator = collator; 3028 strsrch->strength = ucol_getStrength(collator); 3029 strsrch->ceMask = getMask(strsrch->strength); 3030 #if !UCONFIG_NO_BREAK_ITERATION 3031 ubrk_close(strsrch->search->internalBreakIter); 3032 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3033 strsrch->search->text, strsrch->search->textLength, status); 3034 #endif 3035 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3036 strsrch->toShift = 3037 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3038 UCOL_SHIFTED; 3039 // if status is a failure, ucol_getVariableTop returns 0 3040 strsrch->variableTop = ucol_getVariableTop(collator, status); 3041 strsrch->textIter = ucol_openElements(collator, 3042 strsrch->search->text, 3043 strsrch->search->textLength, 3044 status); 3045 strsrch->utilIter = ucol_openElements( 3046 collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 3047 // initialize() _after_ setting the iterators for the new collator. 3048 initialize(strsrch, status); 3049 } 3050 3051 // **** are these calls needed? 3052 // **** we call uprv_init_pce in initializePatternPCETable 3053 // **** and the CEIBuffer constructor... 3054 #if 0 3055 uprv_init_pce(strsrch->textIter); 3056 uprv_init_pce(strsrch->utilIter); 3057 #endif 3058 } 3059 } 3060 3061 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3062 { 3063 if (strsrch) { 3064 return (UCollator *)strsrch->collator; 3065 } 3066 return NULL; 3067 } 3068 3069 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3070 const UChar *pattern, 3071 int32_t patternlength, 3072 UErrorCode *status) 3073 { 3074 if (U_SUCCESS(*status)) { 3075 if (strsrch == NULL || pattern == NULL) { 3076 *status = U_ILLEGAL_ARGUMENT_ERROR; 3077 } 3078 else { 3079 if (patternlength == -1) { 3080 patternlength = u_strlen(pattern); 3081 } 3082 if (patternlength == 0) { 3083 *status = U_ILLEGAL_ARGUMENT_ERROR; 3084 return; 3085 } 3086 strsrch->pattern.text = pattern; 3087 strsrch->pattern.textLength = patternlength; 3088 initialize(strsrch, status); 3089 } 3090 } 3091 } 3092 3093 U_CAPI const UChar* U_EXPORT2 3094 usearch_getPattern(const UStringSearch *strsrch, 3095 int32_t *length) 3096 { 3097 if (strsrch) { 3098 *length = strsrch->pattern.textLength; 3099 return strsrch->pattern.text; 3100 } 3101 return NULL; 3102 } 3103 3104 // miscellanous methods -------------------------------------------------- 3105 3106 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3107 UErrorCode *status) 3108 { 3109 if (strsrch && U_SUCCESS(*status)) { 3110 strsrch->search->isForwardSearching = TRUE; 3111 usearch_setOffset(strsrch, 0, status); 3112 if (U_SUCCESS(*status)) { 3113 return usearch_next(strsrch, status); 3114 } 3115 } 3116 return USEARCH_DONE; 3117 } 3118 3119 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3120 int32_t position, 3121 UErrorCode *status) 3122 { 3123 if (strsrch && U_SUCCESS(*status)) { 3124 strsrch->search->isForwardSearching = TRUE; 3125 // position checked in usearch_setOffset 3126 usearch_setOffset(strsrch, position, status); 3127 if (U_SUCCESS(*status)) { 3128 return usearch_next(strsrch, status); 3129 } 3130 } 3131 return USEARCH_DONE; 3132 } 3133 3134 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3135 UErrorCode *status) 3136 { 3137 if (strsrch && U_SUCCESS(*status)) { 3138 strsrch->search->isForwardSearching = FALSE; 3139 usearch_setOffset(strsrch, strsrch->search->textLength, status); 3140 if (U_SUCCESS(*status)) { 3141 return usearch_previous(strsrch, status); 3142 } 3143 } 3144 return USEARCH_DONE; 3145 } 3146 3147 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3148 int32_t position, 3149 UErrorCode *status) 3150 { 3151 if (strsrch && U_SUCCESS(*status)) { 3152 strsrch->search->isForwardSearching = FALSE; 3153 // position checked in usearch_setOffset 3154 usearch_setOffset(strsrch, position, status); 3155 if (U_SUCCESS(*status)) { 3156 return usearch_previous(strsrch, status); 3157 } 3158 } 3159 return USEARCH_DONE; 3160 } 3161 3162 /** 3163 * If a direction switch is required, we'll count the number of ces till the 3164 * beginning of the collation element iterator and iterate forwards that 3165 * number of times. This is so that we get to the correct point within the 3166 * string to continue the search in. Imagine when we are in the middle of the 3167 * normalization buffer when the change in direction is request. arrrgghh.... 3168 * After searching the offset within the collation element iterator will be 3169 * shifted to the start of the match. If a match is not found, the offset would 3170 * have been set to the end of the text string in the collation element 3171 * iterator. 3172 * Okay, here's my take on normalization buffer. The only time when there can 3173 * be 2 matches within the same normalization is when the pattern is consists 3174 * of all accents. But since the offset returned is from the text string, we 3175 * should not confuse the caller by returning the second match within the 3176 * same normalization buffer. If we do, the 2 results will have the same match 3177 * offsets, and that'll be confusing. I'll return the next match that doesn't 3178 * fall within the same normalization buffer. Note this does not affect the 3179 * results of matches spanning the text and the normalization buffer. 3180 * The position to start searching is taken from the collation element 3181 * iterator. Callers of this API would have to set the offset in the collation 3182 * element iterator before using this method. 3183 */ 3184 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3185 UErrorCode *status) 3186 { 3187 if (U_SUCCESS(*status) && strsrch) { 3188 // note offset is either equivalent to the start of the previous match 3189 // or is set by the user 3190 int32_t offset = usearch_getOffset(strsrch); 3191 USearch *search = strsrch->search; 3192 search->reset = FALSE; 3193 int32_t textlength = search->textLength; 3194 if (search->isForwardSearching) { 3195 #if BOYER_MOORE 3196 if (offset == textlength 3197 || (!search->isOverlap && 3198 (offset + strsrch->pattern.defaultShiftSize > textlength || 3199 (search->matchedIndex != USEARCH_DONE && 3200 offset + search->matchedLength >= textlength)))) { 3201 // not enough characters to match 3202 setMatchNotFound(strsrch); 3203 return USEARCH_DONE; 3204 } 3205 #else 3206 if (offset == textlength || 3207 (! search->isOverlap && 3208 (search->matchedIndex != USEARCH_DONE && 3209 offset + search->matchedLength > textlength))) { 3210 // not enough characters to match 3211 setMatchNotFound(strsrch); 3212 return USEARCH_DONE; 3213 } 3214 #endif 3215 } 3216 else { 3217 // switching direction. 3218 // if matchedIndex == USEARCH_DONE, it means that either a 3219 // setOffset has been called or that previous ran off the text 3220 // string. the iterator would have been set to offset 0 if a 3221 // match is not found. 3222 search->isForwardSearching = TRUE; 3223 if (search->matchedIndex != USEARCH_DONE) { 3224 // there's no need to set the collation element iterator 3225 // the next call to next will set the offset. 3226 return search->matchedIndex; 3227 } 3228 } 3229 3230 if (U_SUCCESS(*status)) { 3231 if (strsrch->pattern.cesLength == 0) { 3232 if (search->matchedIndex == USEARCH_DONE) { 3233 search->matchedIndex = offset; 3234 } 3235 else { // moves by codepoints 3236 U16_FWD_1(search->text, search->matchedIndex, textlength); 3237 } 3238 3239 search->matchedLength = 0; 3240 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3241 // status checked below 3242 if (search->matchedIndex == textlength) { 3243 search->matchedIndex = USEARCH_DONE; 3244 } 3245 } 3246 else { 3247 if (search->matchedLength > 0) { 3248 // if matchlength is 0 we are at the start of the iteration 3249 if (search->isOverlap) { 3250 ucol_setOffset(strsrch->textIter, offset + 1, status); 3251 } 3252 else { 3253 ucol_setOffset(strsrch->textIter, 3254 offset + search->matchedLength, status); 3255 } 3256 } 3257 else { 3258 // for boundary check purposes. this will ensure that the 3259 // next match will not preceed the current offset 3260 // note search->matchedIndex will always be set to something 3261 // in the code 3262 search->matchedIndex = offset - 1; 3263 } 3264 3265 if (search->isCanonicalMatch) { 3266 // can't use exact here since extra accents are allowed. 3267 usearch_handleNextCanonical(strsrch, status); 3268 } 3269 else { 3270 usearch_handleNextExact(strsrch, status); 3271 } 3272 } 3273 3274 if (U_FAILURE(*status)) { 3275 return USEARCH_DONE; 3276 } 3277 3278 #if !BOYER_MOORE 3279 if (search->matchedIndex == USEARCH_DONE) { 3280 ucol_setOffset(strsrch->textIter, search->textLength, status); 3281 } else { 3282 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3283 } 3284 #endif 3285 3286 return search->matchedIndex; 3287 } 3288 } 3289 return USEARCH_DONE; 3290 } 3291 3292 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3293 UErrorCode *status) 3294 { 3295 if (U_SUCCESS(*status) && strsrch) { 3296 int32_t offset; 3297 USearch *search = strsrch->search; 3298 if (search->reset) { 3299 offset = search->textLength; 3300 search->isForwardSearching = FALSE; 3301 search->reset = FALSE; 3302 setColEIterOffset(strsrch->textIter, offset); 3303 } 3304 else { 3305 offset = usearch_getOffset(strsrch); 3306 } 3307 3308 int32_t matchedindex = search->matchedIndex; 3309 if (search->isForwardSearching == TRUE) { 3310 // switching direction. 3311 // if matchedIndex == USEARCH_DONE, it means that either a 3312 // setOffset has been called or that next ran off the text 3313 // string. the iterator would have been set to offset textLength if 3314 // a match is not found. 3315 search->isForwardSearching = FALSE; 3316 if (matchedindex != USEARCH_DONE) { 3317 return matchedindex; 3318 } 3319 } 3320 else { 3321 #if BOYER_MOORE 3322 if (offset == 0 || matchedindex == 0 || 3323 (!search->isOverlap && 3324 (offset < strsrch->pattern.defaultShiftSize || 3325 (matchedindex != USEARCH_DONE && 3326 matchedindex < strsrch->pattern.defaultShiftSize)))) { 3327 // not enough characters to match 3328 setMatchNotFound(strsrch); 3329 return USEARCH_DONE; 3330 } 3331 #else 3332 // Could check pattern length, but the 3333 // linear search will do the right thing 3334 if (offset == 0 || matchedindex == 0) { 3335 setMatchNotFound(strsrch); 3336 return USEARCH_DONE; 3337 } 3338 #endif 3339 } 3340 3341 if (U_SUCCESS(*status)) { 3342 if (strsrch->pattern.cesLength == 0) { 3343 search->matchedIndex = 3344 (matchedindex == USEARCH_DONE ? offset : matchedindex); 3345 if (search->matchedIndex == 0) { 3346 setMatchNotFound(strsrch); 3347 // status checked below 3348 } 3349 else { // move by codepoints 3350 U16_BACK_1(search->text, 0, search->matchedIndex); 3351 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3352 // status checked below 3353 search->matchedLength = 0; 3354 } 3355 } 3356 else { 3357 if (strsrch->search->isCanonicalMatch) { 3358 // can't use exact here since extra accents are allowed. 3359 usearch_handlePreviousCanonical(strsrch, status); 3360 // status checked below 3361 } 3362 else { 3363 usearch_handlePreviousExact(strsrch, status); 3364 // status checked below 3365 } 3366 } 3367 3368 if (U_FAILURE(*status)) { 3369 return USEARCH_DONE; 3370 } 3371 3372 return search->matchedIndex; 3373 } 3374 } 3375 return USEARCH_DONE; 3376 } 3377 3378 3379 3380 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3381 { 3382 /* 3383 reset is setting the attributes that are already in 3384 string search, hence all attributes in the collator should 3385 be retrieved without any problems 3386 */ 3387 if (strsrch) { 3388 UErrorCode status = U_ZERO_ERROR; 3389 UBool sameCollAttribute = TRUE; 3390 uint32_t ceMask; 3391 UBool shift; 3392 uint32_t varTop; 3393 3394 // **** hack to deal w/ how processed CEs encode quaternary **** 3395 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3396 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3397 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3398 sameCollAttribute = FALSE; 3399 } 3400 3401 strsrch->strength = ucol_getStrength(strsrch->collator); 3402 ceMask = getMask(strsrch->strength); 3403 if (strsrch->ceMask != ceMask) { 3404 strsrch->ceMask = ceMask; 3405 sameCollAttribute = FALSE; 3406 } 3407 3408 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3409 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3410 &status) == UCOL_SHIFTED; 3411 if (strsrch->toShift != shift) { 3412 strsrch->toShift = shift; 3413 sameCollAttribute = FALSE; 3414 } 3415 3416 // if status is a failure, ucol_getVariableTop returns 0 3417 varTop = ucol_getVariableTop(strsrch->collator, &status); 3418 if (strsrch->variableTop != varTop) { 3419 strsrch->variableTop = varTop; 3420 sameCollAttribute = FALSE; 3421 } 3422 if (!sameCollAttribute) { 3423 initialize(strsrch, &status); 3424 } 3425 ucol_setText(strsrch->textIter, strsrch->search->text, 3426 strsrch->search->textLength, 3427 &status); 3428 strsrch->search->matchedLength = 0; 3429 strsrch->search->matchedIndex = USEARCH_DONE; 3430 strsrch->search->isOverlap = FALSE; 3431 strsrch->search->isCanonicalMatch = FALSE; 3432 strsrch->search->elementComparisonType = 0; 3433 strsrch->search->isForwardSearching = TRUE; 3434 strsrch->search->reset = TRUE; 3435 } 3436 } 3437 3438 // 3439 // CEI Collation Element + source text index. 3440 // These structs are kept in the circular buffer. 3441 // 3442 struct CEI { 3443 int64_t ce; 3444 int32_t lowIndex; 3445 int32_t highIndex; 3446 }; 3447 3448 U_NAMESPACE_BEGIN 3449 3450 namespace { 3451 // 3452 // CEIBuffer A circular buffer of CEs-with-index from the text being searched. 3453 // 3454 #define DEFAULT_CEBUFFER_SIZE 96 3455 #define CEBUFFER_EXTRA 32 3456 // Some typical max values to make buffer size more reasonable for asymmetric search. 3457 // #8694 is for a better long-term solution to allocation of this buffer. 3458 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3459 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3460 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3461 struct CEIBuffer { 3462 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3463 CEI *buf; 3464 int32_t bufSize; 3465 int32_t firstIx; 3466 int32_t limitIx; 3467 UCollationElements *ceIter; 3468 UStringSearch *strSearch; 3469 3470 3471 3472 CEIBuffer(UStringSearch *ss, UErrorCode *status); 3473 ~CEIBuffer(); 3474 const CEI *get(int32_t index); 3475 const CEI *getPrevious(int32_t index); 3476 }; 3477 3478 3479 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { 3480 buf = defBuf; 3481 strSearch = ss; 3482 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; 3483 if (ss->search->elementComparisonType != 0) { 3484 const UChar * patText = ss->pattern.text; 3485 if (patText) { 3486 const UChar * patTextLimit = patText + ss->pattern.textLength; 3487 while ( patText < patTextLimit ) { 3488 UChar c = *patText++; 3489 if (MIGHT_BE_JAMO_L(c)) { 3490 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3491 } else { 3492 // No check for surrogates, we might allocate slightly more buffer than necessary. 3493 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3494 } 3495 } 3496 } 3497 } 3498 ceIter = ss->textIter; 3499 firstIx = 0; 3500 limitIx = 0; 3501 3502 if (!initTextProcessedIter(ss, status)) { return; } 3503 3504 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3505 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3506 if (buf == NULL) { 3507 *status = U_MEMORY_ALLOCATION_ERROR; 3508 } 3509 } 3510 } 3511 3512 // TODO: add a reset or init function so that allocated 3513 // buffers can be retained & reused. 3514 3515 CEIBuffer::~CEIBuffer() { 3516 if (buf != defBuf) { 3517 uprv_free(buf); 3518 } 3519 } 3520 3521 3522 // Get the CE with the specified index. 3523 // Index must be in the range 3524 // n-history_size < index < n+1 3525 // where n is the largest index to have been fetched by some previous call to this function. 3526 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3527 // 3528 const CEI *CEIBuffer::get(int32_t index) { 3529 int i = index % bufSize; 3530 3531 if (index>=firstIx && index<limitIx) { 3532 // The request was for an entry already in our buffer. 3533 // Just return it. 3534 return &buf[i]; 3535 } 3536 3537 // Caller is requesting a new, never accessed before, CE. 3538 // Verify that it is the next one in sequence, which is all 3539 // that is allowed. 3540 if (index != limitIx) { 3541 U_ASSERT(FALSE); 3542 3543 return NULL; 3544 } 3545 3546 // Manage the circular CE buffer indexing 3547 limitIx++; 3548 3549 if (limitIx - firstIx >= bufSize) { 3550 // The buffer is full, knock out the lowest-indexed entry. 3551 firstIx++; 3552 } 3553 3554 UErrorCode status = U_ZERO_ERROR; 3555 3556 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3557 3558 return &buf[i]; 3559 } 3560 3561 // Get the CE with the specified index. 3562 // Index must be in the range 3563 // n-history_size < index < n+1 3564 // where n is the largest index to have been fetched by some previous call to this function. 3565 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3566 // 3567 const CEI *CEIBuffer::getPrevious(int32_t index) { 3568 int i = index % bufSize; 3569 3570 if (index>=firstIx && index<limitIx) { 3571 // The request was for an entry already in our buffer. 3572 // Just return it. 3573 return &buf[i]; 3574 } 3575 3576 // Caller is requesting a new, never accessed before, CE. 3577 // Verify that it is the next one in sequence, which is all 3578 // that is allowed. 3579 if (index != limitIx) { 3580 U_ASSERT(FALSE); 3581 3582 return NULL; 3583 } 3584 3585 // Manage the circular CE buffer indexing 3586 limitIx++; 3587 3588 if (limitIx - firstIx >= bufSize) { 3589 // The buffer is full, knock out the lowest-indexed entry. 3590 firstIx++; 3591 } 3592 3593 UErrorCode status = U_ZERO_ERROR; 3594 3595 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3596 3597 return &buf[i]; 3598 } 3599 3600 } 3601 3602 U_NAMESPACE_END 3603 3604 3605 // #define USEARCH_DEBUG 3606 3607 #ifdef USEARCH_DEBUG 3608 #include <stdio.h> 3609 #include <stdlib.h> 3610 #endif 3611 3612 /* 3613 * Find the next break boundary after startIndex. If the UStringSearch object 3614 * has an external break iterator, use that. Otherwise use the internal character 3615 * break iterator. 3616 */ 3617 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3618 #if 0 3619 const UChar *text = strsrch->search->text; 3620 int32_t textLen = strsrch->search->textLength; 3621 3622 U_ASSERT(startIndex>=0); 3623 U_ASSERT(startIndex<=textLen); 3624 3625 if (startIndex >= textLen) { 3626 return startIndex; 3627 } 3628 3629 UChar32 c; 3630 int32_t i = startIndex; 3631 U16_NEXT(text, i, textLen, c); 3632 3633 // If we are on a control character, stop without looking for combining marks. 3634 // Control characters do not combine. 3635 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3636 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3637 return i; 3638 } 3639 3640 // The initial character was not a control, and can thus accept trailing 3641 // combining characters. Advance over however many of them there are. 3642 int32_t indexOfLastCharChecked; 3643 for (;;) { 3644 indexOfLastCharChecked = i; 3645 if (i>=textLen) { 3646 break; 3647 } 3648 U16_NEXT(text, i, textLen, c); 3649 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3650 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3651 break; 3652 } 3653 } 3654 return indexOfLastCharChecked; 3655 #elif !UCONFIG_NO_BREAK_ITERATION 3656 UBreakIterator *breakiterator = strsrch->search->breakIter; 3657 3658 if (breakiterator == NULL) { 3659 breakiterator = strsrch->search->internalBreakIter; 3660 } 3661 3662 if (breakiterator != NULL) { 3663 return ubrk_following(breakiterator, startIndex); 3664 } 3665 3666 return startIndex; 3667 #else 3668 // **** or should we use the original code? **** 3669 return startIndex; 3670 #endif 3671 3672 } 3673 3674 /* 3675 * Returns TRUE if index is on a break boundary. If the UStringSearch 3676 * has an external break iterator, test using that, otherwise test 3677 * using the internal character break iterator. 3678 */ 3679 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3680 #if 0 3681 const UChar *text = strsrch->search->text; 3682 int32_t textLen = strsrch->search->textLength; 3683 3684 U_ASSERT(index>=0); 3685 U_ASSERT(index<=textLen); 3686 3687 if (index>=textLen || index<=0) { 3688 return TRUE; 3689 } 3690 3691 // If the character at the current index is not a GRAPHEME_EXTEND 3692 // then we can not be within a combining sequence. 3693 UChar32 c; 3694 U16_GET(text, 0, index, textLen, c); 3695 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3696 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3697 return TRUE; 3698 } 3699 3700 // We are at a combining mark. If the preceding character is anything 3701 // except a CONTROL, CR or LF, we are in a combining sequence. 3702 U16_PREV(text, 0, index, c); 3703 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3704 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3705 return !combining; 3706 #elif !UCONFIG_NO_BREAK_ITERATION 3707 UBreakIterator *breakiterator = strsrch->search->breakIter; 3708 3709 if (breakiterator == NULL) { 3710 breakiterator = strsrch->search->internalBreakIter; 3711 } 3712 3713 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3714 #else 3715 // **** or use the original code? **** 3716 return TRUE; 3717 #endif 3718 } 3719 3720 #if 0 3721 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3722 { 3723 #if !UCONFIG_NO_BREAK_ITERATION 3724 UBreakIterator *breakiterator = strsrch->search->breakIter; 3725 3726 if (breakiterator != NULL) { 3727 int32_t startindex = ubrk_first(breakiterator); 3728 int32_t endindex = ubrk_last(breakiterator); 3729 3730 // out-of-range indexes are never boundary positions 3731 if (start < startindex || start > endindex || 3732 end < startindex || end > endindex) { 3733 return FALSE; 3734 } 3735 3736 return ubrk_isBoundary(breakiterator, start) && 3737 ubrk_isBoundary(breakiterator, end); 3738 } 3739 #endif 3740 3741 return TRUE; 3742 } 3743 #endif 3744 3745 typedef enum { 3746 U_CE_MATCH = -1, 3747 U_CE_NO_MATCH = 0, 3748 U_CE_SKIP_TARG, 3749 U_CE_SKIP_PATN 3750 } UCompareCEsResult; 3751 #define U_CE_LEVEL2_BASE 0x00000005 3752 #define U_CE_LEVEL3_BASE 0x00050000 3753 3754 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 3755 if (targCE == patCE) { 3756 return U_CE_MATCH; 3757 } 3758 if (compareType == 0) { 3759 return U_CE_NO_MATCH; 3760 } 3761 3762 int64_t targCEshifted = targCE >> 32; 3763 int64_t patCEshifted = patCE >> 32; 3764 int64_t mask; 3765 3766 mask = 0xFFFF0000; 3767 int32_t targLev1 = (int32_t)(targCEshifted & mask); 3768 int32_t patLev1 = (int32_t)(patCEshifted & mask); 3769 if ( targLev1 != patLev1 ) { 3770 if ( targLev1 == 0 ) { 3771 return U_CE_SKIP_TARG; 3772 } 3773 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3774 return U_CE_SKIP_PATN; 3775 } 3776 return U_CE_NO_MATCH; 3777 } 3778 3779 mask = 0x0000FFFF; 3780 int32_t targLev2 = (int32_t)(targCEshifted & mask); 3781 int32_t patLev2 = (int32_t)(patCEshifted & mask); 3782 if ( targLev2 != patLev2 ) { 3783 if ( targLev2 == 0 ) { 3784 return U_CE_SKIP_TARG; 3785 } 3786 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3787 return U_CE_SKIP_PATN; 3788 } 3789 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 3790 U_CE_MATCH: U_CE_NO_MATCH; 3791 } 3792 3793 mask = 0xFFFF0000; 3794 int32_t targLev3 = (int32_t)(targCE & mask); 3795 int32_t patLev3 = (int32_t)(patCE & mask); 3796 if ( targLev3 != patLev3 ) { 3797 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 3798 U_CE_MATCH: U_CE_NO_MATCH; 3799 } 3800 3801 return U_CE_MATCH; 3802 } 3803 3804 #if BOYER_MOORE 3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 3806 #endif 3807 3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3809 int32_t startIdx, 3810 int32_t *matchStart, 3811 int32_t *matchLimit, 3812 UErrorCode *status) 3813 { 3814 if (U_FAILURE(*status)) { 3815 return FALSE; 3816 } 3817 3818 // TODO: reject search patterns beginning with a combining char. 3819 3820 #ifdef USEARCH_DEBUG 3821 if (getenv("USEARCH_DEBUG") != NULL) { 3822 printf("Pattern CEs\n"); 3823 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 3824 printf(" %8x", strsrch->pattern.ces[ii]); 3825 } 3826 printf("\n"); 3827 } 3828 3829 #endif 3830 // Input parameter sanity check. 3831 // TODO: should input indicies clip to the text length 3832 // in the same way that UText does. 3833 if(strsrch->pattern.cesLength == 0 || 3834 startIdx < 0 || 3835 startIdx > strsrch->search->textLength || 3836 strsrch->pattern.ces == NULL) { 3837 *status = U_ILLEGAL_ARGUMENT_ERROR; 3838 return FALSE; 3839 } 3840 3841 if (strsrch->pattern.pces == NULL) { 3842 initializePatternPCETable(strsrch, status); 3843 } 3844 3845 ucol_setOffset(strsrch->textIter, startIdx, status); 3846 CEIBuffer ceb(strsrch, status); 3847 3848 3849 int32_t targetIx = 0; 3850 const CEI *targetCEI = NULL; 3851 int32_t patIx; 3852 UBool found; 3853 3854 int32_t mStart = -1; 3855 int32_t mLimit = -1; 3856 int32_t minLimit; 3857 int32_t maxLimit; 3858 3859 3860 3861 // Outer loop moves over match starting positions in the 3862 // target CE space. 3863 // Here we see the target as a sequence of collation elements, resulting from the following: 3864 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 3865 // (for example, digraphs such as IJ may be broken into two characters). 3866 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 3867 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 3868 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 3869 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 3870 // then the CE is deleted, so the following code sees only CEs that are relevant. 3871 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 3872 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 3873 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 3874 // 3875 for(targetIx=0; ; targetIx++) 3876 { 3877 found = TRUE; 3878 // Inner loop checks for a match beginning at each 3879 // position from the outer loop. 3880 int32_t targetIxOffset = 0; 3881 int64_t patCE = 0; 3882 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3883 // (compared to the last CE fetched for the previous targetIx value) as we need to go 3884 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3885 const CEI *firstCEI = ceb.get(targetIx); 3886 if (firstCEI == NULL) { 3887 *status = U_INTERNAL_PROGRAM_ERROR; 3888 found = FALSE; 3889 break; 3890 } 3891 3892 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) { 3893 patCE = strsrch->pattern.pces[patIx]; 3894 targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3895 // Compare CE from target string with CE from the pattern. 3896 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3897 // which will fail the compare, below. 3898 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 3899 if ( ceMatch == U_CE_NO_MATCH ) { 3900 found = FALSE; 3901 break; 3902 } else if ( ceMatch > U_CE_NO_MATCH ) { 3903 if ( ceMatch == U_CE_SKIP_TARG ) { 3904 // redo with same patCE, next targCE 3905 patIx--; 3906 targetIxOffset++; 3907 } else { // ceMatch == U_CE_SKIP_PATN 3908 // redo with same targCE, next patCE 3909 targetIxOffset--; 3910 } 3911 } 3912 } 3913 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far 3914 3915 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3916 // No match at this targetIx. Try again at the next. 3917 continue; 3918 } 3919 3920 if (!found) { 3921 // No match at all, we have run off the end of the target text. 3922 break; 3923 } 3924 3925 3926 // We have found a match in CE space. 3927 // Now determine the bounds in string index space. 3928 // There still is a chance of match failure if the CE range not correspond to 3929 // an acceptable character range. 3930 // 3931 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3932 3933 mStart = firstCEI->lowIndex; 3934 minLimit = lastCEI->lowIndex; 3935 3936 // Look at the CE following the match. If it is UCOL_NULLORDER the match 3937 // extended to the end of input, and the match is good. 3938 3939 // Look at the high and low indices of the CE following the match. If 3940 // they are the same it means one of two things: 3941 // 1. The match extended to the last CE from the target text, which is OK, or 3942 // 2. The last CE that was part of the match is in an expansion that extends 3943 // to the first CE after the match. In this case, we reject the match. 3944 const CEI *nextCEI = 0; 3945 if (strsrch->search->elementComparisonType == 0) { 3946 nextCEI = ceb.get(targetIx + targetIxOffset); 3947 maxLimit = nextCEI->lowIndex; 3948 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 3949 found = FALSE; 3950 } 3951 } else { 3952 for ( ; ; ++targetIxOffset ) { 3953 nextCEI = ceb.get(targetIx + targetIxOffset); 3954 maxLimit = nextCEI->lowIndex; 3955 // If we are at the end of the target too, match succeeds 3956 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 3957 break; 3958 } 3959 // As long as the next CE has primary weight of 0, 3960 // it is part of the last target element matched by the pattern; 3961 // make sure it can be part of a match with the last patCE 3962 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3963 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3964 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3965 found = FALSE; 3966 break; 3967 } 3968 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 3969 // target element, but it has non-zero primary weight => match fails 3970 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3971 found = false; 3972 break; 3973 // Else the target CE is not part of an expansion of the last matched element, match succeeds 3974 } else { 3975 break; 3976 } 3977 } 3978 } 3979 3980 3981 // Check for the start of the match being within a combining sequence. 3982 // This can happen if the pattern itself begins with a combining char, and 3983 // the match found combining marks in the target text that were attached 3984 // to something else. 3985 // This type of match should be rejected for not completely consuming a 3986 // combining sequence. 3987 if (!isBreakBoundary(strsrch, mStart)) { 3988 found = FALSE; 3989 } 3990 3991 // Check for the start of the match being within an Collation Element Expansion, 3992 // meaning that the first char of the match is only partially matched. 3993 // With exapnsions, the first CE will report the index of the source 3994 // character, and all subsequent (expansions) CEs will report the source index of the 3995 // _following_ character. 3996 int32_t secondIx = firstCEI->highIndex; 3997 if (mStart == secondIx) { 3998 found = FALSE; 3999 } 4000 4001 // Advance the match end position to the first acceptable match boundary. 4002 // This advances the index over any combining charcters. 4003 mLimit = maxLimit; 4004 if (minLimit < maxLimit) { 4005 // When the last CE's low index is same with its high index, the CE is likely 4006 // a part of expansion. In this case, the index is located just after the 4007 // character corresponding to the CEs compared above. If the index is right 4008 // at the break boundary, move the position to the next boundary will result 4009 // incorrect match length when there are ignorable characters exist between 4010 // the position and the next character produces CE(s). See ticket#8482. 4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4012 mLimit = minLimit; 4013 } else { 4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4015 if (nba >= lastCEI->highIndex) { 4016 mLimit = nba; 4017 } 4018 } 4019 } 4020 4021 #ifdef USEARCH_DEBUG 4022 if (getenv("USEARCH_DEBUG") != NULL) { 4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4024 } 4025 #endif 4026 4027 // If advancing to the end of a combining sequence in character indexing space 4028 // advanced us beyond the end of the match in CE space, reject this match. 4029 if (mLimit > maxLimit) { 4030 found = FALSE; 4031 } 4032 4033 if (!isBreakBoundary(strsrch, mLimit)) { 4034 found = FALSE; 4035 } 4036 4037 if (! checkIdentical(strsrch, mStart, mLimit)) { 4038 found = FALSE; 4039 } 4040 4041 if (found) { 4042 break; 4043 } 4044 } 4045 4046 #ifdef USEARCH_DEBUG 4047 if (getenv("USEARCH_DEBUG") != NULL) { 4048 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4049 int32_t lastToPrint = ceb.limitIx+2; 4050 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4051 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4052 } 4053 printf("\n%s\n", found? "match found" : "no match"); 4054 } 4055 #endif 4056 4057 // All Done. Store back the match bounds to the caller. 4058 // 4059 if (found==FALSE) { 4060 mLimit = -1; 4061 mStart = -1; 4062 } 4063 4064 if (matchStart != NULL) { 4065 *matchStart= mStart; 4066 } 4067 4068 if (matchLimit != NULL) { 4069 *matchLimit = mLimit; 4070 } 4071 4072 return found; 4073 } 4074 4075 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4076 int32_t startIdx, 4077 int32_t *matchStart, 4078 int32_t *matchLimit, 4079 UErrorCode *status) 4080 { 4081 if (U_FAILURE(*status)) { 4082 return FALSE; 4083 } 4084 4085 // TODO: reject search patterns beginning with a combining char. 4086 4087 #ifdef USEARCH_DEBUG 4088 if (getenv("USEARCH_DEBUG") != NULL) { 4089 printf("Pattern CEs\n"); 4090 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 4091 printf(" %8x", strsrch->pattern.ces[ii]); 4092 } 4093 printf("\n"); 4094 } 4095 4096 #endif 4097 // Input parameter sanity check. 4098 // TODO: should input indicies clip to the text length 4099 // in the same way that UText does. 4100 if(strsrch->pattern.cesLength == 0 || 4101 startIdx < 0 || 4102 startIdx > strsrch->search->textLength || 4103 strsrch->pattern.ces == NULL) { 4104 *status = U_ILLEGAL_ARGUMENT_ERROR; 4105 return FALSE; 4106 } 4107 4108 if (strsrch->pattern.pces == NULL) { 4109 initializePatternPCETable(strsrch, status); 4110 } 4111 4112 CEIBuffer ceb(strsrch, status); 4113 int32_t targetIx = 0; 4114 4115 /* 4116 * Pre-load the buffer with the CE's for the grapheme 4117 * after our starting position so that we're sure that 4118 * we can look at the CE following the match when we 4119 * check the match boundaries. 4120 * 4121 * This will also pre-fetch the first CE that we'll 4122 * consider for the match. 4123 */ 4124 if (startIdx < strsrch->search->textLength) { 4125 UBreakIterator *bi = strsrch->search->internalBreakIter; 4126 int32_t next = ubrk_following(bi, startIdx); 4127 4128 ucol_setOffset(strsrch->textIter, next, status); 4129 4130 for (targetIx = 0; ; targetIx += 1) { 4131 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4132 break; 4133 } 4134 } 4135 } else { 4136 ucol_setOffset(strsrch->textIter, startIdx, status); 4137 } 4138 4139 4140 const CEI *targetCEI = NULL; 4141 int32_t patIx; 4142 UBool found; 4143 4144 int32_t limitIx = targetIx; 4145 int32_t mStart = -1; 4146 int32_t mLimit = -1; 4147 int32_t minLimit; 4148 int32_t maxLimit; 4149 4150 4151 4152 // Outer loop moves over match starting positions in the 4153 // target CE space. 4154 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 4155 // But patIx is 0 at the beginning of the pattern and increases toward the end. 4156 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 4157 // and the beginning of the base text. 4158 for(targetIx = limitIx; ; targetIx += 1) 4159 { 4160 found = TRUE; 4161 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4162 // (compared to the last CE fetched for the previous targetIx value) as we need to go 4163 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4164 const CEI *lastCEI = ceb.getPrevious(targetIx); 4165 if (lastCEI == NULL) { 4166 *status = U_INTERNAL_PROGRAM_ERROR; 4167 found = FALSE; 4168 break; 4169 } 4170 // Inner loop checks for a match beginning at each 4171 // position from the outer loop. 4172 int32_t targetIxOffset = 0; 4173 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { 4174 int64_t patCE = strsrch->pattern.pces[patIx]; 4175 4176 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); 4177 // Compare CE from target string with CE from the pattern. 4178 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4179 // which will fail the compare, below. 4180 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 4181 if ( ceMatch == U_CE_NO_MATCH ) { 4182 found = FALSE; 4183 break; 4184 } else if ( ceMatch > U_CE_NO_MATCH ) { 4185 if ( ceMatch == U_CE_SKIP_TARG ) { 4186 // redo with same patCE, next targCE 4187 patIx++; 4188 targetIxOffset++; 4189 } else { // ceMatch == U_CE_SKIP_PATN 4190 // redo with same targCE, next patCE 4191 targetIxOffset--; 4192 } 4193 } 4194 } 4195 4196 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4197 // No match at this targetIx. Try again at the next. 4198 continue; 4199 } 4200 4201 if (!found) { 4202 // No match at all, we have run off the end of the target text. 4203 break; 4204 } 4205 4206 4207 // We have found a match in CE space. 4208 // Now determine the bounds in string index space. 4209 // There still is a chance of match failure if the CE range not correspond to 4210 // an acceptable character range. 4211 // 4212 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); 4213 mStart = firstCEI->lowIndex; 4214 4215 // Check for the start of the match being within a combining sequence. 4216 // This can happen if the pattern itself begins with a combining char, and 4217 // the match found combining marks in the target text that were attached 4218 // to something else. 4219 // This type of match should be rejected for not completely consuming a 4220 // combining sequence. 4221 if (!isBreakBoundary(strsrch, mStart)) { 4222 found = FALSE; 4223 } 4224 4225 // Look at the high index of the first CE in the match. If it's the same as the 4226 // low index, the first CE in the match is in the middle of an expansion. 4227 if (mStart == firstCEI->highIndex) { 4228 found = FALSE; 4229 } 4230 4231 4232 minLimit = lastCEI->lowIndex; 4233 4234 if (targetIx > 0) { 4235 // Look at the CE following the match. If it is UCOL_NULLORDER the match 4236 // extended to the end of input, and the match is good. 4237 4238 // Look at the high and low indices of the CE following the match. If 4239 // they are the same it means one of two things: 4240 // 1. The match extended to the last CE from the target text, which is OK, or 4241 // 2. The last CE that was part of the match is in an expansion that extends 4242 // to the first CE after the match. In this case, we reject the match. 4243 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4244 4245 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4246 found = FALSE; 4247 } 4248 4249 mLimit = maxLimit = nextCEI->lowIndex; 4250 4251 // Advance the match end position to the first acceptable match boundary. 4252 // This advances the index over any combining charcters. 4253 if (minLimit < maxLimit) { 4254 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4255 4256 if (nba >= lastCEI->highIndex) { 4257 mLimit = nba; 4258 } 4259 } 4260 4261 // If advancing to the end of a combining sequence in character indexing space 4262 // advanced us beyond the end of the match in CE space, reject this match. 4263 if (mLimit > maxLimit) { 4264 found = FALSE; 4265 } 4266 4267 // Make sure the end of the match is on a break boundary 4268 if (!isBreakBoundary(strsrch, mLimit)) { 4269 found = FALSE; 4270 } 4271 4272 } else { 4273 // No non-ignorable CEs after this point. 4274 // The maximum position is detected by boundary after 4275 // the last non-ignorable CE. Combining sequence 4276 // across the start index will be truncated. 4277 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4278 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4279 } 4280 4281 #ifdef USEARCH_DEBUG 4282 if (getenv("USEARCH_DEBUG") != NULL) { 4283 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4284 } 4285 #endif 4286 4287 4288 if (! checkIdentical(strsrch, mStart, mLimit)) { 4289 found = FALSE; 4290 } 4291 4292 if (found) { 4293 break; 4294 } 4295 } 4296 4297 #ifdef USEARCH_DEBUG 4298 if (getenv("USEARCH_DEBUG") != NULL) { 4299 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4300 int32_t lastToPrint = ceb.limitIx+2; 4301 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4302 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4303 } 4304 printf("\n%s\n", found? "match found" : "no match"); 4305 } 4306 #endif 4307 4308 // All Done. Store back the match bounds to the caller. 4309 // 4310 if (found==FALSE) { 4311 mLimit = -1; 4312 mStart = -1; 4313 } 4314 4315 if (matchStart != NULL) { 4316 *matchStart= mStart; 4317 } 4318 4319 if (matchLimit != NULL) { 4320 *matchLimit = mLimit; 4321 } 4322 4323 return found; 4324 } 4325 4326 // internal use methods declared in usrchimp.h ----------------------------- 4327 4328 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4329 { 4330 if (U_FAILURE(*status)) { 4331 setMatchNotFound(strsrch); 4332 return FALSE; 4333 } 4334 4335 #if BOYER_MOORE 4336 UCollationElements *coleiter = strsrch->textIter; 4337 int32_t textlength = strsrch->search->textLength; 4338 int32_t *patternce = strsrch->pattern.ces; 4339 int32_t patterncelength = strsrch->pattern.cesLength; 4340 int32_t textoffset = ucol_getOffset(coleiter); 4341 4342 // status used in setting coleiter offset, since offset is checked in 4343 // shiftForward before setting the coleiter offset, status never 4344 // a failure 4345 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4346 patterncelength); 4347 while (textoffset <= textlength) 4348 { 4349 uint32_t patternceindex = patterncelength - 1; 4350 int32_t targetce; 4351 UBool found = FALSE; 4352 int32_t lastce = UCOL_NULLORDER; 4353 4354 setColEIterOffset(coleiter, textoffset); 4355 4356 for (;;) { 4357 // finding the last pattern ce match, imagine composite characters 4358 // for example: search for pattern A in text \u00C0 4359 // we'll have to skip \u0300 the grave first before we get to A 4360 targetce = ucol_previous(coleiter, status); 4361 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4362 found = FALSE; 4363 break; 4364 } 4365 targetce = getCE(strsrch, targetce); 4366 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4367 // this is for the text \u0315\u0300 that requires 4368 // normalization and pattern \u0300, where \u0315 is ignorable 4369 continue; 4370 } 4371 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4372 lastce = targetce; 4373 } 4374 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4375 if (targetce == patternce[patternceindex]) { 4376 // the first ce can be a contraction 4377 found = TRUE; 4378 break; 4379 } 4380 if (!hasExpansion(coleiter)) { 4381 found = FALSE; 4382 break; 4383 } 4384 } 4385 4386 //targetce = lastce; 4387 4388 while (found && patternceindex > 0) { 4389 lastce = targetce; 4390 targetce = ucol_previous(coleiter, status); 4391 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4392 found = FALSE; 4393 break; 4394 } 4395 targetce = getCE(strsrch, targetce); 4396 if (targetce == UCOL_IGNORABLE) { 4397 continue; 4398 } 4399 4400 patternceindex --; 4401 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4402 found = found && targetce == patternce[patternceindex]; 4403 } 4404 4405 targetce = lastce; 4406 4407 if (!found) { 4408 if (U_FAILURE(*status)) { 4409 break; 4410 } 4411 textoffset = shiftForward(strsrch, textoffset, lastce, 4412 patternceindex); 4413 // status checked at loop. 4414 patternceindex = patterncelength; 4415 continue; 4416 } 4417 4418 if (checkNextExactMatch(strsrch, &textoffset, status)) { 4419 // status checked in ucol_setOffset 4420 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4421 return TRUE; 4422 } 4423 } 4424 setMatchNotFound(strsrch); 4425 return FALSE; 4426 #else 4427 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4428 int32_t start = -1; 4429 int32_t end = -1; 4430 4431 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4432 strsrch->search->matchedIndex = start; 4433 strsrch->search->matchedLength = end - start; 4434 return TRUE; 4435 } else { 4436 setMatchNotFound(strsrch); 4437 return FALSE; 4438 } 4439 #endif 4440 } 4441 4442 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4443 { 4444 if (U_FAILURE(*status)) { 4445 setMatchNotFound(strsrch); 4446 return FALSE; 4447 } 4448 4449 #if BOYER_MOORE 4450 UCollationElements *coleiter = strsrch->textIter; 4451 int32_t textlength = strsrch->search->textLength; 4452 int32_t *patternce = strsrch->pattern.ces; 4453 int32_t patterncelength = strsrch->pattern.cesLength; 4454 int32_t textoffset = ucol_getOffset(coleiter); 4455 UBool hasPatternAccents = 4456 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4457 4458 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4459 patterncelength); 4460 strsrch->canonicalPrefixAccents[0] = 0; 4461 strsrch->canonicalSuffixAccents[0] = 0; 4462 4463 while (textoffset <= textlength) 4464 { 4465 int32_t patternceindex = patterncelength - 1; 4466 int32_t targetce; 4467 UBool found = FALSE; 4468 int32_t lastce = UCOL_NULLORDER; 4469 4470 setColEIterOffset(coleiter, textoffset); 4471 4472 for (;;) { 4473 // finding the last pattern ce match, imagine composite characters 4474 // for example: search for pattern A in text \u00C0 4475 // we'll have to skip \u0300 the grave first before we get to A 4476 targetce = ucol_previous(coleiter, status); 4477 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4478 found = FALSE; 4479 break; 4480 } 4481 targetce = getCE(strsrch, targetce); 4482 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4483 lastce = targetce; 4484 } 4485 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4486 if (targetce == patternce[patternceindex]) { 4487 // the first ce can be a contraction 4488 found = TRUE; 4489 break; 4490 } 4491 if (!hasExpansion(coleiter)) { 4492 found = FALSE; 4493 break; 4494 } 4495 } 4496 4497 while (found && patternceindex > 0) { 4498 targetce = ucol_previous(coleiter, status); 4499 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4500 found = FALSE; 4501 break; 4502 } 4503 targetce = getCE(strsrch, targetce); 4504 if (targetce == UCOL_IGNORABLE) { 4505 continue; 4506 } 4507 4508 patternceindex --; 4509 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4510 found = found && targetce == patternce[patternceindex]; 4511 } 4512 4513 // initializing the rearranged accent array 4514 if (hasPatternAccents && !found) { 4515 strsrch->canonicalPrefixAccents[0] = 0; 4516 strsrch->canonicalSuffixAccents[0] = 0; 4517 if (U_FAILURE(*status)) { 4518 break; 4519 } 4520 found = doNextCanonicalMatch(strsrch, textoffset, status); 4521 } 4522 4523 if (!found) { 4524 if (U_FAILURE(*status)) { 4525 break; 4526 } 4527 textoffset = shiftForward(strsrch, textoffset, lastce, 4528 patternceindex); 4529 // status checked at loop 4530 patternceindex = patterncelength; 4531 continue; 4532 } 4533 4534 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4535 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4536 return TRUE; 4537 } 4538 } 4539 setMatchNotFound(strsrch); 4540 return FALSE; 4541 #else 4542 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4543 int32_t start = -1; 4544 int32_t end = -1; 4545 4546 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4547 strsrch->search->matchedIndex = start; 4548 strsrch->search->matchedLength = end - start; 4549 return TRUE; 4550 } else { 4551 setMatchNotFound(strsrch); 4552 return FALSE; 4553 } 4554 #endif 4555 } 4556 4557 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4558 { 4559 if (U_FAILURE(*status)) { 4560 setMatchNotFound(strsrch); 4561 return FALSE; 4562 } 4563 4564 #if BOYER_MOORE 4565 UCollationElements *coleiter = strsrch->textIter; 4566 int32_t *patternce = strsrch->pattern.ces; 4567 int32_t patterncelength = strsrch->pattern.cesLength; 4568 int32_t textoffset = ucol_getOffset(coleiter); 4569 4570 // shifting it check for setting offset 4571 // if setOffset is called previously or there was no previous match, we 4572 // leave the offset as it is. 4573 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4574 textoffset = strsrch->search->matchedIndex; 4575 } 4576 4577 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4578 patterncelength); 4579 4580 while (textoffset >= 0) 4581 { 4582 int32_t patternceindex = 1; 4583 int32_t targetce; 4584 UBool found = FALSE; 4585 int32_t firstce = UCOL_NULLORDER; 4586 4587 // if status is a failure, ucol_setOffset does nothing 4588 setColEIterOffset(coleiter, textoffset); 4589 4590 for (;;) { 4591 // finding the first pattern ce match, imagine composite 4592 // characters. for example: search for pattern \u0300 in text 4593 // \u00C0, we'll have to skip A first before we get to 4594 // \u0300 the grave accent 4595 targetce = ucol_next(coleiter, status); 4596 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4597 found = FALSE; 4598 break; 4599 } 4600 targetce = getCE(strsrch, targetce); 4601 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4602 firstce = targetce; 4603 } 4604 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4605 continue; 4606 } 4607 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4608 if (targetce == patternce[0]) { 4609 found = TRUE; 4610 break; 4611 } 4612 if (!hasExpansion(coleiter)) { 4613 // checking for accents in composite character 4614 found = FALSE; 4615 break; 4616 } 4617 } 4618 4619 //targetce = firstce; 4620 4621 while (found && (patternceindex < patterncelength)) { 4622 firstce = targetce; 4623 targetce = ucol_next(coleiter, status); 4624 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4625 found = FALSE; 4626 break; 4627 } 4628 targetce = getCE(strsrch, targetce); 4629 if (targetce == UCOL_IGNORABLE) { 4630 continue; 4631 } 4632 4633 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4634 found = found && targetce == patternce[patternceindex]; 4635 patternceindex ++; 4636 } 4637 4638 targetce = firstce; 4639 4640 if (!found) { 4641 if (U_FAILURE(*status)) { 4642 break; 4643 } 4644 4645 textoffset = reverseShift(strsrch, textoffset, targetce, 4646 patternceindex); 4647 patternceindex = 0; 4648 continue; 4649 } 4650 4651 if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4652 setColEIterOffset(coleiter, textoffset); 4653 return TRUE; 4654 } 4655 } 4656 setMatchNotFound(strsrch); 4657 return FALSE; 4658 #else 4659 int32_t textOffset; 4660 4661 if (strsrch->search->isOverlap) { 4662 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4663 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4664 } else { 4665 // move the start position at the end of possible match 4666 initializePatternPCETable(strsrch, status); 4667 if (!initTextProcessedIter(strsrch, status)) { 4668 setMatchNotFound(strsrch); 4669 return FALSE; 4670 } 4671 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4672 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4673 if (pce == UCOL_PROCESSED_NULLORDER) { 4674 // at the end of the text 4675 break; 4676 } 4677 } 4678 if (U_FAILURE(*status)) { 4679 setMatchNotFound(strsrch); 4680 return FALSE; 4681 } 4682 textOffset = ucol_getOffset(strsrch->textIter); 4683 } 4684 } else { 4685 textOffset = ucol_getOffset(strsrch->textIter); 4686 } 4687 4688 int32_t start = -1; 4689 int32_t end = -1; 4690 4691 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4692 strsrch->search->matchedIndex = start; 4693 strsrch->search->matchedLength = end - start; 4694 return TRUE; 4695 } else { 4696 setMatchNotFound(strsrch); 4697 return FALSE; 4698 } 4699 #endif 4700 } 4701 4702 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4703 UErrorCode *status) 4704 { 4705 if (U_FAILURE(*status)) { 4706 setMatchNotFound(strsrch); 4707 return FALSE; 4708 } 4709 4710 #if BOYER_MOORE 4711 UCollationElements *coleiter = strsrch->textIter; 4712 int32_t *patternce = strsrch->pattern.ces; 4713 int32_t patterncelength = strsrch->pattern.cesLength; 4714 int32_t textoffset = ucol_getOffset(coleiter); 4715 UBool hasPatternAccents = 4716 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4717 4718 // shifting it check for setting offset 4719 // if setOffset is called previously or there was no previous match, we 4720 // leave the offset as it is. 4721 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4722 textoffset = strsrch->search->matchedIndex; 4723 } 4724 4725 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4726 patterncelength); 4727 strsrch->canonicalPrefixAccents[0] = 0; 4728 strsrch->canonicalSuffixAccents[0] = 0; 4729 4730 while (textoffset >= 0) 4731 { 4732 int32_t patternceindex = 1; 4733 int32_t targetce; 4734 UBool found = FALSE; 4735 int32_t firstce = UCOL_NULLORDER; 4736 4737 setColEIterOffset(coleiter, textoffset); 4738 for (;;) { 4739 // finding the first pattern ce match, imagine composite 4740 // characters. for example: search for pattern \u0300 in text 4741 // \u00C0, we'll have to skip A first before we get to 4742 // \u0300 the grave accent 4743 targetce = ucol_next(coleiter, status); 4744 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4745 found = FALSE; 4746 break; 4747 } 4748 targetce = getCE(strsrch, targetce); 4749 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4750 firstce = targetce; 4751 } 4752 4753 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4754 if (targetce == patternce[0]) { 4755 // the first ce can be a contraction 4756 found = TRUE; 4757 break; 4758 } 4759 if (!hasExpansion(coleiter)) { 4760 // checking for accents in composite character 4761 found = FALSE; 4762 break; 4763 } 4764 } 4765 4766 targetce = firstce; 4767 4768 while (found && patternceindex < patterncelength) { 4769 targetce = ucol_next(coleiter, status); 4770 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4771 found = FALSE; 4772 break; 4773 } 4774 targetce = getCE(strsrch, targetce); 4775 if (targetce == UCOL_IGNORABLE) { 4776 continue; 4777 } 4778 4779 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4780 found = found && targetce == patternce[patternceindex]; 4781 patternceindex ++; 4782 } 4783 4784 // initializing the rearranged accent array 4785 if (hasPatternAccents && !found) { 4786 strsrch->canonicalPrefixAccents[0] = 0; 4787 strsrch->canonicalSuffixAccents[0] = 0; 4788 if (U_FAILURE(*status)) { 4789 break; 4790 } 4791 found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4792 } 4793 4794 if (!found) { 4795 if (U_FAILURE(*status)) { 4796 break; 4797 } 4798 textoffset = reverseShift(strsrch, textoffset, targetce, 4799 patternceindex); 4800 patternceindex = 0; 4801 continue; 4802 } 4803 4804 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4805 setColEIterOffset(coleiter, textoffset); 4806 return TRUE; 4807 } 4808 } 4809 setMatchNotFound(strsrch); 4810 return FALSE; 4811 #else 4812 int32_t textOffset; 4813 4814 if (strsrch->search->isOverlap) { 4815 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4816 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4817 } else { 4818 // move the start position at the end of possible match 4819 initializePatternPCETable(strsrch, status); 4820 if (!initTextProcessedIter(strsrch, status)) { 4821 setMatchNotFound(strsrch); 4822 return FALSE; 4823 } 4824 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 4825 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4826 if (pce == UCOL_PROCESSED_NULLORDER) { 4827 // at the end of the text 4828 break; 4829 } 4830 } 4831 if (U_FAILURE(*status)) { 4832 setMatchNotFound(strsrch); 4833 return FALSE; 4834 } 4835 textOffset = ucol_getOffset(strsrch->textIter); 4836 } 4837 } else { 4838 textOffset = ucol_getOffset(strsrch->textIter); 4839 } 4840 4841 int32_t start = -1; 4842 int32_t end = -1; 4843 4844 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4845 strsrch->search->matchedIndex = start; 4846 strsrch->search->matchedLength = end - start; 4847 return TRUE; 4848 } else { 4849 setMatchNotFound(strsrch); 4850 return FALSE; 4851 } 4852 #endif 4853 } 4854 4855 #endif /* #if !UCONFIG_NO_COLLATION */ 4856