1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2004-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: uregex.cpp 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 14 15 #include "unicode/regex.h" 16 #include "unicode/uregex.h" 17 #include "unicode/unistr.h" 18 #include "unicode/ustring.h" 19 #include "unicode/uchar.h" 20 #include "unicode/uobject.h" 21 #include "unicode/utf16.h" 22 #include "cmemory.h" 23 #include "uassert.h" 24 #include "uhash.h" 25 #include "umutex.h" 26 #include "uvectr32.h" 27 28 #include "regextxt.h" 29 30 U_NAMESPACE_BEGIN 31 32 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 33 34 struct RegularExpression: public UMemory { 35 public: 36 RegularExpression(); 37 ~RegularExpression(); 38 int32_t fMagic; 39 RegexPattern *fPat; 40 u_atomic_int32_t *fPatRefCount; 41 UChar *fPatString; 42 int32_t fPatStringLen; 43 RegexMatcher *fMatcher; 44 const UChar *fText; // Text from setText() 45 int32_t fTextLength; // Length provided by user with setText(), which 46 // may be -1. 47 UBool fOwnsText; 48 }; 49 50 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 51 52 RegularExpression::RegularExpression() { 53 fMagic = REXP_MAGIC; 54 fPat = NULL; 55 fPatRefCount = NULL; 56 fPatString = NULL; 57 fPatStringLen = 0; 58 fMatcher = NULL; 59 fText = NULL; 60 fTextLength = 0; 61 fOwnsText = FALSE; 62 } 63 64 RegularExpression::~RegularExpression() { 65 delete fMatcher; 66 fMatcher = NULL; 67 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 68 delete fPat; 69 uprv_free(fPatString); 70 uprv_free((void *)fPatRefCount); 71 } 72 if (fOwnsText && fText!=NULL) { 73 uprv_free((void *)fText); 74 } 75 fMagic = 0; 76 } 77 78 U_NAMESPACE_END 79 80 U_NAMESPACE_USE 81 82 //---------------------------------------------------------------------------------------- 83 // 84 // validateRE Do boilerplate style checks on API function parameters. 85 // Return TRUE if they look OK. 86 //---------------------------------------------------------------------------------------- 87 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { 88 if (U_FAILURE(*status)) { 89 return FALSE; 90 } 91 if (re == NULL || re->fMagic != REXP_MAGIC) { 92 *status = U_ILLEGAL_ARGUMENT_ERROR; 93 return FALSE; 94 } 95 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 96 if (requiresText && re->fText == NULL && !re->fOwnsText) { 97 *status = U_REGEX_INVALID_STATE; 98 return FALSE; 99 } 100 return TRUE; 101 } 102 103 //---------------------------------------------------------------------------------------- 104 // 105 // uregex_open 106 // 107 //---------------------------------------------------------------------------------------- 108 U_CAPI URegularExpression * U_EXPORT2 109 uregex_open( const UChar *pattern, 110 int32_t patternLength, 111 uint32_t flags, 112 UParseError *pe, 113 UErrorCode *status) { 114 115 if (U_FAILURE(*status)) { 116 return NULL; 117 } 118 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 119 *status = U_ILLEGAL_ARGUMENT_ERROR; 120 return NULL; 121 } 122 int32_t actualPatLen = patternLength; 123 if (actualPatLen == -1) { 124 actualPatLen = u_strlen(pattern); 125 } 126 127 RegularExpression *re = new RegularExpression; 128 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 129 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 130 if (re == NULL || refC == NULL || patBuf == NULL) { 131 *status = U_MEMORY_ALLOCATION_ERROR; 132 delete re; 133 uprv_free((void *)refC); 134 uprv_free(patBuf); 135 return NULL; 136 } 137 re->fPatRefCount = refC; 138 *re->fPatRefCount = 1; 139 140 // 141 // Make a copy of the pattern string, so we can return it later if asked. 142 // For compiling the pattern, we will use a UText wrapper around 143 // this local copy, to avoid making even more copies. 144 // 145 re->fPatString = patBuf; 146 re->fPatStringLen = patternLength; 147 u_memcpy(patBuf, pattern, actualPatLen); 148 patBuf[actualPatLen] = 0; 149 150 UText patText = UTEXT_INITIALIZER; 151 utext_openUChars(&patText, patBuf, patternLength, status); 152 153 // 154 // Compile the pattern 155 // 156 if (pe != NULL) { 157 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 158 } else { 159 re->fPat = RegexPattern::compile(&patText, flags, *status); 160 } 161 utext_close(&patText); 162 163 if (U_FAILURE(*status)) { 164 goto ErrorExit; 165 } 166 167 // 168 // Create the matcher object 169 // 170 re->fMatcher = re->fPat->matcher(*status); 171 if (U_SUCCESS(*status)) { 172 return (URegularExpression*)re; 173 } 174 175 ErrorExit: 176 delete re; 177 return NULL; 178 179 } 180 181 //---------------------------------------------------------------------------------------- 182 // 183 // uregex_openUText 184 // 185 //---------------------------------------------------------------------------------------- 186 U_CAPI URegularExpression * U_EXPORT2 187 uregex_openUText(UText *pattern, 188 uint32_t flags, 189 UParseError *pe, 190 UErrorCode *status) { 191 192 if (U_FAILURE(*status)) { 193 return NULL; 194 } 195 if (pattern == NULL) { 196 *status = U_ILLEGAL_ARGUMENT_ERROR; 197 return NULL; 198 } 199 200 int64_t patternNativeLength = utext_nativeLength(pattern); 201 202 if (patternNativeLength == 0) { 203 *status = U_ILLEGAL_ARGUMENT_ERROR; 204 return NULL; 205 } 206 207 RegularExpression *re = new RegularExpression; 208 209 UErrorCode lengthStatus = U_ZERO_ERROR; 210 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 211 212 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); 213 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 214 if (re == NULL || refC == NULL || patBuf == NULL) { 215 *status = U_MEMORY_ALLOCATION_ERROR; 216 delete re; 217 uprv_free((void *)refC); 218 uprv_free(patBuf); 219 return NULL; 220 } 221 re->fPatRefCount = refC; 222 *re->fPatRefCount = 1; 223 224 // 225 // Make a copy of the pattern string, so we can return it later if asked. 226 // For compiling the pattern, we will use a read-only UText wrapper 227 // around this local copy, to avoid making even more copies. 228 // 229 re->fPatString = patBuf; 230 re->fPatStringLen = pattern16Length; 231 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 232 233 UText patText = UTEXT_INITIALIZER; 234 utext_openUChars(&patText, patBuf, pattern16Length, status); 235 236 // 237 // Compile the pattern 238 // 239 if (pe != NULL) { 240 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 241 } else { 242 re->fPat = RegexPattern::compile(&patText, flags, *status); 243 } 244 utext_close(&patText); 245 246 if (U_FAILURE(*status)) { 247 goto ErrorExit; 248 } 249 250 // 251 // Create the matcher object 252 // 253 re->fMatcher = re->fPat->matcher(*status); 254 if (U_SUCCESS(*status)) { 255 return (URegularExpression*)re; 256 } 257 258 ErrorExit: 259 delete re; 260 return NULL; 261 262 } 263 264 //---------------------------------------------------------------------------------------- 265 // 266 // uregex_close 267 // 268 //---------------------------------------------------------------------------------------- 269 U_CAPI void U_EXPORT2 270 uregex_close(URegularExpression *re2) { 271 RegularExpression *re = (RegularExpression*)re2; 272 UErrorCode status = U_ZERO_ERROR; 273 if (validateRE(re, FALSE, &status) == FALSE) { 274 return; 275 } 276 delete re; 277 } 278 279 280 //---------------------------------------------------------------------------------------- 281 // 282 // uregex_clone 283 // 284 //---------------------------------------------------------------------------------------- 285 U_CAPI URegularExpression * U_EXPORT2 286 uregex_clone(const URegularExpression *source2, UErrorCode *status) { 287 RegularExpression *source = (RegularExpression*)source2; 288 if (validateRE(source, FALSE, status) == FALSE) { 289 return NULL; 290 } 291 292 RegularExpression *clone = new RegularExpression; 293 if (clone == NULL) { 294 *status = U_MEMORY_ALLOCATION_ERROR; 295 return NULL; 296 } 297 298 clone->fMatcher = source->fPat->matcher(*status); 299 if (U_FAILURE(*status)) { 300 delete clone; 301 return NULL; 302 } 303 304 clone->fPat = source->fPat; 305 clone->fPatRefCount = source->fPatRefCount; 306 clone->fPatString = source->fPatString; 307 clone->fPatStringLen = source->fPatStringLen; 308 umtx_atomic_inc(source->fPatRefCount); 309 // Note: fText is not cloned. 310 311 return (URegularExpression*)clone; 312 } 313 314 315 316 317 //------------------------------------------------------------------------------ 318 // 319 // uregex_pattern 320 // 321 //------------------------------------------------------------------------------ 322 U_CAPI const UChar * U_EXPORT2 323 uregex_pattern(const URegularExpression *regexp2, 324 int32_t *patLength, 325 UErrorCode *status) { 326 RegularExpression *regexp = (RegularExpression*)regexp2; 327 328 if (validateRE(regexp, FALSE, status) == FALSE) { 329 return NULL; 330 } 331 if (patLength != NULL) { 332 *patLength = regexp->fPatStringLen; 333 } 334 return regexp->fPatString; 335 } 336 337 338 //------------------------------------------------------------------------------ 339 // 340 // uregex_patternUText 341 // 342 //------------------------------------------------------------------------------ 343 U_CAPI UText * U_EXPORT2 344 uregex_patternUText(const URegularExpression *regexp2, 345 UErrorCode *status) { 346 RegularExpression *regexp = (RegularExpression*)regexp2; 347 return regexp->fPat->patternText(*status); 348 } 349 350 351 //------------------------------------------------------------------------------ 352 // 353 // uregex_flags 354 // 355 //------------------------------------------------------------------------------ 356 U_CAPI int32_t U_EXPORT2 357 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 358 RegularExpression *regexp = (RegularExpression*)regexp2; 359 if (validateRE(regexp, FALSE, status) == FALSE) { 360 return 0; 361 } 362 int32_t flags = regexp->fPat->flags(); 363 return flags; 364 } 365 366 367 //------------------------------------------------------------------------------ 368 // 369 // uregex_setText 370 // 371 //------------------------------------------------------------------------------ 372 U_CAPI void U_EXPORT2 373 uregex_setText(URegularExpression *regexp2, 374 const UChar *text, 375 int32_t textLength, 376 UErrorCode *status) { 377 RegularExpression *regexp = (RegularExpression*)regexp2; 378 if (validateRE(regexp, FALSE, status) == FALSE) { 379 return; 380 } 381 if (text == NULL || textLength < -1) { 382 *status = U_ILLEGAL_ARGUMENT_ERROR; 383 return; 384 } 385 386 if (regexp->fOwnsText && regexp->fText != NULL) { 387 uprv_free((void *)regexp->fText); 388 } 389 390 regexp->fText = text; 391 regexp->fTextLength = textLength; 392 regexp->fOwnsText = FALSE; 393 394 UText input = UTEXT_INITIALIZER; 395 utext_openUChars(&input, text, textLength, status); 396 regexp->fMatcher->reset(&input); 397 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 398 } 399 400 401 //------------------------------------------------------------------------------ 402 // 403 // uregex_setUText 404 // 405 //------------------------------------------------------------------------------ 406 U_CAPI void U_EXPORT2 407 uregex_setUText(URegularExpression *regexp2, 408 UText *text, 409 UErrorCode *status) { 410 RegularExpression *regexp = (RegularExpression*)regexp2; 411 if (validateRE(regexp, FALSE, status) == FALSE) { 412 return; 413 } 414 if (text == NULL) { 415 *status = U_ILLEGAL_ARGUMENT_ERROR; 416 return; 417 } 418 419 if (regexp->fOwnsText && regexp->fText != NULL) { 420 uprv_free((void *)regexp->fText); 421 } 422 423 regexp->fText = NULL; // only fill it in on request 424 regexp->fTextLength = -1; 425 regexp->fOwnsText = TRUE; 426 regexp->fMatcher->reset(text); 427 } 428 429 430 431 //------------------------------------------------------------------------------ 432 // 433 // uregex_getText 434 // 435 //------------------------------------------------------------------------------ 436 U_CAPI const UChar * U_EXPORT2 437 uregex_getText(URegularExpression *regexp2, 438 int32_t *textLength, 439 UErrorCode *status) { 440 RegularExpression *regexp = (RegularExpression*)regexp2; 441 if (validateRE(regexp, FALSE, status) == FALSE) { 442 return NULL; 443 } 444 445 if (regexp->fText == NULL) { 446 // need to fill in the text 447 UText *inputText = regexp->fMatcher->inputText(); 448 int64_t inputNativeLength = utext_nativeLength(inputText); 449 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 450 regexp->fText = inputText->chunkContents; 451 regexp->fTextLength = (int32_t)inputNativeLength; 452 regexp->fOwnsText = FALSE; // because the UText owns it 453 } else { 454 UErrorCode lengthStatus = U_ZERO_ERROR; 455 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 456 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 457 458 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 459 regexp->fText = inputChars; 460 regexp->fOwnsText = TRUE; // should already be set but just in case 461 } 462 } 463 464 if (textLength != NULL) { 465 *textLength = regexp->fTextLength; 466 } 467 return regexp->fText; 468 } 469 470 471 //------------------------------------------------------------------------------ 472 // 473 // uregex_getUText 474 // 475 //------------------------------------------------------------------------------ 476 U_CAPI UText * U_EXPORT2 477 uregex_getUText(URegularExpression *regexp2, 478 UText *dest, 479 UErrorCode *status) { 480 RegularExpression *regexp = (RegularExpression*)regexp2; 481 if (validateRE(regexp, FALSE, status) == FALSE) { 482 return dest; 483 } 484 return regexp->fMatcher->getInput(dest, *status); 485 } 486 487 488 //------------------------------------------------------------------------------ 489 // 490 // uregex_refreshUText 491 // 492 //------------------------------------------------------------------------------ 493 U_CAPI void U_EXPORT2 494 uregex_refreshUText(URegularExpression *regexp2, 495 UText *text, 496 UErrorCode *status) { 497 RegularExpression *regexp = (RegularExpression*)regexp2; 498 if (validateRE(regexp, FALSE, status) == FALSE) { 499 return; 500 } 501 regexp->fMatcher->refreshInputText(text, *status); 502 } 503 504 505 //------------------------------------------------------------------------------ 506 // 507 // uregex_matches 508 // 509 //------------------------------------------------------------------------------ 510 U_CAPI UBool U_EXPORT2 511 uregex_matches(URegularExpression *regexp2, 512 int32_t startIndex, 513 UErrorCode *status) { 514 return uregex_matches64( regexp2, (int64_t)startIndex, status); 515 } 516 517 U_CAPI UBool U_EXPORT2 518 uregex_matches64(URegularExpression *regexp2, 519 int64_t startIndex, 520 UErrorCode *status) { 521 RegularExpression *regexp = (RegularExpression*)regexp2; 522 UBool result = FALSE; 523 if (validateRE(regexp, TRUE, status) == FALSE) { 524 return result; 525 } 526 if (startIndex == -1) { 527 result = regexp->fMatcher->matches(*status); 528 } else { 529 result = regexp->fMatcher->matches(startIndex, *status); 530 } 531 return result; 532 } 533 534 535 //------------------------------------------------------------------------------ 536 // 537 // uregex_lookingAt 538 // 539 //------------------------------------------------------------------------------ 540 U_CAPI UBool U_EXPORT2 541 uregex_lookingAt(URegularExpression *regexp2, 542 int32_t startIndex, 543 UErrorCode *status) { 544 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 545 } 546 547 U_CAPI UBool U_EXPORT2 548 uregex_lookingAt64(URegularExpression *regexp2, 549 int64_t startIndex, 550 UErrorCode *status) { 551 RegularExpression *regexp = (RegularExpression*)regexp2; 552 UBool result = FALSE; 553 if (validateRE(regexp, TRUE, status) == FALSE) { 554 return result; 555 } 556 if (startIndex == -1) { 557 result = regexp->fMatcher->lookingAt(*status); 558 } else { 559 result = regexp->fMatcher->lookingAt(startIndex, *status); 560 } 561 return result; 562 } 563 564 565 566 //------------------------------------------------------------------------------ 567 // 568 // uregex_find 569 // 570 //------------------------------------------------------------------------------ 571 U_CAPI UBool U_EXPORT2 572 uregex_find(URegularExpression *regexp2, 573 int32_t startIndex, 574 UErrorCode *status) { 575 return uregex_find64( regexp2, (int64_t)startIndex, status); 576 } 577 578 U_CAPI UBool U_EXPORT2 579 uregex_find64(URegularExpression *regexp2, 580 int64_t startIndex, 581 UErrorCode *status) { 582 RegularExpression *regexp = (RegularExpression*)regexp2; 583 UBool result = FALSE; 584 if (validateRE(regexp, TRUE, status) == FALSE) { 585 return result; 586 } 587 if (startIndex == -1) { 588 regexp->fMatcher->resetPreserveRegion(); 589 result = regexp->fMatcher->find(*status); 590 } else { 591 result = regexp->fMatcher->find(startIndex, *status); 592 } 593 return result; 594 } 595 596 597 //------------------------------------------------------------------------------ 598 // 599 // uregex_findNext 600 // 601 //------------------------------------------------------------------------------ 602 U_CAPI UBool U_EXPORT2 603 uregex_findNext(URegularExpression *regexp2, 604 UErrorCode *status) { 605 RegularExpression *regexp = (RegularExpression*)regexp2; 606 if (validateRE(regexp, TRUE, status) == FALSE) { 607 return FALSE; 608 } 609 UBool result = regexp->fMatcher->find(*status); 610 return result; 611 } 612 613 //------------------------------------------------------------------------------ 614 // 615 // uregex_groupCount 616 // 617 //------------------------------------------------------------------------------ 618 U_CAPI int32_t U_EXPORT2 619 uregex_groupCount(URegularExpression *regexp2, 620 UErrorCode *status) { 621 RegularExpression *regexp = (RegularExpression*)regexp2; 622 if (validateRE(regexp, FALSE, status) == FALSE) { 623 return 0; 624 } 625 int32_t result = regexp->fMatcher->groupCount(); 626 return result; 627 } 628 629 630 //------------------------------------------------------------------------------ 631 // 632 // uregex_groupNumberFromName 633 // 634 //------------------------------------------------------------------------------ 635 int32_t 636 uregex_groupNumberFromName(URegularExpression *regexp2, 637 const UChar *groupName, 638 int32_t nameLength, 639 UErrorCode *status) { 640 RegularExpression *regexp = (RegularExpression*)regexp2; 641 if (validateRE(regexp, FALSE, status) == FALSE) { 642 return 0; 643 } 644 int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); 645 return result; 646 } 647 648 int32_t 649 uregex_groupNumberFromCName(URegularExpression *regexp2, 650 const char *groupName, 651 int32_t nameLength, 652 UErrorCode *status) { 653 RegularExpression *regexp = (RegularExpression*)regexp2; 654 if (validateRE(regexp, FALSE, status) == FALSE) { 655 return 0; 656 } 657 return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); 658 } 659 660 //------------------------------------------------------------------------------ 661 // 662 // uregex_group 663 // 664 //------------------------------------------------------------------------------ 665 U_CAPI int32_t U_EXPORT2 666 uregex_group(URegularExpression *regexp2, 667 int32_t groupNum, 668 UChar *dest, 669 int32_t destCapacity, 670 UErrorCode *status) { 671 RegularExpression *regexp = (RegularExpression*)regexp2; 672 if (validateRE(regexp, TRUE, status) == FALSE) { 673 return 0; 674 } 675 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 676 *status = U_ILLEGAL_ARGUMENT_ERROR; 677 return 0; 678 } 679 680 if (destCapacity == 0 || regexp->fText != NULL) { 681 // If preflighting or if we already have the text as UChars, 682 // this is a little cheaper than extracting from the UText 683 684 // 685 // Pick up the range of characters from the matcher 686 // 687 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 688 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 689 if (U_FAILURE(*status)) { 690 return 0; 691 } 692 693 // 694 // Trim length based on buffer capacity 695 // 696 int32_t fullLength = endIx - startIx; 697 int32_t copyLength = fullLength; 698 if (copyLength < destCapacity) { 699 dest[copyLength] = 0; 700 } else if (copyLength == destCapacity) { 701 *status = U_STRING_NOT_TERMINATED_WARNING; 702 } else { 703 copyLength = destCapacity; 704 *status = U_BUFFER_OVERFLOW_ERROR; 705 } 706 707 // 708 // Copy capture group to user's buffer 709 // 710 if (copyLength > 0) { 711 u_memcpy(dest, ®exp->fText[startIx], copyLength); 712 } 713 return fullLength; 714 } else { 715 int64_t start = regexp->fMatcher->start64(groupNum, *status); 716 int64_t limit = regexp->fMatcher->end64(groupNum, *status); 717 if (U_FAILURE(*status)) { 718 return 0; 719 } 720 // Note edge cases: 721 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. 722 // Zero Length Match: start == end. 723 int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); 724 return length; 725 } 726 727 } 728 729 730 //------------------------------------------------------------------------------ 731 // 732 // uregex_groupUText 733 // 734 //------------------------------------------------------------------------------ 735 U_CAPI UText * U_EXPORT2 736 uregex_groupUText(URegularExpression *regexp2, 737 int32_t groupNum, 738 UText *dest, 739 int64_t *groupLength, 740 UErrorCode *status) { 741 RegularExpression *regexp = (RegularExpression*)regexp2; 742 if (validateRE(regexp, TRUE, status) == FALSE) { 743 UErrorCode emptyTextStatus = U_ZERO_ERROR; 744 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 745 } 746 747 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 748 } 749 750 //------------------------------------------------------------------------------ 751 // 752 // uregex_start 753 // 754 //------------------------------------------------------------------------------ 755 U_CAPI int32_t U_EXPORT2 756 uregex_start(URegularExpression *regexp2, 757 int32_t groupNum, 758 UErrorCode *status) { 759 return (int32_t)uregex_start64( regexp2, groupNum, status); 760 } 761 762 U_CAPI int64_t U_EXPORT2 763 uregex_start64(URegularExpression *regexp2, 764 int32_t groupNum, 765 UErrorCode *status) { 766 RegularExpression *regexp = (RegularExpression*)regexp2; 767 if (validateRE(regexp, TRUE, status) == FALSE) { 768 return 0; 769 } 770 int32_t result = regexp->fMatcher->start(groupNum, *status); 771 return result; 772 } 773 774 //------------------------------------------------------------------------------ 775 // 776 // uregex_end 777 // 778 //------------------------------------------------------------------------------ 779 U_CAPI int32_t U_EXPORT2 780 uregex_end(URegularExpression *regexp2, 781 int32_t groupNum, 782 UErrorCode *status) { 783 return (int32_t)uregex_end64( regexp2, groupNum, status); 784 } 785 786 U_CAPI int64_t U_EXPORT2 787 uregex_end64(URegularExpression *regexp2, 788 int32_t groupNum, 789 UErrorCode *status) { 790 RegularExpression *regexp = (RegularExpression*)regexp2; 791 if (validateRE(regexp, TRUE, status) == FALSE) { 792 return 0; 793 } 794 int32_t result = regexp->fMatcher->end(groupNum, *status); 795 return result; 796 } 797 798 //------------------------------------------------------------------------------ 799 // 800 // uregex_reset 801 // 802 //------------------------------------------------------------------------------ 803 U_CAPI void U_EXPORT2 804 uregex_reset(URegularExpression *regexp2, 805 int32_t index, 806 UErrorCode *status) { 807 uregex_reset64( regexp2, (int64_t)index, status); 808 } 809 810 U_CAPI void U_EXPORT2 811 uregex_reset64(URegularExpression *regexp2, 812 int64_t index, 813 UErrorCode *status) { 814 RegularExpression *regexp = (RegularExpression*)regexp2; 815 if (validateRE(regexp, TRUE, status) == FALSE) { 816 return; 817 } 818 regexp->fMatcher->reset(index, *status); 819 } 820 821 822 //------------------------------------------------------------------------------ 823 // 824 // uregex_setRegion 825 // 826 //------------------------------------------------------------------------------ 827 U_CAPI void U_EXPORT2 828 uregex_setRegion(URegularExpression *regexp2, 829 int32_t regionStart, 830 int32_t regionLimit, 831 UErrorCode *status) { 832 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 833 } 834 835 U_CAPI void U_EXPORT2 836 uregex_setRegion64(URegularExpression *regexp2, 837 int64_t regionStart, 838 int64_t regionLimit, 839 UErrorCode *status) { 840 RegularExpression *regexp = (RegularExpression*)regexp2; 841 if (validateRE(regexp, TRUE, status) == FALSE) { 842 return; 843 } 844 regexp->fMatcher->region(regionStart, regionLimit, *status); 845 } 846 847 848 //------------------------------------------------------------------------------ 849 // 850 // uregex_setRegionAndStart 851 // 852 //------------------------------------------------------------------------------ 853 U_CAPI void U_EXPORT2 854 uregex_setRegionAndStart(URegularExpression *regexp2, 855 int64_t regionStart, 856 int64_t regionLimit, 857 int64_t startIndex, 858 UErrorCode *status) { 859 RegularExpression *regexp = (RegularExpression*)regexp2; 860 if (validateRE(regexp, TRUE, status) == FALSE) { 861 return; 862 } 863 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 864 } 865 866 //------------------------------------------------------------------------------ 867 // 868 // uregex_regionStart 869 // 870 //------------------------------------------------------------------------------ 871 U_CAPI int32_t U_EXPORT2 872 uregex_regionStart(const URegularExpression *regexp2, 873 UErrorCode *status) { 874 return (int32_t)uregex_regionStart64(regexp2, status); 875 } 876 877 U_CAPI int64_t U_EXPORT2 878 uregex_regionStart64(const URegularExpression *regexp2, 879 UErrorCode *status) { 880 RegularExpression *regexp = (RegularExpression*)regexp2; 881 if (validateRE(regexp, TRUE, status) == FALSE) { 882 return 0; 883 } 884 return regexp->fMatcher->regionStart(); 885 } 886 887 888 //------------------------------------------------------------------------------ 889 // 890 // uregex_regionEnd 891 // 892 //------------------------------------------------------------------------------ 893 U_CAPI int32_t U_EXPORT2 894 uregex_regionEnd(const URegularExpression *regexp2, 895 UErrorCode *status) { 896 return (int32_t)uregex_regionEnd64(regexp2, status); 897 } 898 899 U_CAPI int64_t U_EXPORT2 900 uregex_regionEnd64(const URegularExpression *regexp2, 901 UErrorCode *status) { 902 RegularExpression *regexp = (RegularExpression*)regexp2; 903 if (validateRE(regexp, TRUE, status) == FALSE) { 904 return 0; 905 } 906 return regexp->fMatcher->regionEnd(); 907 } 908 909 910 //------------------------------------------------------------------------------ 911 // 912 // uregex_hasTransparentBounds 913 // 914 //------------------------------------------------------------------------------ 915 U_CAPI UBool U_EXPORT2 916 uregex_hasTransparentBounds(const URegularExpression *regexp2, 917 UErrorCode *status) { 918 RegularExpression *regexp = (RegularExpression*)regexp2; 919 if (validateRE(regexp, FALSE, status) == FALSE) { 920 return FALSE; 921 } 922 return regexp->fMatcher->hasTransparentBounds(); 923 } 924 925 926 //------------------------------------------------------------------------------ 927 // 928 // uregex_useTransparentBounds 929 // 930 //------------------------------------------------------------------------------ 931 U_CAPI void U_EXPORT2 932 uregex_useTransparentBounds(URegularExpression *regexp2, 933 UBool b, 934 UErrorCode *status) { 935 RegularExpression *regexp = (RegularExpression*)regexp2; 936 if (validateRE(regexp, FALSE, status) == FALSE) { 937 return; 938 } 939 regexp->fMatcher->useTransparentBounds(b); 940 } 941 942 943 //------------------------------------------------------------------------------ 944 // 945 // uregex_hasAnchoringBounds 946 // 947 //------------------------------------------------------------------------------ 948 U_CAPI UBool U_EXPORT2 949 uregex_hasAnchoringBounds(const URegularExpression *regexp2, 950 UErrorCode *status) { 951 RegularExpression *regexp = (RegularExpression*)regexp2; 952 if (validateRE(regexp, FALSE, status) == FALSE) { 953 return FALSE; 954 } 955 return regexp->fMatcher->hasAnchoringBounds(); 956 } 957 958 959 //------------------------------------------------------------------------------ 960 // 961 // uregex_useAnchoringBounds 962 // 963 //------------------------------------------------------------------------------ 964 U_CAPI void U_EXPORT2 965 uregex_useAnchoringBounds(URegularExpression *regexp2, 966 UBool b, 967 UErrorCode *status) { 968 RegularExpression *regexp = (RegularExpression*)regexp2; 969 if (validateRE(regexp, FALSE, status) == FALSE) { 970 return; 971 } 972 regexp->fMatcher->useAnchoringBounds(b); 973 } 974 975 976 //------------------------------------------------------------------------------ 977 // 978 // uregex_hitEnd 979 // 980 //------------------------------------------------------------------------------ 981 U_CAPI UBool U_EXPORT2 982 uregex_hitEnd(const URegularExpression *regexp2, 983 UErrorCode *status) { 984 RegularExpression *regexp = (RegularExpression*)regexp2; 985 if (validateRE(regexp, TRUE, status) == FALSE) { 986 return FALSE; 987 } 988 return regexp->fMatcher->hitEnd(); 989 } 990 991 992 //------------------------------------------------------------------------------ 993 // 994 // uregex_requireEnd 995 // 996 //------------------------------------------------------------------------------ 997 U_CAPI UBool U_EXPORT2 998 uregex_requireEnd(const URegularExpression *regexp2, 999 UErrorCode *status) { 1000 RegularExpression *regexp = (RegularExpression*)regexp2; 1001 if (validateRE(regexp, TRUE, status) == FALSE) { 1002 return FALSE; 1003 } 1004 return regexp->fMatcher->requireEnd(); 1005 } 1006 1007 1008 //------------------------------------------------------------------------------ 1009 // 1010 // uregex_setTimeLimit 1011 // 1012 //------------------------------------------------------------------------------ 1013 U_CAPI void U_EXPORT2 1014 uregex_setTimeLimit(URegularExpression *regexp2, 1015 int32_t limit, 1016 UErrorCode *status) { 1017 RegularExpression *regexp = (RegularExpression*)regexp2; 1018 if (validateRE(regexp, FALSE, status)) { 1019 regexp->fMatcher->setTimeLimit(limit, *status); 1020 } 1021 } 1022 1023 1024 1025 //------------------------------------------------------------------------------ 1026 // 1027 // uregex_getTimeLimit 1028 // 1029 //------------------------------------------------------------------------------ 1030 U_CAPI int32_t U_EXPORT2 1031 uregex_getTimeLimit(const URegularExpression *regexp2, 1032 UErrorCode *status) { 1033 int32_t retVal = 0; 1034 RegularExpression *regexp = (RegularExpression*)regexp2; 1035 if (validateRE(regexp, FALSE, status)) { 1036 retVal = regexp->fMatcher->getTimeLimit(); 1037 } 1038 return retVal; 1039 } 1040 1041 1042 1043 //------------------------------------------------------------------------------ 1044 // 1045 // uregex_setStackLimit 1046 // 1047 //------------------------------------------------------------------------------ 1048 U_CAPI void U_EXPORT2 1049 uregex_setStackLimit(URegularExpression *regexp2, 1050 int32_t limit, 1051 UErrorCode *status) { 1052 RegularExpression *regexp = (RegularExpression*)regexp2; 1053 if (validateRE(regexp, FALSE, status)) { 1054 regexp->fMatcher->setStackLimit(limit, *status); 1055 } 1056 } 1057 1058 1059 1060 //------------------------------------------------------------------------------ 1061 // 1062 // uregex_getStackLimit 1063 // 1064 //------------------------------------------------------------------------------ 1065 U_CAPI int32_t U_EXPORT2 1066 uregex_getStackLimit(const URegularExpression *regexp2, 1067 UErrorCode *status) { 1068 int32_t retVal = 0; 1069 RegularExpression *regexp = (RegularExpression*)regexp2; 1070 if (validateRE(regexp, FALSE, status)) { 1071 retVal = regexp->fMatcher->getStackLimit(); 1072 } 1073 return retVal; 1074 } 1075 1076 1077 //------------------------------------------------------------------------------ 1078 // 1079 // uregex_setMatchCallback 1080 // 1081 //------------------------------------------------------------------------------ 1082 U_CAPI void U_EXPORT2 1083 uregex_setMatchCallback(URegularExpression *regexp2, 1084 URegexMatchCallback *callback, 1085 const void *context, 1086 UErrorCode *status) { 1087 RegularExpression *regexp = (RegularExpression*)regexp2; 1088 if (validateRE(regexp, FALSE, status)) { 1089 regexp->fMatcher->setMatchCallback(callback, context, *status); 1090 } 1091 } 1092 1093 1094 //------------------------------------------------------------------------------ 1095 // 1096 // uregex_getMatchCallback 1097 // 1098 //------------------------------------------------------------------------------ 1099 U_CAPI void U_EXPORT2 1100 uregex_getMatchCallback(const URegularExpression *regexp2, 1101 URegexMatchCallback **callback, 1102 const void **context, 1103 UErrorCode *status) { 1104 RegularExpression *regexp = (RegularExpression*)regexp2; 1105 if (validateRE(regexp, FALSE, status)) { 1106 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1107 } 1108 } 1109 1110 1111 //------------------------------------------------------------------------------ 1112 // 1113 // uregex_setMatchProgressCallback 1114 // 1115 //------------------------------------------------------------------------------ 1116 U_CAPI void U_EXPORT2 1117 uregex_setFindProgressCallback(URegularExpression *regexp2, 1118 URegexFindProgressCallback *callback, 1119 const void *context, 1120 UErrorCode *status) { 1121 RegularExpression *regexp = (RegularExpression*)regexp2; 1122 if (validateRE(regexp, FALSE, status)) { 1123 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1124 } 1125 } 1126 1127 1128 //------------------------------------------------------------------------------ 1129 // 1130 // uregex_getMatchCallback 1131 // 1132 //------------------------------------------------------------------------------ 1133 U_CAPI void U_EXPORT2 1134 uregex_getFindProgressCallback(const URegularExpression *regexp2, 1135 URegexFindProgressCallback **callback, 1136 const void **context, 1137 UErrorCode *status) { 1138 RegularExpression *regexp = (RegularExpression*)regexp2; 1139 if (validateRE(regexp, FALSE, status)) { 1140 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1141 } 1142 } 1143 1144 1145 //------------------------------------------------------------------------------ 1146 // 1147 // uregex_replaceAll 1148 // 1149 //------------------------------------------------------------------------------ 1150 U_CAPI int32_t U_EXPORT2 1151 uregex_replaceAll(URegularExpression *regexp2, 1152 const UChar *replacementText, 1153 int32_t replacementLength, 1154 UChar *destBuf, 1155 int32_t destCapacity, 1156 UErrorCode *status) { 1157 RegularExpression *regexp = (RegularExpression*)regexp2; 1158 if (validateRE(regexp, TRUE, status) == FALSE) { 1159 return 0; 1160 } 1161 if (replacementText == NULL || replacementLength < -1 || 1162 (destBuf == NULL && destCapacity > 0) || 1163 destCapacity < 0) { 1164 *status = U_ILLEGAL_ARGUMENT_ERROR; 1165 return 0; 1166 } 1167 1168 int32_t len = 0; 1169 1170 uregex_reset(regexp2, 0, status); 1171 1172 // Note: Seperate error code variables for findNext() and appendReplacement() 1173 // are used so that destination buffer overflow errors 1174 // in appendReplacement won't stop findNext() from working. 1175 // appendReplacement() and appendTail() special case incoming buffer 1176 // overflow errors, continuing to return the correct length. 1177 UErrorCode findStatus = *status; 1178 while (uregex_findNext(regexp2, &findStatus)) { 1179 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1180 &destBuf, &destCapacity, status); 1181 } 1182 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1183 1184 if (U_FAILURE(findStatus)) { 1185 // If anything went wrong with the findNext(), make that error trump 1186 // whatever may have happened with the append() operations. 1187 // Errors in findNext() are not expected. 1188 *status = findStatus; 1189 } 1190 1191 return len; 1192 } 1193 1194 1195 //------------------------------------------------------------------------------ 1196 // 1197 // uregex_replaceAllUText 1198 // 1199 //------------------------------------------------------------------------------ 1200 U_CAPI UText * U_EXPORT2 1201 uregex_replaceAllUText(URegularExpression *regexp2, 1202 UText *replacementText, 1203 UText *dest, 1204 UErrorCode *status) { 1205 RegularExpression *regexp = (RegularExpression*)regexp2; 1206 if (validateRE(regexp, TRUE, status) == FALSE) { 1207 return 0; 1208 } 1209 if (replacementText == NULL) { 1210 *status = U_ILLEGAL_ARGUMENT_ERROR; 1211 return 0; 1212 } 1213 1214 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1215 return dest; 1216 } 1217 1218 1219 //------------------------------------------------------------------------------ 1220 // 1221 // uregex_replaceFirst 1222 // 1223 //------------------------------------------------------------------------------ 1224 U_CAPI int32_t U_EXPORT2 1225 uregex_replaceFirst(URegularExpression *regexp2, 1226 const UChar *replacementText, 1227 int32_t replacementLength, 1228 UChar *destBuf, 1229 int32_t destCapacity, 1230 UErrorCode *status) { 1231 RegularExpression *regexp = (RegularExpression*)regexp2; 1232 if (validateRE(regexp, TRUE, status) == FALSE) { 1233 return 0; 1234 } 1235 if (replacementText == NULL || replacementLength < -1 || 1236 (destBuf == NULL && destCapacity > 0) || 1237 destCapacity < 0) { 1238 *status = U_ILLEGAL_ARGUMENT_ERROR; 1239 return 0; 1240 } 1241 1242 int32_t len = 0; 1243 UBool findSucceeded; 1244 uregex_reset(regexp2, 0, status); 1245 findSucceeded = uregex_find(regexp2, 0, status); 1246 if (findSucceeded) { 1247 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1248 &destBuf, &destCapacity, status); 1249 } 1250 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1251 1252 return len; 1253 } 1254 1255 1256 //------------------------------------------------------------------------------ 1257 // 1258 // uregex_replaceFirstUText 1259 // 1260 //------------------------------------------------------------------------------ 1261 U_CAPI UText * U_EXPORT2 1262 uregex_replaceFirstUText(URegularExpression *regexp2, 1263 UText *replacementText, 1264 UText *dest, 1265 UErrorCode *status) { 1266 RegularExpression *regexp = (RegularExpression*)regexp2; 1267 if (validateRE(regexp, TRUE, status) == FALSE) { 1268 return 0; 1269 } 1270 if (replacementText == NULL) { 1271 *status = U_ILLEGAL_ARGUMENT_ERROR; 1272 return 0; 1273 } 1274 1275 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1276 return dest; 1277 } 1278 1279 1280 //------------------------------------------------------------------------------ 1281 // 1282 // uregex_appendReplacement 1283 // 1284 //------------------------------------------------------------------------------ 1285 1286 U_NAMESPACE_BEGIN 1287 // 1288 // Dummy class, because these functions need to be friends of class RegexMatcher, 1289 // and stand-alone C functions don't work as friends 1290 // 1291 class RegexCImpl { 1292 public: 1293 inline static int32_t appendReplacement(RegularExpression *regexp, 1294 const UChar *replacementText, 1295 int32_t replacementLength, 1296 UChar **destBuf, 1297 int32_t *destCapacity, 1298 UErrorCode *status); 1299 1300 inline static int32_t appendTail(RegularExpression *regexp, 1301 UChar **destBuf, 1302 int32_t *destCapacity, 1303 UErrorCode *status); 1304 1305 inline static int32_t split(RegularExpression *regexp, 1306 UChar *destBuf, 1307 int32_t destCapacity, 1308 int32_t *requiredCapacity, 1309 UChar *destFields[], 1310 int32_t destFieldsCapacity, 1311 UErrorCode *status); 1312 }; 1313 1314 U_NAMESPACE_END 1315 1316 1317 1318 static const UChar BACKSLASH = 0x5c; 1319 static const UChar DOLLARSIGN = 0x24; 1320 static const UChar LEFTBRACKET = 0x7b; 1321 static const UChar RIGHTBRACKET = 0x7d; 1322 1323 // 1324 // Move a character to an output buffer, with bounds checking on the index. 1325 // Index advances even if capacity is exceeded, for preflight size computations. 1326 // This little sequence is used a LOT. 1327 // 1328 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1329 if (*idx < bufCapacity) { 1330 buf[*idx] = c; 1331 } 1332 (*idx)++; 1333 } 1334 1335 1336 // 1337 // appendReplacement, the actual implementation. 1338 // 1339 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1340 const UChar *replacementText, 1341 int32_t replacementLength, 1342 UChar **destBuf, 1343 int32_t *destCapacity, 1344 UErrorCode *status) { 1345 1346 // If we come in with a buffer overflow error, don't suppress the operation. 1347 // A series of appendReplacements, appendTail need to correctly preflight 1348 // the buffer size when an overflow happens somewhere in the middle. 1349 UBool pendingBufferOverflow = FALSE; 1350 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1351 pendingBufferOverflow = TRUE; 1352 *status = U_ZERO_ERROR; 1353 } 1354 1355 // 1356 // Validate all paramters 1357 // 1358 if (validateRE(regexp, TRUE, status) == FALSE) { 1359 return 0; 1360 } 1361 if (replacementText == NULL || replacementLength < -1 || 1362 destCapacity == NULL || destBuf == NULL || 1363 (*destBuf == NULL && *destCapacity > 0) || 1364 *destCapacity < 0) { 1365 *status = U_ILLEGAL_ARGUMENT_ERROR; 1366 return 0; 1367 } 1368 1369 RegexMatcher *m = regexp->fMatcher; 1370 if (m->fMatch == FALSE) { 1371 *status = U_REGEX_INVALID_STATE; 1372 return 0; 1373 } 1374 1375 UChar *dest = *destBuf; 1376 int32_t capacity = *destCapacity; 1377 int32_t destIdx = 0; 1378 int32_t i; 1379 1380 // If it wasn't supplied by the caller, get the length of the replacement text. 1381 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1382 // the fly and avoid this step. 1383 if (replacementLength == -1) { 1384 replacementLength = u_strlen(replacementText); 1385 } 1386 1387 // Copy input string from the end of previous match to start of current match 1388 if (regexp->fText != NULL) { 1389 int32_t matchStart; 1390 int32_t lastMatchEnd; 1391 if (UTEXT_USES_U16(m->fInputText)) { 1392 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1393 matchStart = (int32_t)m->fMatchStart; 1394 } else { 1395 // !!!: Would like a better way to do this! 1396 UErrorCode tempStatus = U_ZERO_ERROR; 1397 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus); 1398 tempStatus = U_ZERO_ERROR; 1399 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus); 1400 } 1401 for (i=lastMatchEnd; i<matchStart; i++) { 1402 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1403 } 1404 } else { 1405 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1406 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1407 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), 1408 &possibleOverflowError); 1409 } 1410 U_ASSERT(destIdx >= 0); 1411 1412 // scan the replacement text, looking for substitutions ($n) and \escapes. 1413 int32_t replIdx = 0; 1414 while (replIdx < replacementLength && U_SUCCESS(*status)) { 1415 UChar c = replacementText[replIdx]; 1416 replIdx++; 1417 if (c != DOLLARSIGN && c != BACKSLASH) { 1418 // Common case, no substitution, no escaping, 1419 // just copy the char to the dest buf. 1420 appendToBuf(c, &destIdx, dest, capacity); 1421 continue; 1422 } 1423 1424 if (c == BACKSLASH) { 1425 // Backslash Escape. Copy the following char out without further checks. 1426 // Note: Surrogate pairs don't need any special handling 1427 // The second half wont be a '$' or a '\', and 1428 // will move to the dest normally on the next 1429 // loop iteration. 1430 if (replIdx >= replacementLength) { 1431 break; 1432 } 1433 c = replacementText[replIdx]; 1434 1435 if (c==0x55/*U*/ || c==0x75/*u*/) { 1436 // We have a \udddd or \Udddddddd escape sequence. 1437 UChar32 escapedChar = 1438 u_unescapeAt(uregex_ucstr_unescape_charAt, 1439 &replIdx, // Index is updated by unescapeAt 1440 replacementLength, // Length of replacement text 1441 (void *)replacementText); 1442 1443 if (escapedChar != (UChar32)0xFFFFFFFF) { 1444 if (escapedChar <= 0xffff) { 1445 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1446 } else { 1447 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1448 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1449 } 1450 continue; 1451 } 1452 // Note: if the \u escape was invalid, just fall through and 1453 // treat it as a plain \<anything> escape. 1454 } 1455 1456 // Plain backslash escape. Just put out the escaped character. 1457 appendToBuf(c, &destIdx, dest, capacity); 1458 1459 replIdx++; 1460 continue; 1461 } 1462 1463 // We've got a $. Pick up the following capture group name or number. 1464 // For numbers, consume only digits that produce a valid capture group for the pattern. 1465 1466 int32_t groupNum = 0; 1467 U_ASSERT(c == DOLLARSIGN); 1468 UChar32 c32; 1469 U16_GET(replacementText, 0, replIdx, replacementLength, c32); 1470 if (u_isdigit(c32)) { 1471 int32_t numDigits = 0; 1472 int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); 1473 for (;;) { 1474 if (replIdx >= replacementLength) { 1475 break; 1476 } 1477 U16_GET(replacementText, 0, replIdx, replacementLength, c32); 1478 if (u_isdigit(c32) == FALSE) { 1479 break; 1480 } 1481 1482 int32_t digitVal = u_charDigitValue(c32); 1483 if (groupNum * 10 + digitVal <= numCaptureGroups) { 1484 groupNum = groupNum * 10 + digitVal; 1485 U16_FWD_1(replacementText, replIdx, replacementLength); 1486 numDigits++; 1487 } else { 1488 if (numDigits == 0) { 1489 *status = U_INDEX_OUTOFBOUNDS_ERROR; 1490 } 1491 break; 1492 } 1493 } 1494 } else if (c32 == LEFTBRACKET) { 1495 // Scan for Named Capture Group, ${name}. 1496 UnicodeString groupName; 1497 U16_FWD_1(replacementText, replIdx, replacementLength); 1498 while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { 1499 if (replIdx >= replacementLength) { 1500 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1501 break; 1502 } 1503 U16_NEXT(replacementText, replIdx, replacementLength, c32); 1504 if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z 1505 (c32 >= 0x61 && c32 <= 0x7a) || // a..z 1506 (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 1507 groupName.append(c32); 1508 } else if (c32 == RIGHTBRACKET) { 1509 groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); 1510 if (groupNum == 0) { 1511 // Name not defined by pattern. 1512 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1513 } 1514 } else { 1515 // Character was something other than a name char or a closing '}' 1516 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1517 } 1518 } 1519 } else { 1520 // $ not followed by {name} or digits. 1521 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 1522 } 1523 1524 1525 // Finally, append the capture group data to the destination. 1526 if (U_SUCCESS(*status)) { 1527 destIdx += uregex_group((URegularExpression*)regexp, groupNum, 1528 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1529 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1530 // Ignore buffer overflow when extracting the group. We need to 1531 // continue on to get full size of the untruncated result. We will 1532 // raise our own buffer overflow error at the end. 1533 *status = U_ZERO_ERROR; 1534 } 1535 } 1536 1537 if (U_FAILURE(*status)) { 1538 // bad group number or name. 1539 break; 1540 } 1541 } 1542 1543 // 1544 // Nul Terminate the dest buffer if possible. 1545 // Set the appropriate buffer overflow or not terminated error, if needed. 1546 // 1547 if (destIdx < capacity) { 1548 dest[destIdx] = 0; 1549 } else if (U_SUCCESS(*status)) { 1550 if (destIdx == *destCapacity) { 1551 *status = U_STRING_NOT_TERMINATED_WARNING; 1552 } else { 1553 *status = U_BUFFER_OVERFLOW_ERROR; 1554 } 1555 } 1556 1557 // 1558 // Return an updated dest buffer and capacity to the caller. 1559 // 1560 if (destIdx > 0 && *destCapacity > 0) { 1561 if (destIdx < capacity) { 1562 *destBuf += destIdx; 1563 *destCapacity -= destIdx; 1564 } else { 1565 *destBuf += capacity; 1566 *destCapacity = 0; 1567 } 1568 } 1569 1570 // If we came in with a buffer overflow, make sure we go out with one also. 1571 // (A zero length match right at the end of the previous match could 1572 // make this function succeed even though a previous call had overflowed the buf) 1573 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1574 *status = U_BUFFER_OVERFLOW_ERROR; 1575 } 1576 1577 return destIdx; 1578 } 1579 1580 // 1581 // appendReplacement the actual API function, 1582 // 1583 U_CAPI int32_t U_EXPORT2 1584 uregex_appendReplacement(URegularExpression *regexp2, 1585 const UChar *replacementText, 1586 int32_t replacementLength, 1587 UChar **destBuf, 1588 int32_t *destCapacity, 1589 UErrorCode *status) { 1590 1591 RegularExpression *regexp = (RegularExpression*)regexp2; 1592 return RegexCImpl::appendReplacement( 1593 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1594 } 1595 1596 // 1597 // uregex_appendReplacementUText...can just use the normal C++ method 1598 // 1599 U_CAPI void U_EXPORT2 1600 uregex_appendReplacementUText(URegularExpression *regexp2, 1601 UText *replText, 1602 UText *dest, 1603 UErrorCode *status) { 1604 RegularExpression *regexp = (RegularExpression*)regexp2; 1605 regexp->fMatcher->appendReplacement(dest, replText, *status); 1606 } 1607 1608 1609 //------------------------------------------------------------------------------ 1610 // 1611 // uregex_appendTail 1612 // 1613 //------------------------------------------------------------------------------ 1614 int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1615 UChar **destBuf, 1616 int32_t *destCapacity, 1617 UErrorCode *status) 1618 { 1619 1620 // If we come in with a buffer overflow error, don't suppress the operation. 1621 // A series of appendReplacements, appendTail need to correctly preflight 1622 // the buffer size when an overflow happens somewhere in the middle. 1623 UBool pendingBufferOverflow = FALSE; 1624 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1625 pendingBufferOverflow = TRUE; 1626 *status = U_ZERO_ERROR; 1627 } 1628 1629 if (validateRE(regexp, TRUE, status) == FALSE) { 1630 return 0; 1631 } 1632 1633 if (destCapacity == NULL || destBuf == NULL || 1634 (*destBuf == NULL && *destCapacity > 0) || 1635 *destCapacity < 0) 1636 { 1637 *status = U_ILLEGAL_ARGUMENT_ERROR; 1638 return 0; 1639 } 1640 1641 RegexMatcher *m = regexp->fMatcher; 1642 1643 int32_t destIdx = 0; 1644 int32_t destCap = *destCapacity; 1645 UChar *dest = *destBuf; 1646 1647 if (regexp->fText != NULL) { 1648 int32_t srcIdx; 1649 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1650 if (nativeIdx == -1) { 1651 srcIdx = 0; 1652 } else if (UTEXT_USES_U16(m->fInputText)) { 1653 srcIdx = (int32_t)nativeIdx; 1654 } else { 1655 UErrorCode status = U_ZERO_ERROR; 1656 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1657 } 1658 1659 for (;;) { 1660 U_ASSERT(destIdx >= 0); 1661 1662 if (srcIdx == regexp->fTextLength) { 1663 break; 1664 } 1665 UChar c = regexp->fText[srcIdx]; 1666 if (c == 0 && regexp->fTextLength == -1) { 1667 regexp->fTextLength = srcIdx; 1668 break; 1669 } 1670 1671 if (destIdx < destCap) { 1672 dest[destIdx] = c; 1673 } else { 1674 // We've overflowed the dest buffer. 1675 // If the total input string length is known, we can 1676 // compute the total buffer size needed without scanning through the string. 1677 if (regexp->fTextLength > 0) { 1678 destIdx += (regexp->fTextLength - srcIdx); 1679 break; 1680 } 1681 } 1682 srcIdx++; 1683 destIdx++; 1684 } 1685 } else { 1686 int64_t srcIdx; 1687 if (m->fMatch) { 1688 // The most recent call to find() succeeded. 1689 srcIdx = m->fMatchEnd; 1690 } else { 1691 // The last call to find() on this matcher failed(). 1692 // Look back to the end of the last find() that succeeded for src index. 1693 srcIdx = m->fLastMatchEnd; 1694 if (srcIdx == -1) { 1695 // There has been no successful match with this matcher. 1696 // We want to copy the whole string. 1697 srcIdx = 0; 1698 } 1699 } 1700 1701 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1702 } 1703 1704 // 1705 // NUL terminate the output string, if possible, otherwise issue the 1706 // appropriate error or warning. 1707 // 1708 if (destIdx < destCap) { 1709 dest[destIdx] = 0; 1710 } else if (destIdx == destCap) { 1711 *status = U_STRING_NOT_TERMINATED_WARNING; 1712 } else { 1713 *status = U_BUFFER_OVERFLOW_ERROR; 1714 } 1715 1716 // 1717 // Update the user's buffer ptr and capacity vars to reflect the 1718 // amount used. 1719 // 1720 if (destIdx < destCap) { 1721 *destBuf += destIdx; 1722 *destCapacity -= destIdx; 1723 } else if (*destBuf != NULL) { 1724 *destBuf += destCap; 1725 *destCapacity = 0; 1726 } 1727 1728 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1729 *status = U_BUFFER_OVERFLOW_ERROR; 1730 } 1731 1732 return destIdx; 1733 } 1734 1735 1736 // 1737 // appendTail the actual API function 1738 // 1739 U_CAPI int32_t U_EXPORT2 1740 uregex_appendTail(URegularExpression *regexp2, 1741 UChar **destBuf, 1742 int32_t *destCapacity, 1743 UErrorCode *status) { 1744 RegularExpression *regexp = (RegularExpression*)regexp2; 1745 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1746 } 1747 1748 1749 // 1750 // uregex_appendTailUText...can just use the normal C++ method 1751 // 1752 U_CAPI UText * U_EXPORT2 1753 uregex_appendTailUText(URegularExpression *regexp2, 1754 UText *dest, 1755 UErrorCode *status) { 1756 RegularExpression *regexp = (RegularExpression*)regexp2; 1757 return regexp->fMatcher->appendTail(dest, *status); 1758 } 1759 1760 1761 //------------------------------------------------------------------------------ 1762 // 1763 // copyString Internal utility to copy a string to an output buffer, 1764 // while managing buffer overflow and preflight size 1765 // computation. NUL termination is added to destination, 1766 // and the NUL is counted in the output size. 1767 // 1768 //------------------------------------------------------------------------------ 1769 #if 0 1770 static void copyString(UChar *destBuffer, // Destination buffer. 1771 int32_t destCapacity, // Total capacity of dest buffer 1772 int32_t *destIndex, // Index into dest buffer. Updated on return. 1773 // Update not clipped to destCapacity. 1774 const UChar *srcPtr, // Pointer to source string 1775 int32_t srcLen) // Source string len. 1776 { 1777 int32_t si; 1778 int32_t di = *destIndex; 1779 UChar c; 1780 1781 for (si=0; si<srcLen; si++) { 1782 c = srcPtr[si]; 1783 if (di < destCapacity) { 1784 destBuffer[di] = c; 1785 di++; 1786 } else { 1787 di += srcLen - si; 1788 break; 1789 } 1790 } 1791 if (di<destCapacity) { 1792 destBuffer[di] = 0; 1793 } 1794 di++; 1795 *destIndex = di; 1796 } 1797 #endif 1798 1799 //------------------------------------------------------------------------------ 1800 // 1801 // uregex_split 1802 // 1803 //------------------------------------------------------------------------------ 1804 int32_t RegexCImpl::split(RegularExpression *regexp, 1805 UChar *destBuf, 1806 int32_t destCapacity, 1807 int32_t *requiredCapacity, 1808 UChar *destFields[], 1809 int32_t destFieldsCapacity, 1810 UErrorCode *status) { 1811 // 1812 // Reset for the input text 1813 // 1814 regexp->fMatcher->reset(); 1815 UText *inputText = regexp->fMatcher->fInputText; 1816 int64_t nextOutputStringStart = 0; 1817 int64_t inputLen = regexp->fMatcher->fInputLength; 1818 if (inputLen == 0) { 1819 return 0; 1820 } 1821 1822 // 1823 // Loop through the input text, searching for the delimiter pattern 1824 // 1825 int32_t i; // Index of the field being processed. 1826 int32_t destIdx = 0; // Next available position in destBuf; 1827 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1828 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1829 for (i=0; ; i++) { 1830 if (i>=destFieldsCapacity-1) { 1831 // There are one or zero output strings left. 1832 // Fill the last output string with whatever is left from the input, then exit the loop. 1833 // ( i will be == destFieldsCapacity if we filled the output array while processing 1834 // capture groups of the delimiter expression, in which case we will discard the 1835 // last capture group saved in favor of the unprocessed remainder of the 1836 // input string.) 1837 if (inputLen > nextOutputStringStart) { 1838 if (i != destFieldsCapacity-1) { 1839 // No fields are left. Recycle the last one for holding the trailing part of 1840 // the input string. 1841 i = destFieldsCapacity-1; 1842 destIdx = (int32_t)(destFields[i] - destFields[0]); 1843 } 1844 1845 destFields[i] = &destBuf[destIdx]; 1846 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1847 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1848 } 1849 break; 1850 } 1851 1852 if (regexp->fMatcher->find()) { 1853 // We found another delimiter. Move everything from where we started looking 1854 // up until the start of the delimiter into the next output string. 1855 destFields[i] = &destBuf[destIdx]; 1856 1857 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1858 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1859 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1860 tStatus = U_ZERO_ERROR; 1861 } else { 1862 *status = tStatus; 1863 } 1864 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1865 1866 // If the delimiter pattern has capturing parentheses, the captured 1867 // text goes out into the next n destination strings. 1868 int32_t groupNum; 1869 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1870 // If we've run out of output string slots, bail out. 1871 if (i==destFieldsCapacity-1) { 1872 break; 1873 } 1874 i++; 1875 1876 // Set up to extract the capture group contents into the dest buffer. 1877 destFields[i] = &destBuf[destIdx]; 1878 tStatus = U_ZERO_ERROR; 1879 int32_t t = uregex_group((URegularExpression*)regexp, 1880 groupNum, 1881 destFields[i], 1882 REMAINING_CAPACITY(destIdx, destCapacity), 1883 &tStatus); 1884 destIdx += t + 1; // Record the space used in the output string buffer. 1885 // +1 for the NUL that terminates the string. 1886 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1887 tStatus = U_ZERO_ERROR; 1888 } else { 1889 *status = tStatus; 1890 } 1891 } 1892 1893 if (nextOutputStringStart == inputLen) { 1894 // The delimiter was at the end of the string. 1895 // Output an empty string, and then we are done. 1896 if (destIdx < destCapacity) { 1897 destBuf[destIdx] = 0; 1898 } 1899 if (i < destFieldsCapacity-1) { 1900 ++i; 1901 } 1902 if (destIdx < destCapacity) { 1903 destFields[i] = destBuf + destIdx; 1904 } 1905 ++destIdx; 1906 break; 1907 } 1908 1909 } 1910 else 1911 { 1912 // We ran off the end of the input while looking for the next delimiter. 1913 // All the remaining text goes into the current output string. 1914 destFields[i] = &destBuf[destIdx]; 1915 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1916 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1917 break; 1918 } 1919 } 1920 1921 // Zero out any unused portion of the destFields array 1922 int j; 1923 for (j=i+1; j<destFieldsCapacity; j++) { 1924 destFields[j] = NULL; 1925 } 1926 1927 if (requiredCapacity != NULL) { 1928 *requiredCapacity = destIdx; 1929 } 1930 if (destIdx > destCapacity) { 1931 *status = U_BUFFER_OVERFLOW_ERROR; 1932 } 1933 return i+1; 1934 } 1935 1936 // 1937 // uregex_split The actual API function 1938 // 1939 U_CAPI int32_t U_EXPORT2 1940 uregex_split(URegularExpression *regexp2, 1941 UChar *destBuf, 1942 int32_t destCapacity, 1943 int32_t *requiredCapacity, 1944 UChar *destFields[], 1945 int32_t destFieldsCapacity, 1946 UErrorCode *status) { 1947 RegularExpression *regexp = (RegularExpression*)regexp2; 1948 if (validateRE(regexp, TRUE, status) == FALSE) { 1949 return 0; 1950 } 1951 if ((destBuf == NULL && destCapacity > 0) || 1952 destCapacity < 0 || 1953 destFields == NULL || 1954 destFieldsCapacity < 1 ) { 1955 *status = U_ILLEGAL_ARGUMENT_ERROR; 1956 return 0; 1957 } 1958 1959 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1960 } 1961 1962 1963 // 1964 // uregex_splitUText...can just use the normal C++ method 1965 // 1966 U_CAPI int32_t U_EXPORT2 1967 uregex_splitUText(URegularExpression *regexp2, 1968 UText *destFields[], 1969 int32_t destFieldsCapacity, 1970 UErrorCode *status) { 1971 RegularExpression *regexp = (RegularExpression*)regexp2; 1972 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1973 } 1974 1975 1976 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1977 1978