1 /* 2 ******************************************************************************* 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: regex.cpp 7 */ 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13 #include "unicode/regex.h" 14 #include "unicode/uregex.h" 15 #include "unicode/unistr.h" 16 #include "unicode/ustring.h" 17 #include "unicode/uchar.h" 18 #include "unicode/uobject.h" 19 #include "umutex.h" 20 #include "uassert.h" 21 #include "cmemory.h" 22 23 #include "regextxt.h" 24 25 #include <stdio.h> 26 27 U_NAMESPACE_BEGIN 28 29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 30 31 struct RegularExpression: public UMemory { 32 public: 33 RegularExpression(); 34 ~RegularExpression(); 35 int32_t fMagic; 36 RegexPattern *fPat; 37 int32_t *fPatRefCount; 38 UChar *fPatString; 39 int32_t fPatStringLen; 40 RegexMatcher *fMatcher; 41 const UChar *fText; // Text from setText() 42 int32_t fTextLength; // Length provided by user with setText(), which 43 // may be -1. 44 UBool fOwnsText; 45 }; 46 47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 48 49 RegularExpression::RegularExpression() { 50 fMagic = REXP_MAGIC; 51 fPat = NULL; 52 fPatRefCount = NULL; 53 fPatString = NULL; 54 fPatStringLen = 0; 55 fMatcher = NULL; 56 fText = NULL; 57 fTextLength = 0; 58 fOwnsText = FALSE; 59 } 60 61 RegularExpression::~RegularExpression() { 62 delete fMatcher; 63 fMatcher = NULL; 64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 65 delete fPat; 66 uprv_free(fPatString); 67 uprv_free(fPatRefCount); 68 } 69 if (fOwnsText && fText!=NULL) { 70 uprv_free((void *)fText); 71 } 72 fMagic = 0; 73 } 74 75 U_NAMESPACE_END 76 77 U_NAMESPACE_USE 78 79 //---------------------------------------------------------------------------------------- 80 // 81 // validateRE Do boilerplate style checks on API function parameters. 82 // Return TRUE if they look OK. 83 //---------------------------------------------------------------------------------------- 84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { 85 if (U_FAILURE(*status)) { 86 return FALSE; 87 } 88 if (re == NULL || re->fMagic != REXP_MAGIC) { 89 *status = U_ILLEGAL_ARGUMENT_ERROR; 90 return FALSE; 91 } 92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 93 if (requiresText && re->fText == NULL && !re->fOwnsText) { 94 *status = U_REGEX_INVALID_STATE; 95 return FALSE; 96 } 97 return TRUE; 98 } 99 100 //---------------------------------------------------------------------------------------- 101 // 102 // uregex_open 103 // 104 //---------------------------------------------------------------------------------------- 105 U_CAPI URegularExpression * U_EXPORT2 106 uregex_open( const UChar *pattern, 107 int32_t patternLength, 108 uint32_t flags, 109 UParseError *pe, 110 UErrorCode *status) { 111 112 if (U_FAILURE(*status)) { 113 return NULL; 114 } 115 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 116 *status = U_ILLEGAL_ARGUMENT_ERROR; 117 return NULL; 118 } 119 int32_t actualPatLen = patternLength; 120 if (actualPatLen == -1) { 121 actualPatLen = u_strlen(pattern); 122 } 123 124 RegularExpression *re = new RegularExpression; 125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 127 if (re == NULL || refC == NULL || patBuf == NULL) { 128 *status = U_MEMORY_ALLOCATION_ERROR; 129 delete re; 130 uprv_free(refC); 131 uprv_free(patBuf); 132 return NULL; 133 } 134 re->fPatRefCount = refC; 135 *re->fPatRefCount = 1; 136 137 // 138 // Make a copy of the pattern string, so we can return it later if asked. 139 // For compiling the pattern, we will use a UText wrapper around 140 // this local copy, to avoid making even more copies. 141 // 142 re->fPatString = patBuf; 143 re->fPatStringLen = patternLength; 144 u_memcpy(patBuf, pattern, actualPatLen); 145 patBuf[actualPatLen] = 0; 146 147 UText patText = UTEXT_INITIALIZER; 148 utext_openUChars(&patText, patBuf, patternLength, status); 149 150 // 151 // Compile the pattern 152 // 153 if (pe != NULL) { 154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 155 } else { 156 re->fPat = RegexPattern::compile(&patText, flags, *status); 157 } 158 utext_close(&patText); 159 160 if (U_FAILURE(*status)) { 161 goto ErrorExit; 162 } 163 164 // 165 // Create the matcher object 166 // 167 re->fMatcher = re->fPat->matcher(*status); 168 if (U_SUCCESS(*status)) { 169 return (URegularExpression*)re; 170 } 171 172 ErrorExit: 173 delete re; 174 return NULL; 175 176 } 177 178 //---------------------------------------------------------------------------------------- 179 // 180 // uregex_openUText 181 // 182 //---------------------------------------------------------------------------------------- 183 U_CAPI URegularExpression * U_EXPORT2 184 uregex_openUText(UText *pattern, 185 uint32_t flags, 186 UParseError *pe, 187 UErrorCode *status) { 188 189 if (U_FAILURE(*status)) { 190 return NULL; 191 } 192 if (pattern == NULL) { 193 *status = U_ILLEGAL_ARGUMENT_ERROR; 194 return NULL; 195 } 196 197 int64_t patternNativeLength = utext_nativeLength(pattern); 198 199 if (patternNativeLength == 0) { 200 *status = U_ILLEGAL_ARGUMENT_ERROR; 201 return NULL; 202 } 203 204 RegularExpression *re = new RegularExpression; 205 206 UErrorCode lengthStatus = U_ZERO_ERROR; 207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 208 209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 211 if (re == NULL || refC == NULL || patBuf == NULL) { 212 *status = U_MEMORY_ALLOCATION_ERROR; 213 delete re; 214 uprv_free(refC); 215 uprv_free(patBuf); 216 return NULL; 217 } 218 re->fPatRefCount = refC; 219 *re->fPatRefCount = 1; 220 221 // 222 // Make a copy of the pattern string, so we can return it later if asked. 223 // For compiling the pattern, we will use a read-only UText wrapper 224 // around this local copy, to avoid making even more copies. 225 // 226 re->fPatString = patBuf; 227 re->fPatStringLen = pattern16Length; 228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 229 230 UText patText = UTEXT_INITIALIZER; 231 utext_openUChars(&patText, patBuf, pattern16Length, status); 232 233 // 234 // Compile the pattern 235 // 236 if (pe != NULL) { 237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 238 } else { 239 re->fPat = RegexPattern::compile(&patText, flags, *status); 240 } 241 utext_close(&patText); 242 243 if (U_FAILURE(*status)) { 244 goto ErrorExit; 245 } 246 247 // 248 // Create the matcher object 249 // 250 re->fMatcher = re->fPat->matcher(*status); 251 if (U_SUCCESS(*status)) { 252 return (URegularExpression*)re; 253 } 254 255 ErrorExit: 256 delete re; 257 return NULL; 258 259 } 260 261 //---------------------------------------------------------------------------------------- 262 // 263 // uregex_close 264 // 265 //---------------------------------------------------------------------------------------- 266 U_CAPI void U_EXPORT2 267 uregex_close(URegularExpression *re2) { 268 RegularExpression *re = (RegularExpression*)re2; 269 UErrorCode status = U_ZERO_ERROR; 270 if (validateRE(re, &status, FALSE) == FALSE) { 271 return; 272 } 273 delete re; 274 } 275 276 277 //---------------------------------------------------------------------------------------- 278 // 279 // uregex_clone 280 // 281 //---------------------------------------------------------------------------------------- 282 U_CAPI URegularExpression * U_EXPORT2 283 uregex_clone(const URegularExpression *source2, UErrorCode *status) { 284 RegularExpression *source = (RegularExpression*)source2; 285 if (validateRE(source, status, FALSE) == FALSE) { 286 return NULL; 287 } 288 289 RegularExpression *clone = new RegularExpression; 290 if (clone == NULL) { 291 *status = U_MEMORY_ALLOCATION_ERROR; 292 return NULL; 293 } 294 295 clone->fMatcher = source->fPat->matcher(*status); 296 if (U_FAILURE(*status)) { 297 delete clone; 298 return NULL; 299 } 300 301 clone->fPat = source->fPat; 302 clone->fPatRefCount = source->fPatRefCount; 303 clone->fPatString = source->fPatString; 304 clone->fPatStringLen = source->fPatStringLen; 305 umtx_atomic_inc(source->fPatRefCount); 306 // Note: fText is not cloned. 307 308 return (URegularExpression*)clone; 309 } 310 311 312 313 314 //------------------------------------------------------------------------------ 315 // 316 // uregex_pattern 317 // 318 //------------------------------------------------------------------------------ 319 U_CAPI const UChar * U_EXPORT2 320 uregex_pattern(const URegularExpression *regexp2, 321 int32_t *patLength, 322 UErrorCode *status) { 323 RegularExpression *regexp = (RegularExpression*)regexp2; 324 325 if (validateRE(regexp, status, FALSE) == FALSE) { 326 return NULL; 327 } 328 if (patLength != NULL) { 329 *patLength = regexp->fPatStringLen; 330 } 331 return regexp->fPatString; 332 } 333 334 335 //------------------------------------------------------------------------------ 336 // 337 // uregex_patternUText 338 // 339 //------------------------------------------------------------------------------ 340 U_CAPI UText * U_EXPORT2 341 uregex_patternUText(const URegularExpression *regexp2, 342 UErrorCode *status) { 343 RegularExpression *regexp = (RegularExpression*)regexp2; 344 (void)status; 345 return regexp->fPat->patternText(); 346 } 347 348 349 //------------------------------------------------------------------------------ 350 // 351 // uregex_flags 352 // 353 //------------------------------------------------------------------------------ 354 U_CAPI int32_t U_EXPORT2 355 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 356 RegularExpression *regexp = (RegularExpression*)regexp2; 357 if (validateRE(regexp, status, FALSE) == FALSE) { 358 return 0; 359 } 360 int32_t flags = regexp->fPat->flags(); 361 return flags; 362 } 363 364 365 //------------------------------------------------------------------------------ 366 // 367 // uregex_setText 368 // 369 //------------------------------------------------------------------------------ 370 U_CAPI void U_EXPORT2 371 uregex_setText(URegularExpression *regexp2, 372 const UChar *text, 373 int32_t textLength, 374 UErrorCode *status) { 375 RegularExpression *regexp = (RegularExpression*)regexp2; 376 if (validateRE(regexp, status, FALSE) == FALSE) { 377 return; 378 } 379 if (text == NULL || textLength < -1) { 380 *status = U_ILLEGAL_ARGUMENT_ERROR; 381 return; 382 } 383 384 if (regexp->fOwnsText && regexp->fText != NULL) { 385 uprv_free((void *)regexp->fText); 386 } 387 388 regexp->fText = text; 389 regexp->fTextLength = textLength; 390 regexp->fOwnsText = FALSE; 391 392 UText input = UTEXT_INITIALIZER; 393 utext_openUChars(&input, text, textLength, status); 394 regexp->fMatcher->reset(&input); 395 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 396 } 397 398 399 //------------------------------------------------------------------------------ 400 // 401 // uregex_setUText 402 // 403 //------------------------------------------------------------------------------ 404 U_CAPI void U_EXPORT2 405 uregex_setUText(URegularExpression *regexp2, 406 UText *text, 407 UErrorCode *status) { 408 RegularExpression *regexp = (RegularExpression*)regexp2; 409 if (validateRE(regexp, status, FALSE) == FALSE) { 410 return; 411 } 412 if (text == NULL) { 413 *status = U_ILLEGAL_ARGUMENT_ERROR; 414 return; 415 } 416 417 if (regexp->fOwnsText && regexp->fText != NULL) { 418 uprv_free((void *)regexp->fText); 419 } 420 421 regexp->fText = NULL; // only fill it in on request 422 regexp->fTextLength = -1; 423 regexp->fOwnsText = TRUE; 424 regexp->fMatcher->reset(text); 425 } 426 427 428 429 //------------------------------------------------------------------------------ 430 // 431 // uregex_getText 432 // 433 //------------------------------------------------------------------------------ 434 U_CAPI const UChar * U_EXPORT2 435 uregex_getText(URegularExpression *regexp2, 436 int32_t *textLength, 437 UErrorCode *status) { 438 RegularExpression *regexp = (RegularExpression*)regexp2; 439 if (validateRE(regexp, status, FALSE) == FALSE) { 440 return NULL; 441 } 442 443 if (regexp->fText == NULL) { 444 // need to fill in the text 445 UText *inputText = regexp->fMatcher->inputText(); 446 int64_t inputNativeLength = utext_nativeLength(inputText); 447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 448 regexp->fText = inputText->chunkContents; 449 regexp->fTextLength = (int32_t)inputNativeLength; 450 regexp->fOwnsText = FALSE; // because the UText owns it 451 } else { 452 UErrorCode lengthStatus = U_ZERO_ERROR; 453 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 454 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 455 456 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 457 regexp->fText = inputChars; 458 regexp->fOwnsText = TRUE; // should already be set but just in case 459 } 460 } 461 462 if (textLength != NULL) { 463 *textLength = regexp->fTextLength; 464 } 465 return regexp->fText; 466 } 467 468 469 //------------------------------------------------------------------------------ 470 // 471 // uregex_getUText 472 // 473 //------------------------------------------------------------------------------ 474 U_CAPI UText * U_EXPORT2 475 uregex_getUText(URegularExpression *regexp2, 476 UText *dest, 477 UErrorCode *status) { 478 RegularExpression *regexp = (RegularExpression*)regexp2; 479 if (validateRE(regexp, status, FALSE) == FALSE) { 480 return dest; 481 } 482 return regexp->fMatcher->getInput(dest); 483 } 484 485 // BEGIN android-added 486 // Removed this function after Android upgrade to ICU4.6. 487 //------------------------------------------------------------------------------ 488 // 489 // uregex_refreshUText 490 // 491 //------------------------------------------------------------------------------ 492 U_CAPI void U_EXPORT2 493 uregex_refreshUText(URegularExpression *regexp2, 494 UText *text, 495 UErrorCode *status) { 496 RegularExpression *regexp = (RegularExpression*)regexp2; 497 if (validateRE(regexp, status, FALSE) == FALSE) { 498 return; 499 } 500 regexp->fMatcher->refreshInputText(text, *status); 501 } 502 // END android-added 503 504 505 //------------------------------------------------------------------------------ 506 // 507 // uregex_matches 508 // 509 //------------------------------------------------------------------------------ 510 U_CAPI UBool U_EXPORT2 511 uregex_matches(URegularExpression *regexp2, 512 int32_t startIndex, 513 UErrorCode *status) { 514 RegularExpression *regexp = (RegularExpression*)regexp2; 515 UBool result = FALSE; 516 if (validateRE(regexp, status) == FALSE) { 517 return result; 518 } 519 if (startIndex == -1) { 520 result = regexp->fMatcher->matches(*status); 521 } else { 522 result = regexp->fMatcher->matches(startIndex, *status); 523 } 524 return result; 525 } 526 527 528 529 //------------------------------------------------------------------------------ 530 // 531 // uregex_lookingAt 532 // 533 //------------------------------------------------------------------------------ 534 U_CAPI UBool U_EXPORT2 535 uregex_lookingAt(URegularExpression *regexp2, 536 int32_t startIndex, 537 UErrorCode *status) { 538 RegularExpression *regexp = (RegularExpression*)regexp2; 539 UBool result = FALSE; 540 if (validateRE(regexp, status) == FALSE) { 541 return result; 542 } 543 if (startIndex == -1) { 544 result = regexp->fMatcher->lookingAt(*status); 545 } else { 546 result = regexp->fMatcher->lookingAt(startIndex, *status); 547 } 548 return result; 549 } 550 551 552 553 //------------------------------------------------------------------------------ 554 // 555 // uregex_find 556 // 557 //------------------------------------------------------------------------------ 558 U_CAPI UBool U_EXPORT2 559 uregex_find(URegularExpression *regexp2, 560 int32_t startIndex, 561 UErrorCode *status) { 562 RegularExpression *regexp = (RegularExpression*)regexp2; 563 UBool result = FALSE; 564 if (validateRE(regexp, status) == FALSE) { 565 return result; 566 } 567 if (startIndex == -1) { 568 regexp->fMatcher->resetPreserveRegion(); 569 result = regexp->fMatcher->find(); 570 } else { 571 result = regexp->fMatcher->find(startIndex, *status); 572 } 573 return result; 574 } 575 576 //------------------------------------------------------------------------------ 577 // 578 // uregex_findNext 579 // 580 //------------------------------------------------------------------------------ 581 U_CAPI UBool U_EXPORT2 582 uregex_findNext(URegularExpression *regexp2, 583 UErrorCode *status) { 584 RegularExpression *regexp = (RegularExpression*)regexp2; 585 if (validateRE(regexp, status) == FALSE) { 586 return FALSE; 587 } 588 UBool result = regexp->fMatcher->find(); 589 return result; 590 } 591 592 //------------------------------------------------------------------------------ 593 // 594 // uregex_groupCount 595 // 596 //------------------------------------------------------------------------------ 597 U_CAPI int32_t U_EXPORT2 598 uregex_groupCount(URegularExpression *regexp2, 599 UErrorCode *status) { 600 RegularExpression *regexp = (RegularExpression*)regexp2; 601 if (validateRE(regexp, status, FALSE) == FALSE) { 602 return 0; 603 } 604 int32_t result = regexp->fMatcher->groupCount(); 605 return result; 606 } 607 608 609 //------------------------------------------------------------------------------ 610 // 611 // uregex_group 612 // 613 //------------------------------------------------------------------------------ 614 U_CAPI int32_t U_EXPORT2 615 uregex_group(URegularExpression *regexp2, 616 int32_t groupNum, 617 UChar *dest, 618 int32_t destCapacity, 619 UErrorCode *status) { 620 RegularExpression *regexp = (RegularExpression*)regexp2; 621 if (validateRE(regexp, status) == FALSE) { 622 return 0; 623 } 624 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 625 *status = U_ILLEGAL_ARGUMENT_ERROR; 626 return 0; 627 } 628 629 if (destCapacity == 0 || regexp->fText != NULL) { 630 // If preflighting or if we already have the text as UChars, 631 // this is a little cheaper than going through uregex_groupUText() 632 633 // 634 // Pick up the range of characters from the matcher 635 // 636 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 637 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 638 if (U_FAILURE(*status)) { 639 return 0; 640 } 641 642 // 643 // Trim length based on buffer capacity 644 // 645 int32_t fullLength = endIx - startIx; 646 int32_t copyLength = fullLength; 647 if (copyLength < destCapacity) { 648 dest[copyLength] = 0; 649 } else if (copyLength == destCapacity) { 650 *status = U_STRING_NOT_TERMINATED_WARNING; 651 } else { 652 copyLength = destCapacity; 653 *status = U_BUFFER_OVERFLOW_ERROR; 654 } 655 656 // 657 // Copy capture group to user's buffer 658 // 659 if (copyLength > 0) { 660 u_memcpy(dest, ®exp->fText[startIx], copyLength); 661 } 662 return fullLength; 663 } else { 664 UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status); 665 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); 666 utext_close(groupText); 667 return result; 668 } 669 } 670 671 672 //------------------------------------------------------------------------------ 673 // 674 // uregex_groupUText 675 // 676 //------------------------------------------------------------------------------ 677 U_CAPI UText * U_EXPORT2 678 uregex_groupUText(URegularExpression *regexp2, 679 int32_t groupNum, 680 UText *dest, 681 UErrorCode *status) { 682 RegularExpression *regexp = (RegularExpression*)regexp2; 683 if (validateRE(regexp, status) == FALSE) { 684 UErrorCode emptyTextStatus = U_ZERO_ERROR; 685 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 686 } 687 688 if (regexp->fText != NULL) { 689 // 690 // Pick up the range of characters from the matcher 691 // and use our already-extracted characters 692 // 693 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 694 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 695 if (U_FAILURE(*status)) { 696 UErrorCode emptyTextStatus = U_ZERO_ERROR; 697 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 698 } 699 700 if (dest) { 701 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); 702 } else { 703 UText groupText = UTEXT_INITIALIZER; 704 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); 705 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); 706 utext_close(&groupText); 707 } 708 709 return dest; 710 } else { 711 return regexp->fMatcher->group(groupNum, dest, *status); 712 } 713 } 714 715 716 //------------------------------------------------------------------------------ 717 // 718 // uregex_start 719 // 720 //------------------------------------------------------------------------------ 721 U_CAPI int32_t U_EXPORT2 722 uregex_start(URegularExpression *regexp2, 723 int32_t groupNum, 724 UErrorCode *status) { 725 RegularExpression *regexp = (RegularExpression*)regexp2; 726 if (validateRE(regexp, status) == FALSE) { 727 return 0; 728 } 729 int32_t result = regexp->fMatcher->start(groupNum, *status); 730 return result; 731 } 732 733 734 //------------------------------------------------------------------------------ 735 // 736 // uregex_end 737 // 738 //------------------------------------------------------------------------------ 739 U_CAPI int32_t U_EXPORT2 740 uregex_end(URegularExpression *regexp2, 741 int32_t groupNum, 742 UErrorCode *status) { 743 RegularExpression *regexp = (RegularExpression*)regexp2; 744 if (validateRE(regexp, status) == FALSE) { 745 return 0; 746 } 747 int32_t result = regexp->fMatcher->end(groupNum, *status); 748 return result; 749 } 750 751 //------------------------------------------------------------------------------ 752 // 753 // uregex_reset 754 // 755 //------------------------------------------------------------------------------ 756 U_CAPI void U_EXPORT2 757 uregex_reset(URegularExpression *regexp2, 758 int32_t index, 759 UErrorCode *status) { 760 RegularExpression *regexp = (RegularExpression*)regexp2; 761 if (validateRE(regexp, status) == FALSE) { 762 return; 763 } 764 regexp->fMatcher->reset(index, *status); 765 } 766 767 768 //------------------------------------------------------------------------------ 769 // 770 // uregex_setRegion 771 // 772 //------------------------------------------------------------------------------ 773 U_CAPI void U_EXPORT2 774 uregex_setRegion(URegularExpression *regexp2, 775 int32_t regionStart, 776 int32_t regionLimit, 777 UErrorCode *status) { 778 RegularExpression *regexp = (RegularExpression*)regexp2; 779 if (validateRE(regexp, status) == FALSE) { 780 return; 781 } 782 regexp->fMatcher->region(regionStart, regionLimit, *status); 783 } 784 785 786 //------------------------------------------------------------------------------ 787 // 788 // uregex_regionStart 789 // 790 //------------------------------------------------------------------------------ 791 U_CAPI int32_t U_EXPORT2 792 uregex_regionStart(const URegularExpression *regexp2, 793 UErrorCode *status) { 794 RegularExpression *regexp = (RegularExpression*)regexp2; 795 if (validateRE(regexp, status) == FALSE) { 796 return 0; 797 } 798 return regexp->fMatcher->regionStart(); 799 } 800 801 802 //------------------------------------------------------------------------------ 803 // 804 // uregex_regionEnd 805 // 806 //------------------------------------------------------------------------------ 807 U_CAPI int32_t U_EXPORT2 808 uregex_regionEnd(const URegularExpression *regexp2, 809 UErrorCode *status) { 810 RegularExpression *regexp = (RegularExpression*)regexp2; 811 if (validateRE(regexp, status) == FALSE) { 812 return 0; 813 } 814 return regexp->fMatcher->regionEnd(); 815 } 816 817 818 //------------------------------------------------------------------------------ 819 // 820 // uregex_hasTransparentBounds 821 // 822 //------------------------------------------------------------------------------ 823 U_CAPI UBool U_EXPORT2 824 uregex_hasTransparentBounds(const URegularExpression *regexp2, 825 UErrorCode *status) { 826 RegularExpression *regexp = (RegularExpression*)regexp2; 827 if (validateRE(regexp, status) == FALSE) { 828 return FALSE; 829 } 830 return regexp->fMatcher->hasTransparentBounds(); 831 } 832 833 834 //------------------------------------------------------------------------------ 835 // 836 // uregex_useTransparentBounds 837 // 838 //------------------------------------------------------------------------------ 839 U_CAPI void U_EXPORT2 840 uregex_useTransparentBounds(URegularExpression *regexp2, 841 UBool b, 842 UErrorCode *status) { 843 RegularExpression *regexp = (RegularExpression*)regexp2; 844 if (validateRE(regexp, status) == FALSE) { 845 return; 846 } 847 regexp->fMatcher->useTransparentBounds(b); 848 } 849 850 851 //------------------------------------------------------------------------------ 852 // 853 // uregex_hasAnchoringBounds 854 // 855 //------------------------------------------------------------------------------ 856 U_CAPI UBool U_EXPORT2 857 uregex_hasAnchoringBounds(const URegularExpression *regexp2, 858 UErrorCode *status) { 859 RegularExpression *regexp = (RegularExpression*)regexp2; 860 if (validateRE(regexp, status) == FALSE) { 861 return FALSE; 862 } 863 return regexp->fMatcher->hasAnchoringBounds(); 864 } 865 866 867 //------------------------------------------------------------------------------ 868 // 869 // uregex_useAnchoringBounds 870 // 871 //------------------------------------------------------------------------------ 872 U_CAPI void U_EXPORT2 873 uregex_useAnchoringBounds(URegularExpression *regexp2, 874 UBool b, 875 UErrorCode *status) { 876 RegularExpression *regexp = (RegularExpression*)regexp2; 877 if (validateRE(regexp, status) == FALSE) { 878 return; 879 } 880 regexp->fMatcher->useAnchoringBounds(b); 881 } 882 883 884 //------------------------------------------------------------------------------ 885 // 886 // uregex_hitEnd 887 // 888 //------------------------------------------------------------------------------ 889 U_CAPI UBool U_EXPORT2 890 uregex_hitEnd(const URegularExpression *regexp2, 891 UErrorCode *status) { 892 RegularExpression *regexp = (RegularExpression*)regexp2; 893 if (validateRE(regexp, status) == FALSE) { 894 return FALSE; 895 } 896 return regexp->fMatcher->hitEnd(); 897 } 898 899 900 //------------------------------------------------------------------------------ 901 // 902 // uregex_requireEnd 903 // 904 //------------------------------------------------------------------------------ 905 U_CAPI UBool U_EXPORT2 906 uregex_requireEnd(const URegularExpression *regexp2, 907 UErrorCode *status) { 908 RegularExpression *regexp = (RegularExpression*)regexp2; 909 if (validateRE(regexp, status) == FALSE) { 910 return FALSE; 911 } 912 return regexp->fMatcher->requireEnd(); 913 } 914 915 916 //------------------------------------------------------------------------------ 917 // 918 // uregex_setTimeLimit 919 // 920 //------------------------------------------------------------------------------ 921 U_CAPI void U_EXPORT2 922 uregex_setTimeLimit(URegularExpression *regexp2, 923 int32_t limit, 924 UErrorCode *status) { 925 RegularExpression *regexp = (RegularExpression*)regexp2; 926 if (validateRE(regexp, status)) { 927 regexp->fMatcher->setTimeLimit(limit, *status); 928 } 929 } 930 931 932 933 //------------------------------------------------------------------------------ 934 // 935 // uregex_getTimeLimit 936 // 937 //------------------------------------------------------------------------------ 938 U_CAPI int32_t U_EXPORT2 939 uregex_getTimeLimit(const URegularExpression *regexp2, 940 UErrorCode *status) { 941 int32_t retVal = 0; 942 RegularExpression *regexp = (RegularExpression*)regexp2; 943 if (validateRE(regexp, status)) { 944 retVal = regexp->fMatcher->getTimeLimit(); 945 } 946 return retVal; 947 } 948 949 950 951 //------------------------------------------------------------------------------ 952 // 953 // uregex_setStackLimit 954 // 955 //------------------------------------------------------------------------------ 956 U_CAPI void U_EXPORT2 957 uregex_setStackLimit(URegularExpression *regexp2, 958 int32_t limit, 959 UErrorCode *status) { 960 RegularExpression *regexp = (RegularExpression*)regexp2; 961 if (validateRE(regexp, status)) { 962 regexp->fMatcher->setStackLimit(limit, *status); 963 } 964 } 965 966 967 968 //------------------------------------------------------------------------------ 969 // 970 // uregex_getStackLimit 971 // 972 //------------------------------------------------------------------------------ 973 U_CAPI int32_t U_EXPORT2 974 uregex_getStackLimit(const URegularExpression *regexp2, 975 UErrorCode *status) { 976 int32_t retVal = 0; 977 RegularExpression *regexp = (RegularExpression*)regexp2; 978 if (validateRE(regexp, status)) { 979 retVal = regexp->fMatcher->getStackLimit(); 980 } 981 return retVal; 982 } 983 984 985 //------------------------------------------------------------------------------ 986 // 987 // uregex_setMatchCallback 988 // 989 //------------------------------------------------------------------------------ 990 U_CAPI void U_EXPORT2 991 uregex_setMatchCallback(URegularExpression *regexp2, 992 URegexMatchCallback *callback, 993 const void *context, 994 UErrorCode *status) { 995 RegularExpression *regexp = (RegularExpression*)regexp2; 996 if (validateRE(regexp, status)) { 997 regexp->fMatcher->setMatchCallback(callback, context, *status); 998 } 999 } 1000 1001 1002 //------------------------------------------------------------------------------ 1003 // 1004 // uregex_getMatchCallback 1005 // 1006 //------------------------------------------------------------------------------ 1007 U_CAPI void U_EXPORT2 1008 uregex_getMatchCallback(const URegularExpression *regexp2, 1009 URegexMatchCallback **callback, 1010 const void **context, 1011 UErrorCode *status) { 1012 RegularExpression *regexp = (RegularExpression*)regexp2; 1013 if (validateRE(regexp, status)) { 1014 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1015 } 1016 } 1017 1018 1019 //------------------------------------------------------------------------------ 1020 // 1021 // uregex_replaceAll 1022 // 1023 //------------------------------------------------------------------------------ 1024 U_CAPI int32_t U_EXPORT2 1025 uregex_replaceAll(URegularExpression *regexp2, 1026 const UChar *replacementText, 1027 int32_t replacementLength, 1028 UChar *destBuf, 1029 int32_t destCapacity, 1030 UErrorCode *status) { 1031 RegularExpression *regexp = (RegularExpression*)regexp2; 1032 if (validateRE(regexp, status) == FALSE) { 1033 return 0; 1034 } 1035 if (replacementText == NULL || replacementLength < -1 || 1036 destBuf == NULL && destCapacity > 0 || 1037 destCapacity < 0) { 1038 *status = U_ILLEGAL_ARGUMENT_ERROR; 1039 return 0; 1040 } 1041 1042 int32_t len = 0; 1043 1044 uregex_reset(regexp2, 0, status); 1045 1046 // Note: Seperate error code variables for findNext() and appendReplacement() 1047 // are used so that destination buffer overflow errors 1048 // in appendReplacement won't stop findNext() from working. 1049 // appendReplacement() and appendTail() special case incoming buffer 1050 // overflow errors, continuing to return the correct length. 1051 UErrorCode findStatus = *status; 1052 while (uregex_findNext(regexp2, &findStatus)) { 1053 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1054 &destBuf, &destCapacity, status); 1055 } 1056 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1057 1058 if (U_FAILURE(findStatus)) { 1059 // If anything went wrong with the findNext(), make that error trump 1060 // whatever may have happened with the append() operations. 1061 // Errors in findNext() are not expected. 1062 *status = findStatus; 1063 } 1064 1065 return len; 1066 } 1067 1068 1069 //------------------------------------------------------------------------------ 1070 // 1071 // uregex_replaceAllUText 1072 // 1073 //------------------------------------------------------------------------------ 1074 U_CAPI UText * U_EXPORT2 1075 uregex_replaceAllUText(URegularExpression *regexp2, 1076 UText *replacementText, 1077 UText *dest, 1078 UErrorCode *status) { 1079 RegularExpression *regexp = (RegularExpression*)regexp2; 1080 if (validateRE(regexp, status) == FALSE) { 1081 return 0; 1082 } 1083 if (replacementText == NULL) { 1084 *status = U_ILLEGAL_ARGUMENT_ERROR; 1085 return 0; 1086 } 1087 1088 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1089 return dest; 1090 } 1091 1092 1093 //------------------------------------------------------------------------------ 1094 // 1095 // uregex_replaceFirst 1096 // 1097 //------------------------------------------------------------------------------ 1098 U_CAPI int32_t U_EXPORT2 1099 uregex_replaceFirst(URegularExpression *regexp2, 1100 const UChar *replacementText, 1101 int32_t replacementLength, 1102 UChar *destBuf, 1103 int32_t destCapacity, 1104 UErrorCode *status) { 1105 RegularExpression *regexp = (RegularExpression*)regexp2; 1106 if (validateRE(regexp, status) == FALSE) { 1107 return 0; 1108 } 1109 if (replacementText == NULL || replacementLength < -1 || 1110 destBuf == NULL && destCapacity > 0 || 1111 destCapacity < 0) { 1112 *status = U_ILLEGAL_ARGUMENT_ERROR; 1113 return 0; 1114 } 1115 1116 int32_t len = 0; 1117 UBool findSucceeded; 1118 uregex_reset(regexp2, 0, status); 1119 findSucceeded = uregex_find(regexp2, 0, status); 1120 if (findSucceeded) { 1121 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1122 &destBuf, &destCapacity, status); 1123 } 1124 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1125 1126 return len; 1127 } 1128 1129 1130 //------------------------------------------------------------------------------ 1131 // 1132 // uregex_replaceFirstUText 1133 // 1134 //------------------------------------------------------------------------------ 1135 U_CAPI UText * U_EXPORT2 1136 uregex_replaceFirstUText(URegularExpression *regexp2, 1137 UText *replacementText, 1138 UText *dest, 1139 UErrorCode *status) { 1140 RegularExpression *regexp = (RegularExpression*)regexp2; 1141 if (validateRE(regexp, status) == FALSE) { 1142 return 0; 1143 } 1144 if (replacementText == NULL) { 1145 *status = U_ILLEGAL_ARGUMENT_ERROR; 1146 return 0; 1147 } 1148 1149 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1150 return dest; 1151 } 1152 1153 1154 //------------------------------------------------------------------------------ 1155 // 1156 // uregex_appendReplacement 1157 // 1158 //------------------------------------------------------------------------------ 1159 1160 U_NAMESPACE_BEGIN 1161 // 1162 // Dummy class, because these functions need to be friends of class RegexMatcher, 1163 // and stand-alone C functions don't work as friends 1164 // 1165 class RegexCImpl { 1166 public: 1167 inline static int32_t appendReplacement(RegularExpression *regexp, 1168 const UChar *replacementText, 1169 int32_t replacementLength, 1170 UChar **destBuf, 1171 int32_t *destCapacity, 1172 UErrorCode *status); 1173 1174 inline static int32_t appendTail(RegularExpression *regexp, 1175 UChar **destBuf, 1176 int32_t *destCapacity, 1177 UErrorCode *status); 1178 1179 inline static int32_t split(RegularExpression *regexp, 1180 UChar *destBuf, 1181 int32_t destCapacity, 1182 int32_t *requiredCapacity, 1183 UChar *destFields[], 1184 int32_t destFieldsCapacity, 1185 UErrorCode *status); 1186 }; 1187 1188 U_NAMESPACE_END 1189 1190 1191 1192 static const UChar BACKSLASH = 0x5c; 1193 static const UChar DOLLARSIGN = 0x24; 1194 1195 // 1196 // Move a character to an output buffer, with bounds checking on the index. 1197 // Index advances even if capacity is exceeded, for preflight size computations. 1198 // This little sequence is used a LOT. 1199 // 1200 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1201 if (*idx < bufCapacity) { 1202 buf[*idx] = c; 1203 } 1204 (*idx)++; 1205 } 1206 1207 1208 // 1209 // appendReplacement, the actual implementation. 1210 // 1211 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1212 const UChar *replacementText, 1213 int32_t replacementLength, 1214 UChar **destBuf, 1215 int32_t *destCapacity, 1216 UErrorCode *status) { 1217 1218 // If we come in with a buffer overflow error, don't suppress the operation. 1219 // A series of appendReplacements, appendTail need to correctly preflight 1220 // the buffer size when an overflow happens somewhere in the middle. 1221 UBool pendingBufferOverflow = FALSE; 1222 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1223 pendingBufferOverflow = TRUE; 1224 *status = U_ZERO_ERROR; 1225 } 1226 1227 // 1228 // Validate all paramters 1229 // 1230 if (validateRE(regexp, status) == FALSE) { 1231 return 0; 1232 } 1233 if (replacementText == NULL || replacementLength < -1 || 1234 destCapacity == NULL || destBuf == NULL || 1235 *destBuf == NULL && *destCapacity > 0 || 1236 *destCapacity < 0) { 1237 *status = U_ILLEGAL_ARGUMENT_ERROR; 1238 return 0; 1239 } 1240 1241 RegexMatcher *m = regexp->fMatcher; 1242 if (m->fMatch == FALSE) { 1243 *status = U_REGEX_INVALID_STATE; 1244 return 0; 1245 } 1246 1247 UChar *dest = *destBuf; 1248 int32_t capacity = *destCapacity; 1249 int32_t destIdx = 0; 1250 int32_t i; 1251 1252 // If it wasn't supplied by the caller, get the length of the replacement text. 1253 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1254 // the fly and avoid this step. 1255 if (replacementLength == -1) { 1256 replacementLength = u_strlen(replacementText); 1257 } 1258 1259 // Copy input string from the end of previous match to start of current match 1260 if (regexp->fText != NULL) { 1261 int32_t matchStart; 1262 int32_t lastMatchEnd; 1263 if (UTEXT_USES_U16(m->fInputText)) { 1264 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1265 matchStart = (int32_t)m->fMatchStart; 1266 } else { 1267 // !!!: Would like a better way to do this! 1268 UErrorCode status = U_ZERO_ERROR; 1269 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); 1270 status = U_ZERO_ERROR; 1271 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); 1272 } 1273 for (i=lastMatchEnd; i<matchStart; i++) { 1274 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1275 } 1276 } else { 1277 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1278 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1279 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); 1280 } 1281 1282 1283 // scan the replacement text, looking for substitutions ($n) and \escapes. 1284 int32_t replIdx = 0; 1285 while (replIdx < replacementLength) { 1286 UChar c = replacementText[replIdx]; 1287 replIdx++; 1288 if (c != DOLLARSIGN && c != BACKSLASH) { 1289 // Common case, no substitution, no escaping, 1290 // just copy the char to the dest buf. 1291 appendToBuf(c, &destIdx, dest, capacity); 1292 continue; 1293 } 1294 1295 if (c == BACKSLASH) { 1296 // Backslash Escape. Copy the following char out without further checks. 1297 // Note: Surrogate pairs don't need any special handling 1298 // The second half wont be a '$' or a '\', and 1299 // will move to the dest normally on the next 1300 // loop iteration. 1301 if (replIdx >= replacementLength) { 1302 break; 1303 } 1304 c = replacementText[replIdx]; 1305 1306 if (c==0x55/*U*/ || c==0x75/*u*/) { 1307 // We have a \udddd or \Udddddddd escape sequence. 1308 UChar32 escapedChar = 1309 u_unescapeAt(uregex_ucstr_unescape_charAt, 1310 &replIdx, // Index is updated by unescapeAt 1311 replacementLength, // Length of replacement text 1312 (void *)replacementText); 1313 1314 if (escapedChar != (UChar32)0xFFFFFFFF) { 1315 if (escapedChar <= 0xffff) { 1316 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1317 } else { 1318 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1319 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1320 } 1321 continue; 1322 } 1323 // Note: if the \u escape was invalid, just fall through and 1324 // treat it as a plain \<anything> escape. 1325 } 1326 1327 // Plain backslash escape. Just put out the escaped character. 1328 appendToBuf(c, &destIdx, dest, capacity); 1329 1330 replIdx++; 1331 continue; 1332 } 1333 1334 1335 1336 // We've got a $. Pick up a capture group number if one follows. 1337 // Consume at most the number of digits necessary for the largest capture 1338 // number that is valid for this pattern. 1339 1340 int32_t numDigits = 0; 1341 int32_t groupNum = 0; 1342 UChar32 digitC; 1343 for (;;) { 1344 if (replIdx >= replacementLength) { 1345 break; 1346 } 1347 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 1348 if (u_isdigit(digitC) == FALSE) { 1349 break; 1350 } 1351 1352 U16_FWD_1(replacementText, replIdx, replacementLength); 1353 groupNum=groupNum*10 + u_charDigitValue(digitC); 1354 numDigits++; 1355 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1356 break; 1357 } 1358 } 1359 1360 1361 if (numDigits == 0) { 1362 // The $ didn't introduce a group number at all. 1363 // Treat it as just part of the substitution text. 1364 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1365 continue; 1366 } 1367 1368 // Finally, append the capture group data to the destination. 1369 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1370 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1371 // Ignore buffer overflow when extracting the group. We need to 1372 // continue on to get full size of the untruncated result. We will 1373 // raise our own buffer overflow error at the end. 1374 *status = U_ZERO_ERROR; 1375 } 1376 1377 if (U_FAILURE(*status)) { 1378 // Can fail if group number is out of range. 1379 break; 1380 } 1381 1382 } 1383 1384 // 1385 // Nul Terminate the dest buffer if possible. 1386 // Set the appropriate buffer overflow or not terminated error, if needed. 1387 // 1388 if (destIdx < capacity) { 1389 dest[destIdx] = 0; 1390 } else if (destIdx == *destCapacity) { 1391 *status = U_STRING_NOT_TERMINATED_WARNING; 1392 } else { 1393 *status = U_BUFFER_OVERFLOW_ERROR; 1394 } 1395 1396 // 1397 // Return an updated dest buffer and capacity to the caller. 1398 // 1399 if (destIdx > 0 && *destCapacity > 0) { 1400 if (destIdx < capacity) { 1401 *destBuf += destIdx; 1402 *destCapacity -= destIdx; 1403 } else { 1404 *destBuf += capacity; 1405 *destCapacity = 0; 1406 } 1407 } 1408 1409 // If we came in with a buffer overflow, make sure we go out with one also. 1410 // (A zero length match right at the end of the previous match could 1411 // make this function succeed even though a previous call had overflowed the buf) 1412 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1413 *status = U_BUFFER_OVERFLOW_ERROR; 1414 } 1415 1416 return destIdx; 1417 } 1418 1419 // 1420 // appendReplacement the actual API function, 1421 // 1422 U_CAPI int32_t U_EXPORT2 1423 uregex_appendReplacement(URegularExpression *regexp2, 1424 const UChar *replacementText, 1425 int32_t replacementLength, 1426 UChar **destBuf, 1427 int32_t *destCapacity, 1428 UErrorCode *status) { 1429 1430 RegularExpression *regexp = (RegularExpression*)regexp2; 1431 return RegexCImpl::appendReplacement( 1432 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1433 } 1434 1435 // 1436 // uregex_appendReplacementUText...can just use the normal C++ method 1437 // 1438 U_CAPI void U_EXPORT2 1439 uregex_appendReplacementUText(URegularExpression *regexp2, 1440 UText *replText, 1441 UText *dest, 1442 UErrorCode *status) { 1443 RegularExpression *regexp = (RegularExpression*)regexp2; 1444 regexp->fMatcher->appendReplacement(dest, replText, *status); 1445 } 1446 1447 1448 //------------------------------------------------------------------------------ 1449 // 1450 // uregex_appendTail 1451 // 1452 //------------------------------------------------------------------------------ 1453 int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1454 UChar **destBuf, 1455 int32_t *destCapacity, 1456 UErrorCode *status) 1457 { 1458 1459 // If we come in with a buffer overflow error, don't suppress the operation. 1460 // A series of appendReplacements, appendTail need to correctly preflight 1461 // the buffer size when an overflow happens somewhere in the middle. 1462 UBool pendingBufferOverflow = FALSE; 1463 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1464 pendingBufferOverflow = TRUE; 1465 *status = U_ZERO_ERROR; 1466 } 1467 1468 if (validateRE(regexp, status) == FALSE) { 1469 return 0; 1470 } 1471 1472 if (destCapacity == NULL || destBuf == NULL || 1473 *destBuf == NULL && *destCapacity > 0 || 1474 *destCapacity < 0) 1475 { 1476 *status = U_ILLEGAL_ARGUMENT_ERROR; 1477 return 0; 1478 } 1479 1480 RegexMatcher *m = regexp->fMatcher; 1481 1482 int32_t destIdx = 0; 1483 int32_t destCap = *destCapacity; 1484 UChar *dest = *destBuf; 1485 1486 if (regexp->fText != NULL) { 1487 int32_t srcIdx; 1488 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1489 if (nativeIdx == -1) { 1490 srcIdx = 0; 1491 } else if (UTEXT_USES_U16(m->fInputText)) { 1492 srcIdx = (int32_t)nativeIdx; 1493 } else { 1494 UErrorCode status = U_ZERO_ERROR; 1495 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1496 } 1497 1498 for (;;) { 1499 if (srcIdx == regexp->fTextLength) { 1500 break; 1501 } 1502 UChar c = regexp->fText[srcIdx]; 1503 if (c == 0 && regexp->fTextLength == -1) { 1504 regexp->fTextLength = srcIdx; 1505 break; 1506 } 1507 if (destIdx < destCap) { 1508 dest[destIdx] = c; 1509 } else { 1510 // We've overflowed the dest buffer. 1511 // If the total input string length is known, we can 1512 // compute the total buffer size needed without scanning through the string. 1513 if (regexp->fTextLength > 0) { 1514 destIdx += (regexp->fTextLength - srcIdx); 1515 break; 1516 } 1517 } 1518 srcIdx++; 1519 destIdx++; 1520 } 1521 } else { 1522 int64_t srcIdx; 1523 if (m->fMatch) { 1524 // The most recent call to find() succeeded. 1525 srcIdx = m->fMatchEnd; 1526 } else { 1527 // The last call to find() on this matcher failed(). 1528 // Look back to the end of the last find() that succeeded for src index. 1529 srcIdx = m->fLastMatchEnd; 1530 if (srcIdx == -1) { 1531 // There has been no successful match with this matcher. 1532 // We want to copy the whole string. 1533 srcIdx = 0; 1534 } 1535 } 1536 1537 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1538 } 1539 1540 // 1541 // NUL terminate the output string, if possible, otherwise issue the 1542 // appropriate error or warning. 1543 // 1544 if (destIdx < destCap) { 1545 dest[destIdx] = 0; 1546 } else if (destIdx == destCap) { 1547 *status = U_STRING_NOT_TERMINATED_WARNING; 1548 } else { 1549 *status = U_BUFFER_OVERFLOW_ERROR; 1550 } 1551 1552 // 1553 // Update the user's buffer ptr and capacity vars to reflect the 1554 // amount used. 1555 // 1556 if (destIdx < destCap) { 1557 *destBuf += destIdx; 1558 *destCapacity -= destIdx; 1559 } else { 1560 *destBuf += destCap; 1561 *destCapacity = 0; 1562 } 1563 1564 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1565 *status = U_BUFFER_OVERFLOW_ERROR; 1566 } 1567 1568 return destIdx; 1569 } 1570 1571 1572 // 1573 // appendTail the actual API function 1574 // 1575 U_CAPI int32_t U_EXPORT2 1576 uregex_appendTail(URegularExpression *regexp2, 1577 UChar **destBuf, 1578 int32_t *destCapacity, 1579 UErrorCode *status) { 1580 RegularExpression *regexp = (RegularExpression*)regexp2; 1581 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1582 } 1583 1584 1585 // 1586 // uregex_appendTailUText...can just use the normal C++ method 1587 // 1588 U_CAPI UText * U_EXPORT2 1589 uregex_appendTailUText(URegularExpression *regexp2, 1590 UText *dest) { 1591 RegularExpression *regexp = (RegularExpression*)regexp2; 1592 return regexp->fMatcher->appendTail(dest); 1593 } 1594 1595 1596 //------------------------------------------------------------------------------ 1597 // 1598 // copyString Internal utility to copy a string to an output buffer, 1599 // while managing buffer overflow and preflight size 1600 // computation. NUL termination is added to destination, 1601 // and the NUL is counted in the output size. 1602 // 1603 //------------------------------------------------------------------------------ 1604 #if 0 1605 static void copyString(UChar *destBuffer, // Destination buffer. 1606 int32_t destCapacity, // Total capacity of dest buffer 1607 int32_t *destIndex, // Index into dest buffer. Updated on return. 1608 // Update not clipped to destCapacity. 1609 const UChar *srcPtr, // Pointer to source string 1610 int32_t srcLen) // Source string len. 1611 { 1612 int32_t si; 1613 int32_t di = *destIndex; 1614 UChar c; 1615 1616 for (si=0; si<srcLen; si++) { 1617 c = srcPtr[si]; 1618 if (di < destCapacity) { 1619 destBuffer[di] = c; 1620 di++; 1621 } else { 1622 di += srcLen - si; 1623 break; 1624 } 1625 } 1626 if (di<destCapacity) { 1627 destBuffer[di] = 0; 1628 } 1629 di++; 1630 *destIndex = di; 1631 } 1632 #endif 1633 1634 //------------------------------------------------------------------------------ 1635 // 1636 // uregex_split 1637 // 1638 //------------------------------------------------------------------------------ 1639 int32_t RegexCImpl::split(RegularExpression *regexp, 1640 UChar *destBuf, 1641 int32_t destCapacity, 1642 int32_t *requiredCapacity, 1643 UChar *destFields[], 1644 int32_t destFieldsCapacity, 1645 UErrorCode *status) { 1646 // 1647 // Reset for the input text 1648 // 1649 regexp->fMatcher->reset(); 1650 UText *inputText = regexp->fMatcher->fInputText; 1651 int64_t nextOutputStringStart = 0; 1652 int64_t inputLen = regexp->fMatcher->fInputLength; 1653 if (inputLen == 0) { 1654 return 0; 1655 } 1656 1657 // 1658 // Loop through the input text, searching for the delimiter pattern 1659 // 1660 int32_t i; // Index of the field being processed. 1661 int32_t destIdx = 0; // Next available position in destBuf; 1662 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1663 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1664 for (i=0; ; i++) { 1665 if (i>=destFieldsCapacity-1) { 1666 // There are one or zero output strings left. 1667 // Fill the last output string with whatever is left from the input, then exit the loop. 1668 // ( i will be == destFieldsCapacity if we filled the output array while processing 1669 // capture groups of the delimiter expression, in which case we will discard the 1670 // last capture group saved in favor of the unprocessed remainder of the 1671 // input string.) 1672 if (inputLen > nextOutputStringStart) { 1673 if (i != destFieldsCapacity-1) { 1674 // No fields are left. Recycle the last one for holding the trailing part of 1675 // the input string. 1676 i = destFieldsCapacity-1; 1677 destIdx = (int32_t)(destFields[i] - destFields[0]); 1678 } 1679 1680 destFields[i] = &destBuf[destIdx]; 1681 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1682 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1683 } 1684 break; 1685 } 1686 1687 if (regexp->fMatcher->find()) { 1688 // We found another delimiter. Move everything from where we started looking 1689 // up until the start of the delimiter into the next output string. 1690 destFields[i] = &destBuf[destIdx]; 1691 1692 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1693 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1694 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1695 tStatus = U_ZERO_ERROR; 1696 } else { 1697 *status = tStatus; 1698 } 1699 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1700 1701 // If the delimiter pattern has capturing parentheses, the captured 1702 // text goes out into the next n destination strings. 1703 int32_t groupNum; 1704 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1705 // If we've run out of output string slots, bail out. 1706 if (i==destFieldsCapacity-1) { 1707 break; 1708 } 1709 i++; 1710 1711 // Set up to extract the capture group contents into the dest buffer. 1712 destFields[i] = &destBuf[destIdx]; 1713 tStatus = U_ZERO_ERROR; 1714 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1715 destIdx += t + 1; // Record the space used in the output string buffer. 1716 // +1 for the NUL that terminates the string. 1717 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1718 tStatus = U_ZERO_ERROR; 1719 } else { 1720 *status = tStatus; 1721 } 1722 } 1723 1724 if (nextOutputStringStart == inputLen) { 1725 // The delimiter was at the end of the string. We're done. 1726 break; 1727 } 1728 1729 } 1730 else 1731 { 1732 // We ran off the end of the input while looking for the next delimiter. 1733 // All the remaining text goes into the current output string. 1734 destFields[i] = &destBuf[destIdx]; 1735 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1736 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1737 break; 1738 } 1739 } 1740 1741 // Zero out any unused portion of the destFields array 1742 int j; 1743 for (j=i+1; j<destFieldsCapacity; j++) { 1744 destFields[j] = NULL; 1745 } 1746 1747 if (requiredCapacity != NULL) { 1748 *requiredCapacity = destIdx; 1749 } 1750 if (destIdx > destCapacity) { 1751 *status = U_BUFFER_OVERFLOW_ERROR; 1752 } 1753 return i+1; 1754 } 1755 1756 // 1757 // uregex_split The actual API function 1758 // 1759 U_CAPI int32_t U_EXPORT2 1760 uregex_split(URegularExpression *regexp2, 1761 UChar *destBuf, 1762 int32_t destCapacity, 1763 int32_t *requiredCapacity, 1764 UChar *destFields[], 1765 int32_t destFieldsCapacity, 1766 UErrorCode *status) { 1767 RegularExpression *regexp = (RegularExpression*)regexp2; 1768 if (validateRE(regexp, status) == FALSE) { 1769 return 0; 1770 } 1771 if (destBuf == NULL && destCapacity > 0 || 1772 destCapacity < 0 || 1773 destFields == NULL || 1774 destFieldsCapacity < 1 ) { 1775 *status = U_ILLEGAL_ARGUMENT_ERROR; 1776 return 0; 1777 } 1778 1779 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1780 } 1781 1782 1783 // 1784 // uregex_splitUText...can just use the normal C++ method 1785 // 1786 U_CAPI int32_t U_EXPORT2 1787 uregex_splitUText(URegularExpression *regexp2, 1788 UText *destFields[], 1789 int32_t destFieldsCapacity, 1790 UErrorCode *status) { 1791 RegularExpression *regexp = (RegularExpression*)regexp2; 1792 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1793 } 1794 1795 1796 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1797 1798