1 /* 2 ******************************************************************************* 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: regex.cpp 7 */ 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13 #include "unicode/regex.h" 14 #include "unicode/uregex.h" 15 #include "unicode/unistr.h" 16 #include "unicode/ustring.h" 17 #include "unicode/uchar.h" 18 #include "unicode/uobject.h" 19 #include "umutex.h" 20 #include "uassert.h" 21 #include "cmemory.h" 22 23 #include "regextxt.h" 24 25 #include <stdio.h> 26 27 U_NAMESPACE_BEGIN 28 29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 30 31 struct RegularExpression: public UMemory { 32 public: 33 RegularExpression(); 34 ~RegularExpression(); 35 int32_t fMagic; 36 RegexPattern *fPat; 37 int32_t *fPatRefCount; 38 UChar *fPatString; 39 int32_t fPatStringLen; 40 RegexMatcher *fMatcher; 41 const UChar *fText; // Text from setText() 42 int32_t fTextLength; // Length provided by user with setText(), which 43 // may be -1. 44 UBool fOwnsText; 45 }; 46 47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 48 49 RegularExpression::RegularExpression() { 50 fMagic = REXP_MAGIC; 51 fPat = NULL; 52 fPatRefCount = NULL; 53 fPatString = NULL; 54 fPatStringLen = 0; 55 fMatcher = NULL; 56 fText = NULL; 57 fTextLength = 0; 58 fOwnsText = FALSE; 59 } 60 61 RegularExpression::~RegularExpression() { 62 delete fMatcher; 63 fMatcher = NULL; 64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 65 delete fPat; 66 uprv_free(fPatString); 67 uprv_free(fPatRefCount); 68 } 69 if (fOwnsText && fText!=NULL) { 70 uprv_free((void *)fText); 71 } 72 fMagic = 0; 73 } 74 75 U_NAMESPACE_END 76 77 U_NAMESPACE_USE 78 79 //---------------------------------------------------------------------------------------- 80 // 81 // validateRE Do boilerplate style checks on API function parameters. 82 // Return TRUE if they look OK. 83 //---------------------------------------------------------------------------------------- 84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { 85 if (U_FAILURE(*status)) { 86 return FALSE; 87 } 88 if (re == NULL || re->fMagic != REXP_MAGIC) { 89 *status = U_ILLEGAL_ARGUMENT_ERROR; 90 return FALSE; 91 } 92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 93 if (requiresText && re->fText == NULL && !re->fOwnsText) { 94 *status = U_REGEX_INVALID_STATE; 95 return FALSE; 96 } 97 return TRUE; 98 } 99 100 //---------------------------------------------------------------------------------------- 101 // 102 // uregex_open 103 // 104 //---------------------------------------------------------------------------------------- 105 U_CAPI URegularExpression * U_EXPORT2 106 uregex_open( const UChar *pattern, 107 int32_t patternLength, 108 uint32_t flags, 109 UParseError *pe, 110 UErrorCode *status) { 111 112 if (U_FAILURE(*status)) { 113 return NULL; 114 } 115 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 116 *status = U_ILLEGAL_ARGUMENT_ERROR; 117 return NULL; 118 } 119 int32_t actualPatLen = patternLength; 120 if (actualPatLen == -1) { 121 actualPatLen = u_strlen(pattern); 122 } 123 124 RegularExpression *re = new RegularExpression; 125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 127 if (re == NULL || refC == NULL || patBuf == NULL) { 128 *status = U_MEMORY_ALLOCATION_ERROR; 129 delete re; 130 uprv_free(refC); 131 uprv_free(patBuf); 132 return NULL; 133 } 134 re->fPatRefCount = refC; 135 *re->fPatRefCount = 1; 136 137 // 138 // Make a copy of the pattern string, so we can return it later if asked. 139 // For compiling the pattern, we will use a UText wrapper around 140 // this local copy, to avoid making even more copies. 141 // 142 re->fPatString = patBuf; 143 re->fPatStringLen = patternLength; 144 u_memcpy(patBuf, pattern, actualPatLen); 145 patBuf[actualPatLen] = 0; 146 147 UText patText = UTEXT_INITIALIZER; 148 utext_openUChars(&patText, patBuf, patternLength, status); 149 150 // 151 // Compile the pattern 152 // 153 if (pe != NULL) { 154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 155 } else { 156 re->fPat = RegexPattern::compile(&patText, flags, *status); 157 } 158 utext_close(&patText); 159 160 if (U_FAILURE(*status)) { 161 goto ErrorExit; 162 } 163 164 // 165 // Create the matcher object 166 // 167 re->fMatcher = re->fPat->matcher(*status); 168 if (U_SUCCESS(*status)) { 169 return (URegularExpression*)re; 170 } 171 172 ErrorExit: 173 delete re; 174 return NULL; 175 176 } 177 178 //---------------------------------------------------------------------------------------- 179 // 180 // uregex_openUText 181 // 182 //---------------------------------------------------------------------------------------- 183 U_CAPI URegularExpression * U_EXPORT2 184 uregex_openUText(UText *pattern, 185 uint32_t flags, 186 UParseError *pe, 187 UErrorCode *status) { 188 189 if (U_FAILURE(*status)) { 190 return NULL; 191 } 192 if (pattern == NULL) { 193 *status = U_ILLEGAL_ARGUMENT_ERROR; 194 return NULL; 195 } 196 197 int64_t patternNativeLength = utext_nativeLength(pattern); 198 199 if (patternNativeLength == 0) { 200 *status = U_ILLEGAL_ARGUMENT_ERROR; 201 return NULL; 202 } 203 204 RegularExpression *re = new RegularExpression; 205 206 UErrorCode lengthStatus = U_ZERO_ERROR; 207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 208 209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 211 if (re == NULL || refC == NULL || patBuf == NULL) { 212 *status = U_MEMORY_ALLOCATION_ERROR; 213 delete re; 214 uprv_free(refC); 215 uprv_free(patBuf); 216 return NULL; 217 } 218 re->fPatRefCount = refC; 219 *re->fPatRefCount = 1; 220 221 // 222 // Make a copy of the pattern string, so we can return it later if asked. 223 // For compiling the pattern, we will use a read-only UText wrapper 224 // around this local copy, to avoid making even more copies. 225 // 226 re->fPatString = patBuf; 227 re->fPatStringLen = pattern16Length; 228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 229 230 UText patText = UTEXT_INITIALIZER; 231 utext_openUChars(&patText, patBuf, pattern16Length, status); 232 233 // 234 // Compile the pattern 235 // 236 if (pe != NULL) { 237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 238 } else { 239 re->fPat = RegexPattern::compile(&patText, flags, *status); 240 } 241 utext_close(&patText); 242 243 if (U_FAILURE(*status)) { 244 goto ErrorExit; 245 } 246 247 // 248 // Create the matcher object 249 // 250 re->fMatcher = re->fPat->matcher(*status); 251 if (U_SUCCESS(*status)) { 252 return (URegularExpression*)re; 253 } 254 255 ErrorExit: 256 delete re; 257 return NULL; 258 259 } 260 261 //---------------------------------------------------------------------------------------- 262 // 263 // uregex_close 264 // 265 //---------------------------------------------------------------------------------------- 266 U_CAPI void U_EXPORT2 267 uregex_close(URegularExpression *re2) { 268 RegularExpression *re = (RegularExpression*)re2; 269 UErrorCode status = U_ZERO_ERROR; 270 if (validateRE(re, &status, FALSE) == FALSE) { 271 return; 272 } 273 delete re; 274 } 275 276 277 //---------------------------------------------------------------------------------------- 278 // 279 // uregex_clone 280 // 281 //---------------------------------------------------------------------------------------- 282 U_CAPI URegularExpression * U_EXPORT2 283 uregex_clone(const URegularExpression *source2, UErrorCode *status) { 284 RegularExpression *source = (RegularExpression*)source2; 285 if (validateRE(source, status, FALSE) == FALSE) { 286 return NULL; 287 } 288 289 RegularExpression *clone = new RegularExpression; 290 if (clone == NULL) { 291 *status = U_MEMORY_ALLOCATION_ERROR; 292 return NULL; 293 } 294 295 clone->fMatcher = source->fPat->matcher(*status); 296 if (U_FAILURE(*status)) { 297 delete clone; 298 return NULL; 299 } 300 301 clone->fPat = source->fPat; 302 clone->fPatRefCount = source->fPatRefCount; 303 clone->fPatString = source->fPatString; 304 clone->fPatStringLen = source->fPatStringLen; 305 umtx_atomic_inc(source->fPatRefCount); 306 // Note: fText is not cloned. 307 308 return (URegularExpression*)clone; 309 } 310 311 312 313 314 //------------------------------------------------------------------------------ 315 // 316 // uregex_pattern 317 // 318 //------------------------------------------------------------------------------ 319 U_CAPI const UChar * U_EXPORT2 320 uregex_pattern(const URegularExpression *regexp2, 321 int32_t *patLength, 322 UErrorCode *status) { 323 RegularExpression *regexp = (RegularExpression*)regexp2; 324 325 if (validateRE(regexp, status, FALSE) == FALSE) { 326 return NULL; 327 } 328 if (patLength != NULL) { 329 *patLength = regexp->fPatStringLen; 330 } 331 return regexp->fPatString; 332 } 333 334 335 //------------------------------------------------------------------------------ 336 // 337 // uregex_patternUText 338 // 339 //------------------------------------------------------------------------------ 340 U_CAPI UText * U_EXPORT2 341 uregex_patternUText(const URegularExpression *regexp2, 342 UErrorCode *status) { 343 RegularExpression *regexp = (RegularExpression*)regexp2; 344 return regexp->fPat->patternText(*status); 345 } 346 347 348 //------------------------------------------------------------------------------ 349 // 350 // uregex_flags 351 // 352 //------------------------------------------------------------------------------ 353 U_CAPI int32_t U_EXPORT2 354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 355 RegularExpression *regexp = (RegularExpression*)regexp2; 356 if (validateRE(regexp, status, FALSE) == FALSE) { 357 return 0; 358 } 359 int32_t flags = regexp->fPat->flags(); 360 return flags; 361 } 362 363 364 //------------------------------------------------------------------------------ 365 // 366 // uregex_setText 367 // 368 //------------------------------------------------------------------------------ 369 U_CAPI void U_EXPORT2 370 uregex_setText(URegularExpression *regexp2, 371 const UChar *text, 372 int32_t textLength, 373 UErrorCode *status) { 374 RegularExpression *regexp = (RegularExpression*)regexp2; 375 if (validateRE(regexp, status, FALSE) == FALSE) { 376 return; 377 } 378 if (text == NULL || textLength < -1) { 379 *status = U_ILLEGAL_ARGUMENT_ERROR; 380 return; 381 } 382 383 if (regexp->fOwnsText && regexp->fText != NULL) { 384 uprv_free((void *)regexp->fText); 385 } 386 387 regexp->fText = text; 388 regexp->fTextLength = textLength; 389 regexp->fOwnsText = FALSE; 390 391 UText input = UTEXT_INITIALIZER; 392 utext_openUChars(&input, text, textLength, status); 393 regexp->fMatcher->reset(&input); 394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 395 } 396 397 398 //------------------------------------------------------------------------------ 399 // 400 // uregex_setUText 401 // 402 //------------------------------------------------------------------------------ 403 U_CAPI void U_EXPORT2 404 uregex_setUText(URegularExpression *regexp2, 405 UText *text, 406 UErrorCode *status) { 407 RegularExpression *regexp = (RegularExpression*)regexp2; 408 if (validateRE(regexp, status, FALSE) == FALSE) { 409 return; 410 } 411 if (text == NULL) { 412 *status = U_ILLEGAL_ARGUMENT_ERROR; 413 return; 414 } 415 416 if (regexp->fOwnsText && regexp->fText != NULL) { 417 uprv_free((void *)regexp->fText); 418 } 419 420 regexp->fText = NULL; // only fill it in on request 421 regexp->fTextLength = -1; 422 regexp->fOwnsText = TRUE; 423 regexp->fMatcher->reset(text); 424 } 425 426 427 428 //------------------------------------------------------------------------------ 429 // 430 // uregex_getText 431 // 432 //------------------------------------------------------------------------------ 433 U_CAPI const UChar * U_EXPORT2 434 uregex_getText(URegularExpression *regexp2, 435 int32_t *textLength, 436 UErrorCode *status) { 437 RegularExpression *regexp = (RegularExpression*)regexp2; 438 if (validateRE(regexp, status, FALSE) == FALSE) { 439 return NULL; 440 } 441 442 if (regexp->fText == NULL) { 443 // need to fill in the text 444 UText *inputText = regexp->fMatcher->inputText(); 445 int64_t inputNativeLength = utext_nativeLength(inputText); 446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 447 regexp->fText = inputText->chunkContents; 448 regexp->fTextLength = (int32_t)inputNativeLength; 449 regexp->fOwnsText = FALSE; // because the UText owns it 450 } else { 451 UErrorCode lengthStatus = U_ZERO_ERROR; 452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 454 455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 456 regexp->fText = inputChars; 457 regexp->fOwnsText = TRUE; // should already be set but just in case 458 } 459 } 460 461 if (textLength != NULL) { 462 *textLength = regexp->fTextLength; 463 } 464 return regexp->fText; 465 } 466 467 468 //------------------------------------------------------------------------------ 469 // 470 // uregex_getUText 471 // 472 //------------------------------------------------------------------------------ 473 U_CAPI UText * U_EXPORT2 474 uregex_getUText(URegularExpression *regexp2, 475 UText *dest, 476 UErrorCode *status) { 477 RegularExpression *regexp = (RegularExpression*)regexp2; 478 if (validateRE(regexp, status, FALSE) == FALSE) { 479 return dest; 480 } 481 return regexp->fMatcher->getInput(dest, *status); 482 } 483 484 485 //------------------------------------------------------------------------------ 486 // 487 // uregex_matches 488 // 489 //------------------------------------------------------------------------------ 490 U_CAPI UBool U_EXPORT2 491 uregex_matches(URegularExpression *regexp2, 492 int32_t startIndex, 493 UErrorCode *status) { 494 return uregex_matches64( regexp2, (int64_t)startIndex, status); 495 } 496 497 U_CAPI UBool U_EXPORT2 498 uregex_matches64(URegularExpression *regexp2, 499 int64_t startIndex, 500 UErrorCode *status) { 501 RegularExpression *regexp = (RegularExpression*)regexp2; 502 UBool result = FALSE; 503 if (validateRE(regexp, status) == FALSE) { 504 return result; 505 } 506 if (startIndex == -1) { 507 result = regexp->fMatcher->matches(*status); 508 } else { 509 result = regexp->fMatcher->matches(startIndex, *status); 510 } 511 return result; 512 } 513 514 515 //------------------------------------------------------------------------------ 516 // 517 // uregex_lookingAt 518 // 519 //------------------------------------------------------------------------------ 520 U_CAPI UBool U_EXPORT2 521 uregex_lookingAt(URegularExpression *regexp2, 522 int32_t startIndex, 523 UErrorCode *status) { 524 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 525 } 526 527 U_CAPI UBool U_EXPORT2 528 uregex_lookingAt64(URegularExpression *regexp2, 529 int64_t startIndex, 530 UErrorCode *status) { 531 RegularExpression *regexp = (RegularExpression*)regexp2; 532 UBool result = FALSE; 533 if (validateRE(regexp, status) == FALSE) { 534 return result; 535 } 536 if (startIndex == -1) { 537 result = regexp->fMatcher->lookingAt(*status); 538 } else { 539 result = regexp->fMatcher->lookingAt(startIndex, *status); 540 } 541 return result; 542 } 543 544 545 546 //------------------------------------------------------------------------------ 547 // 548 // uregex_find 549 // 550 //------------------------------------------------------------------------------ 551 U_CAPI UBool U_EXPORT2 552 uregex_find(URegularExpression *regexp2, 553 int32_t startIndex, 554 UErrorCode *status) { 555 return uregex_find64( regexp2, (int64_t)startIndex, status); 556 } 557 558 U_CAPI UBool U_EXPORT2 559 uregex_find64(URegularExpression *regexp2, 560 int64_t startIndex, 561 UErrorCode *status) { 562 RegularExpression *regexp = (RegularExpression*)regexp2; 563 UBool result = FALSE; 564 if (validateRE(regexp, status) == FALSE) { 565 return result; 566 } 567 if (startIndex == -1) { 568 regexp->fMatcher->resetPreserveRegion(); 569 result = regexp->fMatcher->find(); 570 } else { 571 result = regexp->fMatcher->find(startIndex, *status); 572 } 573 return result; 574 } 575 576 577 //------------------------------------------------------------------------------ 578 // 579 // uregex_findNext 580 // 581 //------------------------------------------------------------------------------ 582 U_CAPI UBool U_EXPORT2 583 uregex_findNext(URegularExpression *regexp2, 584 UErrorCode *status) { 585 RegularExpression *regexp = (RegularExpression*)regexp2; 586 if (validateRE(regexp, status) == FALSE) { 587 return FALSE; 588 } 589 UBool result = regexp->fMatcher->find(); 590 return result; 591 } 592 593 //------------------------------------------------------------------------------ 594 // 595 // uregex_groupCount 596 // 597 //------------------------------------------------------------------------------ 598 U_CAPI int32_t U_EXPORT2 599 uregex_groupCount(URegularExpression *regexp2, 600 UErrorCode *status) { 601 RegularExpression *regexp = (RegularExpression*)regexp2; 602 if (validateRE(regexp, status, FALSE) == FALSE) { 603 return 0; 604 } 605 int32_t result = regexp->fMatcher->groupCount(); 606 return result; 607 } 608 609 610 //------------------------------------------------------------------------------ 611 // 612 // uregex_group 613 // 614 //------------------------------------------------------------------------------ 615 U_CAPI int32_t U_EXPORT2 616 uregex_group(URegularExpression *regexp2, 617 int32_t groupNum, 618 UChar *dest, 619 int32_t destCapacity, 620 UErrorCode *status) { 621 RegularExpression *regexp = (RegularExpression*)regexp2; 622 if (validateRE(regexp, status) == FALSE) { 623 return 0; 624 } 625 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 626 *status = U_ILLEGAL_ARGUMENT_ERROR; 627 return 0; 628 } 629 630 if (destCapacity == 0 || regexp->fText != NULL) { 631 // If preflighting or if we already have the text as UChars, 632 // this is a little cheaper than going through uregex_groupUTextDeep() 633 634 // 635 // Pick up the range of characters from the matcher 636 // 637 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 638 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 639 if (U_FAILURE(*status)) { 640 return 0; 641 } 642 643 // 644 // Trim length based on buffer capacity 645 // 646 int32_t fullLength = endIx - startIx; 647 int32_t copyLength = fullLength; 648 if (copyLength < destCapacity) { 649 dest[copyLength] = 0; 650 } else if (copyLength == destCapacity) { 651 *status = U_STRING_NOT_TERMINATED_WARNING; 652 } else { 653 copyLength = destCapacity; 654 *status = U_BUFFER_OVERFLOW_ERROR; 655 } 656 657 // 658 // Copy capture group to user's buffer 659 // 660 if (copyLength > 0) { 661 u_memcpy(dest, ®exp->fText[startIx], copyLength); 662 } 663 return fullLength; 664 } else { 665 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); 666 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); 667 utext_close(groupText); 668 return result; 669 } 670 } 671 672 673 //------------------------------------------------------------------------------ 674 // 675 // uregex_groupUText 676 // 677 //------------------------------------------------------------------------------ 678 U_CAPI UText * U_EXPORT2 679 uregex_groupUText(URegularExpression *regexp2, 680 int32_t groupNum, 681 UText *dest, 682 int64_t *groupLength, 683 UErrorCode *status) { 684 RegularExpression *regexp = (RegularExpression*)regexp2; 685 if (validateRE(regexp, status) == FALSE) { 686 UErrorCode emptyTextStatus = U_ZERO_ERROR; 687 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 688 } 689 690 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 691 } 692 693 //------------------------------------------------------------------------------ 694 // 695 // uregex_groupUTextDeep 696 // 697 //------------------------------------------------------------------------------ 698 U_CAPI UText * U_EXPORT2 699 uregex_groupUTextDeep(URegularExpression *regexp2, 700 int32_t groupNum, 701 UText *dest, 702 UErrorCode *status) { 703 RegularExpression *regexp = (RegularExpression*)regexp2; 704 if (validateRE(regexp, status) == FALSE) { 705 UErrorCode emptyTextStatus = U_ZERO_ERROR; 706 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 707 } 708 709 if (regexp->fText != NULL) { 710 // 711 // Pick up the range of characters from the matcher 712 // and use our already-extracted characters 713 // 714 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 715 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 716 if (U_FAILURE(*status)) { 717 UErrorCode emptyTextStatus = U_ZERO_ERROR; 718 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 719 } 720 721 if (dest) { 722 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); 723 } else { 724 UText groupText = UTEXT_INITIALIZER; 725 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); 726 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); 727 utext_close(&groupText); 728 } 729 730 return dest; 731 } else { 732 return regexp->fMatcher->group(groupNum, dest, *status); 733 } 734 } 735 736 //------------------------------------------------------------------------------ 737 // 738 // uregex_start 739 // 740 //------------------------------------------------------------------------------ 741 U_CAPI int32_t U_EXPORT2 742 uregex_start(URegularExpression *regexp2, 743 int32_t groupNum, 744 UErrorCode *status) { 745 return (int32_t)uregex_start64( regexp2, groupNum, status); 746 } 747 748 U_CAPI int64_t U_EXPORT2 749 uregex_start64(URegularExpression *regexp2, 750 int32_t groupNum, 751 UErrorCode *status) { 752 RegularExpression *regexp = (RegularExpression*)regexp2; 753 if (validateRE(regexp, status) == FALSE) { 754 return 0; 755 } 756 int32_t result = regexp->fMatcher->start(groupNum, *status); 757 return result; 758 } 759 760 //------------------------------------------------------------------------------ 761 // 762 // uregex_end 763 // 764 //------------------------------------------------------------------------------ 765 U_CAPI int32_t U_EXPORT2 766 uregex_end(URegularExpression *regexp2, 767 int32_t groupNum, 768 UErrorCode *status) { 769 return (int32_t)uregex_end64( regexp2, groupNum, status); 770 } 771 772 U_CAPI int64_t U_EXPORT2 773 uregex_end64(URegularExpression *regexp2, 774 int32_t groupNum, 775 UErrorCode *status) { 776 RegularExpression *regexp = (RegularExpression*)regexp2; 777 if (validateRE(regexp, status) == FALSE) { 778 return 0; 779 } 780 int32_t result = regexp->fMatcher->end(groupNum, *status); 781 return result; 782 } 783 784 //------------------------------------------------------------------------------ 785 // 786 // uregex_reset 787 // 788 //------------------------------------------------------------------------------ 789 U_CAPI void U_EXPORT2 790 uregex_reset(URegularExpression *regexp2, 791 int32_t index, 792 UErrorCode *status) { 793 uregex_reset64( regexp2, (int64_t)index, status); 794 } 795 796 U_CAPI void U_EXPORT2 797 uregex_reset64(URegularExpression *regexp2, 798 int64_t index, 799 UErrorCode *status) { 800 RegularExpression *regexp = (RegularExpression*)regexp2; 801 if (validateRE(regexp, status) == FALSE) { 802 return; 803 } 804 regexp->fMatcher->reset(index, *status); 805 } 806 807 808 //------------------------------------------------------------------------------ 809 // 810 // uregex_setRegion 811 // 812 //------------------------------------------------------------------------------ 813 U_CAPI void U_EXPORT2 814 uregex_setRegion(URegularExpression *regexp2, 815 int32_t regionStart, 816 int32_t regionLimit, 817 UErrorCode *status) { 818 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 819 } 820 821 U_CAPI void U_EXPORT2 822 uregex_setRegion64(URegularExpression *regexp2, 823 int64_t regionStart, 824 int64_t regionLimit, 825 UErrorCode *status) { 826 RegularExpression *regexp = (RegularExpression*)regexp2; 827 if (validateRE(regexp, status) == FALSE) { 828 return; 829 } 830 regexp->fMatcher->region(regionStart, regionLimit, *status); 831 } 832 833 834 //------------------------------------------------------------------------------ 835 // 836 // uregex_setRegionAndStart 837 // 838 //------------------------------------------------------------------------------ 839 U_DRAFT void U_EXPORT2 840 uregex_setRegionAndStart(URegularExpression *regexp2, 841 int64_t regionStart, 842 int64_t regionLimit, 843 int64_t startIndex, 844 UErrorCode *status) { 845 RegularExpression *regexp = (RegularExpression*)regexp2; 846 if (validateRE(regexp, status) == FALSE) { 847 return; 848 } 849 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 850 } 851 852 //------------------------------------------------------------------------------ 853 // 854 // uregex_regionStart 855 // 856 //------------------------------------------------------------------------------ 857 U_CAPI int32_t U_EXPORT2 858 uregex_regionStart(const URegularExpression *regexp2, 859 UErrorCode *status) { 860 return (int32_t)uregex_regionStart64(regexp2, status); 861 } 862 863 U_CAPI int64_t U_EXPORT2 864 uregex_regionStart64(const URegularExpression *regexp2, 865 UErrorCode *status) { 866 RegularExpression *regexp = (RegularExpression*)regexp2; 867 if (validateRE(regexp, status) == FALSE) { 868 return 0; 869 } 870 return regexp->fMatcher->regionStart(); 871 } 872 873 874 //------------------------------------------------------------------------------ 875 // 876 // uregex_regionEnd 877 // 878 //------------------------------------------------------------------------------ 879 U_CAPI int32_t U_EXPORT2 880 uregex_regionEnd(const URegularExpression *regexp2, 881 UErrorCode *status) { 882 return (int32_t)uregex_regionEnd64(regexp2, status); 883 } 884 885 U_CAPI int64_t U_EXPORT2 886 uregex_regionEnd64(const URegularExpression *regexp2, 887 UErrorCode *status) { 888 RegularExpression *regexp = (RegularExpression*)regexp2; 889 if (validateRE(regexp, status) == FALSE) { 890 return 0; 891 } 892 return regexp->fMatcher->regionEnd(); 893 } 894 895 896 //------------------------------------------------------------------------------ 897 // 898 // uregex_hasTransparentBounds 899 // 900 //------------------------------------------------------------------------------ 901 U_CAPI UBool U_EXPORT2 902 uregex_hasTransparentBounds(const URegularExpression *regexp2, 903 UErrorCode *status) { 904 RegularExpression *regexp = (RegularExpression*)regexp2; 905 if (validateRE(regexp, status) == FALSE) { 906 return FALSE; 907 } 908 return regexp->fMatcher->hasTransparentBounds(); 909 } 910 911 912 //------------------------------------------------------------------------------ 913 // 914 // uregex_useTransparentBounds 915 // 916 //------------------------------------------------------------------------------ 917 U_CAPI void U_EXPORT2 918 uregex_useTransparentBounds(URegularExpression *regexp2, 919 UBool b, 920 UErrorCode *status) { 921 RegularExpression *regexp = (RegularExpression*)regexp2; 922 if (validateRE(regexp, status) == FALSE) { 923 return; 924 } 925 regexp->fMatcher->useTransparentBounds(b); 926 } 927 928 929 //------------------------------------------------------------------------------ 930 // 931 // uregex_hasAnchoringBounds 932 // 933 //------------------------------------------------------------------------------ 934 U_CAPI UBool U_EXPORT2 935 uregex_hasAnchoringBounds(const URegularExpression *regexp2, 936 UErrorCode *status) { 937 RegularExpression *regexp = (RegularExpression*)regexp2; 938 if (validateRE(regexp, status) == FALSE) { 939 return FALSE; 940 } 941 return regexp->fMatcher->hasAnchoringBounds(); 942 } 943 944 945 //------------------------------------------------------------------------------ 946 // 947 // uregex_useAnchoringBounds 948 // 949 //------------------------------------------------------------------------------ 950 U_CAPI void U_EXPORT2 951 uregex_useAnchoringBounds(URegularExpression *regexp2, 952 UBool b, 953 UErrorCode *status) { 954 RegularExpression *regexp = (RegularExpression*)regexp2; 955 if (validateRE(regexp, status) == FALSE) { 956 return; 957 } 958 regexp->fMatcher->useAnchoringBounds(b); 959 } 960 961 962 //------------------------------------------------------------------------------ 963 // 964 // uregex_hitEnd 965 // 966 //------------------------------------------------------------------------------ 967 U_CAPI UBool U_EXPORT2 968 uregex_hitEnd(const URegularExpression *regexp2, 969 UErrorCode *status) { 970 RegularExpression *regexp = (RegularExpression*)regexp2; 971 if (validateRE(regexp, status) == FALSE) { 972 return FALSE; 973 } 974 return regexp->fMatcher->hitEnd(); 975 } 976 977 978 //------------------------------------------------------------------------------ 979 // 980 // uregex_requireEnd 981 // 982 //------------------------------------------------------------------------------ 983 U_CAPI UBool U_EXPORT2 984 uregex_requireEnd(const URegularExpression *regexp2, 985 UErrorCode *status) { 986 RegularExpression *regexp = (RegularExpression*)regexp2; 987 if (validateRE(regexp, status) == FALSE) { 988 return FALSE; 989 } 990 return regexp->fMatcher->requireEnd(); 991 } 992 993 994 //------------------------------------------------------------------------------ 995 // 996 // uregex_setTimeLimit 997 // 998 //------------------------------------------------------------------------------ 999 U_CAPI void U_EXPORT2 1000 uregex_setTimeLimit(URegularExpression *regexp2, 1001 int32_t limit, 1002 UErrorCode *status) { 1003 RegularExpression *regexp = (RegularExpression*)regexp2; 1004 if (validateRE(regexp, status)) { 1005 regexp->fMatcher->setTimeLimit(limit, *status); 1006 } 1007 } 1008 1009 1010 1011 //------------------------------------------------------------------------------ 1012 // 1013 // uregex_getTimeLimit 1014 // 1015 //------------------------------------------------------------------------------ 1016 U_CAPI int32_t U_EXPORT2 1017 uregex_getTimeLimit(const URegularExpression *regexp2, 1018 UErrorCode *status) { 1019 int32_t retVal = 0; 1020 RegularExpression *regexp = (RegularExpression*)regexp2; 1021 if (validateRE(regexp, status)) { 1022 retVal = regexp->fMatcher->getTimeLimit(); 1023 } 1024 return retVal; 1025 } 1026 1027 1028 1029 //------------------------------------------------------------------------------ 1030 // 1031 // uregex_setStackLimit 1032 // 1033 //------------------------------------------------------------------------------ 1034 U_CAPI void U_EXPORT2 1035 uregex_setStackLimit(URegularExpression *regexp2, 1036 int32_t limit, 1037 UErrorCode *status) { 1038 RegularExpression *regexp = (RegularExpression*)regexp2; 1039 if (validateRE(regexp, status)) { 1040 regexp->fMatcher->setStackLimit(limit, *status); 1041 } 1042 } 1043 1044 1045 1046 //------------------------------------------------------------------------------ 1047 // 1048 // uregex_getStackLimit 1049 // 1050 //------------------------------------------------------------------------------ 1051 U_CAPI int32_t U_EXPORT2 1052 uregex_getStackLimit(const URegularExpression *regexp2, 1053 UErrorCode *status) { 1054 int32_t retVal = 0; 1055 RegularExpression *regexp = (RegularExpression*)regexp2; 1056 if (validateRE(regexp, status)) { 1057 retVal = regexp->fMatcher->getStackLimit(); 1058 } 1059 return retVal; 1060 } 1061 1062 1063 //------------------------------------------------------------------------------ 1064 // 1065 // uregex_setMatchCallback 1066 // 1067 //------------------------------------------------------------------------------ 1068 U_CAPI void U_EXPORT2 1069 uregex_setMatchCallback(URegularExpression *regexp2, 1070 URegexMatchCallback *callback, 1071 const void *context, 1072 UErrorCode *status) { 1073 RegularExpression *regexp = (RegularExpression*)regexp2; 1074 if (validateRE(regexp, status)) { 1075 regexp->fMatcher->setMatchCallback(callback, context, *status); 1076 } 1077 } 1078 1079 1080 //------------------------------------------------------------------------------ 1081 // 1082 // uregex_getMatchCallback 1083 // 1084 //------------------------------------------------------------------------------ 1085 U_CAPI void U_EXPORT2 1086 uregex_getMatchCallback(const URegularExpression *regexp2, 1087 URegexMatchCallback **callback, 1088 const void **context, 1089 UErrorCode *status) { 1090 RegularExpression *regexp = (RegularExpression*)regexp2; 1091 if (validateRE(regexp, status)) { 1092 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1093 } 1094 } 1095 1096 1097 //------------------------------------------------------------------------------ 1098 // 1099 // uregex_setMatchProgressCallback 1100 // 1101 //------------------------------------------------------------------------------ 1102 U_CAPI void U_EXPORT2 1103 uregex_setFindProgressCallback(URegularExpression *regexp2, 1104 URegexFindProgressCallback *callback, 1105 const void *context, 1106 UErrorCode *status) { 1107 RegularExpression *regexp = (RegularExpression*)regexp2; 1108 if (validateRE(regexp, status)) { 1109 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1110 } 1111 } 1112 1113 1114 //------------------------------------------------------------------------------ 1115 // 1116 // uregex_getMatchCallback 1117 // 1118 //------------------------------------------------------------------------------ 1119 U_CAPI void U_EXPORT2 1120 uregex_getFindProgressCallback(const URegularExpression *regexp2, 1121 URegexFindProgressCallback **callback, 1122 const void **context, 1123 UErrorCode *status) { 1124 RegularExpression *regexp = (RegularExpression*)regexp2; 1125 if (validateRE(regexp, status)) { 1126 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1127 } 1128 } 1129 1130 1131 //------------------------------------------------------------------------------ 1132 // 1133 // uregex_replaceAll 1134 // 1135 //------------------------------------------------------------------------------ 1136 U_CAPI int32_t U_EXPORT2 1137 uregex_replaceAll(URegularExpression *regexp2, 1138 const UChar *replacementText, 1139 int32_t replacementLength, 1140 UChar *destBuf, 1141 int32_t destCapacity, 1142 UErrorCode *status) { 1143 RegularExpression *regexp = (RegularExpression*)regexp2; 1144 if (validateRE(regexp, status) == FALSE) { 1145 return 0; 1146 } 1147 if (replacementText == NULL || replacementLength < -1 || 1148 (destBuf == NULL && destCapacity > 0) || 1149 destCapacity < 0) { 1150 *status = U_ILLEGAL_ARGUMENT_ERROR; 1151 return 0; 1152 } 1153 1154 int32_t len = 0; 1155 1156 uregex_reset(regexp2, 0, status); 1157 1158 // Note: Seperate error code variables for findNext() and appendReplacement() 1159 // are used so that destination buffer overflow errors 1160 // in appendReplacement won't stop findNext() from working. 1161 // appendReplacement() and appendTail() special case incoming buffer 1162 // overflow errors, continuing to return the correct length. 1163 UErrorCode findStatus = *status; 1164 while (uregex_findNext(regexp2, &findStatus)) { 1165 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1166 &destBuf, &destCapacity, status); 1167 } 1168 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1169 1170 if (U_FAILURE(findStatus)) { 1171 // If anything went wrong with the findNext(), make that error trump 1172 // whatever may have happened with the append() operations. 1173 // Errors in findNext() are not expected. 1174 *status = findStatus; 1175 } 1176 1177 return len; 1178 } 1179 1180 1181 //------------------------------------------------------------------------------ 1182 // 1183 // uregex_replaceAllUText 1184 // 1185 //------------------------------------------------------------------------------ 1186 U_CAPI UText * U_EXPORT2 1187 uregex_replaceAllUText(URegularExpression *regexp2, 1188 UText *replacementText, 1189 UText *dest, 1190 UErrorCode *status) { 1191 RegularExpression *regexp = (RegularExpression*)regexp2; 1192 if (validateRE(regexp, status) == FALSE) { 1193 return 0; 1194 } 1195 if (replacementText == NULL) { 1196 *status = U_ILLEGAL_ARGUMENT_ERROR; 1197 return 0; 1198 } 1199 1200 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1201 return dest; 1202 } 1203 1204 1205 //------------------------------------------------------------------------------ 1206 // 1207 // uregex_replaceFirst 1208 // 1209 //------------------------------------------------------------------------------ 1210 U_CAPI int32_t U_EXPORT2 1211 uregex_replaceFirst(URegularExpression *regexp2, 1212 const UChar *replacementText, 1213 int32_t replacementLength, 1214 UChar *destBuf, 1215 int32_t destCapacity, 1216 UErrorCode *status) { 1217 RegularExpression *regexp = (RegularExpression*)regexp2; 1218 if (validateRE(regexp, status) == FALSE) { 1219 return 0; 1220 } 1221 if (replacementText == NULL || replacementLength < -1 || 1222 (destBuf == NULL && destCapacity > 0) || 1223 destCapacity < 0) { 1224 *status = U_ILLEGAL_ARGUMENT_ERROR; 1225 return 0; 1226 } 1227 1228 int32_t len = 0; 1229 UBool findSucceeded; 1230 uregex_reset(regexp2, 0, status); 1231 findSucceeded = uregex_find(regexp2, 0, status); 1232 if (findSucceeded) { 1233 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1234 &destBuf, &destCapacity, status); 1235 } 1236 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1237 1238 return len; 1239 } 1240 1241 1242 //------------------------------------------------------------------------------ 1243 // 1244 // uregex_replaceFirstUText 1245 // 1246 //------------------------------------------------------------------------------ 1247 U_CAPI UText * U_EXPORT2 1248 uregex_replaceFirstUText(URegularExpression *regexp2, 1249 UText *replacementText, 1250 UText *dest, 1251 UErrorCode *status) { 1252 RegularExpression *regexp = (RegularExpression*)regexp2; 1253 if (validateRE(regexp, status) == FALSE) { 1254 return 0; 1255 } 1256 if (replacementText == NULL) { 1257 *status = U_ILLEGAL_ARGUMENT_ERROR; 1258 return 0; 1259 } 1260 1261 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1262 return dest; 1263 } 1264 1265 1266 //------------------------------------------------------------------------------ 1267 // 1268 // uregex_appendReplacement 1269 // 1270 //------------------------------------------------------------------------------ 1271 1272 U_NAMESPACE_BEGIN 1273 // 1274 // Dummy class, because these functions need to be friends of class RegexMatcher, 1275 // and stand-alone C functions don't work as friends 1276 // 1277 class RegexCImpl { 1278 public: 1279 inline static int32_t appendReplacement(RegularExpression *regexp, 1280 const UChar *replacementText, 1281 int32_t replacementLength, 1282 UChar **destBuf, 1283 int32_t *destCapacity, 1284 UErrorCode *status); 1285 1286 inline static int32_t appendTail(RegularExpression *regexp, 1287 UChar **destBuf, 1288 int32_t *destCapacity, 1289 UErrorCode *status); 1290 1291 inline static int32_t split(RegularExpression *regexp, 1292 UChar *destBuf, 1293 int32_t destCapacity, 1294 int32_t *requiredCapacity, 1295 UChar *destFields[], 1296 int32_t destFieldsCapacity, 1297 UErrorCode *status); 1298 }; 1299 1300 U_NAMESPACE_END 1301 1302 1303 1304 static const UChar BACKSLASH = 0x5c; 1305 static const UChar DOLLARSIGN = 0x24; 1306 1307 // 1308 // Move a character to an output buffer, with bounds checking on the index. 1309 // Index advances even if capacity is exceeded, for preflight size computations. 1310 // This little sequence is used a LOT. 1311 // 1312 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1313 if (*idx < bufCapacity) { 1314 buf[*idx] = c; 1315 } 1316 (*idx)++; 1317 } 1318 1319 1320 // 1321 // appendReplacement, the actual implementation. 1322 // 1323 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1324 const UChar *replacementText, 1325 int32_t replacementLength, 1326 UChar **destBuf, 1327 int32_t *destCapacity, 1328 UErrorCode *status) { 1329 1330 // If we come in with a buffer overflow error, don't suppress the operation. 1331 // A series of appendReplacements, appendTail need to correctly preflight 1332 // the buffer size when an overflow happens somewhere in the middle. 1333 UBool pendingBufferOverflow = FALSE; 1334 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1335 pendingBufferOverflow = TRUE; 1336 *status = U_ZERO_ERROR; 1337 } 1338 1339 // 1340 // Validate all paramters 1341 // 1342 if (validateRE(regexp, status) == FALSE) { 1343 return 0; 1344 } 1345 if (replacementText == NULL || replacementLength < -1 || 1346 destCapacity == NULL || destBuf == NULL || 1347 (*destBuf == NULL && *destCapacity > 0) || 1348 *destCapacity < 0) { 1349 *status = U_ILLEGAL_ARGUMENT_ERROR; 1350 return 0; 1351 } 1352 1353 RegexMatcher *m = regexp->fMatcher; 1354 if (m->fMatch == FALSE) { 1355 *status = U_REGEX_INVALID_STATE; 1356 return 0; 1357 } 1358 1359 UChar *dest = *destBuf; 1360 int32_t capacity = *destCapacity; 1361 int32_t destIdx = 0; 1362 int32_t i; 1363 1364 // If it wasn't supplied by the caller, get the length of the replacement text. 1365 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1366 // the fly and avoid this step. 1367 if (replacementLength == -1) { 1368 replacementLength = u_strlen(replacementText); 1369 } 1370 1371 // Copy input string from the end of previous match to start of current match 1372 if (regexp->fText != NULL) { 1373 int32_t matchStart; 1374 int32_t lastMatchEnd; 1375 if (UTEXT_USES_U16(m->fInputText)) { 1376 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1377 matchStart = (int32_t)m->fMatchStart; 1378 } else { 1379 // !!!: Would like a better way to do this! 1380 UErrorCode status = U_ZERO_ERROR; 1381 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); 1382 status = U_ZERO_ERROR; 1383 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); 1384 } 1385 for (i=lastMatchEnd; i<matchStart; i++) { 1386 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1387 } 1388 } else { 1389 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1390 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1391 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); 1392 } 1393 1394 1395 // scan the replacement text, looking for substitutions ($n) and \escapes. 1396 int32_t replIdx = 0; 1397 while (replIdx < replacementLength) { 1398 UChar c = replacementText[replIdx]; 1399 replIdx++; 1400 if (c != DOLLARSIGN && c != BACKSLASH) { 1401 // Common case, no substitution, no escaping, 1402 // just copy the char to the dest buf. 1403 appendToBuf(c, &destIdx, dest, capacity); 1404 continue; 1405 } 1406 1407 if (c == BACKSLASH) { 1408 // Backslash Escape. Copy the following char out without further checks. 1409 // Note: Surrogate pairs don't need any special handling 1410 // The second half wont be a '$' or a '\', and 1411 // will move to the dest normally on the next 1412 // loop iteration. 1413 if (replIdx >= replacementLength) { 1414 break; 1415 } 1416 c = replacementText[replIdx]; 1417 1418 if (c==0x55/*U*/ || c==0x75/*u*/) { 1419 // We have a \udddd or \Udddddddd escape sequence. 1420 UChar32 escapedChar = 1421 u_unescapeAt(uregex_ucstr_unescape_charAt, 1422 &replIdx, // Index is updated by unescapeAt 1423 replacementLength, // Length of replacement text 1424 (void *)replacementText); 1425 1426 if (escapedChar != (UChar32)0xFFFFFFFF) { 1427 if (escapedChar <= 0xffff) { 1428 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1429 } else { 1430 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1431 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1432 } 1433 continue; 1434 } 1435 // Note: if the \u escape was invalid, just fall through and 1436 // treat it as a plain \<anything> escape. 1437 } 1438 1439 // Plain backslash escape. Just put out the escaped character. 1440 appendToBuf(c, &destIdx, dest, capacity); 1441 1442 replIdx++; 1443 continue; 1444 } 1445 1446 1447 1448 // We've got a $. Pick up a capture group number if one follows. 1449 // Consume at most the number of digits necessary for the largest capture 1450 // number that is valid for this pattern. 1451 1452 int32_t numDigits = 0; 1453 int32_t groupNum = 0; 1454 UChar32 digitC; 1455 for (;;) { 1456 if (replIdx >= replacementLength) { 1457 break; 1458 } 1459 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 1460 if (u_isdigit(digitC) == FALSE) { 1461 break; 1462 } 1463 1464 U16_FWD_1(replacementText, replIdx, replacementLength); 1465 groupNum=groupNum*10 + u_charDigitValue(digitC); 1466 numDigits++; 1467 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1468 break; 1469 } 1470 } 1471 1472 1473 if (numDigits == 0) { 1474 // The $ didn't introduce a group number at all. 1475 // Treat it as just part of the substitution text. 1476 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1477 continue; 1478 } 1479 1480 // Finally, append the capture group data to the destination. 1481 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1482 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1483 // Ignore buffer overflow when extracting the group. We need to 1484 // continue on to get full size of the untruncated result. We will 1485 // raise our own buffer overflow error at the end. 1486 *status = U_ZERO_ERROR; 1487 } 1488 1489 if (U_FAILURE(*status)) { 1490 // Can fail if group number is out of range. 1491 break; 1492 } 1493 1494 } 1495 1496 // 1497 // Nul Terminate the dest buffer if possible. 1498 // Set the appropriate buffer overflow or not terminated error, if needed. 1499 // 1500 if (destIdx < capacity) { 1501 dest[destIdx] = 0; 1502 } else if (destIdx == *destCapacity) { 1503 *status = U_STRING_NOT_TERMINATED_WARNING; 1504 } else { 1505 *status = U_BUFFER_OVERFLOW_ERROR; 1506 } 1507 1508 // 1509 // Return an updated dest buffer and capacity to the caller. 1510 // 1511 if (destIdx > 0 && *destCapacity > 0) { 1512 if (destIdx < capacity) { 1513 *destBuf += destIdx; 1514 *destCapacity -= destIdx; 1515 } else { 1516 *destBuf += capacity; 1517 *destCapacity = 0; 1518 } 1519 } 1520 1521 // If we came in with a buffer overflow, make sure we go out with one also. 1522 // (A zero length match right at the end of the previous match could 1523 // make this function succeed even though a previous call had overflowed the buf) 1524 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1525 *status = U_BUFFER_OVERFLOW_ERROR; 1526 } 1527 1528 return destIdx; 1529 } 1530 1531 // 1532 // appendReplacement the actual API function, 1533 // 1534 U_CAPI int32_t U_EXPORT2 1535 uregex_appendReplacement(URegularExpression *regexp2, 1536 const UChar *replacementText, 1537 int32_t replacementLength, 1538 UChar **destBuf, 1539 int32_t *destCapacity, 1540 UErrorCode *status) { 1541 1542 RegularExpression *regexp = (RegularExpression*)regexp2; 1543 return RegexCImpl::appendReplacement( 1544 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1545 } 1546 1547 // 1548 // uregex_appendReplacementUText...can just use the normal C++ method 1549 // 1550 U_CAPI void U_EXPORT2 1551 uregex_appendReplacementUText(URegularExpression *regexp2, 1552 UText *replText, 1553 UText *dest, 1554 UErrorCode *status) { 1555 RegularExpression *regexp = (RegularExpression*)regexp2; 1556 regexp->fMatcher->appendReplacement(dest, replText, *status); 1557 } 1558 1559 1560 //------------------------------------------------------------------------------ 1561 // 1562 // uregex_appendTail 1563 // 1564 //------------------------------------------------------------------------------ 1565 int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1566 UChar **destBuf, 1567 int32_t *destCapacity, 1568 UErrorCode *status) 1569 { 1570 1571 // If we come in with a buffer overflow error, don't suppress the operation. 1572 // A series of appendReplacements, appendTail need to correctly preflight 1573 // the buffer size when an overflow happens somewhere in the middle. 1574 UBool pendingBufferOverflow = FALSE; 1575 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1576 pendingBufferOverflow = TRUE; 1577 *status = U_ZERO_ERROR; 1578 } 1579 1580 if (validateRE(regexp, status) == FALSE) { 1581 return 0; 1582 } 1583 1584 if (destCapacity == NULL || destBuf == NULL || 1585 (*destBuf == NULL && *destCapacity > 0) || 1586 *destCapacity < 0) 1587 { 1588 *status = U_ILLEGAL_ARGUMENT_ERROR; 1589 return 0; 1590 } 1591 1592 RegexMatcher *m = regexp->fMatcher; 1593 1594 int32_t destIdx = 0; 1595 int32_t destCap = *destCapacity; 1596 UChar *dest = *destBuf; 1597 1598 if (regexp->fText != NULL) { 1599 int32_t srcIdx; 1600 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1601 if (nativeIdx == -1) { 1602 srcIdx = 0; 1603 } else if (UTEXT_USES_U16(m->fInputText)) { 1604 srcIdx = (int32_t)nativeIdx; 1605 } else { 1606 UErrorCode status = U_ZERO_ERROR; 1607 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1608 } 1609 1610 for (;;) { 1611 if (srcIdx == regexp->fTextLength) { 1612 break; 1613 } 1614 UChar c = regexp->fText[srcIdx]; 1615 if (c == 0 && regexp->fTextLength == -1) { 1616 regexp->fTextLength = srcIdx; 1617 break; 1618 } 1619 if (destIdx < destCap) { 1620 dest[destIdx] = c; 1621 } else { 1622 // We've overflowed the dest buffer. 1623 // If the total input string length is known, we can 1624 // compute the total buffer size needed without scanning through the string. 1625 if (regexp->fTextLength > 0) { 1626 destIdx += (regexp->fTextLength - srcIdx); 1627 break; 1628 } 1629 } 1630 srcIdx++; 1631 destIdx++; 1632 } 1633 } else { 1634 int64_t srcIdx; 1635 if (m->fMatch) { 1636 // The most recent call to find() succeeded. 1637 srcIdx = m->fMatchEnd; 1638 } else { 1639 // The last call to find() on this matcher failed(). 1640 // Look back to the end of the last find() that succeeded for src index. 1641 srcIdx = m->fLastMatchEnd; 1642 if (srcIdx == -1) { 1643 // There has been no successful match with this matcher. 1644 // We want to copy the whole string. 1645 srcIdx = 0; 1646 } 1647 } 1648 1649 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1650 } 1651 1652 // 1653 // NUL terminate the output string, if possible, otherwise issue the 1654 // appropriate error or warning. 1655 // 1656 if (destIdx < destCap) { 1657 dest[destIdx] = 0; 1658 } else if (destIdx == destCap) { 1659 *status = U_STRING_NOT_TERMINATED_WARNING; 1660 } else { 1661 *status = U_BUFFER_OVERFLOW_ERROR; 1662 } 1663 1664 // 1665 // Update the user's buffer ptr and capacity vars to reflect the 1666 // amount used. 1667 // 1668 if (destIdx < destCap) { 1669 *destBuf += destIdx; 1670 *destCapacity -= destIdx; 1671 } else { 1672 *destBuf += destCap; 1673 *destCapacity = 0; 1674 } 1675 1676 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1677 *status = U_BUFFER_OVERFLOW_ERROR; 1678 } 1679 1680 return destIdx; 1681 } 1682 1683 1684 // 1685 // appendTail the actual API function 1686 // 1687 U_CAPI int32_t U_EXPORT2 1688 uregex_appendTail(URegularExpression *regexp2, 1689 UChar **destBuf, 1690 int32_t *destCapacity, 1691 UErrorCode *status) { 1692 RegularExpression *regexp = (RegularExpression*)regexp2; 1693 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1694 } 1695 1696 1697 // 1698 // uregex_appendTailUText...can just use the normal C++ method 1699 // 1700 U_CAPI UText * U_EXPORT2 1701 uregex_appendTailUText(URegularExpression *regexp2, 1702 UText *dest, 1703 UErrorCode *status) { 1704 RegularExpression *regexp = (RegularExpression*)regexp2; 1705 return regexp->fMatcher->appendTail(dest, *status); 1706 } 1707 1708 1709 //------------------------------------------------------------------------------ 1710 // 1711 // copyString Internal utility to copy a string to an output buffer, 1712 // while managing buffer overflow and preflight size 1713 // computation. NUL termination is added to destination, 1714 // and the NUL is counted in the output size. 1715 // 1716 //------------------------------------------------------------------------------ 1717 #if 0 1718 static void copyString(UChar *destBuffer, // Destination buffer. 1719 int32_t destCapacity, // Total capacity of dest buffer 1720 int32_t *destIndex, // Index into dest buffer. Updated on return. 1721 // Update not clipped to destCapacity. 1722 const UChar *srcPtr, // Pointer to source string 1723 int32_t srcLen) // Source string len. 1724 { 1725 int32_t si; 1726 int32_t di = *destIndex; 1727 UChar c; 1728 1729 for (si=0; si<srcLen; si++) { 1730 c = srcPtr[si]; 1731 if (di < destCapacity) { 1732 destBuffer[di] = c; 1733 di++; 1734 } else { 1735 di += srcLen - si; 1736 break; 1737 } 1738 } 1739 if (di<destCapacity) { 1740 destBuffer[di] = 0; 1741 } 1742 di++; 1743 *destIndex = di; 1744 } 1745 #endif 1746 1747 //------------------------------------------------------------------------------ 1748 // 1749 // uregex_split 1750 // 1751 //------------------------------------------------------------------------------ 1752 int32_t RegexCImpl::split(RegularExpression *regexp, 1753 UChar *destBuf, 1754 int32_t destCapacity, 1755 int32_t *requiredCapacity, 1756 UChar *destFields[], 1757 int32_t destFieldsCapacity, 1758 UErrorCode *status) { 1759 // 1760 // Reset for the input text 1761 // 1762 regexp->fMatcher->reset(); 1763 UText *inputText = regexp->fMatcher->fInputText; 1764 int64_t nextOutputStringStart = 0; 1765 int64_t inputLen = regexp->fMatcher->fInputLength; 1766 if (inputLen == 0) { 1767 return 0; 1768 } 1769 1770 // 1771 // Loop through the input text, searching for the delimiter pattern 1772 // 1773 int32_t i; // Index of the field being processed. 1774 int32_t destIdx = 0; // Next available position in destBuf; 1775 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1776 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1777 for (i=0; ; i++) { 1778 if (i>=destFieldsCapacity-1) { 1779 // There are one or zero output strings left. 1780 // Fill the last output string with whatever is left from the input, then exit the loop. 1781 // ( i will be == destFieldsCapacity if we filled the output array while processing 1782 // capture groups of the delimiter expression, in which case we will discard the 1783 // last capture group saved in favor of the unprocessed remainder of the 1784 // input string.) 1785 if (inputLen > nextOutputStringStart) { 1786 if (i != destFieldsCapacity-1) { 1787 // No fields are left. Recycle the last one for holding the trailing part of 1788 // the input string. 1789 i = destFieldsCapacity-1; 1790 destIdx = (int32_t)(destFields[i] - destFields[0]); 1791 } 1792 1793 destFields[i] = &destBuf[destIdx]; 1794 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1795 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1796 } 1797 break; 1798 } 1799 1800 if (regexp->fMatcher->find()) { 1801 // We found another delimiter. Move everything from where we started looking 1802 // up until the start of the delimiter into the next output string. 1803 destFields[i] = &destBuf[destIdx]; 1804 1805 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1806 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1807 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1808 tStatus = U_ZERO_ERROR; 1809 } else { 1810 *status = tStatus; 1811 } 1812 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1813 1814 // If the delimiter pattern has capturing parentheses, the captured 1815 // text goes out into the next n destination strings. 1816 int32_t groupNum; 1817 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1818 // If we've run out of output string slots, bail out. 1819 if (i==destFieldsCapacity-1) { 1820 break; 1821 } 1822 i++; 1823 1824 // Set up to extract the capture group contents into the dest buffer. 1825 destFields[i] = &destBuf[destIdx]; 1826 tStatus = U_ZERO_ERROR; 1827 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1828 destIdx += t + 1; // Record the space used in the output string buffer. 1829 // +1 for the NUL that terminates the string. 1830 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1831 tStatus = U_ZERO_ERROR; 1832 } else { 1833 *status = tStatus; 1834 } 1835 } 1836 1837 if (nextOutputStringStart == inputLen) { 1838 // The delimiter was at the end of the string. We're done. 1839 break; 1840 } 1841 1842 } 1843 else 1844 { 1845 // We ran off the end of the input while looking for the next delimiter. 1846 // All the remaining text goes into the current output string. 1847 destFields[i] = &destBuf[destIdx]; 1848 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1849 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1850 break; 1851 } 1852 } 1853 1854 // Zero out any unused portion of the destFields array 1855 int j; 1856 for (j=i+1; j<destFieldsCapacity; j++) { 1857 destFields[j] = NULL; 1858 } 1859 1860 if (requiredCapacity != NULL) { 1861 *requiredCapacity = destIdx; 1862 } 1863 if (destIdx > destCapacity) { 1864 *status = U_BUFFER_OVERFLOW_ERROR; 1865 } 1866 return i+1; 1867 } 1868 1869 // 1870 // uregex_split The actual API function 1871 // 1872 U_CAPI int32_t U_EXPORT2 1873 uregex_split(URegularExpression *regexp2, 1874 UChar *destBuf, 1875 int32_t destCapacity, 1876 int32_t *requiredCapacity, 1877 UChar *destFields[], 1878 int32_t destFieldsCapacity, 1879 UErrorCode *status) { 1880 RegularExpression *regexp = (RegularExpression*)regexp2; 1881 if (validateRE(regexp, status) == FALSE) { 1882 return 0; 1883 } 1884 if ((destBuf == NULL && destCapacity > 0) || 1885 destCapacity < 0 || 1886 destFields == NULL || 1887 destFieldsCapacity < 1 ) { 1888 *status = U_ILLEGAL_ARGUMENT_ERROR; 1889 return 0; 1890 } 1891 1892 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1893 } 1894 1895 1896 // 1897 // uregex_splitUText...can just use the normal C++ method 1898 // 1899 U_CAPI int32_t U_EXPORT2 1900 uregex_splitUText(URegularExpression *regexp2, 1901 UText *destFields[], 1902 int32_t destFieldsCapacity, 1903 UErrorCode *status) { 1904 RegularExpression *regexp = (RegularExpression*)regexp2; 1905 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1906 } 1907 1908 1909 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1910 1911