1 /* 2 ******************************************************************************* 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: regex.cpp 7 */ 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 12 13 #include "unicode/regex.h" 14 #include "unicode/uregex.h" 15 #include "unicode/unistr.h" 16 #include "unicode/ustring.h" 17 #include "unicode/uchar.h" 18 #include "unicode/uobject.h" 19 #include "umutex.h" 20 #include "uassert.h" 21 #include "cmemory.h" 22 23 #include "regextxt.h" 24 25 #include <stdio.h> 26 27 U_NAMESPACE_BEGIN 28 29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) 30 31 struct RegularExpression: public UMemory { 32 public: 33 RegularExpression(); 34 ~RegularExpression(); 35 int32_t fMagic; 36 RegexPattern *fPat; 37 int32_t *fPatRefCount; 38 UChar *fPatString; 39 int32_t fPatStringLen; 40 RegexMatcher *fMatcher; 41 const UChar *fText; // Text from setText() 42 int32_t fTextLength; // Length provided by user with setText(), which 43 // may be -1. 44 UBool fOwnsText; 45 }; 46 47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII 48 49 RegularExpression::RegularExpression() { 50 fMagic = REXP_MAGIC; 51 fPat = NULL; 52 fPatRefCount = NULL; 53 fPatString = NULL; 54 fPatStringLen = 0; 55 fMatcher = NULL; 56 fText = NULL; 57 fTextLength = 0; 58 fOwnsText = FALSE; 59 } 60 61 RegularExpression::~RegularExpression() { 62 delete fMatcher; 63 fMatcher = NULL; 64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { 65 delete fPat; 66 uprv_free(fPatString); 67 uprv_free(fPatRefCount); 68 } 69 if (fOwnsText && fText!=NULL) { 70 uprv_free((void *)fText); 71 } 72 fMagic = 0; 73 } 74 75 U_NAMESPACE_END 76 77 U_NAMESPACE_USE 78 79 //---------------------------------------------------------------------------------------- 80 // 81 // validateRE Do boilerplate style checks on API function parameters. 82 // Return TRUE if they look OK. 83 //---------------------------------------------------------------------------------------- 84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { 85 if (U_FAILURE(*status)) { 86 return FALSE; 87 } 88 if (re == NULL || re->fMagic != REXP_MAGIC) { 89 *status = U_ILLEGAL_ARGUMENT_ERROR; 90 return FALSE; 91 } 92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway 93 if (requiresText && re->fText == NULL && !re->fOwnsText) { 94 *status = U_REGEX_INVALID_STATE; 95 return FALSE; 96 } 97 return TRUE; 98 } 99 100 //---------------------------------------------------------------------------------------- 101 // 102 // uregex_open 103 // 104 //---------------------------------------------------------------------------------------- 105 U_CAPI URegularExpression * U_EXPORT2 106 uregex_open( const UChar *pattern, 107 int32_t patternLength, 108 uint32_t flags, 109 UParseError *pe, 110 UErrorCode *status) { 111 112 if (U_FAILURE(*status)) { 113 return NULL; 114 } 115 if (pattern == NULL || patternLength < -1 || patternLength == 0) { 116 *status = U_ILLEGAL_ARGUMENT_ERROR; 117 return NULL; 118 } 119 int32_t actualPatLen = patternLength; 120 if (actualPatLen == -1) { 121 actualPatLen = u_strlen(pattern); 122 } 123 124 RegularExpression *re = new RegularExpression; 125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); 127 if (re == NULL || refC == NULL || patBuf == NULL) { 128 *status = U_MEMORY_ALLOCATION_ERROR; 129 delete re; 130 uprv_free(refC); 131 uprv_free(patBuf); 132 return NULL; 133 } 134 re->fPatRefCount = refC; 135 *re->fPatRefCount = 1; 136 137 // 138 // Make a copy of the pattern string, so we can return it later if asked. 139 // For compiling the pattern, we will use a UText wrapper around 140 // this local copy, to avoid making even more copies. 141 // 142 re->fPatString = patBuf; 143 re->fPatStringLen = patternLength; 144 u_memcpy(patBuf, pattern, actualPatLen); 145 patBuf[actualPatLen] = 0; 146 147 UText patText = UTEXT_INITIALIZER; 148 utext_openUChars(&patText, patBuf, patternLength, status); 149 150 // 151 // Compile the pattern 152 // 153 if (pe != NULL) { 154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 155 } else { 156 re->fPat = RegexPattern::compile(&patText, flags, *status); 157 } 158 utext_close(&patText); 159 160 if (U_FAILURE(*status)) { 161 goto ErrorExit; 162 } 163 164 // 165 // Create the matcher object 166 // 167 re->fMatcher = re->fPat->matcher(*status); 168 if (U_SUCCESS(*status)) { 169 return (URegularExpression*)re; 170 } 171 172 ErrorExit: 173 delete re; 174 return NULL; 175 176 } 177 178 //---------------------------------------------------------------------------------------- 179 // 180 // uregex_openUText 181 // 182 //---------------------------------------------------------------------------------------- 183 U_CAPI URegularExpression * U_EXPORT2 184 uregex_openUText(UText *pattern, 185 uint32_t flags, 186 UParseError *pe, 187 UErrorCode *status) { 188 189 if (U_FAILURE(*status)) { 190 return NULL; 191 } 192 if (pattern == NULL) { 193 *status = U_ILLEGAL_ARGUMENT_ERROR; 194 return NULL; 195 } 196 197 int64_t patternNativeLength = utext_nativeLength(pattern); 198 199 if (patternNativeLength == 0) { 200 *status = U_ILLEGAL_ARGUMENT_ERROR; 201 return NULL; 202 } 203 204 RegularExpression *re = new RegularExpression; 205 206 UErrorCode lengthStatus = U_ZERO_ERROR; 207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); 208 209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); 210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); 211 if (re == NULL || refC == NULL || patBuf == NULL) { 212 *status = U_MEMORY_ALLOCATION_ERROR; 213 delete re; 214 uprv_free(refC); 215 uprv_free(patBuf); 216 return NULL; 217 } 218 re->fPatRefCount = refC; 219 *re->fPatRefCount = 1; 220 221 // 222 // Make a copy of the pattern string, so we can return it later if asked. 223 // For compiling the pattern, we will use a read-only UText wrapper 224 // around this local copy, to avoid making even more copies. 225 // 226 re->fPatString = patBuf; 227 re->fPatStringLen = pattern16Length; 228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); 229 230 UText patText = UTEXT_INITIALIZER; 231 utext_openUChars(&patText, patBuf, pattern16Length, status); 232 233 // 234 // Compile the pattern 235 // 236 if (pe != NULL) { 237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); 238 } else { 239 re->fPat = RegexPattern::compile(&patText, flags, *status); 240 } 241 utext_close(&patText); 242 243 if (U_FAILURE(*status)) { 244 goto ErrorExit; 245 } 246 247 // 248 // Create the matcher object 249 // 250 re->fMatcher = re->fPat->matcher(*status); 251 if (U_SUCCESS(*status)) { 252 return (URegularExpression*)re; 253 } 254 255 ErrorExit: 256 delete re; 257 return NULL; 258 259 } 260 261 //---------------------------------------------------------------------------------------- 262 // 263 // uregex_close 264 // 265 //---------------------------------------------------------------------------------------- 266 U_CAPI void U_EXPORT2 267 uregex_close(URegularExpression *re2) { 268 RegularExpression *re = (RegularExpression*)re2; 269 UErrorCode status = U_ZERO_ERROR; 270 if (validateRE(re, &status, FALSE) == FALSE) { 271 return; 272 } 273 delete re; 274 } 275 276 277 //---------------------------------------------------------------------------------------- 278 // 279 // uregex_clone 280 // 281 //---------------------------------------------------------------------------------------- 282 U_CAPI URegularExpression * U_EXPORT2 283 uregex_clone(const URegularExpression *source2, UErrorCode *status) { 284 RegularExpression *source = (RegularExpression*)source2; 285 if (validateRE(source, status, FALSE) == FALSE) { 286 return NULL; 287 } 288 289 RegularExpression *clone = new RegularExpression; 290 if (clone == NULL) { 291 *status = U_MEMORY_ALLOCATION_ERROR; 292 return NULL; 293 } 294 295 clone->fMatcher = source->fPat->matcher(*status); 296 if (U_FAILURE(*status)) { 297 delete clone; 298 return NULL; 299 } 300 301 clone->fPat = source->fPat; 302 clone->fPatRefCount = source->fPatRefCount; 303 clone->fPatString = source->fPatString; 304 clone->fPatStringLen = source->fPatStringLen; 305 umtx_atomic_inc(source->fPatRefCount); 306 // Note: fText is not cloned. 307 308 return (URegularExpression*)clone; 309 } 310 311 312 313 314 //------------------------------------------------------------------------------ 315 // 316 // uregex_pattern 317 // 318 //------------------------------------------------------------------------------ 319 U_CAPI const UChar * U_EXPORT2 320 uregex_pattern(const URegularExpression *regexp2, 321 int32_t *patLength, 322 UErrorCode *status) { 323 RegularExpression *regexp = (RegularExpression*)regexp2; 324 325 if (validateRE(regexp, status, FALSE) == FALSE) { 326 return NULL; 327 } 328 if (patLength != NULL) { 329 *patLength = regexp->fPatStringLen; 330 } 331 return regexp->fPatString; 332 } 333 334 335 //------------------------------------------------------------------------------ 336 // 337 // uregex_patternUText 338 // 339 //------------------------------------------------------------------------------ 340 U_CAPI UText * U_EXPORT2 341 uregex_patternUText(const URegularExpression *regexp2, 342 UErrorCode *status) { 343 RegularExpression *regexp = (RegularExpression*)regexp2; 344 return regexp->fPat->patternText(*status); 345 } 346 347 348 //------------------------------------------------------------------------------ 349 // 350 // uregex_flags 351 // 352 //------------------------------------------------------------------------------ 353 U_CAPI int32_t U_EXPORT2 354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { 355 RegularExpression *regexp = (RegularExpression*)regexp2; 356 if (validateRE(regexp, status, FALSE) == FALSE) { 357 return 0; 358 } 359 int32_t flags = regexp->fPat->flags(); 360 return flags; 361 } 362 363 364 //------------------------------------------------------------------------------ 365 // 366 // uregex_setText 367 // 368 //------------------------------------------------------------------------------ 369 U_CAPI void U_EXPORT2 370 uregex_setText(URegularExpression *regexp2, 371 const UChar *text, 372 int32_t textLength, 373 UErrorCode *status) { 374 RegularExpression *regexp = (RegularExpression*)regexp2; 375 if (validateRE(regexp, status, FALSE) == FALSE) { 376 return; 377 } 378 if (text == NULL || textLength < -1) { 379 *status = U_ILLEGAL_ARGUMENT_ERROR; 380 return; 381 } 382 383 if (regexp->fOwnsText && regexp->fText != NULL) { 384 uprv_free((void *)regexp->fText); 385 } 386 387 regexp->fText = text; 388 regexp->fTextLength = textLength; 389 regexp->fOwnsText = FALSE; 390 391 UText input = UTEXT_INITIALIZER; 392 utext_openUChars(&input, text, textLength, status); 393 regexp->fMatcher->reset(&input); 394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy 395 } 396 397 398 //------------------------------------------------------------------------------ 399 // 400 // uregex_setUText 401 // 402 //------------------------------------------------------------------------------ 403 U_CAPI void U_EXPORT2 404 uregex_setUText(URegularExpression *regexp2, 405 UText *text, 406 UErrorCode *status) { 407 RegularExpression *regexp = (RegularExpression*)regexp2; 408 if (validateRE(regexp, status, FALSE) == FALSE) { 409 return; 410 } 411 if (text == NULL) { 412 *status = U_ILLEGAL_ARGUMENT_ERROR; 413 return; 414 } 415 416 if (regexp->fOwnsText && regexp->fText != NULL) { 417 uprv_free((void *)regexp->fText); 418 } 419 420 regexp->fText = NULL; // only fill it in on request 421 regexp->fTextLength = -1; 422 regexp->fOwnsText = TRUE; 423 regexp->fMatcher->reset(text); 424 } 425 426 427 428 //------------------------------------------------------------------------------ 429 // 430 // uregex_getText 431 // 432 //------------------------------------------------------------------------------ 433 U_CAPI const UChar * U_EXPORT2 434 uregex_getText(URegularExpression *regexp2, 435 int32_t *textLength, 436 UErrorCode *status) { 437 RegularExpression *regexp = (RegularExpression*)regexp2; 438 if (validateRE(regexp, status, FALSE) == FALSE) { 439 return NULL; 440 } 441 442 if (regexp->fText == NULL) { 443 // need to fill in the text 444 UText *inputText = regexp->fMatcher->inputText(); 445 int64_t inputNativeLength = utext_nativeLength(inputText); 446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { 447 regexp->fText = inputText->chunkContents; 448 regexp->fTextLength = (int32_t)inputNativeLength; 449 regexp->fOwnsText = FALSE; // because the UText owns it 450 } else { 451 UErrorCode lengthStatus = U_ZERO_ERROR; 452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error 453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); 454 455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); 456 regexp->fText = inputChars; 457 regexp->fOwnsText = TRUE; // should already be set but just in case 458 } 459 } 460 461 if (textLength != NULL) { 462 *textLength = regexp->fTextLength; 463 } 464 return regexp->fText; 465 } 466 467 468 //------------------------------------------------------------------------------ 469 // 470 // uregex_getUText 471 // 472 //------------------------------------------------------------------------------ 473 U_CAPI UText * U_EXPORT2 474 uregex_getUText(URegularExpression *regexp2, 475 UText *dest, 476 UErrorCode *status) { 477 RegularExpression *regexp = (RegularExpression*)regexp2; 478 if (validateRE(regexp, status, FALSE) == FALSE) { 479 return dest; 480 } 481 return regexp->fMatcher->getInput(dest, *status); 482 } 483 484 // BEGIN android-added 485 // Removed this function after Android upgrade to ICU4.8. 486 //------------------------------------------------------------------------------ 487 // 488 // uregex_refreshUText 489 // 490 //------------------------------------------------------------------------------ 491 U_CAPI void U_EXPORT2 492 uregex_refreshUText(URegularExpression *regexp2, 493 UText *text, 494 UErrorCode *status) { 495 RegularExpression *regexp = (RegularExpression*)regexp2; 496 if (validateRE(regexp, status, FALSE) == FALSE) { 497 return; 498 } 499 regexp->fMatcher->refreshInputText(text, *status); 500 } 501 // END android-added 502 503 //------------------------------------------------------------------------------ 504 // 505 // uregex_matches 506 // 507 //------------------------------------------------------------------------------ 508 U_CAPI UBool U_EXPORT2 509 uregex_matches(URegularExpression *regexp2, 510 int32_t startIndex, 511 UErrorCode *status) { 512 return uregex_matches64( regexp2, (int64_t)startIndex, status); 513 } 514 515 U_CAPI UBool U_EXPORT2 516 uregex_matches64(URegularExpression *regexp2, 517 int64_t startIndex, 518 UErrorCode *status) { 519 RegularExpression *regexp = (RegularExpression*)regexp2; 520 UBool result = FALSE; 521 if (validateRE(regexp, status) == FALSE) { 522 return result; 523 } 524 if (startIndex == -1) { 525 result = regexp->fMatcher->matches(*status); 526 } else { 527 result = regexp->fMatcher->matches(startIndex, *status); 528 } 529 return result; 530 } 531 532 533 //------------------------------------------------------------------------------ 534 // 535 // uregex_lookingAt 536 // 537 //------------------------------------------------------------------------------ 538 U_CAPI UBool U_EXPORT2 539 uregex_lookingAt(URegularExpression *regexp2, 540 int32_t startIndex, 541 UErrorCode *status) { 542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); 543 } 544 545 U_CAPI UBool U_EXPORT2 546 uregex_lookingAt64(URegularExpression *regexp2, 547 int64_t startIndex, 548 UErrorCode *status) { 549 RegularExpression *regexp = (RegularExpression*)regexp2; 550 UBool result = FALSE; 551 if (validateRE(regexp, status) == FALSE) { 552 return result; 553 } 554 if (startIndex == -1) { 555 result = regexp->fMatcher->lookingAt(*status); 556 } else { 557 result = regexp->fMatcher->lookingAt(startIndex, *status); 558 } 559 return result; 560 } 561 562 563 564 //------------------------------------------------------------------------------ 565 // 566 // uregex_find 567 // 568 //------------------------------------------------------------------------------ 569 U_CAPI UBool U_EXPORT2 570 uregex_find(URegularExpression *regexp2, 571 int32_t startIndex, 572 UErrorCode *status) { 573 return uregex_find64( regexp2, (int64_t)startIndex, status); 574 } 575 576 U_CAPI UBool U_EXPORT2 577 uregex_find64(URegularExpression *regexp2, 578 int64_t startIndex, 579 UErrorCode *status) { 580 RegularExpression *regexp = (RegularExpression*)regexp2; 581 UBool result = FALSE; 582 if (validateRE(regexp, status) == FALSE) { 583 return result; 584 } 585 if (startIndex == -1) { 586 regexp->fMatcher->resetPreserveRegion(); 587 result = regexp->fMatcher->find(); 588 } else { 589 result = regexp->fMatcher->find(startIndex, *status); 590 } 591 return result; 592 } 593 594 595 //------------------------------------------------------------------------------ 596 // 597 // uregex_findNext 598 // 599 //------------------------------------------------------------------------------ 600 U_CAPI UBool U_EXPORT2 601 uregex_findNext(URegularExpression *regexp2, 602 UErrorCode *status) { 603 RegularExpression *regexp = (RegularExpression*)regexp2; 604 if (validateRE(regexp, status) == FALSE) { 605 return FALSE; 606 } 607 UBool result = regexp->fMatcher->find(); 608 return result; 609 } 610 611 //------------------------------------------------------------------------------ 612 // 613 // uregex_groupCount 614 // 615 //------------------------------------------------------------------------------ 616 U_CAPI int32_t U_EXPORT2 617 uregex_groupCount(URegularExpression *regexp2, 618 UErrorCode *status) { 619 RegularExpression *regexp = (RegularExpression*)regexp2; 620 if (validateRE(regexp, status, FALSE) == FALSE) { 621 return 0; 622 } 623 int32_t result = regexp->fMatcher->groupCount(); 624 return result; 625 } 626 627 628 //------------------------------------------------------------------------------ 629 // 630 // uregex_group 631 // 632 //------------------------------------------------------------------------------ 633 U_CAPI int32_t U_EXPORT2 634 uregex_group(URegularExpression *regexp2, 635 int32_t groupNum, 636 UChar *dest, 637 int32_t destCapacity, 638 UErrorCode *status) { 639 RegularExpression *regexp = (RegularExpression*)regexp2; 640 if (validateRE(regexp, status) == FALSE) { 641 return 0; 642 } 643 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 644 *status = U_ILLEGAL_ARGUMENT_ERROR; 645 return 0; 646 } 647 648 if (destCapacity == 0 || regexp->fText != NULL) { 649 // If preflighting or if we already have the text as UChars, 650 // this is a little cheaper than going through uregex_groupUTextDeep() 651 652 // 653 // Pick up the range of characters from the matcher 654 // 655 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 656 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 657 if (U_FAILURE(*status)) { 658 return 0; 659 } 660 661 // 662 // Trim length based on buffer capacity 663 // 664 int32_t fullLength = endIx - startIx; 665 int32_t copyLength = fullLength; 666 if (copyLength < destCapacity) { 667 dest[copyLength] = 0; 668 } else if (copyLength == destCapacity) { 669 *status = U_STRING_NOT_TERMINATED_WARNING; 670 } else { 671 copyLength = destCapacity; 672 *status = U_BUFFER_OVERFLOW_ERROR; 673 } 674 675 // 676 // Copy capture group to user's buffer 677 // 678 if (copyLength > 0) { 679 u_memcpy(dest, ®exp->fText[startIx], copyLength); 680 } 681 return fullLength; 682 } else { 683 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); 684 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); 685 utext_close(groupText); 686 return result; 687 } 688 } 689 690 691 //------------------------------------------------------------------------------ 692 // 693 // uregex_groupUText 694 // 695 //------------------------------------------------------------------------------ 696 U_CAPI UText * U_EXPORT2 697 uregex_groupUText(URegularExpression *regexp2, 698 int32_t groupNum, 699 UText *dest, 700 int64_t *groupLength, 701 UErrorCode *status) { 702 RegularExpression *regexp = (RegularExpression*)regexp2; 703 if (validateRE(regexp, status) == FALSE) { 704 UErrorCode emptyTextStatus = U_ZERO_ERROR; 705 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 706 } 707 708 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); 709 } 710 711 //------------------------------------------------------------------------------ 712 // 713 // uregex_groupUTextDeep 714 // 715 //------------------------------------------------------------------------------ 716 U_CAPI UText * U_EXPORT2 717 uregex_groupUTextDeep(URegularExpression *regexp2, 718 int32_t groupNum, 719 UText *dest, 720 UErrorCode *status) { 721 RegularExpression *regexp = (RegularExpression*)regexp2; 722 if (validateRE(regexp, status) == FALSE) { 723 UErrorCode emptyTextStatus = U_ZERO_ERROR; 724 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 725 } 726 727 if (regexp->fText != NULL) { 728 // 729 // Pick up the range of characters from the matcher 730 // and use our already-extracted characters 731 // 732 int32_t startIx = regexp->fMatcher->start(groupNum, *status); 733 int32_t endIx = regexp->fMatcher->end (groupNum, *status); 734 if (U_FAILURE(*status)) { 735 UErrorCode emptyTextStatus = U_ZERO_ERROR; 736 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); 737 } 738 739 if (dest) { 740 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); 741 } else { 742 UText groupText = UTEXT_INITIALIZER; 743 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); 744 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); 745 utext_close(&groupText); 746 } 747 748 return dest; 749 } else { 750 return regexp->fMatcher->group(groupNum, dest, *status); 751 } 752 } 753 754 //------------------------------------------------------------------------------ 755 // 756 // uregex_start 757 // 758 //------------------------------------------------------------------------------ 759 U_CAPI int32_t U_EXPORT2 760 uregex_start(URegularExpression *regexp2, 761 int32_t groupNum, 762 UErrorCode *status) { 763 return (int32_t)uregex_start64( regexp2, groupNum, status); 764 } 765 766 U_CAPI int64_t U_EXPORT2 767 uregex_start64(URegularExpression *regexp2, 768 int32_t groupNum, 769 UErrorCode *status) { 770 RegularExpression *regexp = (RegularExpression*)regexp2; 771 if (validateRE(regexp, status) == FALSE) { 772 return 0; 773 } 774 int32_t result = regexp->fMatcher->start(groupNum, *status); 775 return result; 776 } 777 778 //------------------------------------------------------------------------------ 779 // 780 // uregex_end 781 // 782 //------------------------------------------------------------------------------ 783 U_CAPI int32_t U_EXPORT2 784 uregex_end(URegularExpression *regexp2, 785 int32_t groupNum, 786 UErrorCode *status) { 787 return (int32_t)uregex_end64( regexp2, groupNum, status); 788 } 789 790 U_CAPI int64_t U_EXPORT2 791 uregex_end64(URegularExpression *regexp2, 792 int32_t groupNum, 793 UErrorCode *status) { 794 RegularExpression *regexp = (RegularExpression*)regexp2; 795 if (validateRE(regexp, status) == FALSE) { 796 return 0; 797 } 798 int32_t result = regexp->fMatcher->end(groupNum, *status); 799 return result; 800 } 801 802 //------------------------------------------------------------------------------ 803 // 804 // uregex_reset 805 // 806 //------------------------------------------------------------------------------ 807 U_CAPI void U_EXPORT2 808 uregex_reset(URegularExpression *regexp2, 809 int32_t index, 810 UErrorCode *status) { 811 uregex_reset64( regexp2, (int64_t)index, status); 812 } 813 814 U_CAPI void U_EXPORT2 815 uregex_reset64(URegularExpression *regexp2, 816 int64_t index, 817 UErrorCode *status) { 818 RegularExpression *regexp = (RegularExpression*)regexp2; 819 if (validateRE(regexp, status) == FALSE) { 820 return; 821 } 822 regexp->fMatcher->reset(index, *status); 823 } 824 825 826 //------------------------------------------------------------------------------ 827 // 828 // uregex_setRegion 829 // 830 //------------------------------------------------------------------------------ 831 U_CAPI void U_EXPORT2 832 uregex_setRegion(URegularExpression *regexp2, 833 int32_t regionStart, 834 int32_t regionLimit, 835 UErrorCode *status) { 836 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); 837 } 838 839 U_CAPI void U_EXPORT2 840 uregex_setRegion64(URegularExpression *regexp2, 841 int64_t regionStart, 842 int64_t regionLimit, 843 UErrorCode *status) { 844 RegularExpression *regexp = (RegularExpression*)regexp2; 845 if (validateRE(regexp, status) == FALSE) { 846 return; 847 } 848 regexp->fMatcher->region(regionStart, regionLimit, *status); 849 } 850 851 852 //------------------------------------------------------------------------------ 853 // 854 // uregex_setRegionAndStart 855 // 856 //------------------------------------------------------------------------------ 857 U_DRAFT void U_EXPORT2 858 uregex_setRegionAndStart(URegularExpression *regexp2, 859 int64_t regionStart, 860 int64_t regionLimit, 861 int64_t startIndex, 862 UErrorCode *status) { 863 RegularExpression *regexp = (RegularExpression*)regexp2; 864 if (validateRE(regexp, status) == FALSE) { 865 return; 866 } 867 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); 868 } 869 870 //------------------------------------------------------------------------------ 871 // 872 // uregex_regionStart 873 // 874 //------------------------------------------------------------------------------ 875 U_CAPI int32_t U_EXPORT2 876 uregex_regionStart(const URegularExpression *regexp2, 877 UErrorCode *status) { 878 return (int32_t)uregex_regionStart64(regexp2, status); 879 } 880 881 U_CAPI int64_t U_EXPORT2 882 uregex_regionStart64(const URegularExpression *regexp2, 883 UErrorCode *status) { 884 RegularExpression *regexp = (RegularExpression*)regexp2; 885 if (validateRE(regexp, status) == FALSE) { 886 return 0; 887 } 888 return regexp->fMatcher->regionStart(); 889 } 890 891 892 //------------------------------------------------------------------------------ 893 // 894 // uregex_regionEnd 895 // 896 //------------------------------------------------------------------------------ 897 U_CAPI int32_t U_EXPORT2 898 uregex_regionEnd(const URegularExpression *regexp2, 899 UErrorCode *status) { 900 return (int32_t)uregex_regionEnd64(regexp2, status); 901 } 902 903 U_CAPI int64_t U_EXPORT2 904 uregex_regionEnd64(const URegularExpression *regexp2, 905 UErrorCode *status) { 906 RegularExpression *regexp = (RegularExpression*)regexp2; 907 if (validateRE(regexp, status) == FALSE) { 908 return 0; 909 } 910 return regexp->fMatcher->regionEnd(); 911 } 912 913 914 //------------------------------------------------------------------------------ 915 // 916 // uregex_hasTransparentBounds 917 // 918 //------------------------------------------------------------------------------ 919 U_CAPI UBool U_EXPORT2 920 uregex_hasTransparentBounds(const URegularExpression *regexp2, 921 UErrorCode *status) { 922 RegularExpression *regexp = (RegularExpression*)regexp2; 923 if (validateRE(regexp, status) == FALSE) { 924 return FALSE; 925 } 926 return regexp->fMatcher->hasTransparentBounds(); 927 } 928 929 930 //------------------------------------------------------------------------------ 931 // 932 // uregex_useTransparentBounds 933 // 934 //------------------------------------------------------------------------------ 935 U_CAPI void U_EXPORT2 936 uregex_useTransparentBounds(URegularExpression *regexp2, 937 UBool b, 938 UErrorCode *status) { 939 RegularExpression *regexp = (RegularExpression*)regexp2; 940 if (validateRE(regexp, status) == FALSE) { 941 return; 942 } 943 regexp->fMatcher->useTransparentBounds(b); 944 } 945 946 947 //------------------------------------------------------------------------------ 948 // 949 // uregex_hasAnchoringBounds 950 // 951 //------------------------------------------------------------------------------ 952 U_CAPI UBool U_EXPORT2 953 uregex_hasAnchoringBounds(const URegularExpression *regexp2, 954 UErrorCode *status) { 955 RegularExpression *regexp = (RegularExpression*)regexp2; 956 if (validateRE(regexp, status) == FALSE) { 957 return FALSE; 958 } 959 return regexp->fMatcher->hasAnchoringBounds(); 960 } 961 962 963 //------------------------------------------------------------------------------ 964 // 965 // uregex_useAnchoringBounds 966 // 967 //------------------------------------------------------------------------------ 968 U_CAPI void U_EXPORT2 969 uregex_useAnchoringBounds(URegularExpression *regexp2, 970 UBool b, 971 UErrorCode *status) { 972 RegularExpression *regexp = (RegularExpression*)regexp2; 973 if (validateRE(regexp, status) == FALSE) { 974 return; 975 } 976 regexp->fMatcher->useAnchoringBounds(b); 977 } 978 979 980 //------------------------------------------------------------------------------ 981 // 982 // uregex_hitEnd 983 // 984 //------------------------------------------------------------------------------ 985 U_CAPI UBool U_EXPORT2 986 uregex_hitEnd(const URegularExpression *regexp2, 987 UErrorCode *status) { 988 RegularExpression *regexp = (RegularExpression*)regexp2; 989 if (validateRE(regexp, status) == FALSE) { 990 return FALSE; 991 } 992 return regexp->fMatcher->hitEnd(); 993 } 994 995 996 //------------------------------------------------------------------------------ 997 // 998 // uregex_requireEnd 999 // 1000 //------------------------------------------------------------------------------ 1001 U_CAPI UBool U_EXPORT2 1002 uregex_requireEnd(const URegularExpression *regexp2, 1003 UErrorCode *status) { 1004 RegularExpression *regexp = (RegularExpression*)regexp2; 1005 if (validateRE(regexp, status) == FALSE) { 1006 return FALSE; 1007 } 1008 return regexp->fMatcher->requireEnd(); 1009 } 1010 1011 1012 //------------------------------------------------------------------------------ 1013 // 1014 // uregex_setTimeLimit 1015 // 1016 //------------------------------------------------------------------------------ 1017 U_CAPI void U_EXPORT2 1018 uregex_setTimeLimit(URegularExpression *regexp2, 1019 int32_t limit, 1020 UErrorCode *status) { 1021 RegularExpression *regexp = (RegularExpression*)regexp2; 1022 if (validateRE(regexp, status)) { 1023 regexp->fMatcher->setTimeLimit(limit, *status); 1024 } 1025 } 1026 1027 1028 1029 //------------------------------------------------------------------------------ 1030 // 1031 // uregex_getTimeLimit 1032 // 1033 //------------------------------------------------------------------------------ 1034 U_CAPI int32_t U_EXPORT2 1035 uregex_getTimeLimit(const URegularExpression *regexp2, 1036 UErrorCode *status) { 1037 int32_t retVal = 0; 1038 RegularExpression *regexp = (RegularExpression*)regexp2; 1039 if (validateRE(regexp, status)) { 1040 retVal = regexp->fMatcher->getTimeLimit(); 1041 } 1042 return retVal; 1043 } 1044 1045 1046 1047 //------------------------------------------------------------------------------ 1048 // 1049 // uregex_setStackLimit 1050 // 1051 //------------------------------------------------------------------------------ 1052 U_CAPI void U_EXPORT2 1053 uregex_setStackLimit(URegularExpression *regexp2, 1054 int32_t limit, 1055 UErrorCode *status) { 1056 RegularExpression *regexp = (RegularExpression*)regexp2; 1057 if (validateRE(regexp, status)) { 1058 regexp->fMatcher->setStackLimit(limit, *status); 1059 } 1060 } 1061 1062 1063 1064 //------------------------------------------------------------------------------ 1065 // 1066 // uregex_getStackLimit 1067 // 1068 //------------------------------------------------------------------------------ 1069 U_CAPI int32_t U_EXPORT2 1070 uregex_getStackLimit(const URegularExpression *regexp2, 1071 UErrorCode *status) { 1072 int32_t retVal = 0; 1073 RegularExpression *regexp = (RegularExpression*)regexp2; 1074 if (validateRE(regexp, status)) { 1075 retVal = regexp->fMatcher->getStackLimit(); 1076 } 1077 return retVal; 1078 } 1079 1080 1081 //------------------------------------------------------------------------------ 1082 // 1083 // uregex_setMatchCallback 1084 // 1085 //------------------------------------------------------------------------------ 1086 U_CAPI void U_EXPORT2 1087 uregex_setMatchCallback(URegularExpression *regexp2, 1088 URegexMatchCallback *callback, 1089 const void *context, 1090 UErrorCode *status) { 1091 RegularExpression *regexp = (RegularExpression*)regexp2; 1092 if (validateRE(regexp, status)) { 1093 regexp->fMatcher->setMatchCallback(callback, context, *status); 1094 } 1095 } 1096 1097 1098 //------------------------------------------------------------------------------ 1099 // 1100 // uregex_getMatchCallback 1101 // 1102 //------------------------------------------------------------------------------ 1103 U_CAPI void U_EXPORT2 1104 uregex_getMatchCallback(const URegularExpression *regexp2, 1105 URegexMatchCallback **callback, 1106 const void **context, 1107 UErrorCode *status) { 1108 RegularExpression *regexp = (RegularExpression*)regexp2; 1109 if (validateRE(regexp, status)) { 1110 regexp->fMatcher->getMatchCallback(*callback, *context, *status); 1111 } 1112 } 1113 1114 1115 //------------------------------------------------------------------------------ 1116 // 1117 // uregex_setMatchProgressCallback 1118 // 1119 //------------------------------------------------------------------------------ 1120 U_CAPI void U_EXPORT2 1121 uregex_setFindProgressCallback(URegularExpression *regexp2, 1122 URegexFindProgressCallback *callback, 1123 const void *context, 1124 UErrorCode *status) { 1125 RegularExpression *regexp = (RegularExpression*)regexp2; 1126 if (validateRE(regexp, status)) { 1127 regexp->fMatcher->setFindProgressCallback(callback, context, *status); 1128 } 1129 } 1130 1131 1132 //------------------------------------------------------------------------------ 1133 // 1134 // uregex_getMatchCallback 1135 // 1136 //------------------------------------------------------------------------------ 1137 U_CAPI void U_EXPORT2 1138 uregex_getFindProgressCallback(const URegularExpression *regexp2, 1139 URegexFindProgressCallback **callback, 1140 const void **context, 1141 UErrorCode *status) { 1142 RegularExpression *regexp = (RegularExpression*)regexp2; 1143 if (validateRE(regexp, status)) { 1144 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); 1145 } 1146 } 1147 1148 1149 //------------------------------------------------------------------------------ 1150 // 1151 // uregex_replaceAll 1152 // 1153 //------------------------------------------------------------------------------ 1154 U_CAPI int32_t U_EXPORT2 1155 uregex_replaceAll(URegularExpression *regexp2, 1156 const UChar *replacementText, 1157 int32_t replacementLength, 1158 UChar *destBuf, 1159 int32_t destCapacity, 1160 UErrorCode *status) { 1161 RegularExpression *regexp = (RegularExpression*)regexp2; 1162 if (validateRE(regexp, status) == FALSE) { 1163 return 0; 1164 } 1165 if (replacementText == NULL || replacementLength < -1 || 1166 (destBuf == NULL && destCapacity > 0) || 1167 destCapacity < 0) { 1168 *status = U_ILLEGAL_ARGUMENT_ERROR; 1169 return 0; 1170 } 1171 1172 int32_t len = 0; 1173 1174 uregex_reset(regexp2, 0, status); 1175 1176 // Note: Seperate error code variables for findNext() and appendReplacement() 1177 // are used so that destination buffer overflow errors 1178 // in appendReplacement won't stop findNext() from working. 1179 // appendReplacement() and appendTail() special case incoming buffer 1180 // overflow errors, continuing to return the correct length. 1181 UErrorCode findStatus = *status; 1182 while (uregex_findNext(regexp2, &findStatus)) { 1183 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, 1184 &destBuf, &destCapacity, status); 1185 } 1186 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1187 1188 if (U_FAILURE(findStatus)) { 1189 // If anything went wrong with the findNext(), make that error trump 1190 // whatever may have happened with the append() operations. 1191 // Errors in findNext() are not expected. 1192 *status = findStatus; 1193 } 1194 1195 return len; 1196 } 1197 1198 1199 //------------------------------------------------------------------------------ 1200 // 1201 // uregex_replaceAllUText 1202 // 1203 //------------------------------------------------------------------------------ 1204 U_CAPI UText * U_EXPORT2 1205 uregex_replaceAllUText(URegularExpression *regexp2, 1206 UText *replacementText, 1207 UText *dest, 1208 UErrorCode *status) { 1209 RegularExpression *regexp = (RegularExpression*)regexp2; 1210 if (validateRE(regexp, status) == FALSE) { 1211 return 0; 1212 } 1213 if (replacementText == NULL) { 1214 *status = U_ILLEGAL_ARGUMENT_ERROR; 1215 return 0; 1216 } 1217 1218 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); 1219 return dest; 1220 } 1221 1222 1223 //------------------------------------------------------------------------------ 1224 // 1225 // uregex_replaceFirst 1226 // 1227 //------------------------------------------------------------------------------ 1228 U_CAPI int32_t U_EXPORT2 1229 uregex_replaceFirst(URegularExpression *regexp2, 1230 const UChar *replacementText, 1231 int32_t replacementLength, 1232 UChar *destBuf, 1233 int32_t destCapacity, 1234 UErrorCode *status) { 1235 RegularExpression *regexp = (RegularExpression*)regexp2; 1236 if (validateRE(regexp, status) == FALSE) { 1237 return 0; 1238 } 1239 if (replacementText == NULL || replacementLength < -1 || 1240 (destBuf == NULL && destCapacity > 0) || 1241 destCapacity < 0) { 1242 *status = U_ILLEGAL_ARGUMENT_ERROR; 1243 return 0; 1244 } 1245 1246 int32_t len = 0; 1247 UBool findSucceeded; 1248 uregex_reset(regexp2, 0, status); 1249 findSucceeded = uregex_find(regexp2, 0, status); 1250 if (findSucceeded) { 1251 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 1252 &destBuf, &destCapacity, status); 1253 } 1254 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); 1255 1256 return len; 1257 } 1258 1259 1260 //------------------------------------------------------------------------------ 1261 // 1262 // uregex_replaceFirstUText 1263 // 1264 //------------------------------------------------------------------------------ 1265 U_CAPI UText * U_EXPORT2 1266 uregex_replaceFirstUText(URegularExpression *regexp2, 1267 UText *replacementText, 1268 UText *dest, 1269 UErrorCode *status) { 1270 RegularExpression *regexp = (RegularExpression*)regexp2; 1271 if (validateRE(regexp, status) == FALSE) { 1272 return 0; 1273 } 1274 if (replacementText == NULL) { 1275 *status = U_ILLEGAL_ARGUMENT_ERROR; 1276 return 0; 1277 } 1278 1279 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); 1280 return dest; 1281 } 1282 1283 1284 //------------------------------------------------------------------------------ 1285 // 1286 // uregex_appendReplacement 1287 // 1288 //------------------------------------------------------------------------------ 1289 1290 U_NAMESPACE_BEGIN 1291 // 1292 // Dummy class, because these functions need to be friends of class RegexMatcher, 1293 // and stand-alone C functions don't work as friends 1294 // 1295 class RegexCImpl { 1296 public: 1297 inline static int32_t appendReplacement(RegularExpression *regexp, 1298 const UChar *replacementText, 1299 int32_t replacementLength, 1300 UChar **destBuf, 1301 int32_t *destCapacity, 1302 UErrorCode *status); 1303 1304 inline static int32_t appendTail(RegularExpression *regexp, 1305 UChar **destBuf, 1306 int32_t *destCapacity, 1307 UErrorCode *status); 1308 1309 inline static int32_t split(RegularExpression *regexp, 1310 UChar *destBuf, 1311 int32_t destCapacity, 1312 int32_t *requiredCapacity, 1313 UChar *destFields[], 1314 int32_t destFieldsCapacity, 1315 UErrorCode *status); 1316 }; 1317 1318 U_NAMESPACE_END 1319 1320 1321 1322 static const UChar BACKSLASH = 0x5c; 1323 static const UChar DOLLARSIGN = 0x24; 1324 1325 // 1326 // Move a character to an output buffer, with bounds checking on the index. 1327 // Index advances even if capacity is exceeded, for preflight size computations. 1328 // This little sequence is used a LOT. 1329 // 1330 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { 1331 if (*idx < bufCapacity) { 1332 buf[*idx] = c; 1333 } 1334 (*idx)++; 1335 } 1336 1337 1338 // 1339 // appendReplacement, the actual implementation. 1340 // 1341 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, 1342 const UChar *replacementText, 1343 int32_t replacementLength, 1344 UChar **destBuf, 1345 int32_t *destCapacity, 1346 UErrorCode *status) { 1347 1348 // If we come in with a buffer overflow error, don't suppress the operation. 1349 // A series of appendReplacements, appendTail need to correctly preflight 1350 // the buffer size when an overflow happens somewhere in the middle. 1351 UBool pendingBufferOverflow = FALSE; 1352 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1353 pendingBufferOverflow = TRUE; 1354 *status = U_ZERO_ERROR; 1355 } 1356 1357 // 1358 // Validate all paramters 1359 // 1360 if (validateRE(regexp, status) == FALSE) { 1361 return 0; 1362 } 1363 if (replacementText == NULL || replacementLength < -1 || 1364 destCapacity == NULL || destBuf == NULL || 1365 (*destBuf == NULL && *destCapacity > 0) || 1366 *destCapacity < 0) { 1367 *status = U_ILLEGAL_ARGUMENT_ERROR; 1368 return 0; 1369 } 1370 1371 RegexMatcher *m = regexp->fMatcher; 1372 if (m->fMatch == FALSE) { 1373 *status = U_REGEX_INVALID_STATE; 1374 return 0; 1375 } 1376 1377 UChar *dest = *destBuf; 1378 int32_t capacity = *destCapacity; 1379 int32_t destIdx = 0; 1380 int32_t i; 1381 1382 // If it wasn't supplied by the caller, get the length of the replacement text. 1383 // TODO: slightly smarter logic in the copy loop could watch for the NUL on 1384 // the fly and avoid this step. 1385 if (replacementLength == -1) { 1386 replacementLength = u_strlen(replacementText); 1387 } 1388 1389 // Copy input string from the end of previous match to start of current match 1390 if (regexp->fText != NULL) { 1391 int32_t matchStart; 1392 int32_t lastMatchEnd; 1393 if (UTEXT_USES_U16(m->fInputText)) { 1394 lastMatchEnd = (int32_t)m->fLastMatchEnd; 1395 matchStart = (int32_t)m->fMatchStart; 1396 } else { 1397 // !!!: Would like a better way to do this! 1398 UErrorCode status = U_ZERO_ERROR; 1399 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); 1400 status = U_ZERO_ERROR; 1401 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); 1402 } 1403 for (i=lastMatchEnd; i<matchStart; i++) { 1404 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); 1405 } 1406 } else { 1407 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore 1408 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, 1409 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); 1410 } 1411 1412 1413 // scan the replacement text, looking for substitutions ($n) and \escapes. 1414 int32_t replIdx = 0; 1415 while (replIdx < replacementLength) { 1416 UChar c = replacementText[replIdx]; 1417 replIdx++; 1418 if (c != DOLLARSIGN && c != BACKSLASH) { 1419 // Common case, no substitution, no escaping, 1420 // just copy the char to the dest buf. 1421 appendToBuf(c, &destIdx, dest, capacity); 1422 continue; 1423 } 1424 1425 if (c == BACKSLASH) { 1426 // Backslash Escape. Copy the following char out without further checks. 1427 // Note: Surrogate pairs don't need any special handling 1428 // The second half wont be a '$' or a '\', and 1429 // will move to the dest normally on the next 1430 // loop iteration. 1431 if (replIdx >= replacementLength) { 1432 break; 1433 } 1434 c = replacementText[replIdx]; 1435 1436 if (c==0x55/*U*/ || c==0x75/*u*/) { 1437 // We have a \udddd or \Udddddddd escape sequence. 1438 UChar32 escapedChar = 1439 u_unescapeAt(uregex_ucstr_unescape_charAt, 1440 &replIdx, // Index is updated by unescapeAt 1441 replacementLength, // Length of replacement text 1442 (void *)replacementText); 1443 1444 if (escapedChar != (UChar32)0xFFFFFFFF) { 1445 if (escapedChar <= 0xffff) { 1446 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); 1447 } else { 1448 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); 1449 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); 1450 } 1451 continue; 1452 } 1453 // Note: if the \u escape was invalid, just fall through and 1454 // treat it as a plain \<anything> escape. 1455 } 1456 1457 // Plain backslash escape. Just put out the escaped character. 1458 appendToBuf(c, &destIdx, dest, capacity); 1459 1460 replIdx++; 1461 continue; 1462 } 1463 1464 1465 1466 // We've got a $. Pick up a capture group number if one follows. 1467 // Consume at most the number of digits necessary for the largest capture 1468 // number that is valid for this pattern. 1469 1470 int32_t numDigits = 0; 1471 int32_t groupNum = 0; 1472 UChar32 digitC; 1473 for (;;) { 1474 if (replIdx >= replacementLength) { 1475 break; 1476 } 1477 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); 1478 if (u_isdigit(digitC) == FALSE) { 1479 break; 1480 } 1481 1482 U16_FWD_1(replacementText, replIdx, replacementLength); 1483 groupNum=groupNum*10 + u_charDigitValue(digitC); 1484 numDigits++; 1485 if (numDigits >= m->fPattern->fMaxCaptureDigits) { 1486 break; 1487 } 1488 } 1489 1490 1491 if (numDigits == 0) { 1492 // The $ didn't introduce a group number at all. 1493 // Treat it as just part of the substitution text. 1494 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); 1495 continue; 1496 } 1497 1498 // Finally, append the capture group data to the destination. 1499 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); 1500 if (*status == U_BUFFER_OVERFLOW_ERROR) { 1501 // Ignore buffer overflow when extracting the group. We need to 1502 // continue on to get full size of the untruncated result. We will 1503 // raise our own buffer overflow error at the end. 1504 *status = U_ZERO_ERROR; 1505 } 1506 1507 if (U_FAILURE(*status)) { 1508 // Can fail if group number is out of range. 1509 break; 1510 } 1511 1512 } 1513 1514 // 1515 // Nul Terminate the dest buffer if possible. 1516 // Set the appropriate buffer overflow or not terminated error, if needed. 1517 // 1518 if (destIdx < capacity) { 1519 dest[destIdx] = 0; 1520 } else if (destIdx == *destCapacity) { 1521 *status = U_STRING_NOT_TERMINATED_WARNING; 1522 } else { 1523 *status = U_BUFFER_OVERFLOW_ERROR; 1524 } 1525 1526 // 1527 // Return an updated dest buffer and capacity to the caller. 1528 // 1529 if (destIdx > 0 && *destCapacity > 0) { 1530 if (destIdx < capacity) { 1531 *destBuf += destIdx; 1532 *destCapacity -= destIdx; 1533 } else { 1534 *destBuf += capacity; 1535 *destCapacity = 0; 1536 } 1537 } 1538 1539 // If we came in with a buffer overflow, make sure we go out with one also. 1540 // (A zero length match right at the end of the previous match could 1541 // make this function succeed even though a previous call had overflowed the buf) 1542 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1543 *status = U_BUFFER_OVERFLOW_ERROR; 1544 } 1545 1546 return destIdx; 1547 } 1548 1549 // 1550 // appendReplacement the actual API function, 1551 // 1552 U_CAPI int32_t U_EXPORT2 1553 uregex_appendReplacement(URegularExpression *regexp2, 1554 const UChar *replacementText, 1555 int32_t replacementLength, 1556 UChar **destBuf, 1557 int32_t *destCapacity, 1558 UErrorCode *status) { 1559 1560 RegularExpression *regexp = (RegularExpression*)regexp2; 1561 return RegexCImpl::appendReplacement( 1562 regexp, replacementText, replacementLength,destBuf, destCapacity, status); 1563 } 1564 1565 // 1566 // uregex_appendReplacementUText...can just use the normal C++ method 1567 // 1568 U_CAPI void U_EXPORT2 1569 uregex_appendReplacementUText(URegularExpression *regexp2, 1570 UText *replText, 1571 UText *dest, 1572 UErrorCode *status) { 1573 RegularExpression *regexp = (RegularExpression*)regexp2; 1574 regexp->fMatcher->appendReplacement(dest, replText, *status); 1575 } 1576 1577 1578 //------------------------------------------------------------------------------ 1579 // 1580 // uregex_appendTail 1581 // 1582 //------------------------------------------------------------------------------ 1583 int32_t RegexCImpl::appendTail(RegularExpression *regexp, 1584 UChar **destBuf, 1585 int32_t *destCapacity, 1586 UErrorCode *status) 1587 { 1588 1589 // If we come in with a buffer overflow error, don't suppress the operation. 1590 // A series of appendReplacements, appendTail need to correctly preflight 1591 // the buffer size when an overflow happens somewhere in the middle. 1592 UBool pendingBufferOverflow = FALSE; 1593 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { 1594 pendingBufferOverflow = TRUE; 1595 *status = U_ZERO_ERROR; 1596 } 1597 1598 if (validateRE(regexp, status) == FALSE) { 1599 return 0; 1600 } 1601 1602 if (destCapacity == NULL || destBuf == NULL || 1603 (*destBuf == NULL && *destCapacity > 0) || 1604 *destCapacity < 0) 1605 { 1606 *status = U_ILLEGAL_ARGUMENT_ERROR; 1607 return 0; 1608 } 1609 1610 RegexMatcher *m = regexp->fMatcher; 1611 1612 int32_t destIdx = 0; 1613 int32_t destCap = *destCapacity; 1614 UChar *dest = *destBuf; 1615 1616 if (regexp->fText != NULL) { 1617 int32_t srcIdx; 1618 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); 1619 if (nativeIdx == -1) { 1620 srcIdx = 0; 1621 } else if (UTEXT_USES_U16(m->fInputText)) { 1622 srcIdx = (int32_t)nativeIdx; 1623 } else { 1624 UErrorCode status = U_ZERO_ERROR; 1625 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); 1626 } 1627 1628 for (;;) { 1629 if (srcIdx == regexp->fTextLength) { 1630 break; 1631 } 1632 UChar c = regexp->fText[srcIdx]; 1633 if (c == 0 && regexp->fTextLength == -1) { 1634 regexp->fTextLength = srcIdx; 1635 break; 1636 } 1637 if (destIdx < destCap) { 1638 dest[destIdx] = c; 1639 } else { 1640 // We've overflowed the dest buffer. 1641 // If the total input string length is known, we can 1642 // compute the total buffer size needed without scanning through the string. 1643 if (regexp->fTextLength > 0) { 1644 destIdx += (regexp->fTextLength - srcIdx); 1645 break; 1646 } 1647 } 1648 srcIdx++; 1649 destIdx++; 1650 } 1651 } else { 1652 int64_t srcIdx; 1653 if (m->fMatch) { 1654 // The most recent call to find() succeeded. 1655 srcIdx = m->fMatchEnd; 1656 } else { 1657 // The last call to find() on this matcher failed(). 1658 // Look back to the end of the last find() that succeeded for src index. 1659 srcIdx = m->fLastMatchEnd; 1660 if (srcIdx == -1) { 1661 // There has been no successful match with this matcher. 1662 // We want to copy the whole string. 1663 srcIdx = 0; 1664 } 1665 } 1666 1667 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); 1668 } 1669 1670 // 1671 // NUL terminate the output string, if possible, otherwise issue the 1672 // appropriate error or warning. 1673 // 1674 if (destIdx < destCap) { 1675 dest[destIdx] = 0; 1676 } else if (destIdx == destCap) { 1677 *status = U_STRING_NOT_TERMINATED_WARNING; 1678 } else { 1679 *status = U_BUFFER_OVERFLOW_ERROR; 1680 } 1681 1682 // 1683 // Update the user's buffer ptr and capacity vars to reflect the 1684 // amount used. 1685 // 1686 if (destIdx < destCap) { 1687 *destBuf += destIdx; 1688 *destCapacity -= destIdx; 1689 } else { 1690 *destBuf += destCap; 1691 *destCapacity = 0; 1692 } 1693 1694 if (pendingBufferOverflow && U_SUCCESS(*status)) { 1695 *status = U_BUFFER_OVERFLOW_ERROR; 1696 } 1697 1698 return destIdx; 1699 } 1700 1701 1702 // 1703 // appendTail the actual API function 1704 // 1705 U_CAPI int32_t U_EXPORT2 1706 uregex_appendTail(URegularExpression *regexp2, 1707 UChar **destBuf, 1708 int32_t *destCapacity, 1709 UErrorCode *status) { 1710 RegularExpression *regexp = (RegularExpression*)regexp2; 1711 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); 1712 } 1713 1714 1715 // 1716 // uregex_appendTailUText...can just use the normal C++ method 1717 // 1718 U_CAPI UText * U_EXPORT2 1719 uregex_appendTailUText(URegularExpression *regexp2, 1720 UText *dest, 1721 UErrorCode *status) { 1722 RegularExpression *regexp = (RegularExpression*)regexp2; 1723 return regexp->fMatcher->appendTail(dest, *status); 1724 } 1725 1726 1727 //------------------------------------------------------------------------------ 1728 // 1729 // copyString Internal utility to copy a string to an output buffer, 1730 // while managing buffer overflow and preflight size 1731 // computation. NUL termination is added to destination, 1732 // and the NUL is counted in the output size. 1733 // 1734 //------------------------------------------------------------------------------ 1735 #if 0 1736 static void copyString(UChar *destBuffer, // Destination buffer. 1737 int32_t destCapacity, // Total capacity of dest buffer 1738 int32_t *destIndex, // Index into dest buffer. Updated on return. 1739 // Update not clipped to destCapacity. 1740 const UChar *srcPtr, // Pointer to source string 1741 int32_t srcLen) // Source string len. 1742 { 1743 int32_t si; 1744 int32_t di = *destIndex; 1745 UChar c; 1746 1747 for (si=0; si<srcLen; si++) { 1748 c = srcPtr[si]; 1749 if (di < destCapacity) { 1750 destBuffer[di] = c; 1751 di++; 1752 } else { 1753 di += srcLen - si; 1754 break; 1755 } 1756 } 1757 if (di<destCapacity) { 1758 destBuffer[di] = 0; 1759 } 1760 di++; 1761 *destIndex = di; 1762 } 1763 #endif 1764 1765 //------------------------------------------------------------------------------ 1766 // 1767 // uregex_split 1768 // 1769 //------------------------------------------------------------------------------ 1770 int32_t RegexCImpl::split(RegularExpression *regexp, 1771 UChar *destBuf, 1772 int32_t destCapacity, 1773 int32_t *requiredCapacity, 1774 UChar *destFields[], 1775 int32_t destFieldsCapacity, 1776 UErrorCode *status) { 1777 // 1778 // Reset for the input text 1779 // 1780 regexp->fMatcher->reset(); 1781 UText *inputText = regexp->fMatcher->fInputText; 1782 int64_t nextOutputStringStart = 0; 1783 int64_t inputLen = regexp->fMatcher->fInputLength; 1784 if (inputLen == 0) { 1785 return 0; 1786 } 1787 1788 // 1789 // Loop through the input text, searching for the delimiter pattern 1790 // 1791 int32_t i; // Index of the field being processed. 1792 int32_t destIdx = 0; // Next available position in destBuf; 1793 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); 1794 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted 1795 for (i=0; ; i++) { 1796 if (i>=destFieldsCapacity-1) { 1797 // There are one or zero output strings left. 1798 // Fill the last output string with whatever is left from the input, then exit the loop. 1799 // ( i will be == destFieldsCapacity if we filled the output array while processing 1800 // capture groups of the delimiter expression, in which case we will discard the 1801 // last capture group saved in favor of the unprocessed remainder of the 1802 // input string.) 1803 if (inputLen > nextOutputStringStart) { 1804 if (i != destFieldsCapacity-1) { 1805 // No fields are left. Recycle the last one for holding the trailing part of 1806 // the input string. 1807 i = destFieldsCapacity-1; 1808 destIdx = (int32_t)(destFields[i] - destFields[0]); 1809 } 1810 1811 destFields[i] = &destBuf[destIdx]; 1812 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1813 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1814 } 1815 break; 1816 } 1817 1818 if (regexp->fMatcher->find()) { 1819 // We found another delimiter. Move everything from where we started looking 1820 // up until the start of the delimiter into the next output string. 1821 destFields[i] = &destBuf[destIdx]; 1822 1823 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, 1824 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1825 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1826 tStatus = U_ZERO_ERROR; 1827 } else { 1828 *status = tStatus; 1829 } 1830 nextOutputStringStart = regexp->fMatcher->fMatchEnd; 1831 1832 // If the delimiter pattern has capturing parentheses, the captured 1833 // text goes out into the next n destination strings. 1834 int32_t groupNum; 1835 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1836 // If we've run out of output string slots, bail out. 1837 if (i==destFieldsCapacity-1) { 1838 break; 1839 } 1840 i++; 1841 1842 // Set up to extract the capture group contents into the dest buffer. 1843 destFields[i] = &destBuf[destIdx]; 1844 tStatus = U_ZERO_ERROR; 1845 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); 1846 destIdx += t + 1; // Record the space used in the output string buffer. 1847 // +1 for the NUL that terminates the string. 1848 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { 1849 tStatus = U_ZERO_ERROR; 1850 } else { 1851 *status = tStatus; 1852 } 1853 } 1854 1855 if (nextOutputStringStart == inputLen) { 1856 // The delimiter was at the end of the string. We're done. 1857 break; 1858 } 1859 1860 } 1861 else 1862 { 1863 // We ran off the end of the input while looking for the next delimiter. 1864 // All the remaining text goes into the current output string. 1865 destFields[i] = &destBuf[destIdx]; 1866 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, 1867 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); 1868 break; 1869 } 1870 } 1871 1872 // Zero out any unused portion of the destFields array 1873 int j; 1874 for (j=i+1; j<destFieldsCapacity; j++) { 1875 destFields[j] = NULL; 1876 } 1877 1878 if (requiredCapacity != NULL) { 1879 *requiredCapacity = destIdx; 1880 } 1881 if (destIdx > destCapacity) { 1882 *status = U_BUFFER_OVERFLOW_ERROR; 1883 } 1884 return i+1; 1885 } 1886 1887 // 1888 // uregex_split The actual API function 1889 // 1890 U_CAPI int32_t U_EXPORT2 1891 uregex_split(URegularExpression *regexp2, 1892 UChar *destBuf, 1893 int32_t destCapacity, 1894 int32_t *requiredCapacity, 1895 UChar *destFields[], 1896 int32_t destFieldsCapacity, 1897 UErrorCode *status) { 1898 RegularExpression *regexp = (RegularExpression*)regexp2; 1899 if (validateRE(regexp, status) == FALSE) { 1900 return 0; 1901 } 1902 if ((destBuf == NULL && destCapacity > 0) || 1903 destCapacity < 0 || 1904 destFields == NULL || 1905 destFieldsCapacity < 1 ) { 1906 *status = U_ILLEGAL_ARGUMENT_ERROR; 1907 return 0; 1908 } 1909 1910 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); 1911 } 1912 1913 1914 // 1915 // uregex_splitUText...can just use the normal C++ method 1916 // 1917 U_CAPI int32_t U_EXPORT2 1918 uregex_splitUText(URegularExpression *regexp2, 1919 UText *destFields[], 1920 int32_t destFieldsCapacity, 1921 UErrorCode *status) { 1922 RegularExpression *regexp = (RegularExpression*)regexp2; 1923 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); 1924 } 1925 1926 1927 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1928 1929