1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2012, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 01/14/2002 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "tridpars.h" 16 #include "hash.h" 17 #include "mutex.h" 18 #include "ucln_in.h" 19 #include "unicode/parsepos.h" 20 #include "unicode/translit.h" 21 #include "unicode/uchar.h" 22 #include "unicode/uniset.h" 23 #include "unicode/unistr.h" 24 #include "unicode/utrans.h" 25 #include "util.h" 26 #include "uvector.h" 27 28 U_NAMESPACE_BEGIN 29 30 static const UChar ID_DELIM = 0x003B; // ; 31 static const UChar TARGET_SEP = 0x002D; // - 32 static const UChar VARIANT_SEP = 0x002F; // / 33 static const UChar OPEN_REV = 0x0028; // ( 34 static const UChar CLOSE_REV = 0x0029; // ) 35 36 //static const UChar EMPTY[] = {0}; // "" 37 static const UChar ANY[] = {65,110,121,0}; // "Any" 38 static const UChar ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" 39 40 static const int32_t FORWARD = UTRANS_FORWARD; 41 static const int32_t REVERSE = UTRANS_REVERSE; 42 43 static Hashtable* SPECIAL_INVERSES = NULL; 44 45 /** 46 * The mutex controlling access to SPECIAL_INVERSES 47 */ 48 static UMutex LOCK = U_MUTEX_INITIALIZER; 49 50 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, 51 const UnicodeString& v, UBool sawS, 52 const UnicodeString& f) { 53 source = s; 54 target = t; 55 variant = v; 56 sawSource = sawS; 57 filter = f; 58 } 59 60 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, 61 const UnicodeString& f) { 62 canonID = c; 63 basicID = b; 64 filter = f; 65 } 66 67 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { 68 canonID = c; 69 basicID = b; 70 } 71 72 Transliterator* TransliteratorIDParser::SingleID::createInstance() { 73 Transliterator* t; 74 if (basicID.length() == 0) { 75 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID); 76 } else { 77 t = createBasicInstance(basicID, &canonID); 78 } 79 if (t != NULL) { 80 if (filter.length() != 0) { 81 UErrorCode ec = U_ZERO_ERROR; 82 UnicodeSet *set = new UnicodeSet(filter, ec); 83 if (U_FAILURE(ec)) { 84 delete set; 85 } else { 86 t->adoptFilter(set); 87 } 88 } 89 } 90 return t; 91 } 92 93 94 /** 95 * Parse a single ID, that is, an ID of the general form 96 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 97 * optional, the filters optional, and the variants optional. 98 * @param id the id to be parsed 99 * @param pos INPUT-OUTPUT parameter. On input, the position of 100 * the first character to parse. On output, the position after 101 * the last character parsed. 102 * @param dir the direction. If the direction is REVERSE then the 103 * SingleID is constructed for the reverse direction. 104 * @return a SingleID object or NULL 105 */ 106 TransliteratorIDParser::SingleID* 107 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, 108 int32_t dir, UErrorCode& status) { 109 110 int32_t start = pos; 111 112 // The ID will be of the form A, A(), A(B), or (B), where 113 // A and B are filter IDs. 114 Specs* specsA = NULL; 115 Specs* specsB = NULL; 116 UBool sawParen = FALSE; 117 118 // On the first pass, look for (B) or (). If this fails, then 119 // on the second pass, look for A, A(B), or A(). 120 for (int32_t pass=1; pass<=2; ++pass) { 121 if (pass == 2) { 122 specsA = parseFilterID(id, pos, TRUE); 123 if (specsA == NULL) { 124 pos = start; 125 return NULL; 126 } 127 } 128 if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { 129 sawParen = TRUE; 130 if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 131 specsB = parseFilterID(id, pos, TRUE); 132 // Must close with a ')' 133 if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 134 delete specsA; 135 pos = start; 136 return NULL; 137 } 138 } 139 break; 140 } 141 } 142 143 // Assemble return results 144 SingleID* single; 145 if (sawParen) { 146 if (dir == FORWARD) { 147 SingleID* b = specsToID(specsB, FORWARD); 148 single = specsToID(specsA, FORWARD); 149 // Null pointers check 150 if (b == NULL || single == NULL) { 151 delete b; 152 delete single; 153 status = U_MEMORY_ALLOCATION_ERROR; 154 return NULL; 155 } 156 single->canonID.append(OPEN_REV) 157 .append(b->canonID).append(CLOSE_REV); 158 if (specsA != NULL) { 159 single->filter = specsA->filter; 160 } 161 delete b; 162 } else { 163 SingleID* a = specsToID(specsA, FORWARD); 164 single = specsToID(specsB, FORWARD); 165 // Check for null pointer. 166 if (a == NULL || single == NULL) { 167 delete a; 168 delete single; 169 status = U_MEMORY_ALLOCATION_ERROR; 170 return NULL; 171 } 172 single->canonID.append(OPEN_REV) 173 .append(a->canonID).append(CLOSE_REV); 174 if (specsB != NULL) { 175 single->filter = specsB->filter; 176 } 177 delete a; 178 } 179 } else { 180 // assert(specsA != NULL); 181 if (dir == FORWARD) { 182 single = specsToID(specsA, FORWARD); 183 } else { 184 single = specsToSpecialInverse(*specsA, status); 185 if (single == NULL) { 186 single = specsToID(specsA, REVERSE); 187 } 188 } 189 // Check for NULL pointer 190 if (single == NULL) { 191 status = U_MEMORY_ALLOCATION_ERROR; 192 return NULL; 193 } 194 single->filter = specsA->filter; 195 } 196 197 delete specsA; 198 delete specsB; 199 200 return single; 201 } 202 203 /** 204 * Parse a filter ID, that is, an ID of the general form 205 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 206 * @param id the id to be parsed 207 * @param pos INPUT-OUTPUT parameter. On input, the position of 208 * the first character to parse. On output, the position after 209 * the last character parsed. 210 * @return a SingleID object or null if the parse fails 211 */ 212 TransliteratorIDParser::SingleID* 213 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { 214 215 int32_t start = pos; 216 217 Specs* specs = parseFilterID(id, pos, TRUE); 218 if (specs == NULL) { 219 pos = start; 220 return NULL; 221 } 222 223 // Assemble return results 224 SingleID* single = specsToID(specs, FORWARD); 225 if (single != NULL) { 226 single->filter = specs->filter; 227 } 228 delete specs; 229 return single; 230 } 231 232 /** 233 * Parse a global filter of the form "[f]" or "([f])", depending 234 * on 'withParens'. 235 * @param id the pattern the parse 236 * @param pos INPUT-OUTPUT parameter. On input, the position of 237 * the first character to parse. On output, the position after 238 * the last character parsed. 239 * @param dir the direction. 240 * @param withParens INPUT-OUTPUT parameter. On entry, if 241 * withParens is 0, then parens are disallowed. If it is 1, 242 * then parens are requires. If it is -1, then parens are 243 * optional, and the return result will be set to 0 or 1. 244 * @param canonID OUTPUT parameter. The pattern for the filter 245 * added to the canonID, either at the end, if dir is FORWARD, or 246 * at the start, if dir is REVERSE. The pattern will be enclosed 247 * in parentheses if appropriate, and will be suffixed with an 248 * ID_DELIM character. May be NULL. 249 * @return a UnicodeSet object or NULL. A non-NULL results 250 * indicates a successful parse, regardless of whether the filter 251 * applies to the given direction. The caller should discard it 252 * if withParens != (dir == REVERSE). 253 */ 254 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, 255 int32_t dir, 256 int32_t& withParens, 257 UnicodeString* canonID) { 258 UnicodeSet* filter = NULL; 259 int32_t start = pos; 260 261 if (withParens == -1) { 262 withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; 263 } else if (withParens == 1) { 264 if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { 265 pos = start; 266 return NULL; 267 } 268 } 269 270 ICU_Utility::skipWhitespace(id, pos, TRUE); 271 272 if (UnicodeSet::resemblesPattern(id, pos)) { 273 ParsePosition ppos(pos); 274 UErrorCode ec = U_ZERO_ERROR; 275 filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec); 276 /* test for NULL */ 277 if (filter == 0) { 278 pos = start; 279 return 0; 280 } 281 if (U_FAILURE(ec)) { 282 delete filter; 283 pos = start; 284 return NULL; 285 } 286 287 UnicodeString pattern; 288 id.extractBetween(pos, ppos.getIndex(), pattern); 289 pos = ppos.getIndex(); 290 291 if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 292 pos = start; 293 return NULL; 294 } 295 296 // In the forward direction, append the pattern to the 297 // canonID. In the reverse, insert it at zero, and invert 298 // the presence of parens ("A" <-> "(A)"). 299 if (canonID != NULL) { 300 if (dir == FORWARD) { 301 if (withParens == 1) { 302 pattern.insert(0, OPEN_REV); 303 pattern.append(CLOSE_REV); 304 } 305 canonID->append(pattern).append(ID_DELIM); 306 } else { 307 if (withParens == 0) { 308 pattern.insert(0, OPEN_REV); 309 pattern.append(CLOSE_REV); 310 } 311 canonID->insert(0, pattern); 312 canonID->insert(pattern.length(), ID_DELIM); 313 } 314 } 315 } 316 317 return filter; 318 } 319 320 U_CDECL_BEGIN 321 static void U_CALLCONV _deleteSingleID(void* obj) { 322 delete (TransliteratorIDParser::SingleID*) obj; 323 } 324 325 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { 326 delete (Transliterator*) obj; 327 } 328 U_CDECL_END 329 330 /** 331 * Parse a compound ID, consisting of an optional forward global 332 * filter, a separator, one or more single IDs delimited by 333 * separators, an an optional reverse global filter. The 334 * separator is a semicolon. The global filters are UnicodeSet 335 * patterns. The reverse global filter must be enclosed in 336 * parentheses. 337 * @param id the pattern the parse 338 * @param dir the direction. 339 * @param canonID OUTPUT parameter that receives the canonical ID, 340 * consisting of canonical IDs for all elements, as returned by 341 * parseSingleID(), separated by semicolons. Previous contents 342 * are discarded. 343 * @param list OUTPUT parameter that receives a list of SingleID 344 * objects representing the parsed IDs. Previous contents are 345 * discarded. 346 * @param globalFilter OUTPUT parameter that receives a pointer to 347 * a newly created global filter for this ID in this direction, or 348 * NULL if there is none. 349 * @return TRUE if the parse succeeds, that is, if the entire 350 * id is consumed without syntax error. 351 */ 352 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, 353 UnicodeString& canonID, 354 UVector& list, 355 UnicodeSet*& globalFilter) { 356 UErrorCode ec = U_ZERO_ERROR; 357 int32_t i; 358 int32_t pos = 0; 359 int32_t withParens = 1; 360 list.removeAllElements(); 361 UnicodeSet* filter; 362 globalFilter = NULL; 363 canonID.truncate(0); 364 365 // Parse leading global filter, if any 366 withParens = 0; // parens disallowed 367 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 368 if (filter != NULL) { 369 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 370 // Not a global filter; backup and resume 371 canonID.truncate(0); 372 pos = 0; 373 } 374 if (dir == FORWARD) { 375 globalFilter = filter; 376 } else { 377 delete filter; 378 } 379 filter = NULL; 380 } 381 382 UBool sawDelimiter = TRUE; 383 for (;;) { 384 SingleID* single = parseSingleID(id, pos, dir, ec); 385 if (single == NULL) { 386 break; 387 } 388 if (dir == FORWARD) { 389 list.addElement(single, ec); 390 } else { 391 list.insertElementAt(single, 0, ec); 392 } 393 if (U_FAILURE(ec)) { 394 goto FAIL; 395 } 396 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 397 sawDelimiter = FALSE; 398 break; 399 } 400 } 401 402 if (list.size() == 0) { 403 goto FAIL; 404 } 405 406 // Construct canonical ID 407 for (i=0; i<list.size(); ++i) { 408 SingleID* single = (SingleID*) list.elementAt(i); 409 canonID.append(single->canonID); 410 if (i != (list.size()-1)) { 411 canonID.append(ID_DELIM); 412 } 413 } 414 415 // Parse trailing global filter, if any, and only if we saw 416 // a trailing delimiter after the IDs. 417 if (sawDelimiter) { 418 withParens = 1; // parens required 419 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 420 if (filter != NULL) { 421 // Don't require trailing ';', but parse it if present 422 ICU_Utility::parseChar(id, pos, ID_DELIM); 423 424 if (dir == REVERSE) { 425 globalFilter = filter; 426 } else { 427 delete filter; 428 } 429 filter = NULL; 430 } 431 } 432 433 // Trailing unparsed text is a syntax error 434 ICU_Utility::skipWhitespace(id, pos, TRUE); 435 if (pos != id.length()) { 436 goto FAIL; 437 } 438 439 return TRUE; 440 441 FAIL: 442 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 443 list.removeAllElements(); 444 list.setDeleter(save); 445 delete globalFilter; 446 globalFilter = NULL; 447 return FALSE; 448 } 449 450 /** 451 * Convert the elements of the 'list' vector, which are SingleID 452 * objects, into actual Transliterator objects. In the course of 453 * this, some (or all) entries may be removed. If all entries 454 * are removed, the NULL transliterator will be added. 455 * 456 * Delete entries with empty basicIDs; these are generated by 457 * elements like "(A)" in the forward direction, or "A()" in 458 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 459 * SingleID entries to actual transliterators. 460 * 461 * @param list vector of SingleID objects. On exit, vector 462 * of one or more Transliterators. 463 * @return new value of insertIndex. The index will shift if 464 * there are empty items, like "(Lower)", with indices less than 465 * insertIndex. 466 */ 467 void TransliteratorIDParser::instantiateList(UVector& list, 468 UErrorCode& ec) { 469 UVector tlist(ec); 470 if (U_FAILURE(ec)) { 471 goto RETURN; 472 } 473 tlist.setDeleter(_deleteTransliteratorTrIDPars); 474 475 Transliterator* t; 476 int32_t i; 477 for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() 478 // We run the loop too long by one, so we can 479 // do an insert after the last element 480 if (i==list.size()) { 481 break; 482 } 483 484 SingleID* single = (SingleID*) list.elementAt(i); 485 if (single->basicID.length() != 0) { 486 t = single->createInstance(); 487 if (t == NULL) { 488 ec = U_INVALID_ID; 489 goto RETURN; 490 } 491 tlist.addElement(t, ec); 492 if (U_FAILURE(ec)) { 493 delete t; 494 goto RETURN; 495 } 496 } 497 } 498 499 // An empty list is equivalent to a NULL transliterator. 500 if (tlist.size() == 0) { 501 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL); 502 if (t == NULL) { 503 // Should never happen 504 ec = U_INTERNAL_TRANSLITERATOR_ERROR; 505 } 506 tlist.addElement(t, ec); 507 if (U_FAILURE(ec)) { 508 delete t; 509 } 510 } 511 512 RETURN: 513 514 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 515 list.removeAllElements(); 516 517 if (U_SUCCESS(ec)) { 518 list.setDeleter(_deleteTransliteratorTrIDPars); 519 520 while (tlist.size() > 0) { 521 t = (Transliterator*) tlist.orphanElementAt(0); 522 list.addElement(t, ec); 523 if (U_FAILURE(ec)) { 524 delete t; 525 list.removeAllElements(); 526 break; 527 } 528 } 529 } 530 531 list.setDeleter(save); 532 } 533 534 /** 535 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 536 * S-T/V, or S/V-T. If the source is missing, return a source of 537 * ANY. 538 * @param id the id string, in any of several forms 539 * @return an array of 4 strings: source, target, variant, and 540 * isSourcePresent. If the source is not present, ANY will be 541 * given as the source, and isSourcePresent will be NULL. Otherwise 542 * isSourcePresent will be non-NULL. The target may be empty if the 543 * id is not well-formed. The variant may be empty. 544 */ 545 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, 546 UnicodeString& source, 547 UnicodeString& target, 548 UnicodeString& variant, 549 UBool& isSourcePresent) { 550 source.setTo(ANY, 3); 551 target.truncate(0); 552 variant.truncate(0); 553 554 int32_t sep = id.indexOf(TARGET_SEP); 555 int32_t var = id.indexOf(VARIANT_SEP); 556 if (var < 0) { 557 var = id.length(); 558 } 559 isSourcePresent = FALSE; 560 561 if (sep < 0) { 562 // Form: T/V or T (or /V) 563 id.extractBetween(0, var, target); 564 id.extractBetween(var, id.length(), variant); 565 } else if (sep < var) { 566 // Form: S-T/V or S-T (or -T/V or -T) 567 if (sep > 0) { 568 id.extractBetween(0, sep, source); 569 isSourcePresent = TRUE; 570 } 571 id.extractBetween(++sep, var, target); 572 id.extractBetween(var, id.length(), variant); 573 } else { 574 // Form: (S/V-T or /V-T) 575 if (var > 0) { 576 id.extractBetween(0, var, source); 577 isSourcePresent = TRUE; 578 } 579 id.extractBetween(var, sep++, variant); 580 id.extractBetween(sep, id.length(), target); 581 } 582 583 if (variant.length() > 0) { 584 variant.remove(0, 1); 585 } 586 } 587 588 /** 589 * Given source, target, and variant strings, concatenate them into a 590 * full ID. If the source is empty, then "Any" will be used for the 591 * source, so the ID will always be of the form s-t/v or s-t. 592 */ 593 void TransliteratorIDParser::STVtoID(const UnicodeString& source, 594 const UnicodeString& target, 595 const UnicodeString& variant, 596 UnicodeString& id) { 597 id = source; 598 if (id.length() == 0) { 599 id.setTo(ANY, 3); 600 } 601 id.append(TARGET_SEP).append(target); 602 if (variant.length() != 0) { 603 id.append(VARIANT_SEP).append(variant); 604 } 605 // NUL-terminate the ID string for getTerminatedBuffer. 606 // This prevents valgrind and Purify warnings. 607 id.append((UChar)0); 608 id.truncate(id.length()-1); 609 } 610 611 /** 612 * Register two targets as being inverses of one another. For 613 * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes 614 * Transliterator to form the following inverse relationships: 615 * 616 * <pre>NFC => NFD 617 * Any-NFC => Any-NFD 618 * NFD => NFC 619 * Any-NFD => Any-NFC</pre> 620 * 621 * (Without the special inverse registration, the inverse of NFC 622 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 623 * that the presence or absence of "Any-" is preserved. 624 * 625 * <p>The relationship is symmetrical; registering (a, b) is 626 * equivalent to registering (b, a). 627 * 628 * <p>The relevant IDs must still be registered separately as 629 * factories or classes. 630 * 631 * <p>Only the targets are specified. Special inverses always 632 * have the form Any-Target1 <=> Any-Target2. The target should 633 * have canonical casing (the casing desired to be produced when 634 * an inverse is formed) and should contain no whitespace or other 635 * extraneous characters. 636 * 637 * @param target the target against which to register the inverse 638 * @param inverseTarget the inverse of target, that is 639 * Any-target.getInverse() => Any-inverseTarget 640 * @param bidirectional if TRUE, register the reverse relation 641 * as well, that is, Any-inverseTarget.getInverse() => Any-target 642 */ 643 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, 644 const UnicodeString& inverseTarget, 645 UBool bidirectional, 646 UErrorCode &status) { 647 init(status); 648 if (U_FAILURE(status)) { 649 return; 650 } 651 652 // If target == inverseTarget then force bidirectional => FALSE 653 if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { 654 bidirectional = FALSE; 655 } 656 657 Mutex lock(&LOCK); 658 659 UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. 660 if (tempus == NULL) { 661 status = U_MEMORY_ALLOCATION_ERROR; 662 return; 663 } 664 SPECIAL_INVERSES->put(target, tempus, status); 665 if (bidirectional) { 666 tempus = new UnicodeString(target); 667 if (tempus == NULL) { 668 status = U_MEMORY_ALLOCATION_ERROR; 669 return; 670 } 671 SPECIAL_INVERSES->put(inverseTarget, tempus, status); 672 } 673 } 674 675 //---------------------------------------------------------------- 676 // Private implementation 677 //---------------------------------------------------------------- 678 679 /** 680 * Parse an ID into component pieces. Take IDs of the form T, 681 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 682 * source of ANY. 683 * @param id the id string, in any of several forms 684 * @param pos INPUT-OUTPUT parameter. On input, pos is the 685 * offset of the first character to parse in id. On output, 686 * pos is the offset after the last parsed character. If the 687 * parse failed, pos will be unchanged. 688 * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed 689 * at any location between specs or delimiters, and is returned 690 * as the fifth string in the array. 691 * @return a Specs object, or NULL if the parse failed. If 692 * neither source nor target was seen in the parsed id, then the 693 * parse fails. If allowFilter is TRUE, then the parsed filter 694 * pattern is returned in the Specs object, otherwise the returned 695 * filter reference is NULL. If the parse fails for any reason 696 * NULL is returned. 697 */ 698 TransliteratorIDParser::Specs* 699 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, 700 UBool allowFilter) { 701 UnicodeString first; 702 UnicodeString source; 703 UnicodeString target; 704 UnicodeString variant; 705 UnicodeString filter; 706 UChar delimiter = 0; 707 int32_t specCount = 0; 708 int32_t start = pos; 709 710 // This loop parses one of the following things with each 711 // pass: a filter, a delimiter character (either '-' or '/'), 712 // or a spec (source, target, or variant). 713 for (;;) { 714 ICU_Utility::skipWhitespace(id, pos, TRUE); 715 if (pos == id.length()) { 716 break; 717 } 718 719 // Parse filters 720 if (allowFilter && filter.length() == 0 && 721 UnicodeSet::resemblesPattern(id, pos)) { 722 723 ParsePosition ppos(pos); 724 UErrorCode ec = U_ZERO_ERROR; 725 UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec); 726 if (U_FAILURE(ec)) { 727 pos = start; 728 return NULL; 729 } 730 id.extractBetween(pos, ppos.getIndex(), filter); 731 pos = ppos.getIndex(); 732 continue; 733 } 734 735 if (delimiter == 0) { 736 UChar c = id.charAt(pos); 737 if ((c == TARGET_SEP && target.length() == 0) || 738 (c == VARIANT_SEP && variant.length() == 0)) { 739 delimiter = c; 740 ++pos; 741 continue; 742 } 743 } 744 745 // We are about to try to parse a spec with no delimiter 746 // when we can no longer do so (we can only do so at the 747 // start); break. 748 if (delimiter == 0 && specCount > 0) { 749 break; 750 } 751 752 UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); 753 if (spec.length() == 0) { 754 // Note that if there was a trailing delimiter, we 755 // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- 756 // are legal. 757 break; 758 } 759 760 switch (delimiter) { 761 case 0: 762 first = spec; 763 break; 764 case TARGET_SEP: 765 target = spec; 766 break; 767 case VARIANT_SEP: 768 variant = spec; 769 break; 770 } 771 ++specCount; 772 delimiter = 0; 773 } 774 775 // A spec with no prior character is either source or target, 776 // depending on whether an explicit "-target" was seen. 777 if (first.length() != 0) { 778 if (target.length() == 0) { 779 target = first; 780 } else { 781 source = first; 782 } 783 } 784 785 // Must have either source or target 786 if (source.length() == 0 && target.length() == 0) { 787 pos = start; 788 return NULL; 789 } 790 791 // Empty source or target defaults to ANY 792 UBool sawSource = TRUE; 793 if (source.length() == 0) { 794 source.setTo(ANY, 3); 795 sawSource = FALSE; 796 } 797 if (target.length() == 0) { 798 target.setTo(ANY, 3); 799 } 800 801 return new Specs(source, target, variant, sawSource, filter); 802 } 803 804 /** 805 * Givens a Spec object, convert it to a SingleID object. The 806 * Spec object is a more unprocessed parse result. The SingleID 807 * object contains information about canonical and basic IDs. 808 * @return a SingleID; never returns NULL. Returned object always 809 * has 'filter' field of NULL. 810 */ 811 TransliteratorIDParser::SingleID* 812 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { 813 UnicodeString canonID; 814 UnicodeString basicID; 815 UnicodeString basicPrefix; 816 if (specs != NULL) { 817 UnicodeString buf; 818 if (dir == FORWARD) { 819 if (specs->sawSource) { 820 buf.append(specs->source).append(TARGET_SEP); 821 } else { 822 basicPrefix = specs->source; 823 basicPrefix.append(TARGET_SEP); 824 } 825 buf.append(specs->target); 826 } else { 827 buf.append(specs->target).append(TARGET_SEP).append(specs->source); 828 } 829 if (specs->variant.length() != 0) { 830 buf.append(VARIANT_SEP).append(specs->variant); 831 } 832 basicID = basicPrefix; 833 basicID.append(buf); 834 if (specs->filter.length() != 0) { 835 buf.insert(0, specs->filter); 836 } 837 canonID = buf; 838 } 839 return new SingleID(canonID, basicID); 840 } 841 842 /** 843 * Given a Specs object, return a SingleID representing the 844 * special inverse of that ID. If there is no special inverse 845 * then return NULL. 846 * @return a SingleID or NULL. Returned object always has 847 * 'filter' field of NULL. 848 */ 849 TransliteratorIDParser::SingleID* 850 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { 851 if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { 852 return NULL; 853 } 854 init(status); 855 856 UnicodeString* inverseTarget; 857 858 umtx_lock(&LOCK); 859 inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); 860 umtx_unlock(&LOCK); 861 862 if (inverseTarget != NULL) { 863 // If the original ID contained "Any-" then make the 864 // special inverse "Any-Foo"; otherwise make it "Foo". 865 // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". 866 UnicodeString buf; 867 if (specs.filter.length() != 0) { 868 buf.append(specs.filter); 869 } 870 if (specs.sawSource) { 871 buf.append(ANY, 3).append(TARGET_SEP); 872 } 873 buf.append(*inverseTarget); 874 875 UnicodeString basicID(TRUE, ANY, 3); 876 basicID.append(TARGET_SEP).append(*inverseTarget); 877 878 if (specs.variant.length() != 0) { 879 buf.append(VARIANT_SEP).append(specs.variant); 880 basicID.append(VARIANT_SEP).append(specs.variant); 881 } 882 return new SingleID(buf, basicID); 883 } 884 return NULL; 885 } 886 887 /** 888 * Glue method to get around access problems in C++. This would 889 * ideally be inline but we want to avoid a circular header 890 * dependency. 891 */ 892 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 893 return Transliterator::createBasicInstance(id, canonID); 894 } 895 896 /** 897 * Initialize static memory. 898 */ 899 void TransliteratorIDParser::init(UErrorCode &status) { 900 if (SPECIAL_INVERSES != NULL) { 901 return; 902 } 903 904 Hashtable* special_inverses = new Hashtable(TRUE, status); 905 // Null pointer check 906 if (special_inverses == NULL) { 907 status = U_MEMORY_ALLOCATION_ERROR; 908 return; 909 } 910 special_inverses->setValueDeleter(uprv_deleteUObject); 911 912 umtx_lock(&LOCK); 913 if (SPECIAL_INVERSES == NULL) { 914 SPECIAL_INVERSES = special_inverses; 915 special_inverses = NULL; 916 } 917 umtx_unlock(&LOCK); 918 delete special_inverses; /*null instance*/ 919 920 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); 921 } 922 923 /** 924 * Free static memory. 925 */ 926 void TransliteratorIDParser::cleanup() { 927 if (SPECIAL_INVERSES) { 928 delete SPECIAL_INVERSES; 929 SPECIAL_INVERSES = NULL; 930 } 931 } 932 933 U_NAMESPACE_END 934 935 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 936 937 //eof 938