1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2014, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 01/14/2002 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "tridpars.h" 16 #include "hash.h" 17 #include "mutex.h" 18 #include "transreg.h" 19 #include "uassert.h" 20 #include "ucln_in.h" 21 #include "unicode/parsepos.h" 22 #include "unicode/translit.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uniset.h" 25 #include "unicode/unistr.h" 26 #include "unicode/utrans.h" 27 #include "util.h" 28 #include "uvector.h" 29 30 U_NAMESPACE_BEGIN 31 32 static const UChar ID_DELIM = 0x003B; // ; 33 static const UChar TARGET_SEP = 0x002D; // - 34 static const UChar VARIANT_SEP = 0x002F; // / 35 static const UChar OPEN_REV = 0x0028; // ( 36 static const UChar CLOSE_REV = 0x0029; // ) 37 38 //static const UChar EMPTY[] = {0}; // "" 39 static const UChar ANY[] = {65,110,121,0}; // "Any" 40 static const UChar ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" 41 42 static const int32_t FORWARD = UTRANS_FORWARD; 43 static const int32_t REVERSE = UTRANS_REVERSE; 44 45 static Hashtable* SPECIAL_INVERSES = NULL; 46 static UInitOnce gSpecialInversesInitOnce = U_INITONCE_INITIALIZER; 47 48 /** 49 * The mutex controlling access to SPECIAL_INVERSES 50 */ 51 static UMutex LOCK = U_MUTEX_INITIALIZER; 52 53 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, 54 const UnicodeString& v, UBool sawS, 55 const UnicodeString& f) { 56 source = s; 57 target = t; 58 variant = v; 59 sawSource = sawS; 60 filter = f; 61 } 62 63 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, 64 const UnicodeString& f) { 65 canonID = c; 66 basicID = b; 67 filter = f; 68 } 69 70 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { 71 canonID = c; 72 basicID = b; 73 } 74 75 Transliterator* TransliteratorIDParser::SingleID::createInstance() { 76 Transliterator* t; 77 if (basicID.length() == 0) { 78 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID); 79 } else { 80 t = createBasicInstance(basicID, &canonID); 81 } 82 if (t != NULL) { 83 if (filter.length() != 0) { 84 UErrorCode ec = U_ZERO_ERROR; 85 UnicodeSet *set = new UnicodeSet(filter, ec); 86 if (U_FAILURE(ec)) { 87 delete set; 88 } else { 89 t->adoptFilter(set); 90 } 91 } 92 } 93 return t; 94 } 95 96 97 /** 98 * Parse a single ID, that is, an ID of the general form 99 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 100 * optional, the filters optional, and the variants optional. 101 * @param id the id to be parsed 102 * @param pos INPUT-OUTPUT parameter. On input, the position of 103 * the first character to parse. On output, the position after 104 * the last character parsed. 105 * @param dir the direction. If the direction is REVERSE then the 106 * SingleID is constructed for the reverse direction. 107 * @return a SingleID object or NULL 108 */ 109 TransliteratorIDParser::SingleID* 110 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, 111 int32_t dir, UErrorCode& status) { 112 113 int32_t start = pos; 114 115 // The ID will be of the form A, A(), A(B), or (B), where 116 // A and B are filter IDs. 117 Specs* specsA = NULL; 118 Specs* specsB = NULL; 119 UBool sawParen = FALSE; 120 121 // On the first pass, look for (B) or (). If this fails, then 122 // on the second pass, look for A, A(B), or A(). 123 for (int32_t pass=1; pass<=2; ++pass) { 124 if (pass == 2) { 125 specsA = parseFilterID(id, pos, TRUE); 126 if (specsA == NULL) { 127 pos = start; 128 return NULL; 129 } 130 } 131 if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { 132 sawParen = TRUE; 133 if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 134 specsB = parseFilterID(id, pos, TRUE); 135 // Must close with a ')' 136 if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 137 delete specsA; 138 pos = start; 139 return NULL; 140 } 141 } 142 break; 143 } 144 } 145 146 // Assemble return results 147 SingleID* single; 148 if (sawParen) { 149 if (dir == FORWARD) { 150 SingleID* b = specsToID(specsB, FORWARD); 151 single = specsToID(specsA, FORWARD); 152 // Null pointers check 153 if (b == NULL || single == NULL) { 154 delete b; 155 delete single; 156 status = U_MEMORY_ALLOCATION_ERROR; 157 return NULL; 158 } 159 single->canonID.append(OPEN_REV) 160 .append(b->canonID).append(CLOSE_REV); 161 if (specsA != NULL) { 162 single->filter = specsA->filter; 163 } 164 delete b; 165 } else { 166 SingleID* a = specsToID(specsA, FORWARD); 167 single = specsToID(specsB, FORWARD); 168 // Check for null pointer. 169 if (a == NULL || single == NULL) { 170 delete a; 171 delete single; 172 status = U_MEMORY_ALLOCATION_ERROR; 173 return NULL; 174 } 175 single->canonID.append(OPEN_REV) 176 .append(a->canonID).append(CLOSE_REV); 177 if (specsB != NULL) { 178 single->filter = specsB->filter; 179 } 180 delete a; 181 } 182 } else { 183 // assert(specsA != NULL); 184 if (dir == FORWARD) { 185 single = specsToID(specsA, FORWARD); 186 } else { 187 single = specsToSpecialInverse(*specsA, status); 188 if (single == NULL) { 189 single = specsToID(specsA, REVERSE); 190 } 191 } 192 // Check for NULL pointer 193 if (single == NULL) { 194 status = U_MEMORY_ALLOCATION_ERROR; 195 return NULL; 196 } 197 single->filter = specsA->filter; 198 } 199 200 delete specsA; 201 delete specsB; 202 203 return single; 204 } 205 206 /** 207 * Parse a filter ID, that is, an ID of the general form 208 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 209 * @param id the id to be parsed 210 * @param pos INPUT-OUTPUT parameter. On input, the position of 211 * the first character to parse. On output, the position after 212 * the last character parsed. 213 * @return a SingleID object or null if the parse fails 214 */ 215 TransliteratorIDParser::SingleID* 216 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { 217 218 int32_t start = pos; 219 220 Specs* specs = parseFilterID(id, pos, TRUE); 221 if (specs == NULL) { 222 pos = start; 223 return NULL; 224 } 225 226 // Assemble return results 227 SingleID* single = specsToID(specs, FORWARD); 228 if (single != NULL) { 229 single->filter = specs->filter; 230 } 231 delete specs; 232 return single; 233 } 234 235 /** 236 * Parse a global filter of the form "[f]" or "([f])", depending 237 * on 'withParens'. 238 * @param id the pattern the parse 239 * @param pos INPUT-OUTPUT parameter. On input, the position of 240 * the first character to parse. On output, the position after 241 * the last character parsed. 242 * @param dir the direction. 243 * @param withParens INPUT-OUTPUT parameter. On entry, if 244 * withParens is 0, then parens are disallowed. If it is 1, 245 * then parens are requires. If it is -1, then parens are 246 * optional, and the return result will be set to 0 or 1. 247 * @param canonID OUTPUT parameter. The pattern for the filter 248 * added to the canonID, either at the end, if dir is FORWARD, or 249 * at the start, if dir is REVERSE. The pattern will be enclosed 250 * in parentheses if appropriate, and will be suffixed with an 251 * ID_DELIM character. May be NULL. 252 * @return a UnicodeSet object or NULL. A non-NULL results 253 * indicates a successful parse, regardless of whether the filter 254 * applies to the given direction. The caller should discard it 255 * if withParens != (dir == REVERSE). 256 */ 257 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, 258 int32_t dir, 259 int32_t& withParens, 260 UnicodeString* canonID) { 261 UnicodeSet* filter = NULL; 262 int32_t start = pos; 263 264 if (withParens == -1) { 265 withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; 266 } else if (withParens == 1) { 267 if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { 268 pos = start; 269 return NULL; 270 } 271 } 272 273 ICU_Utility::skipWhitespace(id, pos, TRUE); 274 275 if (UnicodeSet::resemblesPattern(id, pos)) { 276 ParsePosition ppos(pos); 277 UErrorCode ec = U_ZERO_ERROR; 278 filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec); 279 /* test for NULL */ 280 if (filter == 0) { 281 pos = start; 282 return 0; 283 } 284 if (U_FAILURE(ec)) { 285 delete filter; 286 pos = start; 287 return NULL; 288 } 289 290 UnicodeString pattern; 291 id.extractBetween(pos, ppos.getIndex(), pattern); 292 pos = ppos.getIndex(); 293 294 if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 295 pos = start; 296 return NULL; 297 } 298 299 // In the forward direction, append the pattern to the 300 // canonID. In the reverse, insert it at zero, and invert 301 // the presence of parens ("A" <-> "(A)"). 302 if (canonID != NULL) { 303 if (dir == FORWARD) { 304 if (withParens == 1) { 305 pattern.insert(0, OPEN_REV); 306 pattern.append(CLOSE_REV); 307 } 308 canonID->append(pattern).append(ID_DELIM); 309 } else { 310 if (withParens == 0) { 311 pattern.insert(0, OPEN_REV); 312 pattern.append(CLOSE_REV); 313 } 314 canonID->insert(0, pattern); 315 canonID->insert(pattern.length(), ID_DELIM); 316 } 317 } 318 } 319 320 return filter; 321 } 322 323 U_CDECL_BEGIN 324 static void U_CALLCONV _deleteSingleID(void* obj) { 325 delete (TransliteratorIDParser::SingleID*) obj; 326 } 327 328 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { 329 delete (Transliterator*) obj; 330 } 331 U_CDECL_END 332 333 /** 334 * Parse a compound ID, consisting of an optional forward global 335 * filter, a separator, one or more single IDs delimited by 336 * separators, an an optional reverse global filter. The 337 * separator is a semicolon. The global filters are UnicodeSet 338 * patterns. The reverse global filter must be enclosed in 339 * parentheses. 340 * @param id the pattern the parse 341 * @param dir the direction. 342 * @param canonID OUTPUT parameter that receives the canonical ID, 343 * consisting of canonical IDs for all elements, as returned by 344 * parseSingleID(), separated by semicolons. Previous contents 345 * are discarded. 346 * @param list OUTPUT parameter that receives a list of SingleID 347 * objects representing the parsed IDs. Previous contents are 348 * discarded. 349 * @param globalFilter OUTPUT parameter that receives a pointer to 350 * a newly created global filter for this ID in this direction, or 351 * NULL if there is none. 352 * @return TRUE if the parse succeeds, that is, if the entire 353 * id is consumed without syntax error. 354 */ 355 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, 356 UnicodeString& canonID, 357 UVector& list, 358 UnicodeSet*& globalFilter) { 359 UErrorCode ec = U_ZERO_ERROR; 360 int32_t i; 361 int32_t pos = 0; 362 int32_t withParens = 1; 363 list.removeAllElements(); 364 UnicodeSet* filter; 365 globalFilter = NULL; 366 canonID.truncate(0); 367 368 // Parse leading global filter, if any 369 withParens = 0; // parens disallowed 370 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 371 if (filter != NULL) { 372 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 373 // Not a global filter; backup and resume 374 canonID.truncate(0); 375 pos = 0; 376 } 377 if (dir == FORWARD) { 378 globalFilter = filter; 379 } else { 380 delete filter; 381 } 382 filter = NULL; 383 } 384 385 UBool sawDelimiter = TRUE; 386 for (;;) { 387 SingleID* single = parseSingleID(id, pos, dir, ec); 388 if (single == NULL) { 389 break; 390 } 391 if (dir == FORWARD) { 392 list.addElement(single, ec); 393 } else { 394 list.insertElementAt(single, 0, ec); 395 } 396 if (U_FAILURE(ec)) { 397 goto FAIL; 398 } 399 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 400 sawDelimiter = FALSE; 401 break; 402 } 403 } 404 405 if (list.size() == 0) { 406 goto FAIL; 407 } 408 409 // Construct canonical ID 410 for (i=0; i<list.size(); ++i) { 411 SingleID* single = (SingleID*) list.elementAt(i); 412 canonID.append(single->canonID); 413 if (i != (list.size()-1)) { 414 canonID.append(ID_DELIM); 415 } 416 } 417 418 // Parse trailing global filter, if any, and only if we saw 419 // a trailing delimiter after the IDs. 420 if (sawDelimiter) { 421 withParens = 1; // parens required 422 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 423 if (filter != NULL) { 424 // Don't require trailing ';', but parse it if present 425 ICU_Utility::parseChar(id, pos, ID_DELIM); 426 427 if (dir == REVERSE) { 428 globalFilter = filter; 429 } else { 430 delete filter; 431 } 432 filter = NULL; 433 } 434 } 435 436 // Trailing unparsed text is a syntax error 437 ICU_Utility::skipWhitespace(id, pos, TRUE); 438 if (pos != id.length()) { 439 goto FAIL; 440 } 441 442 return TRUE; 443 444 FAIL: 445 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 446 list.removeAllElements(); 447 list.setDeleter(save); 448 delete globalFilter; 449 globalFilter = NULL; 450 return FALSE; 451 } 452 453 /** 454 * Convert the elements of the 'list' vector, which are SingleID 455 * objects, into actual Transliterator objects. In the course of 456 * this, some (or all) entries may be removed. If all entries 457 * are removed, the NULL transliterator will be added. 458 * 459 * Delete entries with empty basicIDs; these are generated by 460 * elements like "(A)" in the forward direction, or "A()" in 461 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 462 * SingleID entries to actual transliterators. 463 * 464 * @param list vector of SingleID objects. On exit, vector 465 * of one or more Transliterators. 466 * @return new value of insertIndex. The index will shift if 467 * there are empty items, like "(Lower)", with indices less than 468 * insertIndex. 469 */ 470 void TransliteratorIDParser::instantiateList(UVector& list, 471 UErrorCode& ec) { 472 UVector tlist(ec); 473 if (U_FAILURE(ec)) { 474 goto RETURN; 475 } 476 tlist.setDeleter(_deleteTransliteratorTrIDPars); 477 478 Transliterator* t; 479 int32_t i; 480 for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() 481 // We run the loop too long by one, so we can 482 // do an insert after the last element 483 if (i==list.size()) { 484 break; 485 } 486 487 SingleID* single = (SingleID*) list.elementAt(i); 488 if (single->basicID.length() != 0) { 489 t = single->createInstance(); 490 if (t == NULL) { 491 ec = U_INVALID_ID; 492 goto RETURN; 493 } 494 tlist.addElement(t, ec); 495 if (U_FAILURE(ec)) { 496 delete t; 497 goto RETURN; 498 } 499 } 500 } 501 502 // An empty list is equivalent to a NULL transliterator. 503 if (tlist.size() == 0) { 504 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL); 505 if (t == NULL) { 506 // Should never happen 507 ec = U_INTERNAL_TRANSLITERATOR_ERROR; 508 } 509 tlist.addElement(t, ec); 510 if (U_FAILURE(ec)) { 511 delete t; 512 } 513 } 514 515 RETURN: 516 517 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 518 list.removeAllElements(); 519 520 if (U_SUCCESS(ec)) { 521 list.setDeleter(_deleteTransliteratorTrIDPars); 522 523 while (tlist.size() > 0) { 524 t = (Transliterator*) tlist.orphanElementAt(0); 525 list.addElement(t, ec); 526 if (U_FAILURE(ec)) { 527 delete t; 528 list.removeAllElements(); 529 break; 530 } 531 } 532 } 533 534 list.setDeleter(save); 535 } 536 537 /** 538 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 539 * S-T/V, or S/V-T. If the source is missing, return a source of 540 * ANY. 541 * @param id the id string, in any of several forms 542 * @return an array of 4 strings: source, target, variant, and 543 * isSourcePresent. If the source is not present, ANY will be 544 * given as the source, and isSourcePresent will be NULL. Otherwise 545 * isSourcePresent will be non-NULL. The target may be empty if the 546 * id is not well-formed. The variant may be empty. 547 */ 548 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, 549 UnicodeString& source, 550 UnicodeString& target, 551 UnicodeString& variant, 552 UBool& isSourcePresent) { 553 source.setTo(ANY, 3); 554 target.truncate(0); 555 variant.truncate(0); 556 557 int32_t sep = id.indexOf(TARGET_SEP); 558 int32_t var = id.indexOf(VARIANT_SEP); 559 if (var < 0) { 560 var = id.length(); 561 } 562 isSourcePresent = FALSE; 563 564 if (sep < 0) { 565 // Form: T/V or T (or /V) 566 id.extractBetween(0, var, target); 567 id.extractBetween(var, id.length(), variant); 568 } else if (sep < var) { 569 // Form: S-T/V or S-T (or -T/V or -T) 570 if (sep > 0) { 571 id.extractBetween(0, sep, source); 572 isSourcePresent = TRUE; 573 } 574 id.extractBetween(++sep, var, target); 575 id.extractBetween(var, id.length(), variant); 576 } else { 577 // Form: (S/V-T or /V-T) 578 if (var > 0) { 579 id.extractBetween(0, var, source); 580 isSourcePresent = TRUE; 581 } 582 id.extractBetween(var, sep++, variant); 583 id.extractBetween(sep, id.length(), target); 584 } 585 586 if (variant.length() > 0) { 587 variant.remove(0, 1); 588 } 589 } 590 591 /** 592 * Given source, target, and variant strings, concatenate them into a 593 * full ID. If the source is empty, then "Any" will be used for the 594 * source, so the ID will always be of the form s-t/v or s-t. 595 */ 596 void TransliteratorIDParser::STVtoID(const UnicodeString& source, 597 const UnicodeString& target, 598 const UnicodeString& variant, 599 UnicodeString& id) { 600 id = source; 601 if (id.length() == 0) { 602 id.setTo(ANY, 3); 603 } 604 id.append(TARGET_SEP).append(target); 605 if (variant.length() != 0) { 606 id.append(VARIANT_SEP).append(variant); 607 } 608 // NUL-terminate the ID string for getTerminatedBuffer. 609 // This prevents valgrind and Purify warnings. 610 id.append((UChar)0); 611 id.truncate(id.length()-1); 612 } 613 614 /** 615 * Register two targets as being inverses of one another. For 616 * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes 617 * Transliterator to form the following inverse relationships: 618 * 619 * <pre>NFC => NFD 620 * Any-NFC => Any-NFD 621 * NFD => NFC 622 * Any-NFD => Any-NFC</pre> 623 * 624 * (Without the special inverse registration, the inverse of NFC 625 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 626 * that the presence or absence of "Any-" is preserved. 627 * 628 * <p>The relationship is symmetrical; registering (a, b) is 629 * equivalent to registering (b, a). 630 * 631 * <p>The relevant IDs must still be registered separately as 632 * factories or classes. 633 * 634 * <p>Only the targets are specified. Special inverses always 635 * have the form Any-Target1 <=> Any-Target2. The target should 636 * have canonical casing (the casing desired to be produced when 637 * an inverse is formed) and should contain no whitespace or other 638 * extraneous characters. 639 * 640 * @param target the target against which to register the inverse 641 * @param inverseTarget the inverse of target, that is 642 * Any-target.getInverse() => Any-inverseTarget 643 * @param bidirectional if TRUE, register the reverse relation 644 * as well, that is, Any-inverseTarget.getInverse() => Any-target 645 */ 646 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, 647 const UnicodeString& inverseTarget, 648 UBool bidirectional, 649 UErrorCode &status) { 650 umtx_initOnce(gSpecialInversesInitOnce, init, status); 651 if (U_FAILURE(status)) { 652 return; 653 } 654 655 // If target == inverseTarget then force bidirectional => FALSE 656 if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { 657 bidirectional = FALSE; 658 } 659 660 Mutex lock(&LOCK); 661 662 UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. 663 if (tempus == NULL) { 664 status = U_MEMORY_ALLOCATION_ERROR; 665 return; 666 } 667 SPECIAL_INVERSES->put(target, tempus, status); 668 if (bidirectional) { 669 tempus = new UnicodeString(target); 670 if (tempus == NULL) { 671 status = U_MEMORY_ALLOCATION_ERROR; 672 return; 673 } 674 SPECIAL_INVERSES->put(inverseTarget, tempus, status); 675 } 676 } 677 678 //---------------------------------------------------------------- 679 // Private implementation 680 //---------------------------------------------------------------- 681 682 /** 683 * Parse an ID into component pieces. Take IDs of the form T, 684 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 685 * source of ANY. 686 * @param id the id string, in any of several forms 687 * @param pos INPUT-OUTPUT parameter. On input, pos is the 688 * offset of the first character to parse in id. On output, 689 * pos is the offset after the last parsed character. If the 690 * parse failed, pos will be unchanged. 691 * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed 692 * at any location between specs or delimiters, and is returned 693 * as the fifth string in the array. 694 * @return a Specs object, or NULL if the parse failed. If 695 * neither source nor target was seen in the parsed id, then the 696 * parse fails. If allowFilter is TRUE, then the parsed filter 697 * pattern is returned in the Specs object, otherwise the returned 698 * filter reference is NULL. If the parse fails for any reason 699 * NULL is returned. 700 */ 701 TransliteratorIDParser::Specs* 702 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, 703 UBool allowFilter) { 704 UnicodeString first; 705 UnicodeString source; 706 UnicodeString target; 707 UnicodeString variant; 708 UnicodeString filter; 709 UChar delimiter = 0; 710 int32_t specCount = 0; 711 int32_t start = pos; 712 713 // This loop parses one of the following things with each 714 // pass: a filter, a delimiter character (either '-' or '/'), 715 // or a spec (source, target, or variant). 716 for (;;) { 717 ICU_Utility::skipWhitespace(id, pos, TRUE); 718 if (pos == id.length()) { 719 break; 720 } 721 722 // Parse filters 723 if (allowFilter && filter.length() == 0 && 724 UnicodeSet::resemblesPattern(id, pos)) { 725 726 ParsePosition ppos(pos); 727 UErrorCode ec = U_ZERO_ERROR; 728 UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec); 729 if (U_FAILURE(ec)) { 730 pos = start; 731 return NULL; 732 } 733 id.extractBetween(pos, ppos.getIndex(), filter); 734 pos = ppos.getIndex(); 735 continue; 736 } 737 738 if (delimiter == 0) { 739 UChar c = id.charAt(pos); 740 if ((c == TARGET_SEP && target.length() == 0) || 741 (c == VARIANT_SEP && variant.length() == 0)) { 742 delimiter = c; 743 ++pos; 744 continue; 745 } 746 } 747 748 // We are about to try to parse a spec with no delimiter 749 // when we can no longer do so (we can only do so at the 750 // start); break. 751 if (delimiter == 0 && specCount > 0) { 752 break; 753 } 754 755 UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); 756 if (spec.length() == 0) { 757 // Note that if there was a trailing delimiter, we 758 // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- 759 // are legal. 760 break; 761 } 762 763 switch (delimiter) { 764 case 0: 765 first = spec; 766 break; 767 case TARGET_SEP: 768 target = spec; 769 break; 770 case VARIANT_SEP: 771 variant = spec; 772 break; 773 } 774 ++specCount; 775 delimiter = 0; 776 } 777 778 // A spec with no prior character is either source or target, 779 // depending on whether an explicit "-target" was seen. 780 if (first.length() != 0) { 781 if (target.length() == 0) { 782 target = first; 783 } else { 784 source = first; 785 } 786 } 787 788 // Must have either source or target 789 if (source.length() == 0 && target.length() == 0) { 790 pos = start; 791 return NULL; 792 } 793 794 // Empty source or target defaults to ANY 795 UBool sawSource = TRUE; 796 if (source.length() == 0) { 797 source.setTo(ANY, 3); 798 sawSource = FALSE; 799 } 800 if (target.length() == 0) { 801 target.setTo(ANY, 3); 802 } 803 804 return new Specs(source, target, variant, sawSource, filter); 805 } 806 807 /** 808 * Givens a Spec object, convert it to a SingleID object. The 809 * Spec object is a more unprocessed parse result. The SingleID 810 * object contains information about canonical and basic IDs. 811 * @return a SingleID; never returns NULL. Returned object always 812 * has 'filter' field of NULL. 813 */ 814 TransliteratorIDParser::SingleID* 815 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { 816 UnicodeString canonID; 817 UnicodeString basicID; 818 UnicodeString basicPrefix; 819 if (specs != NULL) { 820 UnicodeString buf; 821 if (dir == FORWARD) { 822 if (specs->sawSource) { 823 buf.append(specs->source).append(TARGET_SEP); 824 } else { 825 basicPrefix = specs->source; 826 basicPrefix.append(TARGET_SEP); 827 } 828 buf.append(specs->target); 829 } else { 830 buf.append(specs->target).append(TARGET_SEP).append(specs->source); 831 } 832 if (specs->variant.length() != 0) { 833 buf.append(VARIANT_SEP).append(specs->variant); 834 } 835 basicID = basicPrefix; 836 basicID.append(buf); 837 if (specs->filter.length() != 0) { 838 buf.insert(0, specs->filter); 839 } 840 canonID = buf; 841 } 842 return new SingleID(canonID, basicID); 843 } 844 845 /** 846 * Given a Specs object, return a SingleID representing the 847 * special inverse of that ID. If there is no special inverse 848 * then return NULL. 849 * @return a SingleID or NULL. Returned object always has 850 * 'filter' field of NULL. 851 */ 852 TransliteratorIDParser::SingleID* 853 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { 854 if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { 855 return NULL; 856 } 857 umtx_initOnce(gSpecialInversesInitOnce, init, status); 858 if (U_FAILURE(status)) { 859 return NULL; 860 } 861 862 UnicodeString* inverseTarget; 863 864 umtx_lock(&LOCK); 865 inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); 866 umtx_unlock(&LOCK); 867 868 if (inverseTarget != NULL) { 869 // If the original ID contained "Any-" then make the 870 // special inverse "Any-Foo"; otherwise make it "Foo". 871 // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". 872 UnicodeString buf; 873 if (specs.filter.length() != 0) { 874 buf.append(specs.filter); 875 } 876 if (specs.sawSource) { 877 buf.append(ANY, 3).append(TARGET_SEP); 878 } 879 buf.append(*inverseTarget); 880 881 UnicodeString basicID(TRUE, ANY, 3); 882 basicID.append(TARGET_SEP).append(*inverseTarget); 883 884 if (specs.variant.length() != 0) { 885 buf.append(VARIANT_SEP).append(specs.variant); 886 basicID.append(VARIANT_SEP).append(specs.variant); 887 } 888 return new SingleID(buf, basicID); 889 } 890 return NULL; 891 } 892 893 /** 894 * Glue method to get around access problems in C++. This would 895 * ideally be inline but we want to avoid a circular header 896 * dependency. 897 */ 898 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 899 return Transliterator::createBasicInstance(id, canonID); 900 } 901 902 /** 903 * Initialize static memory. Called through umtx_initOnce only. 904 */ 905 void TransliteratorIDParser::init(UErrorCode &status) { 906 U_ASSERT(SPECIAL_INVERSES == NULL); 907 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); 908 909 SPECIAL_INVERSES = new Hashtable(TRUE, status); 910 if (SPECIAL_INVERSES == NULL) { 911 status = U_MEMORY_ALLOCATION_ERROR; 912 return; 913 } 914 SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject); 915 } 916 917 /** 918 * Free static memory. 919 */ 920 void TransliteratorIDParser::cleanup() { 921 if (SPECIAL_INVERSES) { 922 delete SPECIAL_INVERSES; 923 SPECIAL_INVERSES = NULL; 924 } 925 gSpecialInversesInitOnce.reset(); 926 } 927 928 U_NAMESPACE_END 929 930 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 931 932 //eof 933