1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2014, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 01/14/2002 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "tridpars.h" 18 #include "hash.h" 19 #include "mutex.h" 20 #include "transreg.h" 21 #include "uassert.h" 22 #include "ucln_in.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/translit.h" 25 #include "unicode/uchar.h" 26 #include "unicode/uniset.h" 27 #include "unicode/unistr.h" 28 #include "unicode/utrans.h" 29 #include "util.h" 30 #include "uvector.h" 31 32 U_NAMESPACE_BEGIN 33 34 static const UChar ID_DELIM = 0x003B; // ; 35 static const UChar TARGET_SEP = 0x002D; // - 36 static const UChar VARIANT_SEP = 0x002F; // / 37 static const UChar OPEN_REV = 0x0028; // ( 38 static const UChar CLOSE_REV = 0x0029; // ) 39 40 //static const UChar EMPTY[] = {0}; // "" 41 static const UChar ANY[] = {65,110,121,0}; // "Any" 42 static const UChar ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" 43 44 static const int32_t FORWARD = UTRANS_FORWARD; 45 static const int32_t REVERSE = UTRANS_REVERSE; 46 47 static Hashtable* SPECIAL_INVERSES = NULL; 48 static UInitOnce gSpecialInversesInitOnce = U_INITONCE_INITIALIZER; 49 50 /** 51 * The mutex controlling access to SPECIAL_INVERSES 52 */ 53 static UMutex LOCK = U_MUTEX_INITIALIZER; 54 55 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, 56 const UnicodeString& v, UBool sawS, 57 const UnicodeString& f) { 58 source = s; 59 target = t; 60 variant = v; 61 sawSource = sawS; 62 filter = f; 63 } 64 65 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, 66 const UnicodeString& f) { 67 canonID = c; 68 basicID = b; 69 filter = f; 70 } 71 72 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { 73 canonID = c; 74 basicID = b; 75 } 76 77 Transliterator* TransliteratorIDParser::SingleID::createInstance() { 78 Transliterator* t; 79 if (basicID.length() == 0) { 80 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), &canonID); 81 } else { 82 t = createBasicInstance(basicID, &canonID); 83 } 84 if (t != NULL) { 85 if (filter.length() != 0) { 86 UErrorCode ec = U_ZERO_ERROR; 87 UnicodeSet *set = new UnicodeSet(filter, ec); 88 if (U_FAILURE(ec)) { 89 delete set; 90 } else { 91 t->adoptFilter(set); 92 } 93 } 94 } 95 return t; 96 } 97 98 99 /** 100 * Parse a single ID, that is, an ID of the general form 101 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 102 * optional, the filters optional, and the variants optional. 103 * @param id the id to be parsed 104 * @param pos INPUT-OUTPUT parameter. On input, the position of 105 * the first character to parse. On output, the position after 106 * the last character parsed. 107 * @param dir the direction. If the direction is REVERSE then the 108 * SingleID is constructed for the reverse direction. 109 * @return a SingleID object or NULL 110 */ 111 TransliteratorIDParser::SingleID* 112 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, 113 int32_t dir, UErrorCode& status) { 114 115 int32_t start = pos; 116 117 // The ID will be of the form A, A(), A(B), or (B), where 118 // A and B are filter IDs. 119 Specs* specsA = NULL; 120 Specs* specsB = NULL; 121 UBool sawParen = FALSE; 122 123 // On the first pass, look for (B) or (). If this fails, then 124 // on the second pass, look for A, A(B), or A(). 125 for (int32_t pass=1; pass<=2; ++pass) { 126 if (pass == 2) { 127 specsA = parseFilterID(id, pos, TRUE); 128 if (specsA == NULL) { 129 pos = start; 130 return NULL; 131 } 132 } 133 if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { 134 sawParen = TRUE; 135 if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 136 specsB = parseFilterID(id, pos, TRUE); 137 // Must close with a ')' 138 if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 139 delete specsA; 140 pos = start; 141 return NULL; 142 } 143 } 144 break; 145 } 146 } 147 148 // Assemble return results 149 SingleID* single; 150 if (sawParen) { 151 if (dir == FORWARD) { 152 SingleID* b = specsToID(specsB, FORWARD); 153 single = specsToID(specsA, FORWARD); 154 // Null pointers check 155 if (b == NULL || single == NULL) { 156 delete b; 157 delete single; 158 status = U_MEMORY_ALLOCATION_ERROR; 159 return NULL; 160 } 161 single->canonID.append(OPEN_REV) 162 .append(b->canonID).append(CLOSE_REV); 163 if (specsA != NULL) { 164 single->filter = specsA->filter; 165 } 166 delete b; 167 } else { 168 SingleID* a = specsToID(specsA, FORWARD); 169 single = specsToID(specsB, FORWARD); 170 // Check for null pointer. 171 if (a == NULL || single == NULL) { 172 delete a; 173 delete single; 174 status = U_MEMORY_ALLOCATION_ERROR; 175 return NULL; 176 } 177 single->canonID.append(OPEN_REV) 178 .append(a->canonID).append(CLOSE_REV); 179 if (specsB != NULL) { 180 single->filter = specsB->filter; 181 } 182 delete a; 183 } 184 } else { 185 // assert(specsA != NULL); 186 if (dir == FORWARD) { 187 single = specsToID(specsA, FORWARD); 188 } else { 189 single = specsToSpecialInverse(*specsA, status); 190 if (single == NULL) { 191 single = specsToID(specsA, REVERSE); 192 } 193 } 194 // Check for NULL pointer 195 if (single == NULL) { 196 status = U_MEMORY_ALLOCATION_ERROR; 197 return NULL; 198 } 199 single->filter = specsA->filter; 200 } 201 202 delete specsA; 203 delete specsB; 204 205 return single; 206 } 207 208 /** 209 * Parse a filter ID, that is, an ID of the general form 210 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 211 * @param id the id to be parsed 212 * @param pos INPUT-OUTPUT parameter. On input, the position of 213 * the first character to parse. On output, the position after 214 * the last character parsed. 215 * @return a SingleID object or null if the parse fails 216 */ 217 TransliteratorIDParser::SingleID* 218 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { 219 220 int32_t start = pos; 221 222 Specs* specs = parseFilterID(id, pos, TRUE); 223 if (specs == NULL) { 224 pos = start; 225 return NULL; 226 } 227 228 // Assemble return results 229 SingleID* single = specsToID(specs, FORWARD); 230 if (single != NULL) { 231 single->filter = specs->filter; 232 } 233 delete specs; 234 return single; 235 } 236 237 /** 238 * Parse a global filter of the form "[f]" or "([f])", depending 239 * on 'withParens'. 240 * @param id the pattern the parse 241 * @param pos INPUT-OUTPUT parameter. On input, the position of 242 * the first character to parse. On output, the position after 243 * the last character parsed. 244 * @param dir the direction. 245 * @param withParens INPUT-OUTPUT parameter. On entry, if 246 * withParens is 0, then parens are disallowed. If it is 1, 247 * then parens are requires. If it is -1, then parens are 248 * optional, and the return result will be set to 0 or 1. 249 * @param canonID OUTPUT parameter. The pattern for the filter 250 * added to the canonID, either at the end, if dir is FORWARD, or 251 * at the start, if dir is REVERSE. The pattern will be enclosed 252 * in parentheses if appropriate, and will be suffixed with an 253 * ID_DELIM character. May be NULL. 254 * @return a UnicodeSet object or NULL. A non-NULL results 255 * indicates a successful parse, regardless of whether the filter 256 * applies to the given direction. The caller should discard it 257 * if withParens != (dir == REVERSE). 258 */ 259 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, 260 int32_t dir, 261 int32_t& withParens, 262 UnicodeString* canonID) { 263 UnicodeSet* filter = NULL; 264 int32_t start = pos; 265 266 if (withParens == -1) { 267 withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; 268 } else if (withParens == 1) { 269 if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { 270 pos = start; 271 return NULL; 272 } 273 } 274 275 ICU_Utility::skipWhitespace(id, pos, TRUE); 276 277 if (UnicodeSet::resemblesPattern(id, pos)) { 278 ParsePosition ppos(pos); 279 UErrorCode ec = U_ZERO_ERROR; 280 filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec); 281 /* test for NULL */ 282 if (filter == 0) { 283 pos = start; 284 return 0; 285 } 286 if (U_FAILURE(ec)) { 287 delete filter; 288 pos = start; 289 return NULL; 290 } 291 292 UnicodeString pattern; 293 id.extractBetween(pos, ppos.getIndex(), pattern); 294 pos = ppos.getIndex(); 295 296 if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 297 pos = start; 298 return NULL; 299 } 300 301 // In the forward direction, append the pattern to the 302 // canonID. In the reverse, insert it at zero, and invert 303 // the presence of parens ("A" <-> "(A)"). 304 if (canonID != NULL) { 305 if (dir == FORWARD) { 306 if (withParens == 1) { 307 pattern.insert(0, OPEN_REV); 308 pattern.append(CLOSE_REV); 309 } 310 canonID->append(pattern).append(ID_DELIM); 311 } else { 312 if (withParens == 0) { 313 pattern.insert(0, OPEN_REV); 314 pattern.append(CLOSE_REV); 315 } 316 canonID->insert(0, pattern); 317 canonID->insert(pattern.length(), ID_DELIM); 318 } 319 } 320 } 321 322 return filter; 323 } 324 325 U_CDECL_BEGIN 326 static void U_CALLCONV _deleteSingleID(void* obj) { 327 delete (TransliteratorIDParser::SingleID*) obj; 328 } 329 330 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { 331 delete (Transliterator*) obj; 332 } 333 U_CDECL_END 334 335 /** 336 * Parse a compound ID, consisting of an optional forward global 337 * filter, a separator, one or more single IDs delimited by 338 * separators, an an optional reverse global filter. The 339 * separator is a semicolon. The global filters are UnicodeSet 340 * patterns. The reverse global filter must be enclosed in 341 * parentheses. 342 * @param id the pattern the parse 343 * @param dir the direction. 344 * @param canonID OUTPUT parameter that receives the canonical ID, 345 * consisting of canonical IDs for all elements, as returned by 346 * parseSingleID(), separated by semicolons. Previous contents 347 * are discarded. 348 * @param list OUTPUT parameter that receives a list of SingleID 349 * objects representing the parsed IDs. Previous contents are 350 * discarded. 351 * @param globalFilter OUTPUT parameter that receives a pointer to 352 * a newly created global filter for this ID in this direction, or 353 * NULL if there is none. 354 * @return TRUE if the parse succeeds, that is, if the entire 355 * id is consumed without syntax error. 356 */ 357 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, 358 UnicodeString& canonID, 359 UVector& list, 360 UnicodeSet*& globalFilter) { 361 UErrorCode ec = U_ZERO_ERROR; 362 int32_t i; 363 int32_t pos = 0; 364 int32_t withParens = 1; 365 list.removeAllElements(); 366 UnicodeSet* filter; 367 globalFilter = NULL; 368 canonID.truncate(0); 369 370 // Parse leading global filter, if any 371 withParens = 0; // parens disallowed 372 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 373 if (filter != NULL) { 374 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 375 // Not a global filter; backup and resume 376 canonID.truncate(0); 377 pos = 0; 378 } 379 if (dir == FORWARD) { 380 globalFilter = filter; 381 } else { 382 delete filter; 383 } 384 filter = NULL; 385 } 386 387 UBool sawDelimiter = TRUE; 388 for (;;) { 389 SingleID* single = parseSingleID(id, pos, dir, ec); 390 if (single == NULL) { 391 break; 392 } 393 if (dir == FORWARD) { 394 list.addElement(single, ec); 395 } else { 396 list.insertElementAt(single, 0, ec); 397 } 398 if (U_FAILURE(ec)) { 399 goto FAIL; 400 } 401 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 402 sawDelimiter = FALSE; 403 break; 404 } 405 } 406 407 if (list.size() == 0) { 408 goto FAIL; 409 } 410 411 // Construct canonical ID 412 for (i=0; i<list.size(); ++i) { 413 SingleID* single = (SingleID*) list.elementAt(i); 414 canonID.append(single->canonID); 415 if (i != (list.size()-1)) { 416 canonID.append(ID_DELIM); 417 } 418 } 419 420 // Parse trailing global filter, if any, and only if we saw 421 // a trailing delimiter after the IDs. 422 if (sawDelimiter) { 423 withParens = 1; // parens required 424 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 425 if (filter != NULL) { 426 // Don't require trailing ';', but parse it if present 427 ICU_Utility::parseChar(id, pos, ID_DELIM); 428 429 if (dir == REVERSE) { 430 globalFilter = filter; 431 } else { 432 delete filter; 433 } 434 filter = NULL; 435 } 436 } 437 438 // Trailing unparsed text is a syntax error 439 ICU_Utility::skipWhitespace(id, pos, TRUE); 440 if (pos != id.length()) { 441 goto FAIL; 442 } 443 444 return TRUE; 445 446 FAIL: 447 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 448 list.removeAllElements(); 449 list.setDeleter(save); 450 delete globalFilter; 451 globalFilter = NULL; 452 return FALSE; 453 } 454 455 /** 456 * Convert the elements of the 'list' vector, which are SingleID 457 * objects, into actual Transliterator objects. In the course of 458 * this, some (or all) entries may be removed. If all entries 459 * are removed, the NULL transliterator will be added. 460 * 461 * Delete entries with empty basicIDs; these are generated by 462 * elements like "(A)" in the forward direction, or "A()" in 463 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 464 * SingleID entries to actual transliterators. 465 * 466 * @param list vector of SingleID objects. On exit, vector 467 * of one or more Transliterators. 468 * @return new value of insertIndex. The index will shift if 469 * there are empty items, like "(Lower)", with indices less than 470 * insertIndex. 471 */ 472 void TransliteratorIDParser::instantiateList(UVector& list, 473 UErrorCode& ec) { 474 UVector tlist(ec); 475 if (U_FAILURE(ec)) { 476 goto RETURN; 477 } 478 tlist.setDeleter(_deleteTransliteratorTrIDPars); 479 480 Transliterator* t; 481 int32_t i; 482 for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() 483 // We run the loop too long by one, so we can 484 // do an insert after the last element 485 if (i==list.size()) { 486 break; 487 } 488 489 SingleID* single = (SingleID*) list.elementAt(i); 490 if (single->basicID.length() != 0) { 491 t = single->createInstance(); 492 if (t == NULL) { 493 ec = U_INVALID_ID; 494 goto RETURN; 495 } 496 tlist.addElement(t, ec); 497 if (U_FAILURE(ec)) { 498 delete t; 499 goto RETURN; 500 } 501 } 502 } 503 504 // An empty list is equivalent to a NULL transliterator. 505 if (tlist.size() == 0) { 506 t = createBasicInstance(UnicodeString(TRUE, ANY_NULL, 8), NULL); 507 if (t == NULL) { 508 // Should never happen 509 ec = U_INTERNAL_TRANSLITERATOR_ERROR; 510 } 511 tlist.addElement(t, ec); 512 if (U_FAILURE(ec)) { 513 delete t; 514 } 515 } 516 517 RETURN: 518 519 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 520 list.removeAllElements(); 521 522 if (U_SUCCESS(ec)) { 523 list.setDeleter(_deleteTransliteratorTrIDPars); 524 525 while (tlist.size() > 0) { 526 t = (Transliterator*) tlist.orphanElementAt(0); 527 list.addElement(t, ec); 528 if (U_FAILURE(ec)) { 529 delete t; 530 list.removeAllElements(); 531 break; 532 } 533 } 534 } 535 536 list.setDeleter(save); 537 } 538 539 /** 540 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 541 * S-T/V, or S/V-T. If the source is missing, return a source of 542 * ANY. 543 * @param id the id string, in any of several forms 544 * @return an array of 4 strings: source, target, variant, and 545 * isSourcePresent. If the source is not present, ANY will be 546 * given as the source, and isSourcePresent will be NULL. Otherwise 547 * isSourcePresent will be non-NULL. The target may be empty if the 548 * id is not well-formed. The variant may be empty. 549 */ 550 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, 551 UnicodeString& source, 552 UnicodeString& target, 553 UnicodeString& variant, 554 UBool& isSourcePresent) { 555 source.setTo(ANY, 3); 556 target.truncate(0); 557 variant.truncate(0); 558 559 int32_t sep = id.indexOf(TARGET_SEP); 560 int32_t var = id.indexOf(VARIANT_SEP); 561 if (var < 0) { 562 var = id.length(); 563 } 564 isSourcePresent = FALSE; 565 566 if (sep < 0) { 567 // Form: T/V or T (or /V) 568 id.extractBetween(0, var, target); 569 id.extractBetween(var, id.length(), variant); 570 } else if (sep < var) { 571 // Form: S-T/V or S-T (or -T/V or -T) 572 if (sep > 0) { 573 id.extractBetween(0, sep, source); 574 isSourcePresent = TRUE; 575 } 576 id.extractBetween(++sep, var, target); 577 id.extractBetween(var, id.length(), variant); 578 } else { 579 // Form: (S/V-T or /V-T) 580 if (var > 0) { 581 id.extractBetween(0, var, source); 582 isSourcePresent = TRUE; 583 } 584 id.extractBetween(var, sep++, variant); 585 id.extractBetween(sep, id.length(), target); 586 } 587 588 if (variant.length() > 0) { 589 variant.remove(0, 1); 590 } 591 } 592 593 /** 594 * Given source, target, and variant strings, concatenate them into a 595 * full ID. If the source is empty, then "Any" will be used for the 596 * source, so the ID will always be of the form s-t/v or s-t. 597 */ 598 void TransliteratorIDParser::STVtoID(const UnicodeString& source, 599 const UnicodeString& target, 600 const UnicodeString& variant, 601 UnicodeString& id) { 602 id = source; 603 if (id.length() == 0) { 604 id.setTo(ANY, 3); 605 } 606 id.append(TARGET_SEP).append(target); 607 if (variant.length() != 0) { 608 id.append(VARIANT_SEP).append(variant); 609 } 610 // NUL-terminate the ID string for getTerminatedBuffer. 611 // This prevents valgrind and Purify warnings. 612 id.append((UChar)0); 613 id.truncate(id.length()-1); 614 } 615 616 /** 617 * Register two targets as being inverses of one another. For 618 * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes 619 * Transliterator to form the following inverse relationships: 620 * 621 * <pre>NFC => NFD 622 * Any-NFC => Any-NFD 623 * NFD => NFC 624 * Any-NFD => Any-NFC</pre> 625 * 626 * (Without the special inverse registration, the inverse of NFC 627 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 628 * that the presence or absence of "Any-" is preserved. 629 * 630 * <p>The relationship is symmetrical; registering (a, b) is 631 * equivalent to registering (b, a). 632 * 633 * <p>The relevant IDs must still be registered separately as 634 * factories or classes. 635 * 636 * <p>Only the targets are specified. Special inverses always 637 * have the form Any-Target1 <=> Any-Target2. The target should 638 * have canonical casing (the casing desired to be produced when 639 * an inverse is formed) and should contain no whitespace or other 640 * extraneous characters. 641 * 642 * @param target the target against which to register the inverse 643 * @param inverseTarget the inverse of target, that is 644 * Any-target.getInverse() => Any-inverseTarget 645 * @param bidirectional if TRUE, register the reverse relation 646 * as well, that is, Any-inverseTarget.getInverse() => Any-target 647 */ 648 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, 649 const UnicodeString& inverseTarget, 650 UBool bidirectional, 651 UErrorCode &status) { 652 umtx_initOnce(gSpecialInversesInitOnce, init, status); 653 if (U_FAILURE(status)) { 654 return; 655 } 656 657 // If target == inverseTarget then force bidirectional => FALSE 658 if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { 659 bidirectional = FALSE; 660 } 661 662 Mutex lock(&LOCK); 663 664 UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. 665 if (tempus == NULL) { 666 status = U_MEMORY_ALLOCATION_ERROR; 667 return; 668 } 669 SPECIAL_INVERSES->put(target, tempus, status); 670 if (bidirectional) { 671 tempus = new UnicodeString(target); 672 if (tempus == NULL) { 673 status = U_MEMORY_ALLOCATION_ERROR; 674 return; 675 } 676 SPECIAL_INVERSES->put(inverseTarget, tempus, status); 677 } 678 } 679 680 //---------------------------------------------------------------- 681 // Private implementation 682 //---------------------------------------------------------------- 683 684 /** 685 * Parse an ID into component pieces. Take IDs of the form T, 686 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 687 * source of ANY. 688 * @param id the id string, in any of several forms 689 * @param pos INPUT-OUTPUT parameter. On input, pos is the 690 * offset of the first character to parse in id. On output, 691 * pos is the offset after the last parsed character. If the 692 * parse failed, pos will be unchanged. 693 * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed 694 * at any location between specs or delimiters, and is returned 695 * as the fifth string in the array. 696 * @return a Specs object, or NULL if the parse failed. If 697 * neither source nor target was seen in the parsed id, then the 698 * parse fails. If allowFilter is TRUE, then the parsed filter 699 * pattern is returned in the Specs object, otherwise the returned 700 * filter reference is NULL. If the parse fails for any reason 701 * NULL is returned. 702 */ 703 TransliteratorIDParser::Specs* 704 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, 705 UBool allowFilter) { 706 UnicodeString first; 707 UnicodeString source; 708 UnicodeString target; 709 UnicodeString variant; 710 UnicodeString filter; 711 UChar delimiter = 0; 712 int32_t specCount = 0; 713 int32_t start = pos; 714 715 // This loop parses one of the following things with each 716 // pass: a filter, a delimiter character (either '-' or '/'), 717 // or a spec (source, target, or variant). 718 for (;;) { 719 ICU_Utility::skipWhitespace(id, pos, TRUE); 720 if (pos == id.length()) { 721 break; 722 } 723 724 // Parse filters 725 if (allowFilter && filter.length() == 0 && 726 UnicodeSet::resemblesPattern(id, pos)) { 727 728 ParsePosition ppos(pos); 729 UErrorCode ec = U_ZERO_ERROR; 730 UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec); 731 if (U_FAILURE(ec)) { 732 pos = start; 733 return NULL; 734 } 735 id.extractBetween(pos, ppos.getIndex(), filter); 736 pos = ppos.getIndex(); 737 continue; 738 } 739 740 if (delimiter == 0) { 741 UChar c = id.charAt(pos); 742 if ((c == TARGET_SEP && target.length() == 0) || 743 (c == VARIANT_SEP && variant.length() == 0)) { 744 delimiter = c; 745 ++pos; 746 continue; 747 } 748 } 749 750 // We are about to try to parse a spec with no delimiter 751 // when we can no longer do so (we can only do so at the 752 // start); break. 753 if (delimiter == 0 && specCount > 0) { 754 break; 755 } 756 757 UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); 758 if (spec.length() == 0) { 759 // Note that if there was a trailing delimiter, we 760 // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- 761 // are legal. 762 break; 763 } 764 765 switch (delimiter) { 766 case 0: 767 first = spec; 768 break; 769 case TARGET_SEP: 770 target = spec; 771 break; 772 case VARIANT_SEP: 773 variant = spec; 774 break; 775 } 776 ++specCount; 777 delimiter = 0; 778 } 779 780 // A spec with no prior character is either source or target, 781 // depending on whether an explicit "-target" was seen. 782 if (first.length() != 0) { 783 if (target.length() == 0) { 784 target = first; 785 } else { 786 source = first; 787 } 788 } 789 790 // Must have either source or target 791 if (source.length() == 0 && target.length() == 0) { 792 pos = start; 793 return NULL; 794 } 795 796 // Empty source or target defaults to ANY 797 UBool sawSource = TRUE; 798 if (source.length() == 0) { 799 source.setTo(ANY, 3); 800 sawSource = FALSE; 801 } 802 if (target.length() == 0) { 803 target.setTo(ANY, 3); 804 } 805 806 return new Specs(source, target, variant, sawSource, filter); 807 } 808 809 /** 810 * Givens a Spec object, convert it to a SingleID object. The 811 * Spec object is a more unprocessed parse result. The SingleID 812 * object contains information about canonical and basic IDs. 813 * @return a SingleID; never returns NULL. Returned object always 814 * has 'filter' field of NULL. 815 */ 816 TransliteratorIDParser::SingleID* 817 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { 818 UnicodeString canonID; 819 UnicodeString basicID; 820 UnicodeString basicPrefix; 821 if (specs != NULL) { 822 UnicodeString buf; 823 if (dir == FORWARD) { 824 if (specs->sawSource) { 825 buf.append(specs->source).append(TARGET_SEP); 826 } else { 827 basicPrefix = specs->source; 828 basicPrefix.append(TARGET_SEP); 829 } 830 buf.append(specs->target); 831 } else { 832 buf.append(specs->target).append(TARGET_SEP).append(specs->source); 833 } 834 if (specs->variant.length() != 0) { 835 buf.append(VARIANT_SEP).append(specs->variant); 836 } 837 basicID = basicPrefix; 838 basicID.append(buf); 839 if (specs->filter.length() != 0) { 840 buf.insert(0, specs->filter); 841 } 842 canonID = buf; 843 } 844 return new SingleID(canonID, basicID); 845 } 846 847 /** 848 * Given a Specs object, return a SingleID representing the 849 * special inverse of that ID. If there is no special inverse 850 * then return NULL. 851 * @return a SingleID or NULL. Returned object always has 852 * 'filter' field of NULL. 853 */ 854 TransliteratorIDParser::SingleID* 855 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { 856 if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { 857 return NULL; 858 } 859 umtx_initOnce(gSpecialInversesInitOnce, init, status); 860 if (U_FAILURE(status)) { 861 return NULL; 862 } 863 864 UnicodeString* inverseTarget; 865 866 umtx_lock(&LOCK); 867 inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); 868 umtx_unlock(&LOCK); 869 870 if (inverseTarget != NULL) { 871 // If the original ID contained "Any-" then make the 872 // special inverse "Any-Foo"; otherwise make it "Foo". 873 // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". 874 UnicodeString buf; 875 if (specs.filter.length() != 0) { 876 buf.append(specs.filter); 877 } 878 if (specs.sawSource) { 879 buf.append(ANY, 3).append(TARGET_SEP); 880 } 881 buf.append(*inverseTarget); 882 883 UnicodeString basicID(TRUE, ANY, 3); 884 basicID.append(TARGET_SEP).append(*inverseTarget); 885 886 if (specs.variant.length() != 0) { 887 buf.append(VARIANT_SEP).append(specs.variant); 888 basicID.append(VARIANT_SEP).append(specs.variant); 889 } 890 return new SingleID(buf, basicID); 891 } 892 return NULL; 893 } 894 895 /** 896 * Glue method to get around access problems in C++. This would 897 * ideally be inline but we want to avoid a circular header 898 * dependency. 899 */ 900 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 901 return Transliterator::createBasicInstance(id, canonID); 902 } 903 904 /** 905 * Initialize static memory. Called through umtx_initOnce only. 906 */ 907 void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) { 908 U_ASSERT(SPECIAL_INVERSES == NULL); 909 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); 910 911 SPECIAL_INVERSES = new Hashtable(TRUE, status); 912 if (SPECIAL_INVERSES == NULL) { 913 status = U_MEMORY_ALLOCATION_ERROR; 914 return; 915 } 916 SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject); 917 } 918 919 /** 920 * Free static memory. 921 */ 922 void TransliteratorIDParser::cleanup() { 923 if (SPECIAL_INVERSES) { 924 delete SPECIAL_INVERSES; 925 SPECIAL_INVERSES = NULL; 926 } 927 gSpecialInversesInitOnce.reset(); 928 } 929 930 U_NAMESPACE_END 931 932 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 933 934 //eof 935