1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ****************************************************************************** 6 * Copyright (C) 2003-2011, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ****************************************************************************** 9 */ 10 11 package android.icu.impl; 12 13 import java.util.Collections; 14 import java.util.Comparator; 15 import java.util.Iterator; 16 import java.util.Map; 17 import java.util.TreeMap; 18 19 import android.icu.impl.locale.AsciiUtil; 20 21 /** 22 * Utility class to parse and normalize locale ids (including POSIX style) 23 * @hide Only a subset of ICU is exposed in Android 24 */ 25 public final class LocaleIDParser { 26 27 /** 28 * Char array representing the locale ID. 29 */ 30 private char[] id; 31 32 /** 33 * Current position in {@link #id} (while parsing). 34 */ 35 private int index; 36 37 /** 38 * Temporary buffer for parsed sections of data. 39 */ 40 private StringBuilder buffer; 41 42 // um, don't handle POSIX ids unless we request it. why not? well... because. 43 private boolean canonicalize; 44 private boolean hadCountry; 45 46 // used when canonicalizing 47 Map<String, String> keywords; 48 String baseName; 49 50 /** 51 * Parsing constants. 52 */ 53 private static final char KEYWORD_SEPARATOR = '@'; 54 private static final char HYPHEN = '-'; 55 private static final char KEYWORD_ASSIGN = '='; 56 private static final char COMMA = ','; 57 private static final char ITEM_SEPARATOR = ';'; 58 private static final char DOT = '.'; 59 private static final char UNDERSCORE = '_'; 60 61 public LocaleIDParser(String localeID) { 62 this(localeID, false); 63 } 64 65 public LocaleIDParser(String localeID, boolean canonicalize) { 66 id = localeID.toCharArray(); 67 index = 0; 68 buffer = new StringBuilder(id.length + 5); 69 this.canonicalize = canonicalize; 70 } 71 72 private void reset() { 73 index = 0; 74 buffer = new StringBuilder(id.length + 5); 75 } 76 77 // utilities for working on text in the buffer 78 79 /** 80 * Append c to the buffer. 81 */ 82 private void append(char c) { 83 buffer.append(c); 84 } 85 86 private void addSeparator() { 87 append(UNDERSCORE); 88 } 89 90 /** 91 * Returns the text in the buffer from start to blen as a String. 92 */ 93 private String getString(int start) { 94 return buffer.substring(start); 95 } 96 97 /** 98 * Set the length of the buffer to pos, then append the string. 99 */ 100 private void set(int pos, String s) { 101 buffer.delete(pos, buffer.length()); 102 buffer.insert(pos, s); 103 } 104 105 /** 106 * Append the string to the buffer. 107 */ 108 private void append(String s) { 109 buffer.append(s); 110 } 111 112 // utilities for parsing text out of the id 113 114 /** 115 * Character to indicate no more text is available in the id. 116 */ 117 private static final char DONE = '\uffff'; 118 119 /** 120 * Returns the character at index in the id, and advance index. The returned character 121 * is DONE if index was at the limit of the buffer. The index is advanced regardless 122 * so that decrementing the index will always 'unget' the last character returned. 123 */ 124 private char next() { 125 if (index == id.length) { 126 index++; 127 return DONE; 128 } 129 130 return id[index++]; 131 } 132 133 /** 134 * Advance index until the next terminator or id separator, and leave it there. 135 */ 136 private void skipUntilTerminatorOrIDSeparator() { 137 while (!isTerminatorOrIDSeparator(next())); 138 --index; 139 } 140 141 /** 142 * Returns true if the character at index in the id is a terminator. 143 */ 144 private boolean atTerminator() { 145 return index >= id.length || isTerminator(id[index]); 146 } 147 148 /** 149 * Returns true if the character is a terminator (keyword separator, dot, or DONE). 150 * Dot is a terminator because of the POSIX form, where dot precedes the codepage. 151 */ 152 private boolean isTerminator(char c) { 153 // always terminate at DOT, even if not handling POSIX. It's an error... 154 return c == KEYWORD_SEPARATOR || c == DONE || c == DOT; 155 } 156 157 /** 158 * Returns true if the character is a terminator or id separator. 159 */ 160 private boolean isTerminatorOrIDSeparator(char c) { 161 return c == UNDERSCORE || c == HYPHEN || isTerminator(c); 162 } 163 164 /** 165 * Returns true if the start of the buffer has an experimental or private language 166 * prefix, the pattern '[ixIX][-_].' shows the syntax checked. 167 */ 168 private boolean haveExperimentalLanguagePrefix() { 169 if (id.length > 2) { 170 char c = id[1]; 171 if (c == HYPHEN || c == UNDERSCORE) { 172 c = id[0]; 173 return c == 'x' || c == 'X' || c == 'i' || c == 'I'; 174 } 175 } 176 return false; 177 } 178 179 /** 180 * Returns true if a value separator occurs at or after index. 181 */ 182 private boolean haveKeywordAssign() { 183 // assume it is safe to start from index 184 for (int i = index; i < id.length; ++i) { 185 if (id[i] == KEYWORD_ASSIGN) { 186 return true; 187 } 188 } 189 return false; 190 } 191 192 /** 193 * Advance index past language, and accumulate normalized language code in buffer. 194 * Index must be at 0 when this is called. Index is left at a terminator or id 195 * separator. Returns the start of the language code in the buffer. 196 */ 197 private int parseLanguage() { 198 int startLength = buffer.length(); 199 200 if (haveExperimentalLanguagePrefix()) { 201 append(AsciiUtil.toLower(id[0])); 202 append(HYPHEN); 203 index = 2; 204 } 205 206 char c; 207 while(!isTerminatorOrIDSeparator(c = next())) { 208 append(AsciiUtil.toLower(c)); 209 } 210 --index; // unget 211 212 if (buffer.length() - startLength == 3) { 213 String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0)); 214 if (lang != null) { 215 set(0, lang); 216 } 217 } 218 219 return 0; 220 } 221 222 /** 223 * Advance index past language. Index must be at 0 when this is called. Index 224 * is left at a terminator or id separator. 225 */ 226 private void skipLanguage() { 227 if (haveExperimentalLanguagePrefix()) { 228 index = 2; 229 } 230 skipUntilTerminatorOrIDSeparator(); 231 } 232 233 /** 234 * Advance index past script, and accumulate normalized script in buffer. 235 * Index must be immediately after the language. 236 * If the item at this position is not a script (is not four characters 237 * long) leave index and buffer unchanged. Otherwise index is left at 238 * a terminator or id separator. Returns the start of the script code 239 * in the buffer (this may be equal to the buffer length, if there is no 240 * script). 241 */ 242 private int parseScript() { 243 if (!atTerminator()) { 244 int oldIndex = index; // save original index 245 ++index; 246 247 int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone 248 char c; 249 boolean firstPass = true; 250 while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) { 251 if (firstPass) { 252 addSeparator(); 253 append(AsciiUtil.toUpper(c)); 254 firstPass = false; 255 } else { 256 append(AsciiUtil.toLower(c)); 257 } 258 } 259 --index; // unget 260 261 /* If it's not exactly 4 characters long, then it's not a script. */ 262 if (index - oldIndex != 5) { // +1 to account for separator 263 index = oldIndex; 264 buffer.delete(oldBlen, buffer.length()); 265 } else { 266 oldBlen++; // index past hyphen, for clients who want to extract just the script 267 } 268 269 return oldBlen; 270 } 271 return buffer.length(); 272 } 273 274 /** 275 * Advance index past script. 276 * Index must be immediately after the language and IDSeparator. 277 * If the item at this position is not a script (is not four characters 278 * long) leave index. Otherwise index is left at a terminator or 279 * id separator. 280 */ 281 private void skipScript() { 282 if (!atTerminator()) { 283 int oldIndex = index; 284 ++index; 285 286 char c; 287 while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)); 288 --index; 289 290 if (index - oldIndex != 5) { // +1 to account for separator 291 index = oldIndex; 292 } 293 } 294 } 295 296 /** 297 * Advance index past country, and accumulate normalized country in buffer. 298 * Index must be immediately after the script (if there is one, else language) 299 * and IDSeparator. Return the start of the country code in the buffer. 300 */ 301 private int parseCountry() { 302 if (!atTerminator()) { 303 int oldIndex = index; 304 ++index; 305 306 int oldBlen = buffer.length(); 307 char c; 308 boolean firstPass = true; 309 while (!isTerminatorOrIDSeparator(c = next())) { 310 if (firstPass) { // first, add hyphen 311 hadCountry = true; // we have a country, let variant parsing know 312 addSeparator(); 313 ++oldBlen; // increment past hyphen 314 firstPass = false; 315 } 316 append(AsciiUtil.toUpper(c)); 317 } 318 --index; // unget 319 320 int charsAppended = buffer.length() - oldBlen; 321 322 if (charsAppended == 0) { 323 // Do nothing. 324 } 325 else if (charsAppended < 2 || charsAppended > 3) { 326 // It's not a country, so return index and blen to 327 // their previous values. 328 index = oldIndex; 329 --oldBlen; 330 buffer.delete(oldBlen, buffer.length()); 331 hadCountry = false; 332 } 333 else if (charsAppended == 3) { 334 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen)); 335 if (region != null) { 336 set(oldBlen, region); 337 } 338 } 339 340 return oldBlen; 341 } 342 343 return buffer.length(); 344 } 345 346 /** 347 * Advance index past country. 348 * Index must be immediately after the script (if there is one, else language) 349 * and IDSeparator. 350 */ 351 private void skipCountry() { 352 if (!atTerminator()) { 353 if (id[index] == UNDERSCORE || id[index] == HYPHEN) { 354 ++index; 355 } 356 /* 357 * Save the index point after the separator, since the format 358 * requires two separators if the country is not present. 359 */ 360 int oldIndex = index; 361 362 skipUntilTerminatorOrIDSeparator(); 363 int charsSkipped = index - oldIndex; 364 if (charsSkipped < 2 || charsSkipped > 3) { 365 index = oldIndex; 366 } 367 } 368 } 369 370 /** 371 * Advance index past variant, and accumulate normalized variant in buffer. This ignores 372 * the codepage information from POSIX ids. Index must be immediately after the country 373 * or script. Index is left at the keyword separator or at the end of the text. Return 374 * the start of the variant code in the buffer. 375 * 376 * In standard form, we can have the following forms: 377 * ll__VVVV 378 * ll_CC_VVVV 379 * ll_Ssss_VVVV 380 * ll_Ssss_CC_VVVV 381 * 382 * This also handles POSIX ids, which can have the following forms (pppp is code page id): 383 * ll_CC.pppp --> ll_CC 384 * ll_CC.pppp@VVVV --> ll_CC_VVVV 385 * ll_CC@VVVV --> ll_CC_VVVV 386 * 387 * We identify this use of '@' in POSIX ids by looking for an '=' following 388 * the '@'. If there is one, we consider '@' to start a keyword list, instead of 389 * being part of a POSIX id. 390 * 391 * Note: since it was decided that we want an option to not handle POSIX ids, this 392 * becomes a bit more complex. 393 */ 394 private int parseVariant() { 395 int oldBlen = buffer.length(); 396 397 boolean start = true; 398 boolean needSeparator = true; 399 boolean skipping = false; 400 char c; 401 boolean firstPass = true; 402 403 while ((c = next()) != DONE) { 404 if (c == DOT) { 405 start = false; 406 skipping = true; 407 } else if (c == KEYWORD_SEPARATOR) { 408 if (haveKeywordAssign()) { 409 break; 410 } 411 skipping = false; 412 start = false; 413 needSeparator = true; // add another underscore if we have more text 414 } else if (start) { 415 start = false; 416 if (c != UNDERSCORE && c != HYPHEN) { 417 index--; 418 } 419 } else if (!skipping) { 420 if (needSeparator) { 421 needSeparator = false; 422 if (firstPass && !hadCountry) { // no country, we'll need two 423 addSeparator(); 424 ++oldBlen; // for sure 425 } 426 addSeparator(); 427 if (firstPass) { // only for the first separator 428 ++oldBlen; 429 firstPass = false; 430 } 431 } 432 c = AsciiUtil.toUpper(c); 433 if (c == HYPHEN || c == COMMA) { 434 c = UNDERSCORE; 435 } 436 append(c); 437 } 438 } 439 --index; // unget 440 441 return oldBlen; 442 } 443 444 // no need for skipvariant, to get the keywords we'll just scan directly for 445 // the keyword separator 446 447 /** 448 * Returns the normalized language id, or the empty string. 449 */ 450 public String getLanguage() { 451 reset(); 452 return getString(parseLanguage()); 453 } 454 455 /** 456 * Returns the normalized script id, or the empty string. 457 */ 458 public String getScript() { 459 reset(); 460 skipLanguage(); 461 return getString(parseScript()); 462 } 463 464 /** 465 * return the normalized country id, or the empty string. 466 */ 467 public String getCountry() { 468 reset(); 469 skipLanguage(); 470 skipScript(); 471 return getString(parseCountry()); 472 } 473 474 /** 475 * Returns the normalized variant id, or the empty string. 476 */ 477 public String getVariant() { 478 reset(); 479 skipLanguage(); 480 skipScript(); 481 skipCountry(); 482 return getString(parseVariant()); 483 } 484 485 /** 486 * Returns the language, script, country, and variant as separate strings. 487 */ 488 public String[] getLanguageScriptCountryVariant() { 489 reset(); 490 return new String[] { 491 getString(parseLanguage()), 492 getString(parseScript()), 493 getString(parseCountry()), 494 getString(parseVariant()) 495 }; 496 } 497 498 public void setBaseName(String baseName) { 499 this.baseName = baseName; 500 } 501 502 public void parseBaseName() { 503 if (baseName != null) { 504 set(0, baseName); 505 } else { 506 reset(); 507 parseLanguage(); 508 parseScript(); 509 parseCountry(); 510 parseVariant(); 511 512 // catch unwanted trailing underscore after country if there was no variant 513 int len = buffer.length(); 514 if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) { 515 buffer.deleteCharAt(len - 1); 516 } 517 } 518 } 519 520 /** 521 * Returns the normalized base form of the locale id. The base 522 * form does not include keywords. 523 */ 524 public String getBaseName() { 525 if (baseName != null) { 526 return baseName; 527 } 528 parseBaseName(); 529 return getString(0); 530 } 531 532 /** 533 * Returns the normalized full form of the locale id. The full 534 * form includes keywords if they are present. 535 */ 536 public String getName() { 537 parseBaseName(); 538 parseKeywords(); 539 return getString(0); 540 } 541 542 // keyword utilities 543 544 /** 545 * If we have keywords, advance index to the start of the keywords and return true, 546 * otherwise return false. 547 */ 548 private boolean setToKeywordStart() { 549 for (int i = index; i < id.length; ++i) { 550 if (id[i] == KEYWORD_SEPARATOR) { 551 if (canonicalize) { 552 for (int j = ++i; j < id.length; ++j) { // increment i past separator for return 553 if (id[j] == KEYWORD_ASSIGN) { 554 index = i; 555 return true; 556 } 557 } 558 } else { 559 if (++i < id.length) { 560 index = i; 561 return true; 562 } 563 } 564 break; 565 } 566 } 567 return false; 568 } 569 570 private static boolean isDoneOrKeywordAssign(char c) { 571 return c == DONE || c == KEYWORD_ASSIGN; 572 } 573 574 private static boolean isDoneOrItemSeparator(char c) { 575 return c == DONE || c == ITEM_SEPARATOR; 576 } 577 578 private String getKeyword() { 579 int start = index; 580 while (!isDoneOrKeywordAssign(next())) { 581 } 582 --index; 583 return AsciiUtil.toLowerString(new String(id, start, index-start).trim()); 584 } 585 586 private String getValue() { 587 int start = index; 588 while (!isDoneOrItemSeparator(next())) { 589 } 590 --index; 591 return new String(id, start, index-start).trim(); // leave case alone 592 } 593 594 private Comparator<String> getKeyComparator() { 595 final Comparator<String> comp = new Comparator<String>() { 596 @Override 597 public int compare(String lhs, String rhs) { 598 return lhs.compareTo(rhs); 599 } 600 }; 601 return comp; 602 } 603 604 /** 605 * Returns a map of the keywords and values, or null if there are none. 606 */ 607 public Map<String, String> getKeywordMap() { 608 if (keywords == null) { 609 TreeMap<String, String> m = null; 610 if (setToKeywordStart()) { 611 // trim spaces and convert to lower case, both keywords and values. 612 do { 613 String key = getKeyword(); 614 if (key.length() == 0) { 615 break; 616 } 617 char c = next(); 618 if (c != KEYWORD_ASSIGN) { 619 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 620 if (c == DONE) { 621 break; 622 } else { 623 continue; 624 } 625 } 626 String value = getValue(); 627 if (value.length() == 0) { 628 // throw new IllegalArgumentException("key '" + key + "' missing a value."); 629 continue; 630 } 631 if (m == null) { 632 m = new TreeMap<String, String>(getKeyComparator()); 633 } else if (m.containsKey(key)) { 634 // throw new IllegalArgumentException("key '" + key + "' already has a value."); 635 continue; 636 } 637 m.put(key, value); 638 } while (next() == ITEM_SEPARATOR); 639 } 640 keywords = m != null ? m : Collections.<String, String>emptyMap(); 641 } 642 643 return keywords; 644 } 645 646 647 /** 648 * Parse the keywords and return start of the string in the buffer. 649 */ 650 private int parseKeywords() { 651 int oldBlen = buffer.length(); 652 Map<String, String> m = getKeywordMap(); 653 if (!m.isEmpty()) { 654 boolean first = true; 655 for (Map.Entry<String, String> e : m.entrySet()) { 656 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR); 657 first = false; 658 append(e.getKey()); 659 append(KEYWORD_ASSIGN); 660 append(e.getValue()); 661 } 662 if (first == false) { 663 ++oldBlen; 664 } 665 } 666 return oldBlen; 667 } 668 669 /** 670 * Returns an iterator over the keywords, or null if we have an empty map. 671 */ 672 public Iterator<String> getKeywords() { 673 Map<String, String> m = getKeywordMap(); 674 return m.isEmpty() ? null : m.keySet().iterator(); 675 } 676 677 /** 678 * Returns the value for the named keyword, or null if the keyword is not 679 * present. 680 */ 681 public String getKeywordValue(String keywordName) { 682 Map<String, String> m = getKeywordMap(); 683 return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim())); 684 } 685 686 /** 687 * Set the keyword value only if it is not already set to something else. 688 */ 689 public void defaultKeywordValue(String keywordName, String value) { 690 setKeywordValue(keywordName, value, false); 691 } 692 693 /** 694 * Set the value for the named keyword, or unset it if value is null. If 695 * keywordName itself is null, unset all keywords. If keywordName is not null, 696 * value must not be null. 697 */ 698 public void setKeywordValue(String keywordName, String value) { 699 setKeywordValue(keywordName, value, true); 700 } 701 702 /** 703 * Set the value for the named keyword, or unset it if value is null. If 704 * keywordName itself is null, unset all keywords. If keywordName is not null, 705 * value must not be null. If reset is true, ignore any previous value for 706 * the keyword, otherwise do not change the keyword (including removal of 707 * one or all keywords). 708 */ 709 private void setKeywordValue(String keywordName, String value, boolean reset) { 710 if (keywordName == null) { 711 if (reset) { 712 // force new map, ignore value 713 keywords = Collections.<String, String>emptyMap(); 714 } 715 } else { 716 keywordName = AsciiUtil.toLowerString(keywordName.trim()); 717 if (keywordName.length() == 0) { 718 throw new IllegalArgumentException("keyword must not be empty"); 719 } 720 if (value != null) { 721 value = value.trim(); 722 if (value.length() == 0) { 723 throw new IllegalArgumentException("value must not be empty"); 724 } 725 } 726 Map<String, String> m = getKeywordMap(); 727 if (m.isEmpty()) { // it is EMPTY_MAP 728 if (value != null) { 729 // force new map 730 keywords = new TreeMap<String, String>(getKeyComparator()); 731 keywords.put(keywordName, value.trim()); 732 } 733 } else { 734 if (reset || !m.containsKey(keywordName)) { 735 if (value != null) { 736 m.put(keywordName, value); 737 } else { 738 m.remove(keywordName); 739 if (m.isEmpty()) { 740 // force new map 741 keywords = Collections.<String, String>emptyMap(); 742 } 743 } 744 } 745 } 746 } 747 } 748 } 749