1 package org.unicode.cldr.util; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.util.Arrays; 6 import java.util.Collections; 7 import java.util.HashMap; 8 import java.util.LinkedHashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeMap; 13 import java.util.regex.Pattern; 14 15 import com.google.common.base.Splitter; 16 import com.ibm.icu.impl.Relation; 17 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 18 import com.ibm.icu.util.ICUUncheckedIOException; 19 20 public class Iso639Data { 21 22 static Map<String, String> toAlpha3; 23 24 static Map<String, String> fromAlpha3; 25 26 static Map<String, String> toBiblio3; 27 28 static Map<String, String> fromBiblio3; 29 30 static Relation<String, String> toNames; 31 32 static Relation<String, String> toRetirements; 33 34 static Map<String, String> toChangeTo; 35 36 static Map<String, Scope> toScope; 37 38 static Map<String, List<String>> toHeirarchy; 39 40 static Map<String, Type> toType; 41 42 static Map<String, String> encompassed_macro; 43 44 static Relation<String, String> macro_encompassed; 45 46 static Map<String, Source> toSource; 47 48 private static String version; 49 50 /** 51 * <h3><a NAME="I">Individual</a> languages</h3> 52 * <p> 53 * Judgments regarding when two varieties are considered to be the same or different languages are based on a number 54 * of factors, including linguistic similarity, intelligibility, a common literature, the views of speakers 55 * concerning the relationship between language and identity, and other factors. 56 * </p> 57 * <h3><a NAME="M">Macrolanguages</a></h3> 58 * <p> 59 * In various parts of the world, there are clusters of closely-related language varieties that, based on the 60 * criteria discussed above, can be considered distinct individual languages, yet in certain usage contexts a single 61 * language identity for all is needed. 62 * </p> 63 * <p> 64 * Macrolanguages are distinguished from language collections in that the individual languages that correspond to a 65 * macrolanguage must be very closely related, and there must be some domain in which only a single language 66 * identity is recognized. 67 * </p> 68 * 69 * <h3><a NAME="C">Collections</a> of languages</h3> 70 * <p> 71 * A collective language code element is an identifier that represents a group of individual languages that are not 72 * deemed to be one language in any usage context. 73 * </p> 74 * </p> <h3><a NAME="R">Private Use</a></h3> 75 * <p> 76 * Identifiers <tt>qaa</tt> through <tt>qtz</tt> are reserved for local use, to be used in cases in which there is 77 * no suitable existing code in ISO 639. There are no constraints as to scope of denotation. These identifiers may 78 * only be used locally, and may not be used in interchange without a private agreement. 79 * </p> 80 * <h3><a NAME="S">Special situations</a></h3> 81 * <p> 82 * A few code elements are defined for other special situations. 83 * </p> 84 * For more information, see http://www.sil.org/iso639-3/scope.asp 85 * <p> 86 * Note that the casing on these enum values is chosen to match standard usage. 87 * </p> 88 */ 89 public enum Scope { 90 Individual, Macrolanguage, Special, Collection, PrivateUse, Unknown; 91 public static Scope fromString(String input) { 92 input = input.replace("-", ""); 93 for (Scope item : Scope.values()) { 94 if (item.toString().equalsIgnoreCase(input)) { 95 return item; 96 } 97 } 98 return Scope.valueOf(input); // to get exception 99 } 100 }; 101 102 /** 103 * <h3><a NAME="L"></a>Living languages</h3> 104 * <p> 105 * A language is listed as <i>living</i> when there are people still living who learned it as a first language. 106 * </p> 107 * <h3><a NAME="E"></a>Extinct languages</h3> 108 * 109 * <p> 110 * A language is listed as <i>extinct</i> if it has gone extinct in recent times. (e.g. in the last few centuries). 111 * </p> 112 * <h3><a NAME="A"></a>Ancient languages</h3> 113 * <p> 114 * A language is listed as <i>ancient</i> if it went extinct in ancient times (e.g. more than a millennium ago). 115 * </p> 116 * <h3><a NAME="H"></a>Historic languages</h3> 117 * <p> 118 * A language is listed as <i>historic</i> when it is considered to be distinct from any modern languages that are 119 * descended from it; for instance, Old English and Middle English. 120 * </p> 121 * 122 * <h3><a NAME="C"></a>Constructed languages</h3> 123 * <p> 124 * Artificial languages are those like Esperanto: it excludes programming languages. 125 * </p> 126 * <p> 127 * Note that the casing on these enum values is chosen to match standard usage. <i>For more information, see 128 * http://www.sil.org/iso639-3/scope.asp</i> 129 * </p> 130 */ 131 public enum Type { 132 Ancient, Constructed, Extinct, Historical, Living, Special, Collection, Unknown 133 }; 134 135 /** 136 * This indicates the source of the language subtag. 137 * 138 * @author markdavis 139 * 140 */ 141 public enum Source { 142 ISO_639_1, ISO_639_2, ISO_639_3, BCP47, CLDR 143 }; 144 145 public static String getVersion() { 146 return version; 147 } 148 149 public static Source getSource(String languageSubtag) { 150 if (toAlpha3 == null) { 151 getData(); 152 } 153 if (!isValid(languageSubtag)) { 154 return null; 155 } 156 Source result = toSource.get(languageSubtag); 157 if (result == null) 158 return Source.ISO_639_3; 159 return result; 160 } 161 162 public static String toAlpha3(String languageSubtag) { 163 if (toAlpha3 == null) { 164 getData(); 165 } 166 if (!isValid(languageSubtag)) { 167 return null; 168 } 169 return toAlpha3.get(languageSubtag); 170 } 171 172 public static String fromAlpha3(String alpha3) { 173 if (fromAlpha3 == null) { 174 getData(); 175 } 176 String alpha2 = fromAlpha3.get(alpha3); 177 if (alpha2 != null) { 178 return alpha2; 179 } 180 // it only exists if it has a name 181 if (isValid(alpha3)) { 182 return alpha3; 183 } 184 return null; 185 } 186 187 private static boolean isValid(String alpha3) { 188 return toNames.containsKey(alpha3); 189 } 190 191 public static String fromBiblio3(String biblio3) { 192 if (toNames == null) { 193 getData(); 194 } 195 String result = fromBiblio3.get(biblio3); 196 if (result != null) { 197 return result; 198 } 199 return fromAlpha3(biblio3); 200 } 201 202 public static String toBiblio3(String languageTag) { 203 if (toNames == null) { 204 getData(); 205 } 206 String result = toBiblio3.get(languageTag); 207 if (result != null) { 208 return result; 209 } 210 return toAlpha3(languageTag); 211 } 212 213 public static Set<String> hasBiblio3() { 214 return toBiblio3.keySet(); 215 } 216 217 public static Set<String> getNames(String languageSubtag) { 218 if (toNames == null) { 219 getData(); 220 } 221 return toNames.getAll(languageSubtag); 222 } 223 224 public static Scope getScope(String languageSubtag) { 225 if (toScope == null) { 226 getData(); 227 } 228 if (!isValid(languageSubtag)) 229 return Scope.Unknown; 230 Scope result = toScope.get(languageSubtag); 231 if (result != null) 232 return result; 233 return Scope.Individual; 234 } 235 236 /** 237 * Returns the ISO 639-5 heirarchy if available, otherwise null. 238 */ 239 public static List<String> getHeirarchy(String languageSubtag) { 240 if (toHeirarchy == null) { 241 getData(); 242 } 243 return toHeirarchy.get(languageSubtag); 244 } 245 246 public static Type getType(String languageSubtag) { 247 if (toAlpha3 == null) { 248 getData(); 249 } 250 if (!isValid(languageSubtag)) 251 return Type.Unknown; 252 Type result = toType.get(languageSubtag); 253 if (result != null) 254 return result; 255 return Type.Living; 256 } 257 258 /** 259 * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Part2B char(3) 260 * NULL, -- Equivalent 639-2 identifier of the bibliographic applications code 261 * set, if there is one Part2T char(3) NULL, -- Equivalent 639-2 identifier of 262 * the terminology applications code set, if there is one Part1 char(2) NULL, -- 263 * Equivalent 639-1 identifier, if there is one Scope char(1) NOT NULL, -- 264 * I(ndividual), M(acrolanguage), S(pecial) Type char(1) NOT NULL, -- 265 * A(ncient), C(onstructed), -- E(xtinct), H(istorical), L(iving), S(pecial) 266 * Ref_Name varchar(150) NOT NULL) -- Reference language name 267 * 268 * @throws IOException 269 */ 270 enum IsoColumn { 271 Id, Part2B, Part2T, Part1, Scope, Type, Ref_Name 272 }; 273 274 /** 275 * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Print_Name 276 * varchar(75) NOT NULL, -- One of the names associated with this identifier 277 * Inverted_Name varchar(75) NOT NULL) -- The inverted form of this Print_Name 278 * form 279 */ 280 enum IsoNamesColumn { 281 Id, Print_Name, Inverted_Name 282 }; 283 284 private static void getData() { 285 try { 286 BufferedReader in = CldrUtility.getUTF8Data("iso-639-3-version.tab"); 287 version = in.readLine().trim(); 288 in.close(); 289 290 in = CldrUtility.getUTF8Data("iso-639-3.tab"); 291 SplitToArray tabs = new SplitToArray(Splitter.on('\t').trimResults()); 292 toAlpha3 = new HashMap<String, String>(); 293 fromAlpha3 = new HashMap<String, String>(); 294 toBiblio3 = new HashMap<String, String>(); 295 fromBiblio3 = new HashMap<String, String>(); 296 toScope = new HashMap<String, Scope>(); 297 toType = new HashMap<String, Type>(); 298 toNames = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 299 toRetirements = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 300 toChangeTo = new TreeMap<String, String>(); 301 macro_encompassed = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 302 encompassed_macro = new HashMap<String, String>(); 303 toSource = new HashMap<String, Source>(); 304 toSource.put("sh", Source.ISO_639_1); // add deprecated language 305 306 int count = 0; // line count for debugging 307 while (true) { 308 ++count; 309 String line = in.readLine(); 310 if (line == null) { 311 break; 312 } 313 if (line.startsWith("\uFEFF")) { 314 line = line.substring(1); 315 } 316 line = line.trim(); 317 if (line.isEmpty()) { 318 continue; 319 } 320 String[] parts = tabs.split(line); 321 String alpha3 = parts[IsoColumn.Id.ordinal()]; 322 if (alpha3.equals("Id")) 323 continue; 324 String languageSubtag = alpha3; 325 if (parts[IsoColumn.Part1.ordinal()].length() != 0) { // parts.length > 326 // IsoColumn.Part1.ordinal() 327 // && 328 languageSubtag = parts[IsoColumn.Part1.ordinal()]; 329 toAlpha3.put(languageSubtag, alpha3); 330 fromAlpha3.put(alpha3, languageSubtag); 331 } 332 333 if (parts[IsoColumn.Part2B.ordinal()].length() != 0) { // parts.length > 334 // IsoColumn.Part1.ordinal() 335 // && 336 String biblio = parts[IsoColumn.Part2B.ordinal()]; 337 if (!biblio.equals(alpha3)) { 338 toBiblio3.put(languageSubtag, biblio); 339 fromBiblio3.put(biblio, languageSubtag); 340 } 341 } 342 343 toNames.put(languageSubtag, parts[IsoColumn.Ref_Name.ordinal()]); 344 Scope scope = findMatchToPrefix(parts[IsoColumn.Scope.ordinal()], Scope.values()); 345 if (scope != Scope.Individual) 346 toScope.put(languageSubtag, scope); 347 Type type = findMatchToPrefix(parts[IsoColumn.Type.ordinal()], Type.values()); 348 if (type != Type.Living) 349 toType.put(languageSubtag, type); 350 } 351 // System.out.println("Size:\t" + toNames.size()); 352 in.close(); 353 354 // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective 355 in = CldrUtility.getUTF8Data("iso-639-3_Retirements.tab"); 356 while (true) { 357 String line = in.readLine(); 358 if (line == null) 359 break; 360 if (line.startsWith("\uFEFF")) 361 line = line.substring(1); 362 String[] parts = tabs.split(line); 363 String alpha3 = parts[0]; 364 if (alpha3.equals("Id")) 365 continue; 366 // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective 367 // fri Western Frisian C fry 2007-02-01 368 369 toNames.put(alpha3, parts[1]); 370 if (!parts[3].isEmpty()) { 371 toChangeTo.put(alpha3, parts[3]); 372 } 373 toRetirements.put(alpha3, line); 374 // skip inverted name for now 375 } 376 // System.out.println("Size:\t" + toNames.size()); 377 in.close(); 378 379 // Id Print_Name Inverted_Name 380 in = CldrUtility.getUTF8Data("iso-639-3-macrolanguages.tab"); 381 while (true) { 382 String line = in.readLine(); 383 if (line == null) 384 break; 385 if (line.startsWith("\uFEFF")) 386 line = line.substring(1); 387 String[] parts = tabs.split(line); 388 String prefix = parts[0]; 389 if (prefix.equals("M_Id")) 390 continue; 391 prefix = fromAlpha3(prefix); 392 String suffix = fromAlpha3(parts[1]); 393 if (suffix == null || prefix == null) { 394 throw new IllegalArgumentException(); 395 } 396 encompassed_macro.put(suffix, prefix); 397 macro_encompassed.put(prefix, suffix); 398 // skip inverted name for now 399 } 400 // System.out.println("Size:\t" + toNames.size()); 401 in.close(); 402 403 // Id Print_Name Inverted_Name 404 in = CldrUtility.getUTF8Data("iso-639-3_Name_Index.tab"); 405 while (true) { 406 String line = in.readLine(); 407 if (line == null) 408 break; 409 if (line.startsWith("\uFEFF")) 410 line = line.substring(1); 411 String[] parts = tabs.split(line); 412 String alpha3 = parts[IsoColumn.Id.ordinal()]; 413 if (alpha3.equals("Id")) 414 continue; 415 String languageSubTag = fromAlpha3(alpha3); 416 toNames.put(languageSubTag, parts[IsoNamesColumn.Print_Name.ordinal()]); 417 // skip inverted name for now 418 } 419 // System.out.println("Size:\t" + toNames.size()); 420 in.close(); 421 422 in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt"); 423 // An alpha-3 (bibliographic) code, 424 // an alpha-3 (terminologic) code (when given), 425 // an alpha-2 code (when given), 426 // an English name, 427 // and a French name of a language are all separated by pipe (|) 428 // characters. 429 while (true) { 430 String line = in.readLine(); 431 if (line == null) 432 break; 433 if (line.startsWith("\uFEFF")) 434 line = line.substring(1); 435 String[] parts = line.split("\\s*\\|\\s*"); 436 String alpha3 = parts[0]; 437 if (alpha3.equals("qaa-qtz")) { 438 for (char second = 'a'; second <= 't'; ++second) { 439 for (char third = 'a'; third <= 'z'; ++third) { 440 String languageSubtag = (("q" + second) + third); 441 toScope.put(languageSubtag, Scope.PrivateUse); 442 toType.put(languageSubtag, Type.Special); 443 toNames.put(languageSubtag, "private-use"); 444 toSource.put(languageSubtag, Source.ISO_639_2); 445 } 446 } 447 continue; 448 } 449 if (parts[1].length() != 0) 450 alpha3 = parts[1]; 451 String languageSubtag = parts[2]; 452 if (languageSubtag.length() == 0) { 453 languageSubtag = alpha3; 454 } 455 String[] english = parts[3].split(";"); 456 toSource.put(languageSubtag, languageSubtag.length() == 2 ? Source.ISO_639_1 : Source.ISO_639_2); 457 if (!isValid(languageSubtag)) { 458 // we don't have it already, 459 // System.out.println("Adding2: " + alpha3 + "\t" + languageSubtag + "\t" + Arrays.asList(english)); 460 if (languageSubtag.length() == 2) { 461 toAlpha3.put(languageSubtag, alpha3); 462 fromAlpha3.put(alpha3, languageSubtag); 463 } 464 toScope.put(languageSubtag, Scope.Collection); 465 toType.put(languageSubtag, Type.Special); 466 toNames.putAll(languageSubtag, Arrays.asList(english)); 467 } 468 // skip inverted name for now 469 } 470 in.close(); 471 472 Map<String, String> toHeirarchyTemp = new TreeMap<String, String>(); 473 in = CldrUtility.getUTF8Data("external/Iso639-5.html"); 474 String lastCode = null; 475 int column = 0; 476 boolean lastAttributeIsScope = false; 477 boolean lastElementIsTD = false; 478 boolean hadPop = true; 479 // if the table level is 1 (we are in the main table), then we look for <td>...</td><td>...</td>. That means 480 // that we have column 1 and column 2. 481 482 SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in); 483 StringBuilder result = new StringBuilder(); 484 485 main: while (true) { 486 SimpleHtmlParser.Type x = simple.next(result); 487 // System.out.println(column + "\t" + x + "\t" + result); 488 switch (x) { 489 case ELEMENT_START: 490 hadPop = false; 491 lastElementIsTD = false; 492 break; 493 case ELEMENT: 494 if (SimpleHtmlParser.equals("tr", result)) { 495 column = 0; 496 } else if (SimpleHtmlParser.equals("td", result)) { 497 lastElementIsTD = true; 498 } 499 break; 500 case ELEMENT_POP: 501 hadPop = true; 502 break; 503 case ELEMENT_END: 504 // if we get a POP and a TD, and we have column > 0, we increment 505 if (lastElementIsTD && hadPop && column > 0) { 506 ++column; 507 } 508 break; 509 case ELEMENT_CONTENT: 510 /* 511 * <th scope="col">Identifier<br />Indicatif</th> 512 * <th scope="col">English name<br />Nom anglais</th> 513 * <th scope="col">French name<br />Nom franais</th> 514 * <th scope="col">639-2</th> 515 * <th scope="col">Hierarchy<br />Hirarchie</th> 516 * <th scope="col">Notes<br />Notes</th> 517 * 518 * <td scope="row">apa</td> 519 * <td>Apache languages</td> 520 * <td>apaches, langues</td> 521 * <td>language group<br />groupe de langues</td> 522 * <td>nai : xnd : ath : apa</td> 523 * <td> 524 * <br /> 525 * </td> 526 */ 527 switch (column) { 528 case 1: 529 lastCode = result.toString(); 530 break; 531 case 5: 532 String old = toHeirarchyTemp.get(lastCode); 533 toHeirarchyTemp.put(lastCode, old == null || old.length() == 0 ? result.toString().trim() 534 : old + " " + result.toString().trim()); 535 break; 536 case 2: 537 break; 538 case 3: 539 break; 540 case 4: 541 break; 542 case 0: 543 break; 544 default: 545 break; 546 } 547 break; 548 case ATTRIBUTE: 549 lastAttributeIsScope = SimpleHtmlParser.equals("scope", result); 550 break; 551 case ATTRIBUTE_CONTENT: 552 if (lastAttributeIsScope && SimpleHtmlParser.equals("row", result)) { 553 column = 1; 554 } 555 break; 556 case QUOTE: 557 break; 558 case DONE: 559 break main; 560 } 561 } 562 563 in.close(); 564 565 Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*"); 566 toHeirarchy = new TreeMap<String, List<String>>(); 567 // for (String code : toHeirarchyTemp.keySet()) { 568 // System.out.println(code + " => " + toHeirarchyTemp.get(code)); 569 // } 570 for (String code : toHeirarchyTemp.keySet()) { 571 String valueString = toHeirarchyTemp.get(code); 572 String[] values = SPLIT_HEIRARCHY.split(valueString); 573 for (String value : values) { 574 if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) { 575 throw new IllegalArgumentException("Unexpected value in heirarchy:\t" + value + "\t" + code 576 + "\t" + valueString); 577 } 578 } 579 toHeirarchy.put(code, Arrays.asList(values)); 580 } 581 // System.out.println("Size:\t" + toNames.size()); 582 583 // make data unmodifiable, just to prevent mistakes 584 585 toAlpha3 = Collections.unmodifiableMap(toAlpha3); 586 fromAlpha3 = Collections.unmodifiableMap(fromAlpha3); 587 toBiblio3 = Collections.unmodifiableMap(toBiblio3); 588 fromBiblio3 = Collections.unmodifiableMap(fromBiblio3); 589 toScope = Collections.unmodifiableMap(toScope); 590 toType = Collections.unmodifiableMap(toType); 591 toHeirarchy = Collections.unmodifiableMap(toHeirarchy); 592 593 toNames.freeze(); 594 toRetirements.freeze(); 595 macro_encompassed.freeze(); 596 toChangeTo = ImmutableMap.copyOf(toChangeTo); 597 598 } catch (IOException e) { 599 throw new ICUUncheckedIOException("Cannot parse file", e); 600 } 601 } 602 603 public static <T> T findMatchToPrefix(String prefix, T[] values) { 604 for (T x : values) { 605 if (x.toString().startsWith(prefix)) { 606 return x; 607 } 608 } 609 throw new IllegalArgumentException("Prefix <" + prefix + "> not found in " + Arrays.asList(values)); 610 } 611 612 public static Set<String> getAvailable() { 613 if (toAlpha3 == null) { 614 getData(); 615 } 616 return toNames.keySet(); 617 } 618 619 public static String getMacroForEncompassed(String suffix) { 620 String prefix = encompassed_macro.get(suffix); 621 if (prefix != null) 622 return prefix; 623 if (suffix.equals("sgn")) 624 return null; 625 Set<String> names = toNames.getAll(suffix); 626 if (names == null) 627 return null; 628 for (String name : names) { 629 if (name.contains("Sign Language")) 630 return "sgn"; 631 } 632 return null; 633 } 634 635 public static Set<String> getEncompassedForMacro(String prefix) { 636 return macro_encompassed.getAll(prefix); 637 } 638 639 public static Set<String> getMacros() { 640 return macro_encompassed.keySet(); 641 } 642 643 public static Set<String> getEncompassed() { 644 return encompassed_macro.keySet(); 645 } 646 647 public static String getChangeTo(String subtag) { 648 return getChangeToMap().get(subtag); 649 } 650 651 public static Map<String, String> getChangeToMap() { 652 if (toChangeTo == null) { 653 getData(); 654 } 655 return toChangeTo; 656 } 657 }