1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import android.content.ContentValues; 19 import android.provider.ContactsContract.FullNameStyle; 20 import android.provider.ContactsContract.PhoneticNameStyle; 21 import android.provider.ContactsContract.CommonDataKinds.StructuredName; 22 import android.text.TextUtils; 23 24 import java.lang.Character.UnicodeBlock; 25 import java.util.HashSet; 26 import java.util.Locale; 27 import java.util.StringTokenizer; 28 29 /** 30 * The purpose of this class is to split a full name into given names and last 31 * name. The logic only supports having a single last name. If the full name has 32 * multiple last names the output will be incorrect. 33 * <p> 34 * Core algorithm: 35 * <ol> 36 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 37 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 38 * <li>Assign the last remaining token as the last name.</li> 39 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 40 * this word also as the last name.</li> 41 * <li>Assign the rest of the words as the "given names".</li> 42 * </ol> 43 */ 44 public class NameSplitter { 45 46 public static final int MAX_TOKENS = 10; 47 48 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 49 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 50 51 // This includes simplified and traditional Chinese 52 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 53 54 private final HashSet<String> mPrefixesSet; 55 private final HashSet<String> mSuffixesSet; 56 private final int mMaxSuffixLength; 57 private final HashSet<String> mLastNamePrefixesSet; 58 private final HashSet<String> mConjuctions; 59 private final Locale mLocale; 60 private final String mLanguage; 61 62 /** 63 * Two-Chracter long Korean family names. 64 * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 65 */ 66 private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = { 67 "\uAC15\uC804", // Gang Jeon 68 "\uB0A8\uAD81", // Nam Goong 69 "\uB3C5\uACE0", // Dok Go 70 "\uB3D9\uBC29", // Dong Bang 71 "\uB9DD\uC808", // Mang Jeol 72 "\uC0AC\uACF5", // Sa Gong 73 "\uC11C\uBB38", // Seo Moon 74 "\uC120\uC6B0", // Seon Woo 75 "\uC18C\uBD09", // So Bong 76 "\uC5B4\uAE08", // Uh Geum 77 "\uC7A5\uACE1", // Jang Gok 78 "\uC81C\uAC08", // Je Gal 79 "\uD669\uBCF4" // Hwang Bo 80 }; 81 82 public static class Name { 83 public String prefix; 84 public String givenNames; 85 public String middleName; 86 public String familyName; 87 public String suffix; 88 89 public int fullNameStyle; 90 91 public String phoneticFamilyName; 92 public String phoneticMiddleName; 93 public String phoneticGivenName; 94 95 public int phoneticNameStyle; 96 97 public Name() { 98 } 99 100 public Name(String prefix, String givenNames, String middleName, String familyName, 101 String suffix) { 102 this.prefix = prefix; 103 this.givenNames = givenNames; 104 this.middleName = middleName; 105 this.familyName = familyName; 106 this.suffix = suffix; 107 } 108 109 public String getPrefix() { 110 return prefix; 111 } 112 113 public String getGivenNames() { 114 return givenNames; 115 } 116 117 public String getMiddleName() { 118 return middleName; 119 } 120 121 public String getFamilyName() { 122 return familyName; 123 } 124 125 public String getSuffix() { 126 return suffix; 127 } 128 129 public int getFullNameStyle() { 130 return fullNameStyle; 131 } 132 133 public String getPhoneticFamilyName() { 134 return phoneticFamilyName; 135 } 136 137 public String getPhoneticMiddleName() { 138 return phoneticMiddleName; 139 } 140 141 public String getPhoneticGivenName() { 142 return phoneticGivenName; 143 } 144 145 public int getPhoneticNameStyle() { 146 return phoneticNameStyle; 147 } 148 149 public void fromValues(ContentValues values) { 150 prefix = values.getAsString(StructuredName.PREFIX); 151 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 152 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 153 familyName = values.getAsString(StructuredName.FAMILY_NAME); 154 suffix = values.getAsString(StructuredName.SUFFIX); 155 156 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 157 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 158 159 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 160 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 161 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 162 163 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 164 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 165 } 166 167 public void toValues(ContentValues values) { 168 putValueIfPresent(values, StructuredName.PREFIX, prefix); 169 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 170 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 171 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 172 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 173 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 174 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 175 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 176 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 177 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 178 } 179 180 private void putValueIfPresent(ContentValues values, String name, String value) { 181 if (value != null) { 182 values.put(name, value); 183 } 184 } 185 186 public void clear() { 187 prefix = null; 188 givenNames = null; 189 middleName = null; 190 familyName = null; 191 suffix = null; 192 fullNameStyle = FullNameStyle.UNDEFINED; 193 phoneticFamilyName = null; 194 phoneticMiddleName = null; 195 phoneticGivenName = null; 196 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 197 } 198 199 public boolean isEmpty() { 200 return TextUtils.isEmpty(givenNames) 201 && TextUtils.isEmpty(middleName) 202 && TextUtils.isEmpty(familyName) 203 && TextUtils.isEmpty(suffix) 204 && TextUtils.isEmpty(phoneticFamilyName) 205 && TextUtils.isEmpty(phoneticMiddleName) 206 && TextUtils.isEmpty(phoneticGivenName); 207 } 208 209 @Override 210 public String toString() { 211 return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName 212 + " family: " + familyName + " suffix: " + suffix + " ph/given: " 213 + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: " 214 + phoneticFamilyName + "]"; 215 } 216 } 217 218 private static class NameTokenizer extends StringTokenizer { 219 private final String[] mTokens; 220 private int mDotBitmask; 221 private int mCommaBitmask; 222 private int mStartPointer; 223 private int mEndPointer; 224 225 public NameTokenizer(String fullName) { 226 super(fullName, " .,", true); 227 228 mTokens = new String[MAX_TOKENS]; 229 230 // Iterate over tokens, skipping over empty ones and marking tokens that 231 // are followed by dots. 232 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 233 final String token = nextToken(); 234 if (token.length() > 0) { 235 final char c = token.charAt(0); 236 if (c == ' ') { 237 continue; 238 } 239 } 240 241 if (mEndPointer > 0 && token.charAt(0) == '.') { 242 mDotBitmask |= (1 << (mEndPointer - 1)); 243 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 244 mCommaBitmask |= (1 << (mEndPointer - 1)); 245 } else { 246 mTokens[mEndPointer] = token; 247 mEndPointer++; 248 } 249 } 250 } 251 252 /** 253 * Returns true if the token is followed by a dot in the original full name. 254 */ 255 public boolean hasDot(int index) { 256 return (mDotBitmask & (1 << index)) != 0; 257 } 258 259 /** 260 * Returns true if the token is followed by a comma in the original full name. 261 */ 262 public boolean hasComma(int index) { 263 return (mCommaBitmask & (1 << index)) != 0; 264 } 265 } 266 267 /** 268 * Constructor. 269 * 270 * @param commonPrefixes comma-separated list of common prefixes, 271 * e.g. "Mr, Ms, Mrs" 272 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 273 * e.g. "d', st, st., von" 274 * @param commonSuffixes comma-separated list of common suffixes, 275 * e.g. "Jr, M.D., MD, D.D.S." 276 * @param commonConjunctions comma-separated list of common conjuctions, 277 * e.g. "AND, Or" 278 */ 279 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 280 String commonSuffixes, String commonConjunctions, Locale locale) { 281 // TODO: refactor this to use <string-array> resources 282 mPrefixesSet = convertToSet(commonPrefixes); 283 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 284 mSuffixesSet = convertToSet(commonSuffixes); 285 mConjuctions = convertToSet(commonConjunctions); 286 mLocale = locale != null ? locale : Locale.getDefault(); 287 mLanguage = mLocale.getLanguage().toLowerCase(); 288 289 int maxLength = 0; 290 for (String suffix : mSuffixesSet) { 291 if (suffix.length() > maxLength) { 292 maxLength = suffix.length(); 293 } 294 } 295 296 mMaxSuffixLength = maxLength; 297 } 298 299 /** 300 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 301 * and converts them to upper case. 302 */ 303 private static HashSet<String> convertToSet(String strings) { 304 HashSet<String> set = new HashSet<String>(); 305 if (strings != null) { 306 String[] split = strings.split(","); 307 for (int i = 0; i < split.length; i++) { 308 set.add(split[i].trim().toUpperCase()); 309 } 310 } 311 return set; 312 } 313 314 /** 315 * Parses a full name and returns components as a list of tokens. 316 */ 317 public int tokenize(String[] tokens, String fullName) { 318 if (fullName == null) { 319 return 0; 320 } 321 322 NameTokenizer tokenizer = new NameTokenizer(fullName); 323 324 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 325 return 0; 326 } 327 328 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 329 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 330 tokenizer.mStartPointer++; 331 } 332 int count = 0; 333 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 334 tokens[count++] = tokenizer.mTokens[i]; 335 } 336 337 return count; 338 } 339 340 341 /** 342 * Parses a full name and returns parsed components in the Name object. 343 */ 344 public void split(Name name, String fullName) { 345 if (fullName == null) { 346 return; 347 } 348 349 int fullNameStyle = guessFullNameStyle(fullName); 350 if (fullNameStyle == FullNameStyle.CJK) { 351 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 352 } 353 354 split(name, fullName, fullNameStyle); 355 } 356 357 /** 358 * Parses a full name and returns parsed components in the Name object 359 * with a given fullNameStyle. 360 */ 361 public void split(Name name, String fullName, int fullNameStyle) { 362 if (fullName == null) { 363 return; 364 } 365 366 name.fullNameStyle = fullNameStyle; 367 368 switch (fullNameStyle) { 369 case FullNameStyle.CHINESE: 370 splitChineseName(name, fullName); 371 break; 372 373 case FullNameStyle.JAPANESE: 374 splitJapaneseName(name, fullName); 375 break; 376 377 case FullNameStyle.KOREAN: 378 splitKoreanName(name, fullName); 379 break; 380 381 default: 382 splitWesternName(name, fullName); 383 } 384 } 385 386 /** 387 * Splits a full name composed according to the Western tradition: 388 * <pre> 389 * [prefix] given name(s) [[middle name] family name] [, suffix] 390 * [prefix] family name, given name [middle name] [,suffix] 391 * </pre> 392 */ 393 private void splitWesternName(Name name, String fullName) { 394 NameTokenizer tokens = new NameTokenizer(fullName); 395 parsePrefix(name, tokens); 396 397 // If the name consists of just one or two tokens, treat them as first/last name, 398 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 399 if (tokens.mEndPointer > 2) { 400 parseSuffix(name, tokens); 401 } 402 403 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 404 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 405 } else { 406 parseLastName(name, tokens); 407 parseMiddleName(name, tokens); 408 parseGivenNames(name, tokens); 409 } 410 } 411 412 /** 413 * Splits a full name composed according to the Chinese tradition: 414 * <pre> 415 * [family name [middle name]] given name 416 * </pre> 417 */ 418 private void splitChineseName(Name name, String fullName) { 419 StringTokenizer tokenizer = new StringTokenizer(fullName); 420 while (tokenizer.hasMoreTokens()) { 421 String token = tokenizer.nextToken(); 422 if (name.givenNames == null) { 423 name.givenNames = token; 424 } else if (name.familyName == null) { 425 name.familyName = name.givenNames; 426 name.givenNames = token; 427 } else if (name.middleName == null) { 428 name.middleName = name.givenNames; 429 name.givenNames = token; 430 } else { 431 name.middleName = name.middleName + name.givenNames; 432 name.givenNames = token; 433 } 434 } 435 436 // If a single word parse that word up. 437 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 438 int length = fullName.length(); 439 if (length == 2) { 440 name.familyName = fullName.substring(0, 1); 441 name.givenNames = fullName.substring(1); 442 } else if (length == 3) { 443 name.familyName = fullName.substring(0, 1); 444 name.middleName = fullName.substring(1, 2); 445 name.givenNames = fullName.substring(2); 446 } else if (length == 4) { 447 name.familyName = fullName.substring(0, 2); 448 name.middleName = fullName.substring(2, 3); 449 name.givenNames = fullName.substring(3); 450 } 451 452 } 453 } 454 455 /** 456 * Splits a full name composed according to the Japanese tradition: 457 * <pre> 458 * [family name] given name(s) 459 * </pre> 460 */ 461 private void splitJapaneseName(Name name, String fullName) { 462 StringTokenizer tokenizer = new StringTokenizer(fullName); 463 while (tokenizer.hasMoreTokens()) { 464 String token = tokenizer.nextToken(); 465 if (name.givenNames == null) { 466 name.givenNames = token; 467 } else if (name.familyName == null) { 468 name.familyName = name.givenNames; 469 name.givenNames = token; 470 } else { 471 name.givenNames += " " + token; 472 } 473 } 474 } 475 476 /** 477 * Splits a full name composed according to the Korean tradition: 478 * <pre> 479 * [family name] given name(s) 480 * </pre> 481 */ 482 private void splitKoreanName(Name name, String fullName) { 483 StringTokenizer tokenizer = new StringTokenizer(fullName); 484 if (tokenizer.countTokens() > 1) { 485 // Each name can be identified by separators. 486 while (tokenizer.hasMoreTokens()) { 487 String token = tokenizer.nextToken(); 488 if (name.givenNames == null) { 489 name.givenNames = token; 490 } else if (name.familyName == null) { 491 name.familyName = name.givenNames; 492 name.givenNames = token; 493 } else { 494 name.givenNames += " " + token; 495 } 496 } 497 } else { 498 // There is no separator. Try to guess family name. 499 // The length of most family names is 1. 500 int familyNameLength = 1; 501 502 // Compare with 2-length family names. 503 for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) { 504 if (fullName.startsWith(twoLengthFamilyName)) { 505 familyNameLength = 2; 506 break; 507 } 508 } 509 510 name.familyName = fullName.substring(0, familyNameLength); 511 if (fullName.length() > familyNameLength) { 512 name.givenNames = fullName.substring(familyNameLength); 513 } 514 } 515 } 516 517 /** 518 * Concatenates components of a name according to the rules dictated by the name style. 519 * 520 * @param givenNameFirst is ignored for CJK display name styles 521 */ 522 public String join(Name name, boolean givenNameFirst, boolean includePrefix) { 523 String prefix = includePrefix ? name.prefix : null; 524 switch (name.fullNameStyle) { 525 case FullNameStyle.CJK: 526 case FullNameStyle.CHINESE: 527 case FullNameStyle.KOREAN: 528 return join(prefix, name.familyName, name.middleName, name.givenNames, 529 name.suffix, false, false, false); 530 531 case FullNameStyle.JAPANESE: 532 return join(prefix, name.familyName, name.middleName, name.givenNames, 533 name.suffix, true, false, false); 534 535 default: 536 if (givenNameFirst) { 537 return join(prefix, name.givenNames, name.middleName, name.familyName, 538 name.suffix, true, false, true); 539 } else { 540 return join(prefix, name.familyName, name.givenNames, name.middleName, 541 name.suffix, true, true, true); 542 } 543 } 544 } 545 546 /** 547 * Concatenates components of the phonetic name following the CJK tradition: 548 * family name + middle name + given name(s). 549 */ 550 public String joinPhoneticName(Name name) { 551 return join(null, name.phoneticFamilyName, 552 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false); 553 } 554 555 /** 556 * Concatenates parts of a full name inserting spaces and commas as specified. 557 */ 558 private String join(String prefix, String part1, String part2, String part3, String suffix, 559 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 560 prefix = prefix == null ? null: prefix.trim(); 561 part1 = part1 == null ? null: part1.trim(); 562 part2 = part2 == null ? null: part2.trim(); 563 part3 = part3 == null ? null: part3.trim(); 564 suffix = suffix == null ? null: suffix.trim(); 565 566 boolean hasPrefix = !TextUtils.isEmpty(prefix); 567 boolean hasPart1 = !TextUtils.isEmpty(part1); 568 boolean hasPart2 = !TextUtils.isEmpty(part2); 569 boolean hasPart3 = !TextUtils.isEmpty(part3); 570 boolean hasSuffix = !TextUtils.isEmpty(suffix); 571 572 boolean isSingleWord = true; 573 String singleWord = null; 574 575 if (hasPrefix) { 576 singleWord = prefix; 577 } 578 579 if (hasPart1) { 580 if (singleWord != null) { 581 isSingleWord = false; 582 } else { 583 singleWord = part1; 584 } 585 } 586 587 if (hasPart2) { 588 if (singleWord != null) { 589 isSingleWord = false; 590 } else { 591 singleWord = part2; 592 } 593 } 594 595 if (hasPart3) { 596 if (singleWord != null) { 597 isSingleWord = false; 598 } else { 599 singleWord = part3; 600 } 601 } 602 603 if (hasSuffix) { 604 if (singleWord != null) { 605 isSingleWord = false; 606 } else { 607 singleWord = normalizedSuffix(suffix); 608 } 609 } 610 611 if (isSingleWord) { 612 return singleWord; 613 } 614 615 StringBuilder sb = new StringBuilder(); 616 617 if (hasPrefix) { 618 sb.append(prefix); 619 } 620 621 if (hasPart1) { 622 if (hasPrefix) { 623 sb.append(' '); 624 } 625 sb.append(part1); 626 } 627 628 if (hasPart2) { 629 if (hasPrefix || hasPart1) { 630 if (useCommaAfterPart1) { 631 sb.append(','); 632 } 633 if (useSpace) { 634 sb.append(' '); 635 } 636 } 637 sb.append(part2); 638 } 639 640 if (hasPart3) { 641 if (hasPrefix || hasPart1 || hasPart2) { 642 if (useSpace) { 643 sb.append(' '); 644 } 645 } 646 sb.append(part3); 647 } 648 649 if (hasSuffix) { 650 if (hasPrefix || hasPart1 || hasPart2 || hasPart3) { 651 if (useCommaAfterPart3) { 652 sb.append(','); 653 } 654 if (useSpace) { 655 sb.append(' '); 656 } 657 } 658 sb.append(normalizedSuffix(suffix)); 659 } 660 661 return sb.toString(); 662 } 663 664 /** 665 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 666 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 667 */ 668 private String normalizedSuffix(String suffix) { 669 int length = suffix.length(); 670 if (length == 0 || suffix.charAt(length - 1) == '.') { 671 return suffix; 672 } 673 674 String withDot = suffix + '.'; 675 if (mSuffixesSet.contains(withDot.toUpperCase())) { 676 return withDot; 677 } else { 678 return suffix; 679 } 680 } 681 682 /** 683 * If the supplied name style is undefined, returns a default based on the language, 684 * otherwise returns the supplied name style itself. 685 * 686 * @param nameStyle See {@link FullNameStyle}. 687 */ 688 public int getAdjustedFullNameStyle(int nameStyle) { 689 if (nameStyle == FullNameStyle.UNDEFINED) { 690 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 691 return FullNameStyle.JAPANESE; 692 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 693 return FullNameStyle.KOREAN; 694 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 695 return FullNameStyle.CHINESE; 696 } else { 697 return FullNameStyle.WESTERN; 698 } 699 } else if (nameStyle == FullNameStyle.CJK) { 700 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 701 return FullNameStyle.JAPANESE; 702 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 703 return FullNameStyle.KOREAN; 704 } else { 705 return FullNameStyle.CHINESE; 706 } 707 } 708 return nameStyle; 709 } 710 711 /** 712 * Parses the first word from the name if it is a prefix. 713 */ 714 private void parsePrefix(Name name, NameTokenizer tokens) { 715 if (tokens.mStartPointer == tokens.mEndPointer) { 716 return; 717 } 718 719 String firstToken = tokens.mTokens[tokens.mStartPointer]; 720 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 721 if (tokens.hasDot(tokens.mStartPointer)) { 722 firstToken += '.'; 723 } 724 name.prefix = firstToken; 725 tokens.mStartPointer++; 726 } 727 } 728 729 /** 730 * Parses the last word(s) from the name if it is a suffix. 731 */ 732 private void parseSuffix(Name name, NameTokenizer tokens) { 733 if (tokens.mStartPointer == tokens.mEndPointer) { 734 return; 735 } 736 737 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 738 739 // Take care of an explicit comma-separated suffix 740 if (tokens.mEndPointer - tokens.mStartPointer > 2 741 && tokens.hasComma(tokens.mEndPointer - 2)) { 742 if (tokens.hasDot(tokens.mEndPointer - 1)) { 743 lastToken += '.'; 744 } 745 name.suffix = lastToken; 746 tokens.mEndPointer--; 747 return; 748 } 749 750 if (lastToken.length() > mMaxSuffixLength) { 751 return; 752 } 753 754 String normalized = lastToken.toUpperCase(); 755 if (mSuffixesSet.contains(normalized)) { 756 name.suffix = lastToken; 757 tokens.mEndPointer--; 758 return; 759 } 760 761 if (tokens.hasDot(tokens.mEndPointer - 1)) { 762 lastToken += '.'; 763 } 764 normalized += "."; 765 766 // Take care of suffixes like M.D. and D.D.S. 767 int pos = tokens.mEndPointer - 1; 768 while (normalized.length() <= mMaxSuffixLength) { 769 770 if (mSuffixesSet.contains(normalized)) { 771 name.suffix = lastToken; 772 tokens.mEndPointer = pos; 773 return; 774 } 775 776 if (pos == tokens.mStartPointer) { 777 break; 778 } 779 780 pos--; 781 if (tokens.hasDot(pos)) { 782 lastToken = tokens.mTokens[pos] + "." + lastToken; 783 } else { 784 lastToken = tokens.mTokens[pos] + " " + lastToken; 785 } 786 787 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 788 } 789 } 790 791 private void parseLastName(Name name, NameTokenizer tokens) { 792 if (tokens.mStartPointer == tokens.mEndPointer) { 793 return; 794 } 795 796 // If the first word is followed by a comma, assume that it's the family name 797 if (tokens.hasComma(tokens.mStartPointer)) { 798 name.familyName = tokens.mTokens[tokens.mStartPointer]; 799 tokens.mStartPointer++; 800 return; 801 } 802 803 // If the second word is followed by a comma and the first word 804 // is a last name prefix as in "de Sade" and "von Cliburn", treat 805 // the first two words as the family name. 806 if (tokens.mStartPointer + 1 < tokens.mEndPointer 807 && tokens.hasComma(tokens.mStartPointer + 1) 808 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 809 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 810 if (tokens.hasDot(tokens.mStartPointer)) { 811 familyNamePrefix += '.'; 812 } 813 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 814 tokens.mStartPointer += 2; 815 return; 816 } 817 818 // Finally, assume that the last word is the last name 819 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 820 tokens.mEndPointer--; 821 822 // Take care of last names like "de Sade" and "von Cliburn" 823 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 824 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 825 if (isFamilyNamePrefix(lastNamePrefix)) { 826 if (tokens.hasDot(tokens.mEndPointer - 1)) { 827 lastNamePrefix += '.'; 828 } 829 name.familyName = lastNamePrefix + " " + name.familyName; 830 tokens.mEndPointer--; 831 } 832 } 833 } 834 835 /** 836 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 837 */ 838 private boolean isFamilyNamePrefix(String word) { 839 final String normalized = word.toUpperCase(); 840 841 return mLastNamePrefixesSet.contains(normalized) 842 || mLastNamePrefixesSet.contains(normalized + "."); 843 } 844 845 846 private void parseMiddleName(Name name, NameTokenizer tokens) { 847 if (tokens.mStartPointer == tokens.mEndPointer) { 848 return; 849 } 850 851 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 852 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 853 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 854 toUpperCase())) { 855 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 856 if (tokens.hasDot(tokens.mEndPointer - 1)) { 857 name.middleName += '.'; 858 } 859 tokens.mEndPointer--; 860 } 861 } 862 } 863 864 private void parseGivenNames(Name name, NameTokenizer tokens) { 865 if (tokens.mStartPointer == tokens.mEndPointer) { 866 return; 867 } 868 869 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 870 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 871 } else { 872 StringBuilder sb = new StringBuilder(); 873 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 874 if (i != tokens.mStartPointer) { 875 sb.append(' '); 876 } 877 sb.append(tokens.mTokens[i]); 878 if (tokens.hasDot(i)) { 879 sb.append('.'); 880 } 881 } 882 name.givenNames = sb.toString(); 883 } 884 } 885 886 /** 887 * Makes the best guess at the expected full name style based on the character set 888 * used in the supplied name. If the phonetic name is also supplied, tries to 889 * differentiate between Chinese, Japanese and Korean based on the alphabet used 890 * for the phonetic name. 891 */ 892 public void guessNameStyle(Name name) { 893 guessFullNameStyle(name); 894 guessPhoneticNameStyle(name); 895 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 896 name.phoneticNameStyle); 897 } 898 899 /** 900 * Updates the display name style according to the phonetic name style if we 901 * were unsure about display name style based on the name components, but 902 * phonetic name makes it more definitive. 903 */ 904 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 905 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 906 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 907 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 908 return FullNameStyle.JAPANESE; 909 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 910 return FullNameStyle.KOREAN; 911 } 912 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 913 return FullNameStyle.CHINESE; 914 } 915 } 916 } 917 return nameStyle; 918 } 919 920 /** 921 * Makes the best guess at the expected full name style based on the character set 922 * used in the supplied name. 923 */ 924 private void guessFullNameStyle(NameSplitter.Name name) { 925 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 926 return; 927 } 928 929 int bestGuess = guessFullNameStyle(name.givenNames); 930 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 931 // if the name is not JANPANESE or KOREAN. 932 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 933 && bestGuess != FullNameStyle.WESTERN) { 934 name.fullNameStyle = bestGuess; 935 return; 936 } 937 938 int guess = guessFullNameStyle(name.familyName); 939 if (guess != FullNameStyle.UNDEFINED) { 940 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 941 name.fullNameStyle = guess; 942 return; 943 } 944 bestGuess = guess; 945 } 946 947 guess = guessFullNameStyle(name.middleName); 948 if (guess != FullNameStyle.UNDEFINED) { 949 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 950 name.fullNameStyle = guess; 951 return; 952 } 953 bestGuess = guess; 954 } 955 956 guess = guessFullNameStyle(name.prefix); 957 if (guess != FullNameStyle.UNDEFINED) { 958 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 959 name.fullNameStyle = guess; 960 return; 961 } 962 bestGuess = guess; 963 } 964 965 guess = guessFullNameStyle(name.suffix); 966 if (guess != FullNameStyle.UNDEFINED) { 967 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 968 name.fullNameStyle = guess; 969 return; 970 } 971 bestGuess = guess; 972 } 973 974 name.fullNameStyle = bestGuess; 975 } 976 977 public int guessFullNameStyle(String name) { 978 if (name == null) { 979 return FullNameStyle.UNDEFINED; 980 } 981 982 int nameStyle = FullNameStyle.UNDEFINED; 983 int length = name.length(); 984 int offset = 0; 985 while (offset < length) { 986 int codePoint = Character.codePointAt(name, offset); 987 if (Character.isLetter(codePoint)) { 988 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 989 990 if (!isLatinUnicodeBlock(unicodeBlock)) { 991 992 if (isCJKUnicodeBlock(unicodeBlock)) { 993 // We don't know if this is Chinese, Japanese or Korean - 994 // trying to figure out by looking at other characters in the name 995 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 996 } 997 998 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 999 return FullNameStyle.JAPANESE; 1000 } 1001 1002 if (isKoreanUnicodeBlock(unicodeBlock)) { 1003 return FullNameStyle.KOREAN; 1004 } 1005 } 1006 nameStyle = FullNameStyle.WESTERN; 1007 } 1008 offset += Character.charCount(codePoint); 1009 } 1010 return nameStyle; 1011 } 1012 1013 private int guessCJKNameStyle(String name, int offset) { 1014 int length = name.length(); 1015 while (offset < length) { 1016 int codePoint = Character.codePointAt(name, offset); 1017 if (Character.isLetter(codePoint)) { 1018 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1019 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1020 return FullNameStyle.JAPANESE; 1021 } 1022 if (isKoreanUnicodeBlock(unicodeBlock)) { 1023 return FullNameStyle.KOREAN; 1024 } 1025 } 1026 offset += Character.charCount(codePoint); 1027 } 1028 1029 return FullNameStyle.CJK; 1030 } 1031 1032 private void guessPhoneticNameStyle(NameSplitter.Name name) { 1033 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 1034 return; 1035 } 1036 1037 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 1038 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 1039 name.phoneticNameStyle = bestGuess; 1040 return; 1041 } 1042 1043 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 1044 if (guess != FullNameStyle.UNDEFINED) { 1045 if (guess != FullNameStyle.CJK) { 1046 name.phoneticNameStyle = guess; 1047 return; 1048 } 1049 bestGuess = guess; 1050 } 1051 1052 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 1053 if (guess != FullNameStyle.UNDEFINED) { 1054 if (guess != FullNameStyle.CJK) { 1055 name.phoneticNameStyle = guess; 1056 return; 1057 } 1058 bestGuess = guess; 1059 } 1060 } 1061 1062 public int guessPhoneticNameStyle(String name) { 1063 if (name == null) { 1064 return PhoneticNameStyle.UNDEFINED; 1065 } 1066 1067 int nameStyle = PhoneticNameStyle.UNDEFINED; 1068 int length = name.length(); 1069 int offset = 0; 1070 while (offset < length) { 1071 int codePoint = Character.codePointAt(name, offset); 1072 if (Character.isLetter(codePoint)) { 1073 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1074 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1075 return PhoneticNameStyle.JAPANESE; 1076 } 1077 if (isKoreanUnicodeBlock(unicodeBlock)) { 1078 return PhoneticNameStyle.KOREAN; 1079 } 1080 if (isLatinUnicodeBlock(unicodeBlock)) { 1081 return PhoneticNameStyle.PINYIN; 1082 } 1083 } 1084 offset += Character.charCount(codePoint); 1085 } 1086 1087 return nameStyle; 1088 } 1089 1090 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 1091 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 1092 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 1093 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 1094 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 1095 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 1096 } 1097 1098 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 1099 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 1100 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1101 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1102 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1103 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1104 || block == UnicodeBlock.CJK_COMPATIBILITY 1105 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 1106 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1107 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 1108 } 1109 1110 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 1111 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 1112 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 1113 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 1114 } 1115 1116 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 1117 return unicodeBlock == UnicodeBlock.KATAKANA || 1118 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 1119 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 1120 unicodeBlock == UnicodeBlock.HIRAGANA; 1121 } 1122 } 1123