1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import android.content.ContentValues; 19 import android.provider.ContactsContract.CommonDataKinds.StructuredName; 20 import android.provider.ContactsContract.FullNameStyle; 21 import android.provider.ContactsContract.PhoneticNameStyle; 22 import android.text.TextUtils; 23 24 import com.android.providers.contacts.util.NeededForTesting; 25 26 import java.lang.Character.UnicodeBlock; 27 import java.util.HashSet; 28 import java.util.Locale; 29 import java.util.StringTokenizer; 30 31 /** 32 * The purpose of this class is to split a full name into given names and last 33 * name. The logic only supports having a single last name. If the full name has 34 * multiple last names the output will be incorrect. 35 * <p> 36 * Core algorithm: 37 * <ol> 38 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 39 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 40 * <li>Assign the last remaining token as the last name.</li> 41 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 42 * this word also as the last name.</li> 43 * <li>Assign the rest of the words as the "given names".</li> 44 * </ol> 45 */ 46 public class NameSplitter { 47 48 public static final int MAX_TOKENS = 10; 49 50 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 51 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 52 53 // This includes simplified and traditional Chinese 54 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 55 56 private final HashSet<String> mPrefixesSet; 57 private final HashSet<String> mSuffixesSet; 58 private final int mMaxSuffixLength; 59 private final HashSet<String> mLastNamePrefixesSet; 60 private final HashSet<String> mConjuctions; 61 private final Locale mLocale; 62 private final String mLanguage; 63 64 /** 65 * Two-Chracter long Korean family names. 66 * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 67 */ 68 private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = { 69 "\uAC15\uC804", // Gang Jeon 70 "\uB0A8\uAD81", // Nam Goong 71 "\uB3C5\uACE0", // Dok Go 72 "\uB3D9\uBC29", // Dong Bang 73 "\uB9DD\uC808", // Mang Jeol 74 "\uC0AC\uACF5", // Sa Gong 75 "\uC11C\uBB38", // Seo Moon 76 "\uC120\uC6B0", // Seon Woo 77 "\uC18C\uBD09", // So Bong 78 "\uC5B4\uAE08", // Uh Geum 79 "\uC7A5\uACE1", // Jang Gok 80 "\uC81C\uAC08", // Je Gal 81 "\uD669\uBCF4" // Hwang Bo 82 }; 83 84 public static class Name { 85 public String prefix; 86 public String givenNames; 87 public String middleName; 88 public String familyName; 89 public String suffix; 90 91 public int fullNameStyle; 92 93 public String phoneticFamilyName; 94 public String phoneticMiddleName; 95 public String phoneticGivenName; 96 97 public int phoneticNameStyle; 98 99 public Name() { 100 } 101 102 public Name(String prefix, String givenNames, String middleName, String familyName, 103 String suffix) { 104 this.prefix = prefix; 105 this.givenNames = givenNames; 106 this.middleName = middleName; 107 this.familyName = familyName; 108 this.suffix = suffix; 109 } 110 111 @NeededForTesting 112 public String getPrefix() { 113 return prefix; 114 } 115 116 public String getGivenNames() { 117 return givenNames; 118 } 119 120 public String getMiddleName() { 121 return middleName; 122 } 123 124 public String getFamilyName() { 125 return familyName; 126 } 127 128 @NeededForTesting 129 public String getSuffix() { 130 return suffix; 131 } 132 133 public int getFullNameStyle() { 134 return fullNameStyle; 135 } 136 137 public String getPhoneticFamilyName() { 138 return phoneticFamilyName; 139 } 140 141 public String getPhoneticMiddleName() { 142 return phoneticMiddleName; 143 } 144 145 public String getPhoneticGivenName() { 146 return phoneticGivenName; 147 } 148 149 public int getPhoneticNameStyle() { 150 return phoneticNameStyle; 151 } 152 153 public void fromValues(ContentValues values) { 154 prefix = values.getAsString(StructuredName.PREFIX); 155 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 156 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 157 familyName = values.getAsString(StructuredName.FAMILY_NAME); 158 suffix = values.getAsString(StructuredName.SUFFIX); 159 160 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 161 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 162 163 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 164 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 165 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 166 167 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 168 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 169 } 170 171 public void toValues(ContentValues values) { 172 putValueIfPresent(values, StructuredName.PREFIX, prefix); 173 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 174 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 175 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 176 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 177 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 178 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 179 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 180 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 181 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 182 } 183 184 private void putValueIfPresent(ContentValues values, String name, String value) { 185 if (value != null) { 186 values.put(name, value); 187 } 188 } 189 190 public void clear() { 191 prefix = null; 192 givenNames = null; 193 middleName = null; 194 familyName = null; 195 suffix = null; 196 fullNameStyle = FullNameStyle.UNDEFINED; 197 phoneticFamilyName = null; 198 phoneticMiddleName = null; 199 phoneticGivenName = null; 200 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 201 } 202 203 public boolean isEmpty() { 204 return TextUtils.isEmpty(givenNames) 205 && TextUtils.isEmpty(middleName) 206 && TextUtils.isEmpty(familyName) 207 && TextUtils.isEmpty(suffix) 208 && TextUtils.isEmpty(phoneticFamilyName) 209 && TextUtils.isEmpty(phoneticMiddleName) 210 && TextUtils.isEmpty(phoneticGivenName); 211 } 212 213 @Override 214 public String toString() { 215 return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName 216 + " family: " + familyName + " suffix: " + suffix + " ph/given: " 217 + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: " 218 + phoneticFamilyName + "]"; 219 } 220 } 221 222 private static class NameTokenizer extends StringTokenizer { 223 private final String[] mTokens; 224 private int mDotBitmask; 225 private int mCommaBitmask; 226 private int mStartPointer; 227 private int mEndPointer; 228 229 public NameTokenizer(String fullName) { 230 super(fullName, " .,", true); 231 232 mTokens = new String[MAX_TOKENS]; 233 234 // Iterate over tokens, skipping over empty ones and marking tokens that 235 // are followed by dots. 236 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 237 final String token = nextToken(); 238 if (token.length() > 0) { 239 final char c = token.charAt(0); 240 if (c == ' ') { 241 continue; 242 } 243 } 244 245 if (mEndPointer > 0 && token.charAt(0) == '.') { 246 mDotBitmask |= (1 << (mEndPointer - 1)); 247 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 248 mCommaBitmask |= (1 << (mEndPointer - 1)); 249 } else { 250 mTokens[mEndPointer] = token; 251 mEndPointer++; 252 } 253 } 254 } 255 256 /** 257 * Returns true if the token is followed by a dot in the original full name. 258 */ 259 public boolean hasDot(int index) { 260 return (mDotBitmask & (1 << index)) != 0; 261 } 262 263 /** 264 * Returns true if the token is followed by a comma in the original full name. 265 */ 266 public boolean hasComma(int index) { 267 return (mCommaBitmask & (1 << index)) != 0; 268 } 269 } 270 271 /** 272 * Constructor. 273 * 274 * @param commonPrefixes comma-separated list of common prefixes, 275 * e.g. "Mr, Ms, Mrs" 276 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 277 * e.g. "d', st, st., von" 278 * @param commonSuffixes comma-separated list of common suffixes, 279 * e.g. "Jr, M.D., MD, D.D.S." 280 * @param commonConjunctions comma-separated list of common conjuctions, 281 * e.g. "AND, Or" 282 */ 283 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 284 String commonSuffixes, String commonConjunctions, Locale locale) { 285 // TODO: refactor this to use <string-array> resources 286 mPrefixesSet = convertToSet(commonPrefixes); 287 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 288 mSuffixesSet = convertToSet(commonSuffixes); 289 mConjuctions = convertToSet(commonConjunctions); 290 mLocale = locale != null ? locale : Locale.getDefault(); 291 mLanguage = mLocale.getLanguage().toLowerCase(); 292 293 int maxLength = 0; 294 for (String suffix : mSuffixesSet) { 295 if (suffix.length() > maxLength) { 296 maxLength = suffix.length(); 297 } 298 } 299 300 mMaxSuffixLength = maxLength; 301 } 302 303 /** 304 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 305 * and converts them to upper case. 306 */ 307 private static HashSet<String> convertToSet(String strings) { 308 HashSet<String> set = new HashSet<String>(); 309 if (strings != null) { 310 String[] split = strings.split(","); 311 for (int i = 0; i < split.length; i++) { 312 set.add(split[i].trim().toUpperCase()); 313 } 314 } 315 return set; 316 } 317 318 /** 319 * Parses a full name and returns components as a list of tokens. 320 */ 321 public int tokenize(String[] tokens, String fullName) { 322 if (fullName == null) { 323 return 0; 324 } 325 326 NameTokenizer tokenizer = new NameTokenizer(fullName); 327 328 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 329 return 0; 330 } 331 332 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 333 int count = 0; 334 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 335 tokens[count++] = tokenizer.mTokens[i]; 336 } 337 338 return count; 339 } 340 341 342 /** 343 * Parses a full name and returns parsed components in the Name object. 344 */ 345 public void split(Name name, String fullName) { 346 if (fullName == null) { 347 return; 348 } 349 350 int fullNameStyle = guessFullNameStyle(fullName); 351 if (fullNameStyle == FullNameStyle.CJK) { 352 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 353 } 354 355 split(name, fullName, fullNameStyle); 356 } 357 358 /** 359 * Parses a full name and returns parsed components in the Name object 360 * with a given fullNameStyle. 361 */ 362 public void split(Name name, String fullName, int fullNameStyle) { 363 if (fullName == null) { 364 return; 365 } 366 367 name.fullNameStyle = fullNameStyle; 368 369 switch (fullNameStyle) { 370 case FullNameStyle.CHINESE: 371 splitChineseName(name, fullName); 372 break; 373 374 case FullNameStyle.JAPANESE: 375 splitJapaneseName(name, fullName); 376 break; 377 378 case FullNameStyle.KOREAN: 379 splitKoreanName(name, fullName); 380 break; 381 382 default: 383 splitWesternName(name, fullName); 384 } 385 } 386 387 /** 388 * Splits a full name composed according to the Western tradition: 389 * <pre> 390 * [prefix] given name(s) [[middle name] family name] [, suffix] 391 * [prefix] family name, given name [middle name] [,suffix] 392 * </pre> 393 */ 394 private void splitWesternName(Name name, String fullName) { 395 NameTokenizer tokens = new NameTokenizer(fullName); 396 parsePrefix(name, tokens); 397 398 // If the name consists of just one or two tokens, treat them as first/last name, 399 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 400 if (tokens.mEndPointer > 2) { 401 parseSuffix(name, tokens); 402 } 403 404 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 405 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 406 } else { 407 parseLastName(name, tokens); 408 parseMiddleName(name, tokens); 409 parseGivenNames(name, tokens); 410 } 411 } 412 413 /** 414 * Splits a full name composed according to the Chinese tradition: 415 * <pre> 416 * [family name [middle name]] given name 417 * </pre> 418 */ 419 private void splitChineseName(Name name, String fullName) { 420 StringTokenizer tokenizer = new StringTokenizer(fullName); 421 while (tokenizer.hasMoreTokens()) { 422 String token = tokenizer.nextToken(); 423 if (name.givenNames == null) { 424 name.givenNames = token; 425 } else if (name.familyName == null) { 426 name.familyName = name.givenNames; 427 name.givenNames = token; 428 } else if (name.middleName == null) { 429 name.middleName = name.givenNames; 430 name.givenNames = token; 431 } else { 432 name.middleName = name.middleName + name.givenNames; 433 name.givenNames = token; 434 } 435 } 436 437 // If a single word parse that word up. 438 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 439 int length = fullName.length(); 440 if (length == 2) { 441 name.familyName = fullName.substring(0, 1); 442 name.givenNames = fullName.substring(1); 443 } else if (length == 3) { 444 name.familyName = fullName.substring(0, 1); 445 name.middleName = fullName.substring(1, 2); 446 name.givenNames = fullName.substring(2); 447 } else if (length == 4) { 448 name.familyName = fullName.substring(0, 2); 449 name.middleName = fullName.substring(2, 3); 450 name.givenNames = fullName.substring(3); 451 } 452 453 } 454 } 455 456 /** 457 * Splits a full name composed according to the Japanese tradition: 458 * <pre> 459 * [family name] given name(s) 460 * </pre> 461 */ 462 private void splitJapaneseName(Name name, String fullName) { 463 StringTokenizer tokenizer = new StringTokenizer(fullName); 464 while (tokenizer.hasMoreTokens()) { 465 String token = tokenizer.nextToken(); 466 if (name.givenNames == null) { 467 name.givenNames = token; 468 } else if (name.familyName == null) { 469 name.familyName = name.givenNames; 470 name.givenNames = token; 471 } else { 472 name.givenNames += " " + token; 473 } 474 } 475 } 476 477 /** 478 * Splits a full name composed according to the Korean tradition: 479 * <pre> 480 * [family name] given name(s) 481 * </pre> 482 */ 483 private void splitKoreanName(Name name, String fullName) { 484 StringTokenizer tokenizer = new StringTokenizer(fullName); 485 if (tokenizer.countTokens() > 1) { 486 // Each name can be identified by separators. 487 while (tokenizer.hasMoreTokens()) { 488 String token = tokenizer.nextToken(); 489 if (name.givenNames == null) { 490 name.givenNames = token; 491 } else if (name.familyName == null) { 492 name.familyName = name.givenNames; 493 name.givenNames = token; 494 } else { 495 name.givenNames += " " + token; 496 } 497 } 498 } else { 499 // There is no separator. Try to guess family name. 500 // The length of most family names is 1. 501 int familyNameLength = 1; 502 503 // Compare with 2-length family names. 504 for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) { 505 if (fullName.startsWith(twoLengthFamilyName)) { 506 familyNameLength = 2; 507 break; 508 } 509 } 510 511 name.familyName = fullName.substring(0, familyNameLength); 512 if (fullName.length() > familyNameLength) { 513 name.givenNames = fullName.substring(familyNameLength); 514 } 515 } 516 } 517 518 /** 519 * Concatenates components of a name according to the rules dictated by the name style. 520 * 521 * @param givenNameFirst is ignored for CJK display name styles 522 */ 523 public String join(Name name, boolean givenNameFirst, boolean includePrefix) { 524 String prefix = includePrefix ? name.prefix : null; 525 switch (name.fullNameStyle) { 526 case FullNameStyle.CJK: 527 case FullNameStyle.CHINESE: 528 case FullNameStyle.KOREAN: 529 return join(prefix, name.familyName, name.middleName, name.givenNames, 530 name.suffix, false, false, false); 531 532 case FullNameStyle.JAPANESE: 533 return join(prefix, name.familyName, name.middleName, name.givenNames, 534 name.suffix, true, false, false); 535 536 default: 537 if (givenNameFirst) { 538 return join(prefix, name.givenNames, name.middleName, name.familyName, 539 name.suffix, true, false, true); 540 } else { 541 return join(prefix, name.familyName, name.givenNames, name.middleName, 542 name.suffix, true, true, true); 543 } 544 } 545 } 546 547 /** 548 * Concatenates components of the phonetic name following the CJK tradition: 549 * family name + middle name + given name(s). 550 */ 551 public String joinPhoneticName(Name name) { 552 return join(null, name.phoneticFamilyName, 553 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false); 554 } 555 556 /** 557 * Concatenates parts of a full name inserting spaces and commas as specified. 558 */ 559 private String join(String prefix, String part1, String part2, String part3, String suffix, 560 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 561 prefix = prefix == null ? null: prefix.trim(); 562 part1 = part1 == null ? null: part1.trim(); 563 part2 = part2 == null ? null: part2.trim(); 564 part3 = part3 == null ? null: part3.trim(); 565 suffix = suffix == null ? null: suffix.trim(); 566 567 boolean hasPrefix = !TextUtils.isEmpty(prefix); 568 boolean hasPart1 = !TextUtils.isEmpty(part1); 569 boolean hasPart2 = !TextUtils.isEmpty(part2); 570 boolean hasPart3 = !TextUtils.isEmpty(part3); 571 boolean hasSuffix = !TextUtils.isEmpty(suffix); 572 573 boolean isSingleWord = true; 574 String singleWord = null; 575 576 if (hasPrefix) { 577 singleWord = prefix; 578 } 579 580 if (hasPart1) { 581 if (singleWord != null) { 582 isSingleWord = false; 583 } else { 584 singleWord = part1; 585 } 586 } 587 588 if (hasPart2) { 589 if (singleWord != null) { 590 isSingleWord = false; 591 } else { 592 singleWord = part2; 593 } 594 } 595 596 if (hasPart3) { 597 if (singleWord != null) { 598 isSingleWord = false; 599 } else { 600 singleWord = part3; 601 } 602 } 603 604 if (hasSuffix) { 605 if (singleWord != null) { 606 isSingleWord = false; 607 } else { 608 singleWord = normalizedSuffix(suffix); 609 } 610 } 611 612 if (isSingleWord) { 613 return singleWord; 614 } 615 616 StringBuilder sb = new StringBuilder(); 617 618 if (hasPrefix) { 619 sb.append(prefix); 620 } 621 622 if (hasPart1) { 623 if (hasPrefix) { 624 sb.append(' '); 625 } 626 sb.append(part1); 627 } 628 629 if (hasPart2) { 630 if (hasPrefix || hasPart1) { 631 if (useCommaAfterPart1) { 632 sb.append(','); 633 } 634 if (useSpace) { 635 sb.append(' '); 636 } 637 } 638 sb.append(part2); 639 } 640 641 if (hasPart3) { 642 if (hasPrefix || hasPart1 || hasPart2) { 643 if (useSpace) { 644 sb.append(' '); 645 } 646 } 647 sb.append(part3); 648 } 649 650 if (hasSuffix) { 651 if (hasPrefix || hasPart1 || hasPart2 || hasPart3) { 652 if (useCommaAfterPart3) { 653 sb.append(','); 654 } 655 if (useSpace) { 656 sb.append(' '); 657 } 658 } 659 sb.append(normalizedSuffix(suffix)); 660 } 661 662 return sb.toString(); 663 } 664 665 /** 666 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 667 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 668 */ 669 private String normalizedSuffix(String suffix) { 670 int length = suffix.length(); 671 if (length == 0 || suffix.charAt(length - 1) == '.') { 672 return suffix; 673 } 674 675 String withDot = suffix + '.'; 676 if (mSuffixesSet.contains(withDot.toUpperCase())) { 677 return withDot; 678 } else { 679 return suffix; 680 } 681 } 682 683 /** 684 * If the supplied name style is undefined, returns a default based on the language, 685 * otherwise returns the supplied name style itself. 686 * 687 * @param nameStyle See {@link FullNameStyle}. 688 */ 689 public int getAdjustedFullNameStyle(int nameStyle) { 690 if (nameStyle == FullNameStyle.UNDEFINED) { 691 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 692 return FullNameStyle.JAPANESE; 693 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 694 return FullNameStyle.KOREAN; 695 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 696 return FullNameStyle.CHINESE; 697 } else { 698 return FullNameStyle.WESTERN; 699 } 700 } else if (nameStyle == FullNameStyle.CJK) { 701 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 702 return FullNameStyle.JAPANESE; 703 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 704 return FullNameStyle.KOREAN; 705 } else { 706 return FullNameStyle.CHINESE; 707 } 708 } 709 return nameStyle; 710 } 711 712 /** 713 * Parses the first word from the name if it is a prefix. 714 */ 715 private void parsePrefix(Name name, NameTokenizer tokens) { 716 if (tokens.mStartPointer == tokens.mEndPointer) { 717 return; 718 } 719 720 String firstToken = tokens.mTokens[tokens.mStartPointer]; 721 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 722 if (tokens.hasDot(tokens.mStartPointer)) { 723 firstToken += '.'; 724 } 725 name.prefix = firstToken; 726 tokens.mStartPointer++; 727 } 728 } 729 730 /** 731 * Parses the last word(s) from the name if it is a suffix. 732 */ 733 private void parseSuffix(Name name, NameTokenizer tokens) { 734 if (tokens.mStartPointer == tokens.mEndPointer) { 735 return; 736 } 737 738 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 739 740 // Take care of an explicit comma-separated suffix 741 if (tokens.mEndPointer - tokens.mStartPointer > 2 742 && tokens.hasComma(tokens.mEndPointer - 2)) { 743 if (tokens.hasDot(tokens.mEndPointer - 1)) { 744 lastToken += '.'; 745 } 746 name.suffix = lastToken; 747 tokens.mEndPointer--; 748 return; 749 } 750 751 if (lastToken.length() > mMaxSuffixLength) { 752 return; 753 } 754 755 String normalized = lastToken.toUpperCase(); 756 if (mSuffixesSet.contains(normalized)) { 757 name.suffix = lastToken; 758 tokens.mEndPointer--; 759 return; 760 } 761 762 if (tokens.hasDot(tokens.mEndPointer - 1)) { 763 lastToken += '.'; 764 } 765 normalized += "."; 766 767 // Take care of suffixes like M.D. and D.D.S. 768 int pos = tokens.mEndPointer - 1; 769 while (normalized.length() <= mMaxSuffixLength) { 770 771 if (mSuffixesSet.contains(normalized)) { 772 name.suffix = lastToken; 773 tokens.mEndPointer = pos; 774 return; 775 } 776 777 if (pos == tokens.mStartPointer) { 778 break; 779 } 780 781 pos--; 782 if (tokens.hasDot(pos)) { 783 lastToken = tokens.mTokens[pos] + "." + lastToken; 784 } else { 785 lastToken = tokens.mTokens[pos] + " " + lastToken; 786 } 787 788 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 789 } 790 } 791 792 private void parseLastName(Name name, NameTokenizer tokens) { 793 if (tokens.mStartPointer == tokens.mEndPointer) { 794 return; 795 } 796 797 // If the first word is followed by a comma, assume that it's the family name 798 if (tokens.hasComma(tokens.mStartPointer)) { 799 name.familyName = tokens.mTokens[tokens.mStartPointer]; 800 tokens.mStartPointer++; 801 return; 802 } 803 804 // If the second word is followed by a comma and the first word 805 // is a last name prefix as in "de Sade" and "von Cliburn", treat 806 // the first two words as the family name. 807 if (tokens.mStartPointer + 1 < tokens.mEndPointer 808 && tokens.hasComma(tokens.mStartPointer + 1) 809 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 810 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 811 if (tokens.hasDot(tokens.mStartPointer)) { 812 familyNamePrefix += '.'; 813 } 814 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 815 tokens.mStartPointer += 2; 816 return; 817 } 818 819 // Finally, assume that the last word is the last name 820 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 821 tokens.mEndPointer--; 822 823 // Take care of last names like "de Sade" and "von Cliburn" 824 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 825 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 826 if (isFamilyNamePrefix(lastNamePrefix)) { 827 if (tokens.hasDot(tokens.mEndPointer - 1)) { 828 lastNamePrefix += '.'; 829 } 830 name.familyName = lastNamePrefix + " " + name.familyName; 831 tokens.mEndPointer--; 832 } 833 } 834 } 835 836 /** 837 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 838 */ 839 private boolean isFamilyNamePrefix(String word) { 840 final String normalized = word.toUpperCase(); 841 842 return mLastNamePrefixesSet.contains(normalized) 843 || mLastNamePrefixesSet.contains(normalized + "."); 844 } 845 846 847 private void parseMiddleName(Name name, NameTokenizer tokens) { 848 if (tokens.mStartPointer == tokens.mEndPointer) { 849 return; 850 } 851 852 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 853 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 854 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 855 toUpperCase())) { 856 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 857 if (tokens.hasDot(tokens.mEndPointer - 1)) { 858 name.middleName += '.'; 859 } 860 tokens.mEndPointer--; 861 } 862 } 863 } 864 865 private void parseGivenNames(Name name, NameTokenizer tokens) { 866 if (tokens.mStartPointer == tokens.mEndPointer) { 867 return; 868 } 869 870 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 871 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 872 } else { 873 StringBuilder sb = new StringBuilder(); 874 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 875 if (i != tokens.mStartPointer) { 876 sb.append(' '); 877 } 878 sb.append(tokens.mTokens[i]); 879 if (tokens.hasDot(i)) { 880 sb.append('.'); 881 } 882 } 883 name.givenNames = sb.toString(); 884 } 885 } 886 887 /** 888 * Makes the best guess at the expected full name style based on the character set 889 * used in the supplied name. If the phonetic name is also supplied, tries to 890 * differentiate between Chinese, Japanese and Korean based on the alphabet used 891 * for the phonetic name. 892 */ 893 public void guessNameStyle(Name name) { 894 guessFullNameStyle(name); 895 guessPhoneticNameStyle(name); 896 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 897 name.phoneticNameStyle); 898 } 899 900 /** 901 * Updates the display name style according to the phonetic name style if we 902 * were unsure about display name style based on the name components, but 903 * phonetic name makes it more definitive. 904 */ 905 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 906 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 907 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 908 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 909 return FullNameStyle.JAPANESE; 910 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 911 return FullNameStyle.KOREAN; 912 } 913 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 914 return FullNameStyle.CHINESE; 915 } 916 } 917 } 918 return nameStyle; 919 } 920 921 /** 922 * Makes the best guess at the expected full name style based on the character set 923 * used in the supplied name. 924 */ 925 private void guessFullNameStyle(NameSplitter.Name name) { 926 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 927 return; 928 } 929 930 int bestGuess = guessFullNameStyle(name.givenNames); 931 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 932 // if the name is not JANPANESE or KOREAN. 933 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 934 && bestGuess != FullNameStyle.WESTERN) { 935 name.fullNameStyle = bestGuess; 936 return; 937 } 938 939 int guess = guessFullNameStyle(name.familyName); 940 if (guess != FullNameStyle.UNDEFINED) { 941 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 942 name.fullNameStyle = guess; 943 return; 944 } 945 bestGuess = guess; 946 } 947 948 guess = guessFullNameStyle(name.middleName); 949 if (guess != FullNameStyle.UNDEFINED) { 950 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 951 name.fullNameStyle = guess; 952 return; 953 } 954 bestGuess = guess; 955 } 956 957 guess = guessFullNameStyle(name.prefix); 958 if (guess != FullNameStyle.UNDEFINED) { 959 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 960 name.fullNameStyle = guess; 961 return; 962 } 963 bestGuess = guess; 964 } 965 966 guess = guessFullNameStyle(name.suffix); 967 if (guess != FullNameStyle.UNDEFINED) { 968 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 969 name.fullNameStyle = guess; 970 return; 971 } 972 bestGuess = guess; 973 } 974 975 name.fullNameStyle = bestGuess; 976 } 977 978 public int guessFullNameStyle(String name) { 979 if (name == null) { 980 return FullNameStyle.UNDEFINED; 981 } 982 983 int nameStyle = FullNameStyle.UNDEFINED; 984 int length = name.length(); 985 int offset = 0; 986 while (offset < length) { 987 int codePoint = Character.codePointAt(name, offset); 988 if (Character.isLetter(codePoint)) { 989 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 990 991 if (!isLatinUnicodeBlock(unicodeBlock)) { 992 993 if (isCJKUnicodeBlock(unicodeBlock)) { 994 // We don't know if this is Chinese, Japanese or Korean - 995 // trying to figure out by looking at other characters in the name 996 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 997 } 998 999 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1000 return FullNameStyle.JAPANESE; 1001 } 1002 1003 if (isKoreanUnicodeBlock(unicodeBlock)) { 1004 return FullNameStyle.KOREAN; 1005 } 1006 } 1007 nameStyle = FullNameStyle.WESTERN; 1008 } 1009 offset += Character.charCount(codePoint); 1010 } 1011 return nameStyle; 1012 } 1013 1014 private int guessCJKNameStyle(String name, int offset) { 1015 int length = name.length(); 1016 while (offset < length) { 1017 int codePoint = Character.codePointAt(name, offset); 1018 if (Character.isLetter(codePoint)) { 1019 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1020 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1021 return FullNameStyle.JAPANESE; 1022 } 1023 if (isKoreanUnicodeBlock(unicodeBlock)) { 1024 return FullNameStyle.KOREAN; 1025 } 1026 } 1027 offset += Character.charCount(codePoint); 1028 } 1029 1030 return FullNameStyle.CJK; 1031 } 1032 1033 private void guessPhoneticNameStyle(NameSplitter.Name name) { 1034 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 1035 return; 1036 } 1037 1038 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 1039 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 1040 name.phoneticNameStyle = bestGuess; 1041 return; 1042 } 1043 1044 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 1045 if (guess != FullNameStyle.UNDEFINED) { 1046 if (guess != FullNameStyle.CJK) { 1047 name.phoneticNameStyle = guess; 1048 return; 1049 } 1050 bestGuess = guess; 1051 } 1052 1053 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 1054 if (guess != FullNameStyle.UNDEFINED) { 1055 if (guess != FullNameStyle.CJK) { 1056 name.phoneticNameStyle = guess; 1057 return; 1058 } 1059 bestGuess = guess; 1060 } 1061 } 1062 1063 public int guessPhoneticNameStyle(String name) { 1064 if (name == null) { 1065 return PhoneticNameStyle.UNDEFINED; 1066 } 1067 1068 int nameStyle = PhoneticNameStyle.UNDEFINED; 1069 int length = name.length(); 1070 int offset = 0; 1071 while (offset < length) { 1072 int codePoint = Character.codePointAt(name, offset); 1073 if (Character.isLetter(codePoint)) { 1074 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1075 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1076 return PhoneticNameStyle.JAPANESE; 1077 } 1078 if (isKoreanUnicodeBlock(unicodeBlock)) { 1079 return PhoneticNameStyle.KOREAN; 1080 } 1081 if (isLatinUnicodeBlock(unicodeBlock)) { 1082 return PhoneticNameStyle.PINYIN; 1083 } 1084 } 1085 offset += Character.charCount(codePoint); 1086 } 1087 1088 return nameStyle; 1089 } 1090 1091 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 1092 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 1093 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 1094 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 1095 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 1096 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 1097 } 1098 1099 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 1100 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 1101 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1102 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1103 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1104 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1105 || block == UnicodeBlock.CJK_COMPATIBILITY 1106 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 1107 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1108 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 1109 } 1110 1111 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 1112 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 1113 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 1114 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 1115 } 1116 1117 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 1118 return unicodeBlock == UnicodeBlock.KATAKANA || 1119 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 1120 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 1121 unicodeBlock == UnicodeBlock.HIRAGANA; 1122 } 1123 } 1124