Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts;
     17 
     18 import android.content.ContentValues;
     19 import android.provider.ContactsContract.CommonDataKinds.StructuredName;
     20 import android.provider.ContactsContract.FullNameStyle;
     21 import android.provider.ContactsContract.PhoneticNameStyle;
     22 import android.text.TextUtils;
     23 import android.util.ArraySet;
     24 
     25 import com.android.providers.contacts.util.NeededForTesting;
     26 
     27 import java.lang.Character.UnicodeBlock;
     28 import java.util.Locale;
     29 import java.util.StringTokenizer;
     30 
     31 /**
     32  * The purpose of this class is to split a full name into given names and last
     33  * name. The logic only supports having a single last name. If the full name has
     34  * multiple last names the output will be incorrect.
     35  * <p>
     36  * Core algorithm:
     37  * <ol>
     38  * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
     39  * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
     40  * <li>Assign the last remaining token as the last name.</li>
     41  * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
     42  * this word also as the last name.</li>
     43  * <li>Assign the rest of the words as the "given names".</li>
     44  * </ol>
     45  */
     46 public class NameSplitter {
     47 
     48     public static final int MAX_TOKENS = 10;
     49 
     50     private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
     51     private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();
     52 
     53     // This includes simplified and traditional Chinese
     54     private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();
     55 
     56     private final ArraySet<String> mPrefixesSet;
     57     private final ArraySet<String> mSuffixesSet;
     58     private final int mMaxSuffixLength;
     59     private final ArraySet<String> mLastNamePrefixesSet;
     60     private final ArraySet<String> mConjuctions;
     61     private final Locale mLocale;
     62     private final String mLanguage;
     63 
     64     /**
     65      * Two-Chracter long Korean family names.
     66      * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1
     67      */
     68     private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = {
     69         "\uAC15\uC804", // Gang Jeon
     70         "\uB0A8\uAD81", // Nam Goong
     71         "\uB3C5\uACE0", // Dok Go
     72         "\uB3D9\uBC29", // Dong Bang
     73         "\uB9DD\uC808", // Mang Jeol
     74         "\uC0AC\uACF5", // Sa Gong
     75         "\uC11C\uBB38", // Seo Moon
     76         "\uC120\uC6B0", // Seon Woo
     77         "\uC18C\uBD09", // So Bong
     78         "\uC5B4\uAE08", // Uh Geum
     79         "\uC7A5\uACE1", // Jang Gok
     80         "\uC81C\uAC08", // Je Gal
     81         "\uD669\uBCF4"  // Hwang Bo
     82     };
     83 
     84     public static class Name {
     85         public String prefix;
     86         public String givenNames;
     87         public String middleName;
     88         public String familyName;
     89         public String suffix;
     90 
     91         public int fullNameStyle;
     92 
     93         public String phoneticFamilyName;
     94         public String phoneticMiddleName;
     95         public String phoneticGivenName;
     96 
     97         public int phoneticNameStyle;
     98 
     99         public Name() {
    100         }
    101 
    102         public Name(String prefix, String givenNames, String middleName, String familyName,
    103                 String suffix) {
    104             this.prefix = prefix;
    105             this.givenNames = givenNames;
    106             this.middleName = middleName;
    107             this.familyName = familyName;
    108             this.suffix = suffix;
    109         }
    110 
    111         @NeededForTesting
    112         public String getPrefix() {
    113             return prefix;
    114         }
    115 
    116         public String getGivenNames() {
    117             return givenNames;
    118         }
    119 
    120         public String getMiddleName() {
    121             return middleName;
    122         }
    123 
    124         public String getFamilyName() {
    125             return familyName;
    126         }
    127 
    128         @NeededForTesting
    129         public String getSuffix() {
    130             return suffix;
    131         }
    132 
    133         public int getFullNameStyle() {
    134             return fullNameStyle;
    135         }
    136 
    137         public String getPhoneticFamilyName() {
    138             return phoneticFamilyName;
    139         }
    140 
    141         public String getPhoneticMiddleName() {
    142             return phoneticMiddleName;
    143         }
    144 
    145         public String getPhoneticGivenName() {
    146             return phoneticGivenName;
    147         }
    148 
    149         public int getPhoneticNameStyle() {
    150             return phoneticNameStyle;
    151         }
    152 
    153         public void fromValues(ContentValues values) {
    154             prefix = values.getAsString(StructuredName.PREFIX);
    155             givenNames = values.getAsString(StructuredName.GIVEN_NAME);
    156             middleName = values.getAsString(StructuredName.MIDDLE_NAME);
    157             familyName = values.getAsString(StructuredName.FAMILY_NAME);
    158             suffix = values.getAsString(StructuredName.SUFFIX);
    159 
    160             Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
    161             fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;
    162 
    163             phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
    164             phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
    165             phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);
    166 
    167             integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
    168             phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
    169         }
    170 
    171         public void toValues(ContentValues values) {
    172             putValueIfPresent(values, StructuredName.PREFIX, prefix);
    173             putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
    174             putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
    175             putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
    176             putValueIfPresent(values, StructuredName.SUFFIX, suffix);
    177             values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
    178             putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
    179             putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
    180             putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
    181             values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
    182         }
    183 
    184         private void putValueIfPresent(ContentValues values, String name, String value) {
    185             if (value != null) {
    186                 values.put(name, value);
    187             }
    188         }
    189 
    190         public void clear() {
    191             prefix = null;
    192             givenNames = null;
    193             middleName = null;
    194             familyName = null;
    195             suffix = null;
    196             fullNameStyle = FullNameStyle.UNDEFINED;
    197             phoneticFamilyName = null;
    198             phoneticMiddleName = null;
    199             phoneticGivenName = null;
    200             phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
    201         }
    202 
    203         public boolean isEmpty() {
    204             return TextUtils.isEmpty(givenNames)
    205                     && TextUtils.isEmpty(middleName)
    206                     && TextUtils.isEmpty(familyName)
    207                     && TextUtils.isEmpty(suffix)
    208                     && TextUtils.isEmpty(phoneticFamilyName)
    209                     && TextUtils.isEmpty(phoneticMiddleName)
    210                     && TextUtils.isEmpty(phoneticGivenName);
    211         }
    212 
    213         @Override
    214         public String toString() {
    215             return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName
    216                     + " family: " + familyName + " suffix: " + suffix + " ph/given: "
    217                     + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: "
    218                     + phoneticFamilyName + "]";
    219         }
    220     }
    221 
    222     private static class NameTokenizer extends StringTokenizer {
    223         private final String[] mTokens;
    224         private int mDotBitmask;
    225         private int mCommaBitmask;
    226         private int mStartPointer;
    227         private int mEndPointer;
    228 
    229         public NameTokenizer(String fullName) {
    230             super(fullName, " .,", true);
    231 
    232             mTokens = new String[MAX_TOKENS];
    233 
    234             // Iterate over tokens, skipping over empty ones and marking tokens that
    235             // are followed by dots.
    236             while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
    237                 final String token = nextToken();
    238                 if (token.length() > 0) {
    239                     final char c = token.charAt(0);
    240                     if (c == ' ') {
    241                         continue;
    242                     }
    243                 }
    244 
    245                 if (mEndPointer > 0 && token.charAt(0) == '.') {
    246                     mDotBitmask |= (1 << (mEndPointer - 1));
    247                 } else if (mEndPointer > 0 && token.charAt(0) == ',') {
    248                     mCommaBitmask |= (1 << (mEndPointer - 1));
    249                 } else {
    250                     mTokens[mEndPointer] = token;
    251                     mEndPointer++;
    252                 }
    253             }
    254         }
    255 
    256         /**
    257          * Returns true if the token is followed by a dot in the original full name.
    258          */
    259         public boolean hasDot(int index) {
    260             return (mDotBitmask & (1 << index)) != 0;
    261         }
    262 
    263         /**
    264          * Returns true if the token is followed by a comma in the original full name.
    265          */
    266         public boolean hasComma(int index) {
    267             return (mCommaBitmask & (1 << index)) != 0;
    268         }
    269     }
    270 
    271     /**
    272      * Constructor.
    273      *
    274      * @param commonPrefixes comma-separated list of common prefixes,
    275      *            e.g. "Mr, Ms, Mrs"
    276      * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
    277      *            e.g. "d', st, st., von"
    278      * @param commonSuffixes comma-separated list of common suffixes,
    279      *            e.g. "Jr, M.D., MD, D.D.S."
    280      * @param commonConjunctions comma-separated list of common conjuctions,
    281      *            e.g. "AND, Or"
    282      */
    283     public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
    284             String commonSuffixes, String commonConjunctions, Locale locale) {
    285         // TODO: refactor this to use <string-array> resources
    286         mPrefixesSet = convertToSet(commonPrefixes);
    287         mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
    288         mSuffixesSet = convertToSet(commonSuffixes);
    289         mConjuctions = convertToSet(commonConjunctions);
    290         mLocale = locale != null ? locale : Locale.getDefault();
    291         mLanguage = mLocale.getLanguage().toLowerCase();
    292 
    293         int maxLength = 0;
    294         for (String suffix : mSuffixesSet) {
    295             if (suffix.length() > maxLength) {
    296                 maxLength = suffix.length();
    297             }
    298         }
    299 
    300         mMaxSuffixLength = maxLength;
    301     }
    302 
    303     /**
    304      * Converts a comma-separated list of Strings to a set of Strings. Trims strings
    305      * and converts them to upper case.
    306      */
    307     private static ArraySet<String> convertToSet(String strings) {
    308         ArraySet<String> set = new ArraySet<>();
    309         if (strings != null) {
    310             String[] split = strings.split(",");
    311             for (int i = 0; i < split.length; i++) {
    312                 set.add(split[i].trim().toUpperCase());
    313             }
    314         }
    315         return set;
    316     }
    317 
    318     /**
    319      * Parses a full name and returns components as a list of tokens.
    320      */
    321     public int tokenize(String[] tokens, String fullName) {
    322         if (fullName == null) {
    323             return 0;
    324         }
    325 
    326         NameTokenizer tokenizer = new NameTokenizer(fullName);
    327 
    328         if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
    329             return 0;
    330         }
    331 
    332         String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
    333         int count = 0;
    334         for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
    335             tokens[count++] = tokenizer.mTokens[i];
    336         }
    337 
    338         return count;
    339     }
    340 
    341 
    342     /**
    343      * Parses a full name and returns parsed components in the Name object.
    344      */
    345     public void split(Name name, String fullName) {
    346         if (fullName == null) {
    347             return;
    348         }
    349 
    350         int fullNameStyle = guessFullNameStyle(fullName);
    351         if (fullNameStyle == FullNameStyle.CJK) {
    352             fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
    353         }
    354 
    355         split(name, fullName, fullNameStyle);
    356     }
    357 
    358     /**
    359      * Parses a full name and returns parsed components in the Name object
    360      * with a given fullNameStyle.
    361      */
    362     public void split(Name name, String fullName, int fullNameStyle) {
    363         if (fullName == null) {
    364             return;
    365         }
    366 
    367         name.fullNameStyle = fullNameStyle;
    368 
    369         switch (fullNameStyle) {
    370             case FullNameStyle.CHINESE:
    371                 splitChineseName(name, fullName);
    372                 break;
    373 
    374             case FullNameStyle.JAPANESE:
    375                 splitJapaneseName(name, fullName);
    376                 break;
    377 
    378             case FullNameStyle.KOREAN:
    379                 splitKoreanName(name, fullName);
    380                 break;
    381 
    382             default:
    383                 splitWesternName(name, fullName);
    384         }
    385     }
    386 
    387     /**
    388      * Splits a full name composed according to the Western tradition:
    389      * <pre>
    390      *   [prefix] given name(s) [[middle name] family name] [, suffix]
    391      *   [prefix] family name, given name [middle name] [,suffix]
    392      * </pre>
    393      */
    394     private void splitWesternName(Name name, String fullName) {
    395         NameTokenizer tokens = new NameTokenizer(fullName);
    396         parsePrefix(name, tokens);
    397 
    398         // If the name consists of just one or two tokens, treat them as first/last name,
    399         // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
    400         if (tokens.mEndPointer > 2) {
    401             parseSuffix(name, tokens);
    402         }
    403 
    404         if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
    405             name.givenNames = tokens.mTokens[tokens.mStartPointer];
    406         } else {
    407             parseLastName(name, tokens);
    408             parseMiddleName(name, tokens);
    409             parseGivenNames(name, tokens);
    410         }
    411     }
    412 
    413     /**
    414      * Splits a full name composed according to the Chinese tradition:
    415      * <pre>
    416      *   [family name [middle name]] given name
    417      * </pre>
    418      */
    419     private void splitChineseName(Name name, String fullName) {
    420         StringTokenizer tokenizer = new StringTokenizer(fullName);
    421         while (tokenizer.hasMoreTokens()) {
    422             String token = tokenizer.nextToken();
    423             if (name.givenNames == null) {
    424                 name.givenNames = token;
    425             } else if (name.familyName == null) {
    426                 name.familyName = name.givenNames;
    427                 name.givenNames = token;
    428             } else if (name.middleName == null) {
    429                 name.middleName = name.givenNames;
    430                 name.givenNames = token;
    431             } else {
    432                 name.middleName = name.middleName + name.givenNames;
    433                 name.givenNames = token;
    434             }
    435         }
    436 
    437         // If a single word parse that word up.
    438         if (name.givenNames != null && name.familyName == null && name.middleName == null) {
    439             int length = fullName.length();
    440             if (length == 2) {
    441                 name.familyName = fullName.substring(0, 1);
    442                 name.givenNames = fullName.substring(1);
    443             } else if (length == 3) {
    444                 name.familyName = fullName.substring(0, 1);
    445                 name.middleName = fullName.substring(1, 2);
    446                 name.givenNames = fullName.substring(2);
    447             } else if (length == 4) {
    448                 name.familyName = fullName.substring(0, 2);
    449                 name.middleName = fullName.substring(2, 3);
    450                 name.givenNames = fullName.substring(3);
    451             }
    452 
    453         }
    454     }
    455 
    456     /**
    457      * Splits a full name composed according to the Japanese tradition:
    458      * <pre>
    459      *   [family name] given name(s)
    460      * </pre>
    461      */
    462     private void splitJapaneseName(Name name, String fullName) {
    463         StringTokenizer tokenizer = new StringTokenizer(fullName);
    464         while (tokenizer.hasMoreTokens()) {
    465             String token = tokenizer.nextToken();
    466             if (name.givenNames == null) {
    467                 name.givenNames = token;
    468             } else if (name.familyName == null) {
    469                 name.familyName = name.givenNames;
    470                 name.givenNames = token;
    471             } else {
    472                 name.givenNames += " " + token;
    473             }
    474         }
    475     }
    476 
    477     /**
    478      * Splits a full name composed according to the Korean tradition:
    479      * <pre>
    480      *   [family name] given name(s)
    481      * </pre>
    482      */
    483     private void splitKoreanName(Name name, String fullName) {
    484         StringTokenizer tokenizer = new StringTokenizer(fullName);
    485         if (tokenizer.countTokens() > 1) {
    486             // Each name can be identified by separators.
    487             while (tokenizer.hasMoreTokens()) {
    488                 String token = tokenizer.nextToken();
    489                 if (name.givenNames == null) {
    490                     name.givenNames = token;
    491                 } else if (name.familyName == null) {
    492                     name.familyName = name.givenNames;
    493                     name.givenNames = token;
    494                 } else {
    495                     name.givenNames += " " + token;
    496                 }
    497             }
    498         } else {
    499             // There is no separator. Try to guess family name.
    500             // The length of most family names is 1.
    501             int familyNameLength = 1;
    502 
    503             // Compare with 2-length family names.
    504             for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) {
    505                 if (fullName.startsWith(twoLengthFamilyName)) {
    506                     familyNameLength = 2;
    507                     break;
    508                 }
    509             }
    510 
    511             name.familyName = fullName.substring(0, familyNameLength);
    512             if (fullName.length() > familyNameLength) {
    513                 name.givenNames = fullName.substring(familyNameLength);
    514             }
    515         }
    516     }
    517 
    518     /**
    519      * Concatenates components of a name according to the rules dictated by the name style.
    520      *
    521      * @param givenNameFirst is ignored for CJK display name styles
    522      */
    523     public String join(Name name, boolean givenNameFirst, boolean includePrefix) {
    524         String prefix = includePrefix ? name.prefix : null;
    525         switch (name.fullNameStyle) {
    526             case FullNameStyle.CJK:
    527             case FullNameStyle.CHINESE:
    528             case FullNameStyle.KOREAN:
    529                 return join(prefix, name.familyName, name.middleName, name.givenNames,
    530                         name.suffix, false, false, false);
    531 
    532             case FullNameStyle.JAPANESE:
    533                 return join(prefix, name.familyName, name.middleName, name.givenNames,
    534                         name.suffix, true, false, false);
    535 
    536             default:
    537                 if (givenNameFirst) {
    538                     return join(prefix, name.givenNames, name.middleName, name.familyName,
    539                             name.suffix, true, false, true);
    540                 } else {
    541                     return join(prefix, name.familyName, name.givenNames, name.middleName,
    542                             name.suffix, true, true, true);
    543                 }
    544         }
    545     }
    546 
    547     /**
    548      * Concatenates components of the phonetic name following the CJK tradition:
    549      * family name + middle name + given name(s).
    550      */
    551     public String joinPhoneticName(Name name) {
    552         return join(null, name.phoneticFamilyName,
    553                 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false);
    554     }
    555 
    556     /**
    557      * Concatenates parts of a full name inserting spaces and commas as specified.
    558      */
    559     private String join(String prefix, String part1, String part2, String part3, String suffix,
    560             boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
    561         prefix = prefix == null ? null: prefix.trim();
    562         part1 = part1 == null ? null: part1.trim();
    563         part2 = part2 == null ? null: part2.trim();
    564         part3 = part3 == null ? null: part3.trim();
    565         suffix = suffix == null ? null: suffix.trim();
    566 
    567         boolean hasPrefix = !TextUtils.isEmpty(prefix);
    568         boolean hasPart1 = !TextUtils.isEmpty(part1);
    569         boolean hasPart2 = !TextUtils.isEmpty(part2);
    570         boolean hasPart3 = !TextUtils.isEmpty(part3);
    571         boolean hasSuffix = !TextUtils.isEmpty(suffix);
    572 
    573         boolean isSingleWord = true;
    574         String singleWord = null;
    575 
    576         if (hasPrefix) {
    577             singleWord = prefix;
    578         }
    579 
    580         if (hasPart1) {
    581             if (singleWord != null) {
    582                 isSingleWord = false;
    583             } else {
    584                 singleWord = part1;
    585             }
    586         }
    587 
    588         if (hasPart2) {
    589             if (singleWord != null) {
    590                 isSingleWord = false;
    591             } else {
    592                 singleWord = part2;
    593             }
    594         }
    595 
    596         if (hasPart3) {
    597             if (singleWord != null) {
    598                 isSingleWord = false;
    599             } else {
    600                 singleWord = part3;
    601             }
    602         }
    603 
    604         if (hasSuffix) {
    605             if (singleWord != null) {
    606                 isSingleWord = false;
    607             } else {
    608                 singleWord = normalizedSuffix(suffix);
    609             }
    610         }
    611 
    612         if (isSingleWord) {
    613             return singleWord;
    614         }
    615 
    616         StringBuilder sb = new StringBuilder();
    617 
    618         if (hasPrefix) {
    619             sb.append(prefix);
    620         }
    621 
    622         if (hasPart1) {
    623             if (hasPrefix) {
    624                 sb.append(' ');
    625             }
    626             sb.append(part1);
    627         }
    628 
    629         if (hasPart2) {
    630             if (hasPrefix || hasPart1) {
    631                 if (useCommaAfterPart1) {
    632                     sb.append(',');
    633                 }
    634                 if (useSpace) {
    635                     sb.append(' ');
    636                 }
    637             }
    638             sb.append(part2);
    639         }
    640 
    641         if (hasPart3) {
    642             if (hasPrefix || hasPart1 || hasPart2) {
    643                 if (useSpace) {
    644                     sb.append(' ');
    645                 }
    646             }
    647             sb.append(part3);
    648         }
    649 
    650         if (hasSuffix) {
    651             if (hasPrefix || hasPart1 || hasPart2 || hasPart3) {
    652                 if (useCommaAfterPart3) {
    653                     sb.append(',');
    654                 }
    655                 if (useSpace) {
    656                     sb.append(' ');
    657                 }
    658             }
    659             sb.append(normalizedSuffix(suffix));
    660         }
    661 
    662         return sb.toString();
    663     }
    664 
    665     /**
    666      * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
    667      * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
    668      */
    669     private String normalizedSuffix(String suffix) {
    670         int length = suffix.length();
    671         if (length == 0 || suffix.charAt(length - 1) == '.') {
    672             return suffix;
    673         }
    674 
    675         String withDot = suffix + '.';
    676         if (mSuffixesSet.contains(withDot.toUpperCase())) {
    677             return withDot;
    678         } else {
    679             return suffix;
    680         }
    681     }
    682 
    683     /**
    684      * If the supplied name style is undefined, returns a default based on the language,
    685      * otherwise returns the supplied name style itself.
    686      *
    687      * @param nameStyle See {@link FullNameStyle}.
    688      */
    689     public int getAdjustedFullNameStyle(int nameStyle) {
    690         if (nameStyle == FullNameStyle.UNDEFINED) {
    691             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
    692                 return FullNameStyle.JAPANESE;
    693             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
    694                 return FullNameStyle.KOREAN;
    695             } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
    696                 return FullNameStyle.CHINESE;
    697             } else {
    698                 return FullNameStyle.WESTERN;
    699             }
    700         } else if (nameStyle == FullNameStyle.CJK) {
    701             if (JAPANESE_LANGUAGE.equals(mLanguage)) {
    702                 return FullNameStyle.JAPANESE;
    703             } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
    704                 return FullNameStyle.KOREAN;
    705             } else {
    706                 return FullNameStyle.CHINESE;
    707             }
    708         }
    709         return nameStyle;
    710     }
    711 
    712     /**
    713      * Parses the first word from the name if it is a prefix.
    714      */
    715     private void parsePrefix(Name name, NameTokenizer tokens) {
    716         if (tokens.mStartPointer == tokens.mEndPointer) {
    717             return;
    718         }
    719 
    720         String firstToken = tokens.mTokens[tokens.mStartPointer];
    721         if (mPrefixesSet.contains(firstToken.toUpperCase())) {
    722             if (tokens.hasDot(tokens.mStartPointer)) {
    723                 firstToken += '.';
    724             }
    725             name.prefix = firstToken;
    726             tokens.mStartPointer++;
    727         }
    728     }
    729 
    730     /**
    731      * Parses the last word(s) from the name if it is a suffix.
    732      */
    733     private void parseSuffix(Name name, NameTokenizer tokens) {
    734         if (tokens.mStartPointer == tokens.mEndPointer) {
    735             return;
    736         }
    737 
    738         String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
    739 
    740         // Take care of an explicit comma-separated suffix
    741         if (tokens.mEndPointer - tokens.mStartPointer > 2
    742                 && tokens.hasComma(tokens.mEndPointer - 2)) {
    743             if (tokens.hasDot(tokens.mEndPointer - 1)) {
    744                 lastToken += '.';
    745             }
    746             name.suffix = lastToken;
    747             tokens.mEndPointer--;
    748             return;
    749         }
    750 
    751         if (lastToken.length() > mMaxSuffixLength) {
    752             return;
    753         }
    754 
    755         String normalized = lastToken.toUpperCase();
    756         if (mSuffixesSet.contains(normalized)) {
    757             name.suffix = lastToken;
    758             tokens.mEndPointer--;
    759             return;
    760         }
    761 
    762         if (tokens.hasDot(tokens.mEndPointer - 1)) {
    763             lastToken += '.';
    764         }
    765         normalized += ".";
    766 
    767         // Take care of suffixes like M.D. and D.D.S.
    768         int pos = tokens.mEndPointer - 1;
    769         while (normalized.length() <= mMaxSuffixLength) {
    770 
    771             if (mSuffixesSet.contains(normalized)) {
    772                 name.suffix = lastToken;
    773                 tokens.mEndPointer = pos;
    774                 return;
    775             }
    776 
    777             if (pos == tokens.mStartPointer) {
    778                 break;
    779             }
    780 
    781             pos--;
    782             if (tokens.hasDot(pos)) {
    783                 lastToken = tokens.mTokens[pos] + "." + lastToken;
    784             } else {
    785                 lastToken = tokens.mTokens[pos] + " " + lastToken;
    786             }
    787 
    788             normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
    789         }
    790     }
    791 
    792     private void parseLastName(Name name, NameTokenizer tokens) {
    793         if (tokens.mStartPointer == tokens.mEndPointer) {
    794             return;
    795         }
    796 
    797         // If the first word is followed by a comma, assume that it's the family name
    798         if (tokens.hasComma(tokens.mStartPointer)) {
    799            name.familyName = tokens.mTokens[tokens.mStartPointer];
    800            tokens.mStartPointer++;
    801            return;
    802         }
    803 
    804         // If the second word is followed by a comma and the first word
    805         // is a last name prefix as in "de Sade" and "von Cliburn", treat
    806         // the first two words as the family name.
    807         if (tokens.mStartPointer + 1 < tokens.mEndPointer
    808                 && tokens.hasComma(tokens.mStartPointer + 1)
    809                 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
    810             String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
    811             if (tokens.hasDot(tokens.mStartPointer)) {
    812                 familyNamePrefix += '.';
    813             }
    814             name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
    815             tokens.mStartPointer += 2;
    816             return;
    817         }
    818 
    819         // Finally, assume that the last word is the last name
    820         name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
    821         tokens.mEndPointer--;
    822 
    823         // Take care of last names like "de Sade" and "von Cliburn"
    824         if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
    825             String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
    826             if (isFamilyNamePrefix(lastNamePrefix)) {
    827                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
    828                     lastNamePrefix += '.';
    829                 }
    830                 name.familyName = lastNamePrefix + " " + name.familyName;
    831                 tokens.mEndPointer--;
    832             }
    833         }
    834     }
    835 
    836     /**
    837      * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
    838      */
    839     private boolean isFamilyNamePrefix(String word) {
    840         final String normalized = word.toUpperCase();
    841 
    842         return mLastNamePrefixesSet.contains(normalized)
    843                 || mLastNamePrefixesSet.contains(normalized + ".");
    844     }
    845 
    846 
    847     private void parseMiddleName(Name name, NameTokenizer tokens) {
    848         if (tokens.mStartPointer == tokens.mEndPointer) {
    849             return;
    850         }
    851 
    852         if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
    853             if ((tokens.mEndPointer - tokens.mStartPointer) == 2
    854                     || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
    855                             toUpperCase())) {
    856                 name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
    857                 if (tokens.hasDot(tokens.mEndPointer - 1)) {
    858                     name.middleName += '.';
    859                 }
    860                 tokens.mEndPointer--;
    861             }
    862         }
    863     }
    864 
    865     private void parseGivenNames(Name name, NameTokenizer tokens) {
    866         if (tokens.mStartPointer == tokens.mEndPointer) {
    867             return;
    868         }
    869 
    870         if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
    871             name.givenNames = tokens.mTokens[tokens.mStartPointer];
    872         } else {
    873             StringBuilder sb = new StringBuilder();
    874             for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
    875                 if (i != tokens.mStartPointer) {
    876                     sb.append(' ');
    877                 }
    878                 sb.append(tokens.mTokens[i]);
    879                 if (tokens.hasDot(i)) {
    880                     sb.append('.');
    881                 }
    882             }
    883             name.givenNames = sb.toString();
    884         }
    885     }
    886 
    887     /**
    888      * Makes the best guess at the expected full name style based on the character set
    889      * used in the supplied name.  If the phonetic name is also supplied, tries to
    890      * differentiate between Chinese, Japanese and Korean based on the alphabet used
    891      * for the phonetic name.
    892      */
    893     public void guessNameStyle(Name name) {
    894         guessFullNameStyle(name);
    895         guessPhoneticNameStyle(name);
    896         name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
    897                 name.phoneticNameStyle);
    898     }
    899 
    900     /**
    901      * Updates the display name style according to the phonetic name style if we
    902      * were unsure about display name style based on the name components, but
    903      * phonetic name makes it more definitive.
    904      */
    905     public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
    906         if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
    907             if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
    908                 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
    909                     return FullNameStyle.JAPANESE;
    910                 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
    911                     return FullNameStyle.KOREAN;
    912                 }
    913                 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
    914                     return FullNameStyle.CHINESE;
    915                 }
    916             }
    917         }
    918         return nameStyle;
    919     }
    920 
    921     /**
    922      * Makes the best guess at the expected full name style based on the character set
    923      * used in the supplied name.
    924      */
    925     private void guessFullNameStyle(NameSplitter.Name name) {
    926         if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
    927             return;
    928         }
    929 
    930         int bestGuess = guessFullNameStyle(name.givenNames);
    931         // A mix of Hanzi and latin chars are common in China, so we have to go through all names
    932         // if the name is not JANPANESE or KOREAN.
    933         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
    934                 && bestGuess != FullNameStyle.WESTERN) {
    935             name.fullNameStyle = bestGuess;
    936             return;
    937         }
    938 
    939         int guess = guessFullNameStyle(name.familyName);
    940         if (guess != FullNameStyle.UNDEFINED) {
    941             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
    942                 name.fullNameStyle = guess;
    943                 return;
    944             }
    945             bestGuess = guess;
    946         }
    947 
    948         guess = guessFullNameStyle(name.middleName);
    949         if (guess != FullNameStyle.UNDEFINED) {
    950             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
    951                 name.fullNameStyle = guess;
    952                 return;
    953             }
    954             bestGuess = guess;
    955         }
    956 
    957         guess = guessFullNameStyle(name.prefix);
    958         if (guess != FullNameStyle.UNDEFINED) {
    959             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
    960                 name.fullNameStyle = guess;
    961                 return;
    962             }
    963             bestGuess = guess;
    964         }
    965 
    966         guess = guessFullNameStyle(name.suffix);
    967         if (guess != FullNameStyle.UNDEFINED) {
    968             if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
    969                 name.fullNameStyle = guess;
    970                 return;
    971             }
    972             bestGuess = guess;
    973         }
    974 
    975         name.fullNameStyle = bestGuess;
    976     }
    977 
    978     public int guessFullNameStyle(String name) {
    979         if (name == null) {
    980             return FullNameStyle.UNDEFINED;
    981         }
    982 
    983         int nameStyle = FullNameStyle.UNDEFINED;
    984         int length = name.length();
    985         int offset = 0;
    986         while (offset < length) {
    987             int codePoint = Character.codePointAt(name, offset);
    988             if (Character.isLetter(codePoint)) {
    989                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
    990 
    991                 if (!isLatinUnicodeBlock(unicodeBlock)) {
    992 
    993                     if (isCJKUnicodeBlock(unicodeBlock)) {
    994                         // We don't know if this is Chinese, Japanese or Korean -
    995                         // trying to figure out by looking at other characters in the name
    996                         return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
    997                     }
    998 
    999                     if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
   1000                         return FullNameStyle.JAPANESE;
   1001                     }
   1002 
   1003                     if (isKoreanUnicodeBlock(unicodeBlock)) {
   1004                         return FullNameStyle.KOREAN;
   1005                     }
   1006                 }
   1007                 nameStyle = FullNameStyle.WESTERN;
   1008             }
   1009             offset += Character.charCount(codePoint);
   1010         }
   1011         return nameStyle;
   1012     }
   1013 
   1014     private int guessCJKNameStyle(String name, int offset) {
   1015         int length = name.length();
   1016         while (offset < length) {
   1017             int codePoint = Character.codePointAt(name, offset);
   1018             if (Character.isLetter(codePoint)) {
   1019                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
   1020                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
   1021                     return FullNameStyle.JAPANESE;
   1022                 }
   1023                 if (isKoreanUnicodeBlock(unicodeBlock)) {
   1024                     return FullNameStyle.KOREAN;
   1025                 }
   1026             }
   1027             offset += Character.charCount(codePoint);
   1028         }
   1029 
   1030         return FullNameStyle.CJK;
   1031     }
   1032 
   1033     private void guessPhoneticNameStyle(NameSplitter.Name name) {
   1034         if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
   1035             return;
   1036         }
   1037 
   1038         int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
   1039         if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
   1040             name.phoneticNameStyle = bestGuess;
   1041             return;
   1042         }
   1043 
   1044         int guess = guessPhoneticNameStyle(name.phoneticGivenName);
   1045         if (guess != FullNameStyle.UNDEFINED) {
   1046             if (guess != FullNameStyle.CJK) {
   1047                 name.phoneticNameStyle = guess;
   1048                 return;
   1049             }
   1050             bestGuess = guess;
   1051         }
   1052 
   1053         guess = guessPhoneticNameStyle(name.phoneticMiddleName);
   1054         if (guess != FullNameStyle.UNDEFINED) {
   1055             if (guess != FullNameStyle.CJK) {
   1056                 name.phoneticNameStyle = guess;
   1057                 return;
   1058             }
   1059             bestGuess = guess;
   1060         }
   1061     }
   1062 
   1063     public int guessPhoneticNameStyle(String name) {
   1064         if (name == null) {
   1065             return PhoneticNameStyle.UNDEFINED;
   1066         }
   1067 
   1068         int nameStyle = PhoneticNameStyle.UNDEFINED;
   1069         int length = name.length();
   1070         int offset = 0;
   1071         while (offset < length) {
   1072             int codePoint = Character.codePointAt(name, offset);
   1073             if (Character.isLetter(codePoint)) {
   1074                 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
   1075                 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
   1076                     return PhoneticNameStyle.JAPANESE;
   1077                 }
   1078                 if (isKoreanUnicodeBlock(unicodeBlock)) {
   1079                     return PhoneticNameStyle.KOREAN;
   1080                 }
   1081                 if (isLatinUnicodeBlock(unicodeBlock)) {
   1082                     return PhoneticNameStyle.PINYIN;
   1083                 }
   1084             }
   1085             offset += Character.charCount(codePoint);
   1086         }
   1087 
   1088         return nameStyle;
   1089     }
   1090 
   1091     private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
   1092         return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
   1093                 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
   1094                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
   1095                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
   1096                 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
   1097     }
   1098 
   1099     private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
   1100         return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
   1101                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
   1102                 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
   1103                 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
   1104                 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
   1105                 || block == UnicodeBlock.CJK_COMPATIBILITY
   1106                 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
   1107                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
   1108                 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
   1109     }
   1110 
   1111     private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
   1112         return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
   1113                 unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
   1114                 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
   1115     }
   1116 
   1117     private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
   1118         return unicodeBlock == UnicodeBlock.KATAKANA ||
   1119                 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
   1120                 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
   1121                 unicodeBlock == UnicodeBlock.HIRAGANA;
   1122     }
   1123 }
   1124