Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.providers.contacts;
     18 
     19 import android.text.TextUtils;
     20 import android.util.Log;
     21 
     22 import java.util.ArrayList;
     23 import java.util.Locale;
     24 
     25 import libcore.icu.Transliterator;
     26 
     27 /**
     28  * An object to convert Chinese character to its corresponding pinyin string.
     29  * For characters with multiple possible pinyin string, only one is selected
     30  * according to ICU Transliterator class. Polyphone is not supported in this
     31  * implementation.
     32  */
     33 public class HanziToPinyin {
     34     private static final String TAG = "HanziToPinyin";
     35 
     36     private static HanziToPinyin sInstance;
     37     private Transliterator mPinyinTransliterator;
     38     private Transliterator mAsciiTransliterator;
     39 
     40     public static class Token {
     41         /**
     42          * Separator between target string for each source char
     43          */
     44         public static final String SEPARATOR = " ";
     45 
     46         public static final int LATIN = 1;
     47         public static final int PINYIN = 2;
     48         public static final int UNKNOWN = 3;
     49 
     50         public Token() {
     51         }
     52 
     53         public Token(int type, String source, String target) {
     54             this.type = type;
     55             this.source = source;
     56             this.target = target;
     57         }
     58 
     59         /**
     60          * Type of this token, ASCII, PINYIN or UNKNOWN.
     61          */
     62         public int type;
     63         /**
     64          * Original string before translation.
     65          */
     66         public String source;
     67         /**
     68          * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
     69          * original string in source.
     70          */
     71         public String target;
     72     }
     73 
     74     private HanziToPinyin() {
     75         try {
     76             mPinyinTransliterator = new Transliterator("Han-Latin/Names; Latin-Ascii; Any-Upper");
     77             mAsciiTransliterator = new Transliterator("Latin-Ascii");
     78         } catch (RuntimeException e) {
     79             Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
     80                   + " HanziToPinyin is disabled");
     81         }
     82     }
     83 
     84     public boolean hasChineseTransliterator() {
     85         return mPinyinTransliterator != null;
     86     }
     87 
     88     public static HanziToPinyin getInstance() {
     89         synchronized (HanziToPinyin.class) {
     90             if (sInstance == null) {
     91                 sInstance = new HanziToPinyin();
     92             }
     93             return sInstance;
     94         }
     95     }
     96 
     97     private void tokenize(char character, Token token) {
     98         token.source = Character.toString(character);
     99 
    100         // ASCII
    101         if (character < 128) {
    102             token.type = Token.LATIN;
    103             token.target = token.source;
    104             return;
    105         }
    106 
    107         // Extended Latin. Transcode these to ASCII equivalents
    108         if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
    109             token.type = Token.LATIN;
    110             token.target = mAsciiTransliterator == null ? token.source :
    111                 mAsciiTransliterator.transliterate(token.source);
    112             return;
    113         }
    114 
    115         token.type = Token.PINYIN;
    116         token.target = mPinyinTransliterator.transliterate(token.source);
    117         if (TextUtils.isEmpty(token.target) ||
    118             TextUtils.equals(token.source, token.target)) {
    119             token.type = Token.UNKNOWN;
    120             token.target = token.source;
    121         }
    122     }
    123 
    124     public String transliterate(final String input) {
    125         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
    126             return null;
    127         }
    128         return mPinyinTransliterator.transliterate(input);
    129     }
    130 
    131     /**
    132      * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
    133      * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
    134      * Token. If there is no Chinese transliterator, the empty token array is returned.
    135      */
    136     public ArrayList<Token> getTokens(final String input) {
    137         ArrayList<Token> tokens = new ArrayList<Token>();
    138         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
    139             // return empty tokens.
    140             return tokens;
    141         }
    142 
    143         final int inputLength = input.length();
    144         final StringBuilder sb = new StringBuilder();
    145         int tokenType = Token.LATIN;
    146         Token token = new Token();
    147 
    148         // Go through the input, create a new token when
    149         // a. Token type changed
    150         // b. Get the Pinyin of current charater.
    151         // c. current character is space.
    152         for (int i = 0; i < inputLength; i++) {
    153             final char character = input.charAt(i);
    154             if (Character.isSpaceChar(character)) {
    155                 if (sb.length() > 0) {
    156                     addToken(sb, tokens, tokenType);
    157                 }
    158             } else {
    159                 tokenize(character, token);
    160                 if (token.type == Token.PINYIN) {
    161                     if (sb.length() > 0) {
    162                         addToken(sb, tokens, tokenType);
    163                     }
    164                     tokens.add(token);
    165                     token = new Token();
    166                 } else {
    167                     if (tokenType != token.type && sb.length() > 0) {
    168                         addToken(sb, tokens, tokenType);
    169                     }
    170                     sb.append(token.target);
    171                 }
    172                 tokenType = token.type;
    173             }
    174         }
    175         if (sb.length() > 0) {
    176             addToken(sb, tokens, tokenType);
    177         }
    178         return tokens;
    179     }
    180 
    181     private void addToken(
    182             final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
    183         String str = sb.toString();
    184         tokens.add(new Token(tokenType, str, str));
    185         sb.setLength(0);
    186     }
    187 }
    188