Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.providers.contacts;
     18 
     19 import android.icu.text.Transliterator;
     20 import android.text.TextUtils;
     21 import android.util.Log;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Locale;
     25 
     26 
     27 /**
     28  * An object to convert Chinese character to its corresponding pinyin string.
     29  * For characters with multiple possible pinyin string, only one is selected
     30  * according to ICU Transliterator class. Polyphone is not supported in this
     31  * implementation.
     32  */
     33 public class HanziToPinyin {
     34     private static final String TAG = "HanziToPinyin";
     35 
     36     private static HanziToPinyin sInstance;
     37     private Transliterator mPinyinTransliterator;
     38     private Transliterator mAsciiTransliterator;
     39 
     40     public static class Token {
     41         /**
     42          * Separator between target string for each source char
     43          */
     44         public static final String SEPARATOR = " ";
     45 
     46         public static final int LATIN = 1;
     47         public static final int PINYIN = 2;
     48         public static final int UNKNOWN = 3;
     49 
     50         public Token() {
     51         }
     52 
     53         public Token(int type, String source, String target) {
     54             this.type = type;
     55             this.source = source;
     56             this.target = target;
     57         }
     58 
     59         /**
     60          * Type of this token, ASCII, PINYIN or UNKNOWN.
     61          */
     62         public int type;
     63         /**
     64          * Original string before translation.
     65          */
     66         public String source;
     67         /**
     68          * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
     69          * original string in source.
     70          */
     71         public String target;
     72     }
     73 
     74     private HanziToPinyin() {
     75         try {
     76             mPinyinTransliterator = Transliterator.getInstance(
     77                     "Han-Latin/Names; Latin-Ascii; Any-Upper");
     78             mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii");
     79         } catch (IllegalArgumentException e) {
     80             Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
     81                   + " HanziToPinyin is disabled");
     82         }
     83     }
     84 
     85     public boolean hasChineseTransliterator() {
     86         return mPinyinTransliterator != null;
     87     }
     88 
     89     public static HanziToPinyin getInstance() {
     90         synchronized (HanziToPinyin.class) {
     91             if (sInstance == null) {
     92                 sInstance = new HanziToPinyin();
     93             }
     94             return sInstance;
     95         }
     96     }
     97 
     98     private void tokenize(char character, Token token) {
     99         token.source = Character.toString(character);
    100 
    101         // ASCII
    102         if (character < 128) {
    103             token.type = Token.LATIN;
    104             token.target = token.source;
    105             return;
    106         }
    107 
    108         // Extended Latin. Transcode these to ASCII equivalents
    109         if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
    110             token.type = Token.LATIN;
    111             token.target = mAsciiTransliterator == null ? token.source :
    112                 mAsciiTransliterator.transliterate(token.source);
    113             return;
    114         }
    115 
    116         token.type = Token.PINYIN;
    117         token.target = mPinyinTransliterator.transliterate(token.source);
    118         if (TextUtils.isEmpty(token.target) ||
    119             TextUtils.equals(token.source, token.target)) {
    120             token.type = Token.UNKNOWN;
    121             token.target = token.source;
    122         }
    123     }
    124 
    125     public String transliterate(final String input) {
    126         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
    127             return null;
    128         }
    129         return mPinyinTransliterator.transliterate(input);
    130     }
    131 
    132     /**
    133      * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
    134      * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
    135      * Token. If there is no Chinese transliterator, the empty token array is returned.
    136      */
    137     public ArrayList<Token> getTokens(final String input) {
    138         ArrayList<Token> tokens = new ArrayList<Token>();
    139         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
    140             // return empty tokens.
    141             return tokens;
    142         }
    143 
    144         final int inputLength = input.length();
    145         final StringBuilder sb = new StringBuilder();
    146         int tokenType = Token.LATIN;
    147         Token token = new Token();
    148 
    149         // Go through the input, create a new token when
    150         // a. Token type changed
    151         // b. Get the Pinyin of current charater.
    152         // c. current character is space.
    153         for (int i = 0; i < inputLength; i++) {
    154             final char character = input.charAt(i);
    155             if (Character.isSpaceChar(character)) {
    156                 if (sb.length() > 0) {
    157                     addToken(sb, tokens, tokenType);
    158                 }
    159             } else {
    160                 tokenize(character, token);
    161                 if (token.type == Token.PINYIN) {
    162                     if (sb.length() > 0) {
    163                         addToken(sb, tokens, tokenType);
    164                     }
    165                     tokens.add(token);
    166                     token = new Token();
    167                 } else {
    168                     if (tokenType != token.type && sb.length() > 0) {
    169                         addToken(sb, tokens, tokenType);
    170                     }
    171                     sb.append(token.target);
    172                 }
    173                 tokenType = token.type;
    174             }
    175         }
    176         if (sb.length() > 0) {
    177             addToken(sb, tokens, tokenType);
    178         }
    179         return tokens;
    180     }
    181 
    182     private void addToken(
    183             final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
    184         String str = sb.toString();
    185         tokens.add(new Token(tokenType, str, str));
    186         sb.setLength(0);
    187     }
    188 }
    189