Home | History | Annotate | Download | only in android
      1 /*
      2  * Copyright 2010, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <ctype.h>
     18 #include <string.h>
     19 
     20 #include <unicode/ucol.h>
     21 #include <unicode/uiter.h>
     22 #include <unicode/ustring.h>
     23 #include <unicode/utypes.h>
     24 
     25 #include "PhonebookIndex.h"
     26 #include "PhoneticStringUtils.h"
     27 
     28 #define MIN_OUTPUT_SIZE 6       // Minimum required size for the output buffer (in bytes)
     29 
     30 namespace android {
     31 
     32 // IMPORTANT!  Keep the codes below SORTED. We are doing a binary search on the array
     33 static UChar DEFAULT_CHAR_MAP[] = {
     34     0x00C6,    'A',       // AE
     35     0x00DF,    'S',       // Etzett
     36     0x1100, 0x3131,       // HANGUL LETTER KIYEOK
     37     0x1101, 0x3132,       // HANGUL LETTER SSANGKIYEOK
     38     0x1102, 0x3134,       // HANGUL LETTER NIEUN
     39     0x1103, 0x3137,       // HANGUL LETTER TIKEUT
     40     0x1104, 0x3138,       // HANGUL LETTER SSANGTIKEUT
     41     0x1105, 0x3139,       // HANGUL LETTER RIEUL
     42     0x1106, 0x3141,       // HANGUL LETTER MIEUM
     43     0x1107, 0x3142,       // HANGUL LETTER PIEUP
     44     0x1108, 0x3143,       // HANGUL LETTER SSANGPIEUP
     45     0x1109, 0x3145,       // HANGUL LETTER SIOS
     46     0x110A, 0x3146,       // HANGUL LETTER SSANGSIOS
     47     0x110B, 0x3147,       // HANGUL LETTER IEUNG
     48     0x110C, 0x3148,       // HANGUL LETTER CIEUC
     49     0x110D, 0x3149,       // HANGUL LETTER SSANGCIEUC
     50     0x110E, 0x314A,       // HANGUL LETTER CHIEUCH
     51     0x110F, 0x314B,       // HANGUL LETTER KHIEUKH
     52     0x1110, 0x314C,       // HANGUL LETTER THIEUTH
     53     0x1111, 0x314D,       // HANGUL LETTER PHIEUPH
     54     0x1112, 0x314E,       // HANGUL LETTER HIEUH
     55     0x111A, 0x3140,       // HANGUL LETTER RIEUL-HIEUH
     56     0x1121, 0x3144,       // HANGUL LETTER PIEUP-SIOS
     57     0x1161, 0x314F,       // HANGUL LETTER A
     58     0x1162, 0x3150,       // HANGUL LETTER AE
     59     0x1163, 0x3151,       // HANGUL LETTER YA
     60     0x1164, 0x3152,       // HANGUL LETTER YAE
     61     0x1165, 0x3153,       // HANGUL LETTER EO
     62     0x1166, 0x3154,       // HANGUL LETTER E
     63     0x1167, 0x3155,       // HANGUL LETTER YEO
     64     0x1168, 0x3156,       // HANGUL LETTER YE
     65     0x1169, 0x3157,       // HANGUL LETTER O
     66     0x116A, 0x3158,       // HANGUL LETTER WA
     67     0x116B, 0x3159,       // HANGUL LETTER WAE
     68     0x116C, 0x315A,       // HANGUL LETTER OE
     69     0x116D, 0x315B,       // HANGUL LETTER YO
     70     0x116E, 0x315C,       // HANGUL LETTER U
     71     0x116F, 0x315D,       // HANGUL LETTER WEO
     72     0x1170, 0x315E,       // HANGUL LETTER WE
     73     0x1171, 0x315F,       // HANGUL LETTER WI
     74     0x1172, 0x3160,       // HANGUL LETTER YU
     75     0x1173, 0x3161,       // HANGUL LETTER EU
     76     0x1174, 0x3162,       // HANGUL LETTER YI
     77     0x1175, 0x3163,       // HANGUL LETTER I
     78     0x11AA, 0x3133,       // HANGUL LETTER KIYEOK-SIOS
     79     0x11AC, 0x3135,       // HANGUL LETTER NIEUN-CIEUC
     80     0x11AD, 0x3136,       // HANGUL LETTER NIEUN-HIEUH
     81     0x11B0, 0x313A,       // HANGUL LETTER RIEUL-KIYEOK
     82     0x11B1, 0x313B,       // HANGUL LETTER RIEUL-MIEUM
     83     0x11B3, 0x313D,       // HANGUL LETTER RIEUL-SIOS
     84     0x11B4, 0x313E,       // HANGUL LETTER RIEUL-THIEUTH
     85     0x11B5, 0x313F,       // HANGUL LETTER RIEUL-PHIEUPH
     86 };
     87 
     88 /**
     89  * Binary search to map an individual character to the corresponding phone book index.
     90  */
     91 static UChar map_character(UChar c, UChar * char_map, int32_t length) {
     92   int from = 0, to = length;
     93   while (from < to) {
     94     int m = ((to + from) >> 1) & ~0x1;    // Only consider even positions
     95     UChar cm = char_map[m];
     96     if (cm == c) {
     97       return char_map[m + 1];
     98     } else if (cm < c) {
     99       from = m + 2;
    100     } else {
    101       to = m;
    102     }
    103   }
    104   return 0;
    105 }
    106 
    107 /**
    108  * Returns TRUE if the character belongs to a Hanzi unicode block
    109  */
    110 static bool is_CJK(UChar c) {
    111   return
    112        (0x4e00 <= c && c <= 0x9fff)     // CJK_UNIFIED_IDEOGRAPHS
    113     || (0x3400 <= c && c <= 0x4dbf)     // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
    114     || (0x3000 <= c && c <= 0x303f)     // CJK_SYMBOLS_AND_PUNCTUATION
    115     || (0x2e80 <= c && c <= 0x2eff)     // CJK_RADICALS_SUPPLEMENT
    116     || (0x3300 <= c && c <= 0x33ff)     // CJK_COMPATIBILITY
    117     || (0xfe30 <= c && c <= 0xfe4f)     // CJK_COMPATIBILITY_FORMS
    118     || (0xf900 <= c && c <= 0xfaff);    // CJK_COMPATIBILITY_IDEOGRAPHS
    119 }
    120 
    121 int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
    122         UBool * isError)
    123 {
    124   if (size < MIN_OUTPUT_SIZE) {
    125     *isError = TRUE;
    126     return 0;
    127   }
    128 
    129   *isError = FALSE;
    130 
    131   // Normalize the first character to remove accents using the NFD normalization
    132   UErrorCode errorCode = U_ZERO_ERROR;
    133   int32_t len = unorm_next(iter, out, size, UNORM_NFD,
    134           0 /* options */, TRUE /* normalize */, NULL, &errorCode);
    135   if (U_FAILURE(errorCode)) {
    136     *isError = TRUE;
    137     return 0;
    138   }
    139 
    140   if (len == 0) {   // Empty input string
    141     return 0;
    142   }
    143 
    144   UChar c = out[0];
    145 
    146   if (!u_isalpha(c)) {
    147     // Digits go into a # section. Everything else goes into the empty section
    148     // The unicode function u_isdigit would also identify other characters as digits (arabic),
    149     // but if we caught them here we'd risk having the same section before and after alpha-letters
    150     // which might break the assumption that each section exists only once
    151     if (c >= '0' && c <= '9') {
    152       out[0] = '#';
    153       return 1;
    154     }
    155     return 0;
    156   }
    157 
    158   c = u_toupper(c);
    159 
    160   // Check for explicitly mapped characters
    161   UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
    162   if (c_mapped != 0) {
    163     out[0] = c_mapped;
    164     return 1;
    165   }
    166 
    167   // Convert Kanas to Hiragana
    168   UChar next = len > 2 ? out[1] : 0;
    169   c = android::GetNormalizedCodePoint(c, next, NULL);
    170 
    171   // Traditional grouping of Hiragana characters
    172   if (0x3042 <= c && c <= 0x309F) {
    173     if (c < 0x304B) c = 0x3042;         // a
    174     else if (c < 0x3055) c = 0x304B;    // ka
    175     else if (c < 0x305F) c = 0x3055;    // sa
    176     else if (c < 0x306A) c = 0x305F;    // ta
    177     else if (c < 0x306F) c = 0x306A;    // na
    178     else if (c < 0x307E) c = 0x306F;    // ha
    179     else if (c < 0x3084) c = 0x307E;    // ma
    180     else if (c < 0x3089) c = 0x3084;    // ya
    181     else if (c < 0x308F) c = 0x3089;    // ra
    182     else c = 0x308F;                    // wa
    183     out[0] = c;
    184     return 1;
    185   }
    186 
    187   if (is_CJK(c)) {
    188     if (strncmp(locale, "ja", 2) == 0) {
    189       // Japanese word meaning "misc" or "other"
    190       out[0] = 0x4ED6;
    191       return 1;
    192     } else {
    193       return 0;
    194     }
    195   }
    196 
    197   out[0] = c;
    198   return 1;
    199 }
    200 
    201 }  // namespace android
    202