Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <assert.h>
     18 #include "../include/splparser.h"
     19 
     20 namespace ime_pinyin {
     21 
     22 SpellingParser::SpellingParser() {
     23   spl_trie_ = SpellingTrie::get_cpinstance();
     24 }
     25 
     26 bool SpellingParser::is_valid_to_parse(char ch) {
     27   return SpellingTrie::is_valid_spl_char(ch);
     28 }
     29 
     30 uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
     31                                       uint16 spl_idx[], uint16 start_pos[],
     32                                       uint16 max_size, bool &last_is_pre) {
     33   if (NULL == splstr || 0 == max_size || 0 == str_len)
     34     return 0;
     35 
     36   if (!SpellingTrie::is_valid_spl_char(splstr[0]))
     37     return 0;
     38 
     39   last_is_pre = false;
     40 
     41   const SpellingNode *node_this = spl_trie_->root_;
     42 
     43   uint16 str_pos = 0;
     44   uint16 idx_num = 0;
     45   if (NULL != start_pos)
     46     start_pos[0] = 0;
     47   bool last_is_splitter = false;
     48 
     49   while (str_pos < str_len) {
     50     char char_this = splstr[str_pos];
     51     // all characters outside of [a, z] are considered as splitters
     52     if (!SpellingTrie::is_valid_spl_char(char_this)) {
     53       // test if the current node is endable
     54       uint16 id_this = node_this->spelling_idx;
     55       if (spl_trie_->if_valid_id_update(&id_this)) {
     56         spl_idx[idx_num] = id_this;
     57 
     58         idx_num++;
     59         str_pos++;
     60         if (NULL != start_pos)
     61           start_pos[idx_num] = str_pos;
     62         if (idx_num >= max_size)
     63           return idx_num;
     64 
     65         node_this = spl_trie_->root_;
     66         last_is_splitter = true;
     67         continue;
     68       } else {
     69         if (last_is_splitter) {
     70           str_pos++;
     71           if (NULL != start_pos)
     72             start_pos[idx_num] = str_pos;
     73           continue;
     74         } else {
     75           return idx_num;
     76         }
     77       }
     78     }
     79 
     80     last_is_splitter = false;
     81 
     82     SpellingNode *found_son = NULL;
     83 
     84     if (0 == str_pos) {
     85       if (char_this >= 'a')
     86         found_son = spl_trie_->level1_sons_[char_this - 'a'];
     87       else
     88         found_son = spl_trie_->level1_sons_[char_this - 'A'];
     89     } else {
     90       SpellingNode *first_son = node_this->first_son;
     91       // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
     92       // frequently used, so we scan from the end.
     93       for (int i = 0; i < node_this->num_of_son; i++) {
     94         SpellingNode *this_son = first_son + i;
     95         if (SpellingTrie::is_same_spl_char(
     96             this_son->char_this_node, char_this)) {
     97           found_son = this_son;
     98           break;
     99         }
    100       }
    101     }
    102 
    103     // found, just move the current node pointer to the the son
    104     if (NULL != found_son) {
    105       node_this = found_son;
    106     } else {
    107       // not found, test if it is endable
    108       uint16 id_this = node_this->spelling_idx;
    109       if (spl_trie_->if_valid_id_update(&id_this)) {
    110         // endable, remember the index
    111         spl_idx[idx_num] = id_this;
    112 
    113         idx_num++;
    114         if (NULL != start_pos)
    115           start_pos[idx_num] = str_pos;
    116         if (idx_num >= max_size)
    117           return idx_num;
    118         node_this = spl_trie_->root_;
    119         continue;
    120       } else {
    121         return idx_num;
    122       }
    123     }
    124 
    125     str_pos++;
    126   }
    127 
    128   uint16 id_this = node_this->spelling_idx;
    129   if (spl_trie_->if_valid_id_update(&id_this)) {
    130     // endable, remember the index
    131     spl_idx[idx_num] = id_this;
    132 
    133     idx_num++;
    134     if (NULL != start_pos)
    135       start_pos[idx_num] = str_pos;
    136   }
    137 
    138   last_is_pre = !last_is_splitter;
    139 
    140   return idx_num;
    141 }
    142 
    143 uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
    144                                         uint16 spl_idx[], uint16 start_pos[],
    145                                         uint16 max_size, bool &last_is_pre) {
    146   uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
    147                                   max_size, last_is_pre);
    148   for (uint16 pos = 0; pos < idx_num; pos++) {
    149     if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
    150       spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
    151       if (pos == idx_num - 1) {
    152         last_is_pre = false;
    153       }
    154     }
    155   }
    156   return idx_num;
    157 }
    158 
    159 uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
    160                                         uint16 spl_idx[], uint16 start_pos[],
    161                                         uint16 max_size, bool &last_is_pre) {
    162   if (NULL == splstr || 0 == max_size || 0 == str_len)
    163     return 0;
    164 
    165   if (!SpellingTrie::is_valid_spl_char(splstr[0]))
    166     return 0;
    167 
    168   last_is_pre = false;
    169 
    170   const SpellingNode *node_this = spl_trie_->root_;
    171 
    172   uint16 str_pos = 0;
    173   uint16 idx_num = 0;
    174   if (NULL != start_pos)
    175     start_pos[0] = 0;
    176   bool last_is_splitter = false;
    177 
    178   while (str_pos < str_len) {
    179     char16 char_this = splstr[str_pos];
    180     // all characters outside of [a, z] are considered as splitters
    181     if (!SpellingTrie::is_valid_spl_char(char_this)) {
    182       // test if the current node is endable
    183       uint16 id_this = node_this->spelling_idx;
    184       if (spl_trie_->if_valid_id_update(&id_this)) {
    185         spl_idx[idx_num] = id_this;
    186 
    187         idx_num++;
    188         str_pos++;
    189         if (NULL != start_pos)
    190           start_pos[idx_num] = str_pos;
    191         if (idx_num >= max_size)
    192           return idx_num;
    193 
    194         node_this = spl_trie_->root_;
    195         last_is_splitter = true;
    196         continue;
    197       } else {
    198         if (last_is_splitter) {
    199           str_pos++;
    200           if (NULL != start_pos)
    201             start_pos[idx_num] = str_pos;
    202           continue;
    203         } else {
    204           return idx_num;
    205         }
    206       }
    207     }
    208 
    209     last_is_splitter = false;
    210 
    211     SpellingNode *found_son = NULL;
    212 
    213     if (0 == str_pos) {
    214       if (char_this >= 'a')
    215         found_son = spl_trie_->level1_sons_[char_this - 'a'];
    216       else
    217         found_son = spl_trie_->level1_sons_[char_this - 'A'];
    218     } else {
    219       SpellingNode *first_son = node_this->first_son;
    220       // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
    221       // frequently used, so we scan from the end.
    222       for (int i = 0; i < node_this->num_of_son; i++) {
    223         SpellingNode *this_son = first_son + i;
    224         if (SpellingTrie::is_same_spl_char(
    225             this_son->char_this_node, char_this)) {
    226           found_son = this_son;
    227           break;
    228         }
    229       }
    230     }
    231 
    232     // found, just move the current node pointer to the the son
    233     if (NULL != found_son) {
    234       node_this = found_son;
    235     } else {
    236       // not found, test if it is endable
    237       uint16 id_this = node_this->spelling_idx;
    238       if (spl_trie_->if_valid_id_update(&id_this)) {
    239         // endable, remember the index
    240         spl_idx[idx_num] = id_this;
    241 
    242         idx_num++;
    243         if (NULL != start_pos)
    244           start_pos[idx_num] = str_pos;
    245         if (idx_num >= max_size)
    246           return idx_num;
    247         node_this = spl_trie_->root_;
    248         continue;
    249       } else {
    250         return idx_num;
    251       }
    252     }
    253 
    254     str_pos++;
    255   }
    256 
    257   uint16 id_this = node_this->spelling_idx;
    258   if (spl_trie_->if_valid_id_update(&id_this)) {
    259     // endable, remember the index
    260     spl_idx[idx_num] = id_this;
    261 
    262     idx_num++;
    263     if (NULL != start_pos)
    264       start_pos[idx_num] = str_pos;
    265   }
    266 
    267   last_is_pre = !last_is_splitter;
    268 
    269   return idx_num;
    270 }
    271 
    272 uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
    273                                           uint16 spl_idx[], uint16 start_pos[],
    274                                           uint16 max_size, bool &last_is_pre) {
    275   uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
    276                                     max_size, last_is_pre);
    277   for (uint16 pos = 0; pos < idx_num; pos++) {
    278     if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
    279       spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
    280       if (pos == idx_num - 1) {
    281         last_is_pre = false;
    282       }
    283     }
    284   }
    285   return idx_num;
    286 }
    287 
    288 uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
    289                                         bool *is_pre) {
    290   if (NULL == is_pre)
    291     return 0;
    292 
    293   uint16 spl_idx[2];
    294   uint16 start_pos[3];
    295 
    296   if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
    297     return 0;
    298 
    299   if (start_pos[1] != str_len)
    300     return 0;
    301   return spl_idx[0];
    302 }
    303 
    304 uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
    305                                           bool *is_pre) {
    306   if (NULL == is_pre)
    307     return 0;
    308 
    309   uint16 spl_idx[2];
    310   uint16 start_pos[3];
    311 
    312   if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
    313     return 0;
    314 
    315   if (start_pos[1] != str_len)
    316     return 0;
    317   if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
    318     spl_trie_->half_to_full(spl_idx[0], spl_idx);
    319     *is_pre = false;
    320   }
    321 
    322   return spl_idx[0];
    323 }
    324 
    325 uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
    326     uint16 splidx[], uint16 max_size,
    327     uint16 &full_id_num, bool &is_pre) {
    328   if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
    329     return 0;
    330 
    331   splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
    332   full_id_num = 0;
    333   if (0 != splidx[0]) {
    334     if (splidx[0] >= kFullSplIdStart)
    335       full_id_num = 1;
    336     return 1;
    337   }
    338   return 0;
    339 }
    340 
    341 }  // namespace ime_pinyin
    342