1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include "../include/splparser.h" 19 20 namespace ime_pinyin { 21 22 SpellingParser::SpellingParser() { 23 spl_trie_ = SpellingTrie::get_cpinstance(); 24 } 25 26 bool SpellingParser::is_valid_to_parse(char ch) { 27 return SpellingTrie::is_valid_spl_char(ch); 28 } 29 30 uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, 31 uint16 spl_idx[], uint16 start_pos[], 32 uint16 max_size, bool &last_is_pre) { 33 if (NULL == splstr || 0 == max_size || 0 == str_len) 34 return 0; 35 36 if (!SpellingTrie::is_valid_spl_char(splstr[0])) 37 return 0; 38 39 last_is_pre = false; 40 41 const SpellingNode *node_this = spl_trie_->root_; 42 43 uint16 str_pos = 0; 44 uint16 idx_num = 0; 45 if (NULL != start_pos) 46 start_pos[0] = 0; 47 bool last_is_splitter = false; 48 49 while (str_pos < str_len) { 50 char char_this = splstr[str_pos]; 51 // all characters outside of [a, z] are considered as splitters 52 if (!SpellingTrie::is_valid_spl_char(char_this)) { 53 // test if the current node is endable 54 uint16 id_this = node_this->spelling_idx; 55 if (spl_trie_->if_valid_id_update(&id_this)) { 56 spl_idx[idx_num] = id_this; 57 58 idx_num++; 59 str_pos++; 60 if (NULL != start_pos) 61 start_pos[idx_num] = str_pos; 62 if (idx_num >= max_size) 63 return idx_num; 64 65 node_this = spl_trie_->root_; 66 last_is_splitter = true; 67 continue; 68 } else { 69 if (last_is_splitter) { 70 str_pos++; 71 if (NULL != start_pos) 72 start_pos[idx_num] = str_pos; 73 continue; 74 } else { 75 return idx_num; 76 } 77 } 78 } 79 80 last_is_splitter = false; 81 82 SpellingNode *found_son = NULL; 83 84 if (0 == str_pos) { 85 if (char_this >= 'a') 86 found_son = spl_trie_->level1_sons_[char_this - 'a']; 87 else 88 found_son = spl_trie_->level1_sons_[char_this - 'A']; 89 } else { 90 SpellingNode *first_son = node_this->first_son; 91 // Because for Zh/Ch/Sh nodes, they are the last in the buffer and 92 // frequently used, so we scan from the end. 93 for (int i = 0; i < node_this->num_of_son; i++) { 94 SpellingNode *this_son = first_son + i; 95 if (SpellingTrie::is_same_spl_char( 96 this_son->char_this_node, char_this)) { 97 found_son = this_son; 98 break; 99 } 100 } 101 } 102 103 // found, just move the current node pointer to the the son 104 if (NULL != found_son) { 105 node_this = found_son; 106 } else { 107 // not found, test if it is endable 108 uint16 id_this = node_this->spelling_idx; 109 if (spl_trie_->if_valid_id_update(&id_this)) { 110 // endable, remember the index 111 spl_idx[idx_num] = id_this; 112 113 idx_num++; 114 if (NULL != start_pos) 115 start_pos[idx_num] = str_pos; 116 if (idx_num >= max_size) 117 return idx_num; 118 node_this = spl_trie_->root_; 119 continue; 120 } else { 121 return idx_num; 122 } 123 } 124 125 str_pos++; 126 } 127 128 uint16 id_this = node_this->spelling_idx; 129 if (spl_trie_->if_valid_id_update(&id_this)) { 130 // endable, remember the index 131 spl_idx[idx_num] = id_this; 132 133 idx_num++; 134 if (NULL != start_pos) 135 start_pos[idx_num] = str_pos; 136 } 137 138 last_is_pre = !last_is_splitter; 139 140 return idx_num; 141 } 142 143 uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, 144 uint16 spl_idx[], uint16 start_pos[], 145 uint16 max_size, bool &last_is_pre) { 146 uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 147 max_size, last_is_pre); 148 for (uint16 pos = 0; pos < idx_num; pos++) { 149 if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { 150 spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); 151 if (pos == idx_num - 1) { 152 last_is_pre = false; 153 } 154 } 155 } 156 return idx_num; 157 } 158 159 uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, 160 uint16 spl_idx[], uint16 start_pos[], 161 uint16 max_size, bool &last_is_pre) { 162 if (NULL == splstr || 0 == max_size || 0 == str_len) 163 return 0; 164 165 if (!SpellingTrie::is_valid_spl_char(splstr[0])) 166 return 0; 167 168 last_is_pre = false; 169 170 const SpellingNode *node_this = spl_trie_->root_; 171 172 uint16 str_pos = 0; 173 uint16 idx_num = 0; 174 if (NULL != start_pos) 175 start_pos[0] = 0; 176 bool last_is_splitter = false; 177 178 while (str_pos < str_len) { 179 char16 char_this = splstr[str_pos]; 180 // all characters outside of [a, z] are considered as splitters 181 if (!SpellingTrie::is_valid_spl_char(char_this)) { 182 // test if the current node is endable 183 uint16 id_this = node_this->spelling_idx; 184 if (spl_trie_->if_valid_id_update(&id_this)) { 185 spl_idx[idx_num] = id_this; 186 187 idx_num++; 188 str_pos++; 189 if (NULL != start_pos) 190 start_pos[idx_num] = str_pos; 191 if (idx_num >= max_size) 192 return idx_num; 193 194 node_this = spl_trie_->root_; 195 last_is_splitter = true; 196 continue; 197 } else { 198 if (last_is_splitter) { 199 str_pos++; 200 if (NULL != start_pos) 201 start_pos[idx_num] = str_pos; 202 continue; 203 } else { 204 return idx_num; 205 } 206 } 207 } 208 209 last_is_splitter = false; 210 211 SpellingNode *found_son = NULL; 212 213 if (0 == str_pos) { 214 if (char_this >= 'a') 215 found_son = spl_trie_->level1_sons_[char_this - 'a']; 216 else 217 found_son = spl_trie_->level1_sons_[char_this - 'A']; 218 } else { 219 SpellingNode *first_son = node_this->first_son; 220 // Because for Zh/Ch/Sh nodes, they are the last in the buffer and 221 // frequently used, so we scan from the end. 222 for (int i = 0; i < node_this->num_of_son; i++) { 223 SpellingNode *this_son = first_son + i; 224 if (SpellingTrie::is_same_spl_char( 225 this_son->char_this_node, char_this)) { 226 found_son = this_son; 227 break; 228 } 229 } 230 } 231 232 // found, just move the current node pointer to the the son 233 if (NULL != found_son) { 234 node_this = found_son; 235 } else { 236 // not found, test if it is endable 237 uint16 id_this = node_this->spelling_idx; 238 if (spl_trie_->if_valid_id_update(&id_this)) { 239 // endable, remember the index 240 spl_idx[idx_num] = id_this; 241 242 idx_num++; 243 if (NULL != start_pos) 244 start_pos[idx_num] = str_pos; 245 if (idx_num >= max_size) 246 return idx_num; 247 node_this = spl_trie_->root_; 248 continue; 249 } else { 250 return idx_num; 251 } 252 } 253 254 str_pos++; 255 } 256 257 uint16 id_this = node_this->spelling_idx; 258 if (spl_trie_->if_valid_id_update(&id_this)) { 259 // endable, remember the index 260 spl_idx[idx_num] = id_this; 261 262 idx_num++; 263 if (NULL != start_pos) 264 start_pos[idx_num] = str_pos; 265 } 266 267 last_is_pre = !last_is_splitter; 268 269 return idx_num; 270 } 271 272 uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, 273 uint16 spl_idx[], uint16 start_pos[], 274 uint16 max_size, bool &last_is_pre) { 275 uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, 276 max_size, last_is_pre); 277 for (uint16 pos = 0; pos < idx_num; pos++) { 278 if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { 279 spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); 280 if (pos == idx_num - 1) { 281 last_is_pre = false; 282 } 283 } 284 } 285 return idx_num; 286 } 287 288 uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, 289 bool *is_pre) { 290 if (NULL == is_pre) 291 return 0; 292 293 uint16 spl_idx[2]; 294 uint16 start_pos[3]; 295 296 if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) 297 return 0; 298 299 if (start_pos[1] != str_len) 300 return 0; 301 return spl_idx[0]; 302 } 303 304 uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, 305 bool *is_pre) { 306 if (NULL == is_pre) 307 return 0; 308 309 uint16 spl_idx[2]; 310 uint16 start_pos[3]; 311 312 if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) 313 return 0; 314 315 if (start_pos[1] != str_len) 316 return 0; 317 if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { 318 spl_trie_->half_to_full(spl_idx[0], spl_idx); 319 *is_pre = false; 320 } 321 322 return spl_idx[0]; 323 } 324 325 uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, 326 uint16 splidx[], uint16 max_size, 327 uint16 &full_id_num, bool &is_pre) { 328 if (max_size <= 0 || !is_valid_to_parse(splstr[0])) 329 return 0; 330 331 splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); 332 full_id_num = 0; 333 if (0 != splidx[0]) { 334 if (splidx[0] >= kFullSplIdStart) 335 full_id_num = 1; 336 return 1; 337 } 338 return 0; 339 } 340 341 } // namespace ime_pinyin 342