1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #ifndef __CSRSBCS_H 9 #define __CSRSBCS_H 10 11 #include "unicode/uobject.h" 12 13 #if !UCONFIG_NO_CONVERSION 14 15 #include "csrecog.h" 16 17 U_NAMESPACE_BEGIN 18 19 class NGramParser : public UMemory 20 { 21 private: 22 int32_t byteIndex; 23 int32_t ngram; 24 25 const int32_t *ngramList; 26 const uint8_t *charMap; 27 28 int32_t ngramCount; 29 int32_t hitCount; 30 31 public: 32 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 33 34 private: 35 /* 36 * Binary search for value in table, which must have exactly 64 entries. 37 */ 38 int32_t search(const int32_t *table, int32_t value); 39 40 void lookup(int32_t thisNgram); 41 void addByte(int32_t b); 42 int32_t nextByte(InputText *det); 43 44 public: 45 int32_t parse(InputText *det); 46 47 }; 48 49 class CharsetRecog_sbcs : public CharsetRecognizer 50 { 51 protected: 52 UBool haveC1Bytes; 53 54 public: 55 CharsetRecog_sbcs(); 56 57 virtual ~CharsetRecog_sbcs(); 58 59 virtual const char *getName() const = 0; 60 61 virtual int32_t match(InputText *det) = 0; 62 63 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]); 64 }; 65 66 class CharsetRecog_8859_1 : public CharsetRecog_sbcs 67 { 68 public: 69 virtual ~CharsetRecog_8859_1(); 70 71 const char *getName() const; 72 }; 73 74 class CharsetRecog_8859_2 : public CharsetRecog_sbcs 75 { 76 public: 77 virtual ~CharsetRecog_8859_2(); 78 79 const char *getName() const; 80 }; 81 82 class CharsetRecog_8859_5 : public CharsetRecog_sbcs 83 { 84 public: 85 virtual ~CharsetRecog_8859_5(); 86 87 const char *getName() const; 88 }; 89 90 class CharsetRecog_8859_6 : public CharsetRecog_sbcs 91 { 92 public: 93 virtual ~CharsetRecog_8859_6(); 94 95 const char *getName() const; 96 }; 97 98 class CharsetRecog_8859_7 : public CharsetRecog_sbcs 99 { 100 public: 101 virtual ~CharsetRecog_8859_7(); 102 103 const char *getName() const; 104 }; 105 106 class CharsetRecog_8859_8 : public CharsetRecog_sbcs 107 { 108 public: 109 virtual ~CharsetRecog_8859_8(); 110 111 virtual const char *getName() const; 112 }; 113 114 class CharsetRecog_8859_9 : public CharsetRecog_sbcs 115 { 116 public: 117 virtual ~CharsetRecog_8859_9(); 118 119 const char *getName() const; 120 }; 121 122 class CharsetRecog_8859_1_en : public CharsetRecog_8859_1 123 { 124 public: 125 virtual ~CharsetRecog_8859_1_en(); 126 127 const char *getLanguage() const; 128 129 int32_t match(InputText *textIn); 130 }; 131 132 class CharsetRecog_8859_1_da : public CharsetRecog_8859_1 133 { 134 public: 135 virtual ~CharsetRecog_8859_1_da(); 136 137 const char *getLanguage() const; 138 139 int32_t match(InputText *textIn); 140 }; 141 142 class CharsetRecog_8859_1_de : public CharsetRecog_8859_1 143 { 144 public: 145 virtual ~CharsetRecog_8859_1_de(); 146 147 const char *getLanguage() const; 148 149 int32_t match(InputText *textIn); 150 }; 151 152 class CharsetRecog_8859_1_es : public CharsetRecog_8859_1 153 { 154 public: 155 virtual ~CharsetRecog_8859_1_es(); 156 157 const char *getLanguage() const; 158 159 int32_t match(InputText *textIn); 160 }; 161 162 class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1 163 { 164 public: 165 virtual ~CharsetRecog_8859_1_fr(); 166 167 const char *getLanguage() const; 168 169 int32_t match(InputText *textIn); 170 }; 171 172 class CharsetRecog_8859_1_it : public CharsetRecog_8859_1 173 { 174 public: 175 virtual ~CharsetRecog_8859_1_it(); 176 177 const char *getLanguage() const; 178 179 int32_t match(InputText *textIn); 180 }; 181 182 class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1 183 { 184 public: 185 virtual ~CharsetRecog_8859_1_nl(); 186 187 const char *getLanguage() const; 188 189 int32_t match(InputText *textIn); 190 }; 191 192 class CharsetRecog_8859_1_no : public CharsetRecog_8859_1 193 { 194 public: 195 virtual ~CharsetRecog_8859_1_no(); 196 197 const char *getLanguage() const; 198 199 int32_t match(InputText *textIn); 200 }; 201 202 class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1 203 { 204 public: 205 virtual ~CharsetRecog_8859_1_pt(); 206 207 const char *getLanguage() const; 208 209 int32_t match(InputText *textIn); 210 }; 211 212 class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1 213 { 214 public: 215 virtual ~CharsetRecog_8859_1_sv(); 216 217 const char *getLanguage() const; 218 219 int32_t match(InputText *textIn); 220 }; 221 222 class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2 223 { 224 public: 225 virtual ~CharsetRecog_8859_2_cs(); 226 227 const char *getLanguage() const; 228 229 int32_t match(InputText *textIn); 230 }; 231 232 class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2 233 { 234 public: 235 virtual ~CharsetRecog_8859_2_hu(); 236 237 const char *getLanguage() const; 238 239 int32_t match(InputText *textIn); 240 }; 241 242 class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2 243 { 244 public: 245 virtual ~CharsetRecog_8859_2_pl(); 246 247 const char *getLanguage() const; 248 249 int32_t match(InputText *textIn); 250 }; 251 252 class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2 253 { 254 public: 255 virtual ~CharsetRecog_8859_2_ro(); 256 257 const char *getLanguage() const; 258 259 int32_t match(InputText *textIn); 260 }; 261 262 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 263 { 264 public: 265 virtual ~CharsetRecog_8859_5_ru(); 266 267 const char *getLanguage() const; 268 269 int32_t match(InputText *textIn); 270 }; 271 272 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 273 { 274 public: 275 virtual ~CharsetRecog_8859_6_ar(); 276 277 const char *getLanguage() const; 278 279 int32_t match(InputText *textIn); 280 }; 281 282 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 283 { 284 public: 285 virtual ~CharsetRecog_8859_7_el(); 286 287 const char *getLanguage() const; 288 289 int32_t match(InputText *textIn); 290 }; 291 292 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 293 { 294 public: 295 virtual ~CharsetRecog_8859_8_I_he(); 296 297 const char *getName() const; 298 299 const char *getLanguage() const; 300 301 int32_t match(InputText *textIn); 302 }; 303 304 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 305 { 306 public: 307 virtual ~CharsetRecog_8859_8_he (); 308 309 const char *getLanguage() const; 310 311 int32_t match(InputText *textIn); 312 }; 313 314 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 315 { 316 public: 317 virtual ~CharsetRecog_8859_9_tr (); 318 319 const char *getLanguage() const; 320 321 int32_t match(InputText *textIn); 322 }; 323 324 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 325 { 326 public: 327 virtual ~CharsetRecog_windows_1256(); 328 329 const char *getName() const; 330 331 const char *getLanguage() const; 332 333 int32_t match(InputText *textIn); 334 }; 335 336 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 337 { 338 public: 339 virtual ~CharsetRecog_windows_1251(); 340 341 const char *getName() const; 342 343 const char *getLanguage() const; 344 345 int32_t match(InputText *textIn); 346 }; 347 348 349 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 350 { 351 public: 352 virtual ~CharsetRecog_KOI8_R(); 353 354 const char *getName() const; 355 356 const char *getLanguage() const; 357 358 int32_t match(InputText *textIn); 359 }; 360 361 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 362 { 363 public: 364 virtual ~CharsetRecog_IBM424_he(); 365 366 const char *getLanguage() const; 367 }; 368 369 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 370 public: 371 virtual ~CharsetRecog_IBM424_he_rtl(); 372 373 const char *getName() const; 374 375 int32_t match(InputText *textIn); 376 }; 377 378 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 379 virtual ~CharsetRecog_IBM424_he_ltr(); 380 381 const char *getName() const; 382 383 int32_t match(InputText *textIn); 384 }; 385 386 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 387 { 388 public: 389 virtual ~CharsetRecog_IBM420_ar(); 390 391 const char *getLanguage() const; 392 393 protected: 394 void matchInit(InputText *textIn); 395 void matchFinish(InputText *textIn); 396 397 private: 398 uint8_t *prev_fInputBytes; 399 int32_t prev_fInputBytesLength; 400 UBool deleteBuffer; 401 402 UBool isLamAlef(uint8_t b); 403 uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 404 uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 405 }; 406 407 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 408 public: 409 virtual ~CharsetRecog_IBM420_ar_rtl(); 410 411 const char *getName() const; 412 413 int32_t match(InputText *textIn); 414 }; 415 416 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 417 virtual ~CharsetRecog_IBM420_ar_ltr(); 418 419 const char *getName() const; 420 421 int32_t match(InputText *textIn); 422 }; 423 424 U_NAMESPACE_END 425 426 #endif 427 #endif /* __CSRSBCS_H */ 428