Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2009, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #ifndef __CSRSBCS_H
      9 #define __CSRSBCS_H
     10 
     11 #include "unicode/uobject.h"
     12 
     13 #if !UCONFIG_NO_CONVERSION
     14 
     15 #include "csrecog.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 class NGramParser : public UMemory
     20 {
     21 private:
     22     int32_t byteIndex;
     23     int32_t ngram;
     24 
     25     const int32_t *ngramList;
     26     const uint8_t *charMap;
     27 
     28     int32_t ngramCount;
     29     int32_t hitCount;
     30 
     31 public:
     32     NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
     33 
     34 private:
     35     /*
     36     * Binary search for value in table, which must have exactly 64 entries.
     37     */
     38     int32_t search(const int32_t *table, int32_t value);
     39 
     40     void lookup(int32_t thisNgram);
     41     void addByte(int32_t b);
     42     int32_t nextByte(InputText *det);
     43 
     44 public:
     45     int32_t parse(InputText *det);
     46 
     47 };
     48 
     49 class CharsetRecog_sbcs : public CharsetRecognizer
     50 {
     51 protected:
     52     UBool haveC1Bytes;
     53 
     54 public:
     55     CharsetRecog_sbcs();
     56 
     57     virtual ~CharsetRecog_sbcs();
     58 
     59     virtual const char *getName() const = 0;
     60 
     61     virtual int32_t match(InputText *det) = 0;
     62 
     63     int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]);
     64 };
     65 
     66 class CharsetRecog_8859_1 : public CharsetRecog_sbcs
     67 {
     68 public:
     69     virtual ~CharsetRecog_8859_1();
     70 
     71     const char *getName() const;
     72 };
     73 
     74 class CharsetRecog_8859_2 : public CharsetRecog_sbcs
     75 {
     76 public:
     77     virtual ~CharsetRecog_8859_2();
     78 
     79     const char *getName() const;
     80 };
     81 
     82 class CharsetRecog_8859_5 : public CharsetRecog_sbcs
     83 {
     84 public:
     85     virtual ~CharsetRecog_8859_5();
     86 
     87     const char *getName() const;
     88 };
     89 
     90 class CharsetRecog_8859_6 : public CharsetRecog_sbcs
     91 {
     92 public:
     93     virtual ~CharsetRecog_8859_6();
     94 
     95     const char *getName() const;
     96 };
     97 
     98 class CharsetRecog_8859_7 : public CharsetRecog_sbcs
     99 {
    100 public:
    101     virtual ~CharsetRecog_8859_7();
    102 
    103     const char *getName() const;
    104 };
    105 
    106 class CharsetRecog_8859_8 : public CharsetRecog_sbcs
    107 {
    108 public:
    109     virtual ~CharsetRecog_8859_8();
    110 
    111     virtual const char *getName() const;
    112 };
    113 
    114 class CharsetRecog_8859_9 : public CharsetRecog_sbcs
    115 {
    116 public:
    117     virtual ~CharsetRecog_8859_9();
    118 
    119     const char *getName() const;
    120 };
    121 
    122 class CharsetRecog_8859_1_en : public CharsetRecog_8859_1
    123 {
    124 public:
    125     virtual ~CharsetRecog_8859_1_en();
    126 
    127     const char *getLanguage() const;
    128 
    129     int32_t match(InputText *textIn);
    130 };
    131 
    132 class CharsetRecog_8859_1_da : public CharsetRecog_8859_1
    133 {
    134 public:
    135     virtual ~CharsetRecog_8859_1_da();
    136 
    137     const char *getLanguage() const;
    138 
    139     int32_t match(InputText *textIn);
    140 };
    141 
    142 class CharsetRecog_8859_1_de : public CharsetRecog_8859_1
    143 {
    144 public:
    145     virtual ~CharsetRecog_8859_1_de();
    146 
    147     const char *getLanguage() const;
    148 
    149     int32_t match(InputText *textIn);
    150 };
    151 
    152 class CharsetRecog_8859_1_es : public CharsetRecog_8859_1
    153 {
    154 public:
    155     virtual ~CharsetRecog_8859_1_es();
    156 
    157     const char *getLanguage() const;
    158 
    159     int32_t match(InputText *textIn);
    160 };
    161 
    162 class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1
    163 {
    164 public:
    165     virtual ~CharsetRecog_8859_1_fr();
    166 
    167     const char *getLanguage() const;
    168 
    169     int32_t match(InputText *textIn);
    170 };
    171 
    172 class CharsetRecog_8859_1_it : public CharsetRecog_8859_1
    173 {
    174 public:
    175     virtual ~CharsetRecog_8859_1_it();
    176 
    177     const char *getLanguage() const;
    178 
    179     int32_t match(InputText *textIn);
    180 };
    181 
    182 class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1
    183 {
    184 public:
    185     virtual ~CharsetRecog_8859_1_nl();
    186 
    187     const char *getLanguage() const;
    188 
    189     int32_t match(InputText *textIn);
    190 };
    191 
    192 class CharsetRecog_8859_1_no : public CharsetRecog_8859_1
    193 {
    194 public:
    195     virtual ~CharsetRecog_8859_1_no();
    196 
    197     const char *getLanguage() const;
    198 
    199     int32_t match(InputText *textIn);
    200 };
    201 
    202 class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1
    203 {
    204 public:
    205     virtual ~CharsetRecog_8859_1_pt();
    206 
    207     const char *getLanguage() const;
    208 
    209     int32_t match(InputText *textIn);
    210 };
    211 
    212 class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1
    213 {
    214 public:
    215     virtual ~CharsetRecog_8859_1_sv();
    216 
    217     const char *getLanguage() const;
    218 
    219     int32_t match(InputText *textIn);
    220 };
    221 
    222 class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2
    223 {
    224 public:
    225     virtual ~CharsetRecog_8859_2_cs();
    226 
    227     const char *getLanguage() const;
    228 
    229     int32_t match(InputText *textIn);
    230 };
    231 
    232 class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2
    233 {
    234 public:
    235     virtual ~CharsetRecog_8859_2_hu();
    236 
    237     const char *getLanguage() const;
    238 
    239     int32_t match(InputText *textIn);
    240 };
    241 
    242 class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2
    243 {
    244 public:
    245     virtual ~CharsetRecog_8859_2_pl();
    246 
    247     const char *getLanguage() const;
    248 
    249     int32_t match(InputText *textIn);
    250 };
    251 
    252 class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2
    253 {
    254 public:
    255     virtual ~CharsetRecog_8859_2_ro();
    256 
    257     const char *getLanguage() const;
    258 
    259     int32_t match(InputText *textIn);
    260 };
    261 
    262 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
    263 {
    264 public:
    265     virtual ~CharsetRecog_8859_5_ru();
    266 
    267     const char *getLanguage() const;
    268 
    269     int32_t match(InputText *textIn);
    270 };
    271 
    272 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
    273 {
    274 public:
    275     virtual ~CharsetRecog_8859_6_ar();
    276 
    277     const char *getLanguage() const;
    278 
    279     int32_t match(InputText *textIn);
    280 };
    281 
    282 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
    283 {
    284 public:
    285     virtual ~CharsetRecog_8859_7_el();
    286 
    287     const char *getLanguage() const;
    288 
    289     int32_t match(InputText *textIn);
    290 };
    291 
    292 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
    293 {
    294 public:
    295     virtual ~CharsetRecog_8859_8_I_he();
    296 
    297     const char *getName() const;
    298 
    299     const char *getLanguage() const;
    300 
    301     int32_t match(InputText *textIn);
    302 };
    303 
    304 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
    305 {
    306 public:
    307     virtual ~CharsetRecog_8859_8_he ();
    308 
    309     const char *getLanguage() const;
    310 
    311     int32_t match(InputText *textIn);
    312 };
    313 
    314 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
    315 {
    316 public:
    317     virtual ~CharsetRecog_8859_9_tr ();
    318 
    319     const char *getLanguage() const;
    320 
    321     int32_t match(InputText *textIn);
    322 };
    323 
    324 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
    325 {
    326 public:
    327     virtual ~CharsetRecog_windows_1256();
    328 
    329     const char *getName() const;
    330 
    331     const char *getLanguage() const;
    332 
    333     int32_t match(InputText *textIn);
    334 };
    335 
    336 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
    337 {
    338 public:
    339     virtual ~CharsetRecog_windows_1251();
    340 
    341     const char *getName() const;
    342 
    343     const char *getLanguage() const;
    344 
    345     int32_t match(InputText *textIn);
    346 };
    347 
    348 
    349 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
    350 {
    351 public:
    352     virtual ~CharsetRecog_KOI8_R();
    353 
    354     const char *getName() const;
    355 
    356     const char *getLanguage() const;
    357 
    358     int32_t match(InputText *textIn);
    359 };
    360 
    361 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
    362 {
    363 public:
    364     virtual ~CharsetRecog_IBM424_he();
    365 
    366     const char *getLanguage() const;
    367 };
    368 
    369 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
    370 public:
    371     virtual ~CharsetRecog_IBM424_he_rtl();
    372 
    373     const char *getName() const;
    374 
    375     int32_t match(InputText *textIn);
    376 };
    377 
    378 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
    379     virtual ~CharsetRecog_IBM424_he_ltr();
    380 
    381     const char *getName() const;
    382 
    383     int32_t match(InputText *textIn);
    384 };
    385 
    386 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
    387 {
    388 public:
    389     virtual ~CharsetRecog_IBM420_ar();
    390 
    391     const char *getLanguage() const;
    392 
    393 protected:
    394     void matchInit(InputText *textIn);
    395     void matchFinish(InputText *textIn);
    396 
    397 private:
    398     uint8_t *prev_fInputBytes;
    399     int32_t prev_fInputBytesLength;
    400     UBool deleteBuffer;
    401 
    402     UBool isLamAlef(uint8_t b);
    403     uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
    404     uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
    405 };
    406 
    407 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
    408 public:
    409     virtual ~CharsetRecog_IBM420_ar_rtl();
    410 
    411     const char *getName() const;
    412 
    413     int32_t match(InputText *textIn);
    414 };
    415 
    416 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
    417     virtual ~CharsetRecog_IBM420_ar_ltr();
    418 
    419     const char *getName() const;
    420 
    421     int32_t match(InputText *textIn);
    422 };
    423 
    424 U_NAMESPACE_END
    425 
    426 #endif
    427 #endif /* __CSRSBCS_H */
    428