Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2012, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #ifndef __CSRSBCS_H
      9 #define __CSRSBCS_H
     10 
     11 #include "unicode/uobject.h"
     12 
     13 #if !UCONFIG_NO_CONVERSION
     14 
     15 #include "csrecog.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 class NGramParser : public UMemory
     20 {
     21 private:
     22     int32_t byteIndex;
     23     int32_t ngram;
     24 
     25     const int32_t *ngramList;
     26     const uint8_t *charMap;
     27 
     28     int32_t ngramCount;
     29     int32_t hitCount;
     30 
     31 public:
     32     NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
     33 
     34 private:
     35     /*
     36     * Binary search for value in table, which must have exactly 64 entries.
     37     */
     38     int32_t search(const int32_t *table, int32_t value);
     39 
     40     void lookup(int32_t thisNgram);
     41     void addByte(int32_t b);
     42     int32_t nextByte(InputText *det);
     43 
     44 public:
     45     int32_t parse(InputText *det);
     46 
     47 };
     48 
     49 
     50 class CharsetRecog_sbcs : public CharsetRecognizer
     51 {
     52 public:
     53     CharsetRecog_sbcs();
     54     virtual ~CharsetRecog_sbcs();
     55     virtual const char *getName() const = 0;
     56     virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
     57     virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
     58 };
     59 
     60 class CharsetRecog_8859_1 : public CharsetRecog_sbcs
     61 {
     62 public:
     63     virtual ~CharsetRecog_8859_1();
     64     const char *getName() const;
     65     virtual UBool match(InputText *det, CharsetMatch *results) const;
     66 };
     67 
     68 class CharsetRecog_8859_2 : public CharsetRecog_sbcs
     69 {
     70 public:
     71     virtual ~CharsetRecog_8859_2();
     72     const char *getName() const;
     73     virtual UBool match(InputText *det, CharsetMatch *results) const;
     74 };
     75 
     76 class CharsetRecog_8859_5 : public CharsetRecog_sbcs
     77 {
     78 public:
     79     virtual ~CharsetRecog_8859_5();
     80     const char *getName() const;
     81 };
     82 
     83 class CharsetRecog_8859_6 : public CharsetRecog_sbcs
     84 {
     85 public:
     86     virtual ~CharsetRecog_8859_6();
     87 
     88     const char *getName() const;
     89 };
     90 
     91 class CharsetRecog_8859_7 : public CharsetRecog_sbcs
     92 {
     93 public:
     94     virtual ~CharsetRecog_8859_7();
     95 
     96     const char *getName() const;
     97 };
     98 
     99 class CharsetRecog_8859_8 : public CharsetRecog_sbcs
    100 {
    101 public:
    102     virtual ~CharsetRecog_8859_8();
    103 
    104     virtual const char *getName() const;
    105 };
    106 
    107 class CharsetRecog_8859_9 : public CharsetRecog_sbcs
    108 {
    109 public:
    110     virtual ~CharsetRecog_8859_9();
    111 
    112     const char *getName() const;
    113 };
    114 
    115 
    116 
    117 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
    118 {
    119 public:
    120     virtual ~CharsetRecog_8859_5_ru();
    121 
    122     const char *getLanguage() const;
    123 
    124     virtual UBool match(InputText *det, CharsetMatch *results) const;
    125 };
    126 
    127 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
    128 {
    129 public:
    130     virtual ~CharsetRecog_8859_6_ar();
    131 
    132     const char *getLanguage() const;
    133 
    134     virtual UBool match(InputText *det, CharsetMatch *results) const;
    135 };
    136 
    137 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
    138 {
    139 public:
    140     virtual ~CharsetRecog_8859_7_el();
    141 
    142     const char *getLanguage() const;
    143 
    144     virtual UBool match(InputText *det, CharsetMatch *results) const;
    145 };
    146 
    147 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
    148 {
    149 public:
    150     virtual ~CharsetRecog_8859_8_I_he();
    151 
    152     const char *getName() const;
    153 
    154     const char *getLanguage() const;
    155 
    156     virtual UBool match(InputText *det, CharsetMatch *results) const;
    157 };
    158 
    159 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
    160 {
    161 public:
    162     virtual ~CharsetRecog_8859_8_he ();
    163 
    164     const char *getLanguage() const;
    165 
    166     virtual UBool match(InputText *det, CharsetMatch *results) const;
    167 };
    168 
    169 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
    170 {
    171 public:
    172     virtual ~CharsetRecog_8859_9_tr ();
    173 
    174     const char *getLanguage() const;
    175 
    176     virtual UBool match(InputText *det, CharsetMatch *results) const;
    177 };
    178 
    179 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
    180 {
    181 public:
    182     virtual ~CharsetRecog_windows_1256();
    183 
    184     const char *getName() const;
    185 
    186     const char *getLanguage() const;
    187 
    188     virtual UBool match(InputText *det, CharsetMatch *results) const;
    189 };
    190 
    191 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
    192 {
    193 public:
    194     virtual ~CharsetRecog_windows_1251();
    195 
    196     const char *getName() const;
    197 
    198     const char *getLanguage() const;
    199 
    200     virtual UBool match(InputText *det, CharsetMatch *results) const;
    201 };
    202 
    203 
    204 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
    205 {
    206 public:
    207     virtual ~CharsetRecog_KOI8_R();
    208 
    209     const char *getName() const;
    210 
    211     const char *getLanguage() const;
    212 
    213     virtual UBool match(InputText *det, CharsetMatch *results) const;
    214 };
    215 
    216 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
    217 {
    218 public:
    219     virtual ~CharsetRecog_IBM424_he();
    220 
    221     const char *getLanguage() const;
    222 };
    223 
    224 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
    225 public:
    226     virtual ~CharsetRecog_IBM424_he_rtl();
    227 
    228     const char *getName() const;
    229 
    230     virtual UBool match(InputText *det, CharsetMatch *results) const;
    231 };
    232 
    233 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
    234     virtual ~CharsetRecog_IBM424_he_ltr();
    235 
    236     const char *getName() const;
    237 
    238     virtual UBool match(InputText *det, CharsetMatch *results) const;
    239 };
    240 
    241 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
    242 {
    243 public:
    244     virtual ~CharsetRecog_IBM420_ar();
    245 
    246     const char *getLanguage() const;
    247 
    248 protected:
    249     void matchInit(InputText *textIn);
    250     void matchFinish(InputText *textIn);
    251 
    252 private:
    253     uint8_t *prev_fInputBytes;
    254     int32_t prev_fInputBytesLength;
    255     UBool deleteBuffer;
    256 
    257     UBool isLamAlef(uint8_t b);
    258     uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
    259     uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
    260 };
    261 
    262 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
    263 public:
    264     virtual ~CharsetRecog_IBM420_ar_rtl();
    265 
    266     const char *getName() const;
    267 
    268     virtual UBool match(InputText *det, CharsetMatch *results) const;
    269 };
    270 
    271 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
    272     virtual ~CharsetRecog_IBM420_ar_ltr();
    273 
    274     const char *getName() const;
    275 
    276     virtual UBool match(InputText *det, CharsetMatch *results) const;
    277 };
    278 
    279 U_NAMESPACE_END
    280 
    281 #endif /* !UCONFIG_NO_CONVERSION */
    282 #endif /* __CSRSBCS_H */
    283