Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2015, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #ifndef __CSRSBCS_H
      9 #define __CSRSBCS_H
     10 
     11 #include "unicode/uobject.h"
     12 
     13 #if !UCONFIG_NO_CONVERSION
     14 
     15 #include "csrecog.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 class NGramParser : public UMemory
     20 {
     21 private:
     22     int32_t ngram;
     23     const int32_t *ngramList;
     24 
     25     int32_t ngramCount;
     26     int32_t hitCount;
     27 
     28 protected:
     29 	int32_t byteIndex;
     30     const uint8_t *charMap;
     31 
     32 	void addByte(int32_t b);
     33 
     34 public:
     35     NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
     36     virtual ~NGramParser();
     37 
     38 private:
     39     /*
     40     * Binary search for value in table, which must have exactly 64 entries.
     41     */
     42     int32_t search(const int32_t *table, int32_t value);
     43 
     44     void lookup(int32_t thisNgram);
     45 
     46     virtual int32_t nextByte(InputText *det);
     47 	virtual void parseCharacters(InputText *det);
     48 
     49 public:
     50     int32_t parse(InputText *det);
     51 
     52 };
     53 
     54 #if !UCONFIG_ONLY_HTML_CONVERSION
     55 class NGramParser_IBM420 : public NGramParser
     56 {
     57 public:
     58     NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
     59     ~NGramParser_IBM420();
     60 
     61 private:
     62     int32_t alef;
     63     int32_t isLamAlef(int32_t b);
     64     int32_t nextByte(InputText *det);
     65     void parseCharacters(InputText *det);
     66 };
     67 #endif
     68 
     69 
     70 class CharsetRecog_sbcs : public CharsetRecognizer
     71 {
     72 public:
     73     CharsetRecog_sbcs();
     74     virtual ~CharsetRecog_sbcs();
     75     virtual const char *getName() const = 0;
     76     virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
     77     virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
     78 };
     79 
     80 class CharsetRecog_8859_1 : public CharsetRecog_sbcs
     81 {
     82 public:
     83     virtual ~CharsetRecog_8859_1();
     84     const char *getName() const;
     85     virtual UBool match(InputText *det, CharsetMatch *results) const;
     86 };
     87 
     88 class CharsetRecog_8859_2 : public CharsetRecog_sbcs
     89 {
     90 public:
     91     virtual ~CharsetRecog_8859_2();
     92     const char *getName() const;
     93     virtual UBool match(InputText *det, CharsetMatch *results) const;
     94 };
     95 
     96 class CharsetRecog_8859_5 : public CharsetRecog_sbcs
     97 {
     98 public:
     99     virtual ~CharsetRecog_8859_5();
    100     const char *getName() const;
    101 };
    102 
    103 class CharsetRecog_8859_6 : public CharsetRecog_sbcs
    104 {
    105 public:
    106     virtual ~CharsetRecog_8859_6();
    107 
    108     const char *getName() const;
    109 };
    110 
    111 class CharsetRecog_8859_7 : public CharsetRecog_sbcs
    112 {
    113 public:
    114     virtual ~CharsetRecog_8859_7();
    115 
    116     const char *getName() const;
    117 };
    118 
    119 class CharsetRecog_8859_8 : public CharsetRecog_sbcs
    120 {
    121 public:
    122     virtual ~CharsetRecog_8859_8();
    123 
    124     virtual const char *getName() const;
    125 };
    126 
    127 class CharsetRecog_8859_9 : public CharsetRecog_sbcs
    128 {
    129 public:
    130     virtual ~CharsetRecog_8859_9();
    131 
    132     const char *getName() const;
    133 };
    134 
    135 
    136 
    137 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
    138 {
    139 public:
    140     virtual ~CharsetRecog_8859_5_ru();
    141 
    142     const char *getLanguage() const;
    143 
    144     virtual UBool match(InputText *det, CharsetMatch *results) const;
    145 };
    146 
    147 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
    148 {
    149 public:
    150     virtual ~CharsetRecog_8859_6_ar();
    151 
    152     const char *getLanguage() const;
    153 
    154     virtual UBool match(InputText *det, CharsetMatch *results) const;
    155 };
    156 
    157 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
    158 {
    159 public:
    160     virtual ~CharsetRecog_8859_7_el();
    161 
    162     const char *getLanguage() const;
    163 
    164     virtual UBool match(InputText *det, CharsetMatch *results) const;
    165 };
    166 
    167 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
    168 {
    169 public:
    170     virtual ~CharsetRecog_8859_8_I_he();
    171 
    172     const char *getName() const;
    173 
    174     const char *getLanguage() const;
    175 
    176     virtual UBool match(InputText *det, CharsetMatch *results) const;
    177 };
    178 
    179 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
    180 {
    181 public:
    182     virtual ~CharsetRecog_8859_8_he ();
    183 
    184     const char *getLanguage() const;
    185 
    186     virtual UBool match(InputText *det, CharsetMatch *results) const;
    187 };
    188 
    189 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
    190 {
    191 public:
    192     virtual ~CharsetRecog_8859_9_tr ();
    193 
    194     const char *getLanguage() const;
    195 
    196     virtual UBool match(InputText *det, CharsetMatch *results) const;
    197 };
    198 
    199 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
    200 {
    201 public:
    202     virtual ~CharsetRecog_windows_1256();
    203 
    204     const char *getName() const;
    205 
    206     const char *getLanguage() const;
    207 
    208     virtual UBool match(InputText *det, CharsetMatch *results) const;
    209 };
    210 
    211 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
    212 {
    213 public:
    214     virtual ~CharsetRecog_windows_1251();
    215 
    216     const char *getName() const;
    217 
    218     const char *getLanguage() const;
    219 
    220     virtual UBool match(InputText *det, CharsetMatch *results) const;
    221 };
    222 
    223 
    224 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
    225 {
    226 public:
    227     virtual ~CharsetRecog_KOI8_R();
    228 
    229     const char *getName() const;
    230 
    231     const char *getLanguage() const;
    232 
    233     virtual UBool match(InputText *det, CharsetMatch *results) const;
    234 };
    235 
    236 #if !UCONFIG_ONLY_HTML_CONVERSION
    237 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
    238 {
    239 public:
    240     virtual ~CharsetRecog_IBM424_he();
    241 
    242     const char *getLanguage() const;
    243 };
    244 
    245 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
    246 public:
    247     virtual ~CharsetRecog_IBM424_he_rtl();
    248 
    249     const char *getName() const;
    250 
    251     virtual UBool match(InputText *det, CharsetMatch *results) const;
    252 };
    253 
    254 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
    255     virtual ~CharsetRecog_IBM424_he_ltr();
    256 
    257     const char *getName() const;
    258 
    259     virtual UBool match(InputText *det, CharsetMatch *results) const;
    260 };
    261 
    262 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
    263 {
    264 public:
    265     virtual ~CharsetRecog_IBM420_ar();
    266 
    267     const char *getLanguage() const;
    268 	int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
    269 
    270 };
    271 
    272 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
    273 public:
    274     virtual ~CharsetRecog_IBM420_ar_rtl();
    275 
    276     const char *getName() const;
    277 
    278     virtual UBool match(InputText *det, CharsetMatch *results) const;
    279 };
    280 
    281 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
    282     virtual ~CharsetRecog_IBM420_ar_ltr();
    283 
    284     const char *getName() const;
    285 
    286     virtual UBool match(InputText *det, CharsetMatch *results) const;
    287 };
    288 #endif
    289 
    290 U_NAMESPACE_END
    291 
    292 #endif /* !UCONFIG_NO_CONVERSION */
    293 #endif /* __CSRSBCS_H */
    294