Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2015, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 #ifndef __CSRSBCS_H
     11 #define __CSRSBCS_H
     12 
     13 #include "unicode/uobject.h"
     14 
     15 #if !UCONFIG_NO_CONVERSION
     16 
     17 #include "csrecog.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 class NGramParser : public UMemory
     22 {
     23 private:
     24     int32_t ngram;
     25     const int32_t *ngramList;
     26 
     27     int32_t ngramCount;
     28     int32_t hitCount;
     29 
     30 protected:
     31 	int32_t byteIndex;
     32     const uint8_t *charMap;
     33 
     34 	void addByte(int32_t b);
     35 
     36 public:
     37     NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
     38     virtual ~NGramParser();
     39 
     40 private:
     41     /*
     42     * Binary search for value in table, which must have exactly 64 entries.
     43     */
     44     int32_t search(const int32_t *table, int32_t value);
     45 
     46     void lookup(int32_t thisNgram);
     47 
     48     virtual int32_t nextByte(InputText *det);
     49 	virtual void parseCharacters(InputText *det);
     50 
     51 public:
     52     int32_t parse(InputText *det);
     53 
     54 };
     55 
     56 #if !UCONFIG_ONLY_HTML_CONVERSION
     57 class NGramParser_IBM420 : public NGramParser
     58 {
     59 public:
     60     NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
     61     ~NGramParser_IBM420();
     62 
     63 private:
     64     int32_t alef;
     65     int32_t isLamAlef(int32_t b);
     66     int32_t nextByte(InputText *det);
     67     void parseCharacters(InputText *det);
     68 };
     69 #endif
     70 
     71 
     72 class CharsetRecog_sbcs : public CharsetRecognizer
     73 {
     74 public:
     75     CharsetRecog_sbcs();
     76     virtual ~CharsetRecog_sbcs();
     77     virtual const char *getName() const = 0;
     78     virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
     79     virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
     80 };
     81 
     82 class CharsetRecog_8859_1 : public CharsetRecog_sbcs
     83 {
     84 public:
     85     virtual ~CharsetRecog_8859_1();
     86     const char *getName() const;
     87     virtual UBool match(InputText *det, CharsetMatch *results) const;
     88 };
     89 
     90 class CharsetRecog_8859_2 : public CharsetRecog_sbcs
     91 {
     92 public:
     93     virtual ~CharsetRecog_8859_2();
     94     const char *getName() const;
     95     virtual UBool match(InputText *det, CharsetMatch *results) const;
     96 };
     97 
     98 class CharsetRecog_8859_5 : public CharsetRecog_sbcs
     99 {
    100 public:
    101     virtual ~CharsetRecog_8859_5();
    102     const char *getName() const;
    103 };
    104 
    105 class CharsetRecog_8859_6 : public CharsetRecog_sbcs
    106 {
    107 public:
    108     virtual ~CharsetRecog_8859_6();
    109 
    110     const char *getName() const;
    111 };
    112 
    113 class CharsetRecog_8859_7 : public CharsetRecog_sbcs
    114 {
    115 public:
    116     virtual ~CharsetRecog_8859_7();
    117 
    118     const char *getName() const;
    119 };
    120 
    121 class CharsetRecog_8859_8 : public CharsetRecog_sbcs
    122 {
    123 public:
    124     virtual ~CharsetRecog_8859_8();
    125 
    126     virtual const char *getName() const;
    127 };
    128 
    129 class CharsetRecog_8859_9 : public CharsetRecog_sbcs
    130 {
    131 public:
    132     virtual ~CharsetRecog_8859_9();
    133 
    134     const char *getName() const;
    135 };
    136 
    137 
    138 
    139 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
    140 {
    141 public:
    142     virtual ~CharsetRecog_8859_5_ru();
    143 
    144     const char *getLanguage() const;
    145 
    146     virtual UBool match(InputText *det, CharsetMatch *results) const;
    147 };
    148 
    149 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
    150 {
    151 public:
    152     virtual ~CharsetRecog_8859_6_ar();
    153 
    154     const char *getLanguage() const;
    155 
    156     virtual UBool match(InputText *det, CharsetMatch *results) const;
    157 };
    158 
    159 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
    160 {
    161 public:
    162     virtual ~CharsetRecog_8859_7_el();
    163 
    164     const char *getLanguage() const;
    165 
    166     virtual UBool match(InputText *det, CharsetMatch *results) const;
    167 };
    168 
    169 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
    170 {
    171 public:
    172     virtual ~CharsetRecog_8859_8_I_he();
    173 
    174     const char *getName() const;
    175 
    176     const char *getLanguage() const;
    177 
    178     virtual UBool match(InputText *det, CharsetMatch *results) const;
    179 };
    180 
    181 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
    182 {
    183 public:
    184     virtual ~CharsetRecog_8859_8_he ();
    185 
    186     const char *getLanguage() const;
    187 
    188     virtual UBool match(InputText *det, CharsetMatch *results) const;
    189 };
    190 
    191 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
    192 {
    193 public:
    194     virtual ~CharsetRecog_8859_9_tr ();
    195 
    196     const char *getLanguage() const;
    197 
    198     virtual UBool match(InputText *det, CharsetMatch *results) const;
    199 };
    200 
    201 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
    202 {
    203 public:
    204     virtual ~CharsetRecog_windows_1256();
    205 
    206     const char *getName() const;
    207 
    208     const char *getLanguage() const;
    209 
    210     virtual UBool match(InputText *det, CharsetMatch *results) const;
    211 };
    212 
    213 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
    214 {
    215 public:
    216     virtual ~CharsetRecog_windows_1251();
    217 
    218     const char *getName() const;
    219 
    220     const char *getLanguage() const;
    221 
    222     virtual UBool match(InputText *det, CharsetMatch *results) const;
    223 };
    224 
    225 
    226 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
    227 {
    228 public:
    229     virtual ~CharsetRecog_KOI8_R();
    230 
    231     const char *getName() const;
    232 
    233     const char *getLanguage() const;
    234 
    235     virtual UBool match(InputText *det, CharsetMatch *results) const;
    236 };
    237 
    238 #if !UCONFIG_ONLY_HTML_CONVERSION
    239 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
    240 {
    241 public:
    242     virtual ~CharsetRecog_IBM424_he();
    243 
    244     const char *getLanguage() const;
    245 };
    246 
    247 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
    248 public:
    249     virtual ~CharsetRecog_IBM424_he_rtl();
    250 
    251     const char *getName() const;
    252 
    253     virtual UBool match(InputText *det, CharsetMatch *results) const;
    254 };
    255 
    256 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
    257     virtual ~CharsetRecog_IBM424_he_ltr();
    258 
    259     const char *getName() const;
    260 
    261     virtual UBool match(InputText *det, CharsetMatch *results) const;
    262 };
    263 
    264 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
    265 {
    266 public:
    267     virtual ~CharsetRecog_IBM420_ar();
    268 
    269     const char *getLanguage() const;
    270 	int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
    271 
    272 };
    273 
    274 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
    275 public:
    276     virtual ~CharsetRecog_IBM420_ar_rtl();
    277 
    278     const char *getName() const;
    279 
    280     virtual UBool match(InputText *det, CharsetMatch *results) const;
    281 };
    282 
    283 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
    284     virtual ~CharsetRecog_IBM420_ar_ltr();
    285 
    286     const char *getName() const;
    287 
    288     virtual UBool match(InputText *det, CharsetMatch *results) const;
    289 };
    290 #endif
    291 
    292 U_NAMESPACE_END
    293 
    294 #endif /* !UCONFIG_NO_CONVERSION */
    295 #endif /* __CSRSBCS_H */
    296