1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #ifndef __CSRSBCS_H 9 #define __CSRSBCS_H 10 11 #include "unicode/uobject.h" 12 13 #if !UCONFIG_NO_CONVERSION 14 15 #include "csrecog.h" 16 17 U_NAMESPACE_BEGIN 18 19 class NGramParser : public UMemory 20 { 21 private: 22 int32_t ngram; 23 const int32_t *ngramList; 24 25 int32_t ngramCount; 26 int32_t hitCount; 27 28 protected: 29 int32_t byteIndex; 30 const uint8_t *charMap; 31 32 void addByte(int32_t b); 33 34 public: 35 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 36 virtual ~NGramParser(); 37 38 private: 39 /* 40 * Binary search for value in table, which must have exactly 64 entries. 41 */ 42 int32_t search(const int32_t *table, int32_t value); 43 44 void lookup(int32_t thisNgram); 45 46 virtual int32_t nextByte(InputText *det); 47 virtual void parseCharacters(InputText *det); 48 49 public: 50 int32_t parse(InputText *det); 51 52 }; 53 54 #if !UCONFIG_ONLY_HTML_CONVERSION 55 class NGramParser_IBM420 : public NGramParser 56 { 57 public: 58 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); 59 ~NGramParser_IBM420(); 60 61 private: 62 int32_t alef; 63 int32_t isLamAlef(int32_t b); 64 int32_t nextByte(InputText *det); 65 void parseCharacters(InputText *det); 66 }; 67 #endif 68 69 70 class CharsetRecog_sbcs : public CharsetRecognizer 71 { 72 public: 73 CharsetRecog_sbcs(); 74 virtual ~CharsetRecog_sbcs(); 75 virtual const char *getName() const = 0; 76 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 77 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 78 }; 79 80 class CharsetRecog_8859_1 : public CharsetRecog_sbcs 81 { 82 public: 83 virtual ~CharsetRecog_8859_1(); 84 const char *getName() const; 85 virtual UBool match(InputText *det, CharsetMatch *results) const; 86 }; 87 88 class CharsetRecog_8859_2 : public CharsetRecog_sbcs 89 { 90 public: 91 virtual ~CharsetRecog_8859_2(); 92 const char *getName() const; 93 virtual UBool match(InputText *det, CharsetMatch *results) const; 94 }; 95 96 class CharsetRecog_8859_5 : public CharsetRecog_sbcs 97 { 98 public: 99 virtual ~CharsetRecog_8859_5(); 100 const char *getName() const; 101 }; 102 103 class CharsetRecog_8859_6 : public CharsetRecog_sbcs 104 { 105 public: 106 virtual ~CharsetRecog_8859_6(); 107 108 const char *getName() const; 109 }; 110 111 class CharsetRecog_8859_7 : public CharsetRecog_sbcs 112 { 113 public: 114 virtual ~CharsetRecog_8859_7(); 115 116 const char *getName() const; 117 }; 118 119 class CharsetRecog_8859_8 : public CharsetRecog_sbcs 120 { 121 public: 122 virtual ~CharsetRecog_8859_8(); 123 124 virtual const char *getName() const; 125 }; 126 127 class CharsetRecog_8859_9 : public CharsetRecog_sbcs 128 { 129 public: 130 virtual ~CharsetRecog_8859_9(); 131 132 const char *getName() const; 133 }; 134 135 136 137 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 138 { 139 public: 140 virtual ~CharsetRecog_8859_5_ru(); 141 142 const char *getLanguage() const; 143 144 virtual UBool match(InputText *det, CharsetMatch *results) const; 145 }; 146 147 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 148 { 149 public: 150 virtual ~CharsetRecog_8859_6_ar(); 151 152 const char *getLanguage() const; 153 154 virtual UBool match(InputText *det, CharsetMatch *results) const; 155 }; 156 157 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 158 { 159 public: 160 virtual ~CharsetRecog_8859_7_el(); 161 162 const char *getLanguage() const; 163 164 virtual UBool match(InputText *det, CharsetMatch *results) const; 165 }; 166 167 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 168 { 169 public: 170 virtual ~CharsetRecog_8859_8_I_he(); 171 172 const char *getName() const; 173 174 const char *getLanguage() const; 175 176 virtual UBool match(InputText *det, CharsetMatch *results) const; 177 }; 178 179 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 180 { 181 public: 182 virtual ~CharsetRecog_8859_8_he (); 183 184 const char *getLanguage() const; 185 186 virtual UBool match(InputText *det, CharsetMatch *results) const; 187 }; 188 189 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 190 { 191 public: 192 virtual ~CharsetRecog_8859_9_tr (); 193 194 const char *getLanguage() const; 195 196 virtual UBool match(InputText *det, CharsetMatch *results) const; 197 }; 198 199 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 200 { 201 public: 202 virtual ~CharsetRecog_windows_1256(); 203 204 const char *getName() const; 205 206 const char *getLanguage() const; 207 208 virtual UBool match(InputText *det, CharsetMatch *results) const; 209 }; 210 211 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 212 { 213 public: 214 virtual ~CharsetRecog_windows_1251(); 215 216 const char *getName() const; 217 218 const char *getLanguage() const; 219 220 virtual UBool match(InputText *det, CharsetMatch *results) const; 221 }; 222 223 224 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 225 { 226 public: 227 virtual ~CharsetRecog_KOI8_R(); 228 229 const char *getName() const; 230 231 const char *getLanguage() const; 232 233 virtual UBool match(InputText *det, CharsetMatch *results) const; 234 }; 235 236 #if !UCONFIG_ONLY_HTML_CONVERSION 237 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 238 { 239 public: 240 virtual ~CharsetRecog_IBM424_he(); 241 242 const char *getLanguage() const; 243 }; 244 245 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 246 public: 247 virtual ~CharsetRecog_IBM424_he_rtl(); 248 249 const char *getName() const; 250 251 virtual UBool match(InputText *det, CharsetMatch *results) const; 252 }; 253 254 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 255 virtual ~CharsetRecog_IBM424_he_ltr(); 256 257 const char *getName() const; 258 259 virtual UBool match(InputText *det, CharsetMatch *results) const; 260 }; 261 262 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 263 { 264 public: 265 virtual ~CharsetRecog_IBM420_ar(); 266 267 const char *getLanguage() const; 268 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 269 270 }; 271 272 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 273 public: 274 virtual ~CharsetRecog_IBM420_ar_rtl(); 275 276 const char *getName() const; 277 278 virtual UBool match(InputText *det, CharsetMatch *results) const; 279 }; 280 281 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 282 virtual ~CharsetRecog_IBM420_ar_ltr(); 283 284 const char *getName() const; 285 286 virtual UBool match(InputText *det, CharsetMatch *results) const; 287 }; 288 #endif 289 290 U_NAMESPACE_END 291 292 #endif /* !UCONFIG_NO_CONVERSION */ 293 #endif /* __CSRSBCS_H */ 294