1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #ifndef __CSRSBCS_H 9 #define __CSRSBCS_H 10 11 #include "unicode/uobject.h" 12 13 #if !UCONFIG_NO_CONVERSION 14 15 #include "csrecog.h" 16 17 U_NAMESPACE_BEGIN 18 19 class NGramParser : public UMemory 20 { 21 private: 22 int32_t byteIndex; 23 int32_t ngram; 24 25 const int32_t *ngramList; 26 const uint8_t *charMap; 27 28 int32_t ngramCount; 29 int32_t hitCount; 30 31 public: 32 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 33 34 private: 35 /* 36 * Binary search for value in table, which must have exactly 64 entries. 37 */ 38 int32_t search(const int32_t *table, int32_t value); 39 40 void lookup(int32_t thisNgram); 41 void addByte(int32_t b); 42 int32_t nextByte(InputText *det); 43 44 public: 45 int32_t parse(InputText *det); 46 47 }; 48 49 50 class CharsetRecog_sbcs : public CharsetRecognizer 51 { 52 public: 53 CharsetRecog_sbcs(); 54 virtual ~CharsetRecog_sbcs(); 55 virtual const char *getName() const = 0; 56 virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 57 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 58 }; 59 60 class CharsetRecog_8859_1 : public CharsetRecog_sbcs 61 { 62 public: 63 virtual ~CharsetRecog_8859_1(); 64 const char *getName() const; 65 virtual UBool match(InputText *det, CharsetMatch *results) const; 66 }; 67 68 class CharsetRecog_8859_2 : public CharsetRecog_sbcs 69 { 70 public: 71 virtual ~CharsetRecog_8859_2(); 72 const char *getName() const; 73 virtual UBool match(InputText *det, CharsetMatch *results) const; 74 }; 75 76 class CharsetRecog_8859_5 : public CharsetRecog_sbcs 77 { 78 public: 79 virtual ~CharsetRecog_8859_5(); 80 const char *getName() const; 81 }; 82 83 class CharsetRecog_8859_6 : public CharsetRecog_sbcs 84 { 85 public: 86 virtual ~CharsetRecog_8859_6(); 87 88 const char *getName() const; 89 }; 90 91 class CharsetRecog_8859_7 : public CharsetRecog_sbcs 92 { 93 public: 94 virtual ~CharsetRecog_8859_7(); 95 96 const char *getName() const; 97 }; 98 99 class CharsetRecog_8859_8 : public CharsetRecog_sbcs 100 { 101 public: 102 virtual ~CharsetRecog_8859_8(); 103 104 virtual const char *getName() const; 105 }; 106 107 class CharsetRecog_8859_9 : public CharsetRecog_sbcs 108 { 109 public: 110 virtual ~CharsetRecog_8859_9(); 111 112 const char *getName() const; 113 }; 114 115 116 117 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 118 { 119 public: 120 virtual ~CharsetRecog_8859_5_ru(); 121 122 const char *getLanguage() const; 123 124 virtual UBool match(InputText *det, CharsetMatch *results) const; 125 }; 126 127 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 128 { 129 public: 130 virtual ~CharsetRecog_8859_6_ar(); 131 132 const char *getLanguage() const; 133 134 virtual UBool match(InputText *det, CharsetMatch *results) const; 135 }; 136 137 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 138 { 139 public: 140 virtual ~CharsetRecog_8859_7_el(); 141 142 const char *getLanguage() const; 143 144 virtual UBool match(InputText *det, CharsetMatch *results) const; 145 }; 146 147 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 148 { 149 public: 150 virtual ~CharsetRecog_8859_8_I_he(); 151 152 const char *getName() const; 153 154 const char *getLanguage() const; 155 156 virtual UBool match(InputText *det, CharsetMatch *results) const; 157 }; 158 159 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 160 { 161 public: 162 virtual ~CharsetRecog_8859_8_he (); 163 164 const char *getLanguage() const; 165 166 virtual UBool match(InputText *det, CharsetMatch *results) const; 167 }; 168 169 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 170 { 171 public: 172 virtual ~CharsetRecog_8859_9_tr (); 173 174 const char *getLanguage() const; 175 176 virtual UBool match(InputText *det, CharsetMatch *results) const; 177 }; 178 179 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 180 { 181 public: 182 virtual ~CharsetRecog_windows_1256(); 183 184 const char *getName() const; 185 186 const char *getLanguage() const; 187 188 virtual UBool match(InputText *det, CharsetMatch *results) const; 189 }; 190 191 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 192 { 193 public: 194 virtual ~CharsetRecog_windows_1251(); 195 196 const char *getName() const; 197 198 const char *getLanguage() const; 199 200 virtual UBool match(InputText *det, CharsetMatch *results) const; 201 }; 202 203 204 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 205 { 206 public: 207 virtual ~CharsetRecog_KOI8_R(); 208 209 const char *getName() const; 210 211 const char *getLanguage() const; 212 213 virtual UBool match(InputText *det, CharsetMatch *results) const; 214 }; 215 216 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 217 { 218 public: 219 virtual ~CharsetRecog_IBM424_he(); 220 221 const char *getLanguage() const; 222 }; 223 224 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 225 public: 226 virtual ~CharsetRecog_IBM424_he_rtl(); 227 228 const char *getName() const; 229 230 virtual UBool match(InputText *det, CharsetMatch *results) const; 231 }; 232 233 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 234 virtual ~CharsetRecog_IBM424_he_ltr(); 235 236 const char *getName() const; 237 238 virtual UBool match(InputText *det, CharsetMatch *results) const; 239 }; 240 241 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 242 { 243 public: 244 virtual ~CharsetRecog_IBM420_ar(); 245 246 const char *getLanguage() const; 247 248 protected: 249 void matchInit(InputText *textIn); 250 void matchFinish(InputText *textIn); 251 252 private: 253 uint8_t *prev_fInputBytes; 254 int32_t prev_fInputBytesLength; 255 UBool deleteBuffer; 256 257 UBool isLamAlef(uint8_t b); 258 uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 259 uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); 260 }; 261 262 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 263 public: 264 virtual ~CharsetRecog_IBM420_ar_rtl(); 265 266 const char *getName() const; 267 268 virtual UBool match(InputText *det, CharsetMatch *results) const; 269 }; 270 271 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 272 virtual ~CharsetRecog_IBM420_ar_ltr(); 273 274 const char *getName() const; 275 276 virtual UBool match(InputText *det, CharsetMatch *results) const; 277 }; 278 279 U_NAMESPACE_END 280 281 #endif /* !UCONFIG_NO_CONVERSION */ 282 #endif /* __CSRSBCS_H */ 283