1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #include "cmemory.h" 11 12 #if !UCONFIG_NO_CONVERSION 13 #include "csrsbcs.h" 14 15 #define N_GRAM_SIZE 3 16 #define N_GRAM_MASK 0xFFFFFF 17 18 U_NAMESPACE_BEGIN 19 20 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) 21 :byteIndex(0), ngram(0) 22 { 23 ngramList = theNgramList; 24 charMap = theCharMap; 25 26 ngramCount = hitCount = 0; 27 } 28 29 /* 30 * Binary search for value in table, which must have exactly 64 entries. 31 */ 32 33 int32_t NGramParser::search(const int32_t *table, int32_t value) 34 { 35 int32_t index = 0; 36 37 if (table[index + 32] <= value) { 38 index += 32; 39 } 40 41 if (table[index + 16] <= value) { 42 index += 16; 43 } 44 45 if (table[index + 8] <= value) { 46 index += 8; 47 } 48 49 if (table[index + 4] <= value) { 50 index += 4; 51 } 52 53 if (table[index + 2] <= value) { 54 index += 2; 55 } 56 57 if (table[index + 1] <= value) { 58 index += 1; 59 } 60 61 if (table[index] > value) { 62 index -= 1; 63 } 64 65 if (index < 0 || table[index] != value) { 66 return -1; 67 } 68 69 return index; 70 } 71 72 void NGramParser::lookup(int32_t thisNgram) 73 { 74 ngramCount += 1; 75 76 if (search(ngramList, thisNgram) >= 0) { 77 hitCount += 1; 78 } 79 80 } 81 82 void NGramParser::addByte(int32_t b) 83 { 84 ngram = ((ngram << 8) + b) & N_GRAM_MASK; 85 lookup(ngram); 86 } 87 88 int32_t NGramParser::nextByte(InputText *det) 89 { 90 if (byteIndex >= det->fInputLen) { 91 return -1; 92 } 93 94 return det->fInputBytes[byteIndex++]; 95 } 96 97 int32_t NGramParser::parse(InputText *det) 98 { 99 int32_t b; 100 bool ignoreSpace = FALSE; 101 102 while ((b = nextByte(det)) >= 0) { 103 uint8_t mb = charMap[b]; 104 105 // TODO: 0x20 might not be a space in all character sets... 106 if (mb != 0) { 107 if (!(mb == 0x20 && ignoreSpace)) { 108 addByte(mb); 109 } 110 111 ignoreSpace = (mb == 0x20); 112 } 113 } 114 115 // TODO: Is this OK? The buffer could have ended in the middle of a word... 116 addByte(0x20); 117 118 double rawPercent = (double) hitCount / (double) ngramCount; 119 120 // if (rawPercent <= 2.0) { 121 // return 0; 122 // } 123 124 // TODO - This is a bit of a hack to take care of a case 125 // were we were getting a confidence of 135... 126 if (rawPercent > 0.33) { 127 return 98; 128 } 129 130 return (int32_t) (rawPercent * 300.0); 131 } 132 133 CharsetRecog_sbcs::CharsetRecog_sbcs() 134 : haveC1Bytes(FALSE) 135 { 136 // nothing else to do 137 } 138 139 CharsetRecog_sbcs::~CharsetRecog_sbcs() 140 { 141 // nothing to do 142 } 143 144 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) 145 { 146 NGramParser parser(ngrams, byteMap); 147 int32_t result; 148 149 haveC1Bytes = det->fC1Bytes; 150 result = parser.parse(det); 151 152 return result; 153 } 154 155 static const uint8_t charMap_8859_1[] = { 156 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 157 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 158 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 159 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 160 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 161 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 162 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 163 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 164 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 165 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 166 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 167 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 168 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 169 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 170 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 171 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 172 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 173 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 174 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 175 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 176 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 177 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 178 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 179 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, 180 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 181 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 182 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 183 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 184 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 185 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 186 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 187 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 188 }; 189 190 static const uint8_t charMap_8859_2[] = { 191 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 192 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 193 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 194 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 195 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 196 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 197 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 198 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 199 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 200 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 201 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 202 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 203 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 204 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 205 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 206 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 207 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 208 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 209 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 210 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 211 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20, 212 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, 213 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7, 214 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, 215 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 216 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 217 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 218 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 219 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 220 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 221 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 222 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, 223 }; 224 225 static const uint8_t charMap_8859_5[] = { 226 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 227 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 228 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 229 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 230 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 231 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 232 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 233 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 234 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 235 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 236 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 237 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 238 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 239 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 240 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 241 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 242 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 243 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 244 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 245 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 246 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 247 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, 248 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 249 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 250 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 251 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 252 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 253 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 254 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 255 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 256 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 257 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, 258 }; 259 260 static const uint8_t charMap_8859_6[] = { 261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 273 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 274 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 275 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 276 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 278 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 285 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 286 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 287 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 288 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20, 289 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 290 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20, 291 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 293 }; 294 295 static const uint8_t charMap_8859_7[] = { 296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 308 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 309 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 310 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 311 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 312 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 313 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 314 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 316 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20, 317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20, 319 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE, 320 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 322 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 323 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF, 324 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 325 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 326 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 327 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, 328 }; 329 330 static const uint8_t charMap_8859_8[] = { 331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 343 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 344 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 345 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 346 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 347 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 348 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 349 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 353 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 354 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 355 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 356 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 357 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 358 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 359 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 360 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 361 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 362 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20, 363 }; 364 365 static const uint8_t charMap_8859_9[] = { 366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 378 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 379 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 380 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 381 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 387 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 388 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 389 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, 390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 391 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 392 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 393 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF, 394 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 395 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 396 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 397 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 398 }; 399 400 static const int32_t ngrams_windows_1251[] = { 401 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, 402 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, 403 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, 404 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, 405 }; 406 407 static const uint8_t charMap_windows_1251[] = { 408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 412 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 416 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 417 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 418 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 419 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 420 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 421 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 422 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 423 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 424 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, 425 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, 426 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 427 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, 428 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20, 429 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF, 430 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20, 431 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF, 432 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 433 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 434 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 435 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 436 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 437 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 438 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 439 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 440 }; 441 442 static const int32_t ngrams_windows_1256[] = { 443 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, 444 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, 445 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, 446 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420, 447 }; 448 449 static const uint8_t charMap_windows_1256[] = { 450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 458 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 459 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 460 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 461 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 462 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 463 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 464 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 465 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 466 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, 467 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F, 468 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 469 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F, 470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 471 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 472 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 474 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 475 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 476 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20, 477 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 478 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 479 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 480 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20, 481 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF, 482 }; 483 484 static const int32_t ngrams_KOI8_R[] = { 485 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, 486 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, 487 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, 488 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF, 489 }; 490 491 static const uint8_t charMap_KOI8_R[] = { 492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 495 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 496 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 497 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 498 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 499 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 500 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 501 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 502 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 503 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 504 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 505 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 506 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 507 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 508 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 512 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, 513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 514 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, 515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 516 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 517 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 518 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 519 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 520 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 521 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 522 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 523 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 524 }; 525 526 static const int32_t ngrams_IBM424_he_rtl[] = { 527 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 528 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, 529 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, 530 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, 531 }; 532 533 static const int32_t ngrams_IBM424_he_ltr[] = { 534 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, 535 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, 536 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, 537 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651, 538 }; 539 540 static const uint8_t charMap_IBM424_he[] = { 541 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ 542 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 543 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 544 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 545 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 546 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 547 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 548 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 549 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40, 550 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 551 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 552 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 553 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 554 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 555 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 556 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 557 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 558 }; 559 560 static const int32_t ngrams_IBM420_ar_rtl[] = { 561 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 562 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, 563 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, 564 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, 565 }; 566 567 static const int32_t ngrams_IBM420_ar_ltr[] = { 568 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, 569 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, 570 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, 571 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156 572 }; 573 574 static const uint8_t charMap_IBM420_ar[]= { 575 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ 576 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 577 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 578 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 579 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 580 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 581 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 582 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 583 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 584 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 585 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 586 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 587 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 588 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF, 589 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 590 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, 591 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, 592 }; 593 594 //ISO-8859-1,2,5,6,7,8,9 Ngrams 595 static const int32_t ngrams_8859_1_en[] = { 596 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, 597 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, 598 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, 599 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, 600 }; 601 602 static const int32_t ngrams_8859_1_da[] = { 603 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, 604 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, 605 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, 606 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, 607 }; 608 609 static const int32_t ngrams_8859_1_de[] = { 610 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, 611 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, 612 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, 613 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, 614 }; 615 616 static const int32_t ngrams_8859_1_es[] = { 617 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 618 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, 619 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, 620 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, 621 }; 622 623 static const int32_t ngrams_8859_1_fr[] = { 624 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, 625 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, 626 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, 627 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, 628 }; 629 630 static const int32_t ngrams_8859_1_it[] = { 631 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, 632 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, 633 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, 634 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, 635 }; 636 637 static const int32_t ngrams_8859_1_nl[] = { 638 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, 639 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, 640 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, 641 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, 642 }; 643 644 static const int32_t ngrams_8859_1_no[] = { 645 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, 646 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, 647 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, 648 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, 649 }; 650 651 static const int32_t ngrams_8859_1_pt[] = { 652 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 653 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, 654 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, 655 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, 656 }; 657 658 static const int32_t ngrams_8859_1_sv[] = { 659 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, 660 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, 661 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, 662 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, 663 }; 664 665 static const int32_t ngrams_8859_2_cs[] = { 666 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, 667 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, 668 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, 669 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, 670 }; 671 672 static const int32_t ngrams_8859_2_hu[] = { 673 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, 674 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, 675 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, 676 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, 677 }; 678 679 static const int32_t ngrams_8859_2_pl[] = { 680 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, 681 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, 682 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, 683 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, 684 }; 685 686 static const int32_t ngrams_8859_2_ro[] = { 687 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, 688 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, 689 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, 690 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, 691 }; 692 693 static const int32_t ngrams_8859_5_ru[] = { 694 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE, 695 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD, 696 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2, 697 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520, 698 }; 699 700 static const int32_t ngrams_8859_6_ar[] = { 701 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8, 702 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1, 703 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20, 704 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620, 705 }; 706 707 static const int32_t ngrams_8859_7_el[] = { 708 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7, 709 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120, 710 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5, 711 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20, 712 }; 713 714 static const int32_t ngrams_8859_8_I_he[] = { 715 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, 716 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4, 717 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE, 718 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9, 719 }; 720 721 static const int32_t ngrams_8859_8_he[] = { 722 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0, 723 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC, 724 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920, 725 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9, 726 }; 727 728 static const int32_t ngrams_8859_9_tr[] = { 729 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961, 730 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062, 731 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, 732 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, 733 }; 734 735 CharsetRecog_8859_1::~CharsetRecog_8859_1() 736 { 737 // nothing to do 738 } 739 740 const char *CharsetRecog_8859_1::getName() const 741 { 742 return haveC1Bytes? "windows-1252" : "ISO-8859-1"; 743 } 744 745 const char *CharsetRecog_8859_1_en::getLanguage() const 746 { 747 return "en"; 748 } 749 750 CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en() 751 { 752 // nothing to do 753 } 754 755 int32_t CharsetRecog_8859_1_en::match(InputText *textIn) 756 { 757 int32_t result = match_sbcs(textIn, ngrams_8859_1_en, charMap_8859_1); 758 759 // printf("8859_1_en: result = %d\n", result); 760 return result; //match_sbcs(textIn, ngrams, charMap); 761 } 762 763 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da() 764 { 765 // nothing to do 766 } 767 768 const char *CharsetRecog_8859_1_da::getLanguage() const 769 { 770 return "da"; 771 } 772 773 int32_t CharsetRecog_8859_1_da::match(InputText *textIn) 774 { 775 return match_sbcs(textIn, ngrams_8859_1_da, charMap_8859_1); 776 } 777 778 CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {} 779 780 const char *CharsetRecog_8859_1_de::getLanguage() const 781 { 782 return "de"; 783 } 784 785 int32_t CharsetRecog_8859_1_de::match(InputText *textIn) 786 { 787 return match_sbcs(textIn, ngrams_8859_1_de, charMap_8859_1); 788 } 789 790 CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es() 791 { 792 // nothing to do 793 } 794 795 const char *CharsetRecog_8859_1_es::getLanguage() const 796 { 797 return "es"; 798 } 799 800 int32_t CharsetRecog_8859_1_es::match(InputText *textIn) 801 { 802 return match_sbcs(textIn, ngrams_8859_1_es, charMap_8859_1); 803 } 804 805 CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr() 806 { 807 // nothing to do 808 } 809 810 const char *CharsetRecog_8859_1_fr::getLanguage() const 811 { 812 return "fr"; 813 } 814 815 int32_t CharsetRecog_8859_1_fr::match(InputText *textIn) 816 { 817 return match_sbcs(textIn, ngrams_8859_1_fr, charMap_8859_1); 818 } 819 820 CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it() 821 { 822 // nothing to do 823 } 824 825 const char *CharsetRecog_8859_1_it::getLanguage() const 826 { 827 return "it"; 828 } 829 830 int32_t CharsetRecog_8859_1_it::match(InputText *textIn) 831 { 832 return match_sbcs(textIn, ngrams_8859_1_it, charMap_8859_1); 833 } 834 835 CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl() 836 { 837 // nothing to do 838 } 839 840 const char *CharsetRecog_8859_1_nl::getLanguage() const 841 { 842 return "nl"; 843 } 844 845 int32_t CharsetRecog_8859_1_nl::match(InputText *textIn) 846 { 847 return match_sbcs(textIn, ngrams_8859_1_nl, charMap_8859_1); 848 } 849 850 CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {} 851 852 const char *CharsetRecog_8859_1_no::getLanguage() const 853 { 854 return "no"; 855 } 856 857 int32_t CharsetRecog_8859_1_no::match(InputText *textIn) 858 { 859 return match_sbcs(textIn, ngrams_8859_1_no, charMap_8859_1); 860 } 861 862 CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt() 863 { 864 // nothing to do 865 } 866 867 const char *CharsetRecog_8859_1_pt::getLanguage() const 868 { 869 return "pt"; 870 } 871 872 int32_t CharsetRecog_8859_1_pt::match(InputText *textIn) 873 { 874 return match_sbcs(textIn, ngrams_8859_1_pt, charMap_8859_1); 875 } 876 877 CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {} 878 879 const char *CharsetRecog_8859_1_sv::getLanguage() const 880 { 881 return "sv"; 882 } 883 884 int32_t CharsetRecog_8859_1_sv::match(InputText *textIn) 885 { 886 return match_sbcs(textIn, ngrams_8859_1_sv, charMap_8859_1); 887 } 888 889 CharsetRecog_8859_2::~CharsetRecog_8859_2() 890 { 891 // nothing to do 892 } 893 894 const char *CharsetRecog_8859_2::getName() const 895 { 896 return haveC1Bytes? "windows-1250" : "ISO-8859-2"; 897 } 898 899 CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs() 900 { 901 // nothing to do 902 } 903 904 const char *CharsetRecog_8859_2_cs::getLanguage() const 905 { 906 return "cs"; 907 } 908 909 int32_t CharsetRecog_8859_2_cs::match(InputText *textIn) 910 { 911 return match_sbcs(textIn, ngrams_8859_2_cs, charMap_8859_2); 912 } 913 914 CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu() 915 { 916 // nothing to do 917 } 918 919 const char *CharsetRecog_8859_2_hu::getLanguage() const 920 { 921 return "hu"; 922 } 923 924 int32_t CharsetRecog_8859_2_hu::match(InputText *textIn) 925 { 926 return match_sbcs(textIn, ngrams_8859_2_hu, charMap_8859_2); 927 } 928 929 CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl() 930 { 931 // nothing to do 932 } 933 934 const char *CharsetRecog_8859_2_pl::getLanguage() const 935 { 936 return "pl"; 937 } 938 939 int32_t CharsetRecog_8859_2_pl::match(InputText *textIn) 940 { 941 return match_sbcs(textIn, ngrams_8859_2_pl, charMap_8859_2); 942 } 943 944 CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro() 945 { 946 // nothing to do 947 } 948 949 const char *CharsetRecog_8859_2_ro::getLanguage() const 950 { 951 return "ro"; 952 } 953 954 int32_t CharsetRecog_8859_2_ro::match(InputText *textIn) 955 { 956 return match_sbcs(textIn, ngrams_8859_2_ro, charMap_8859_2); 957 } 958 959 CharsetRecog_8859_5::~CharsetRecog_8859_5() 960 { 961 // nothing to do 962 } 963 964 const char *CharsetRecog_8859_5::getName() const 965 { 966 return "ISO-8859-5"; 967 } 968 969 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru() 970 { 971 // nothing to do 972 } 973 974 const char *CharsetRecog_8859_5_ru::getLanguage() const 975 { 976 return "ru"; 977 } 978 979 int32_t CharsetRecog_8859_5_ru::match(InputText *textIn) 980 { 981 return match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5); 982 } 983 984 CharsetRecog_8859_6::~CharsetRecog_8859_6() 985 { 986 // nothing to do 987 } 988 989 const char *CharsetRecog_8859_6::getName() const 990 { 991 return "ISO-8859-6"; 992 } 993 994 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar() 995 { 996 // nothing to do 997 } 998 999 const char *CharsetRecog_8859_6_ar::getLanguage() const 1000 { 1001 return "ar"; 1002 } 1003 1004 int32_t CharsetRecog_8859_6_ar::match(InputText *textIn) 1005 { 1006 return match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6); 1007 } 1008 1009 CharsetRecog_8859_7::~CharsetRecog_8859_7() 1010 { 1011 // nothing to do 1012 } 1013 1014 const char *CharsetRecog_8859_7::getName() const 1015 { 1016 return haveC1Bytes? "windows-1253" : "ISO-8859-7"; 1017 } 1018 1019 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el() 1020 { 1021 // nothing to do 1022 } 1023 1024 const char *CharsetRecog_8859_7_el::getLanguage() const 1025 { 1026 return "el"; 1027 } 1028 1029 int32_t CharsetRecog_8859_7_el::match(InputText *textIn) 1030 { 1031 return match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7); 1032 } 1033 1034 CharsetRecog_8859_8::~CharsetRecog_8859_8() 1035 { 1036 // nothing to do 1037 } 1038 1039 const char *CharsetRecog_8859_8::getName() const 1040 { 1041 return haveC1Bytes? "windows-1255" : "ISO-8859-8"; 1042 } 1043 1044 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he () 1045 { 1046 // nothing to do 1047 } 1048 1049 const char *CharsetRecog_8859_8_I_he::getName() const 1050 { 1051 return haveC1Bytes? "windows-1255" : "ISO-8859-8-I"; 1052 } 1053 1054 const char *CharsetRecog_8859_8_I_he::getLanguage() const 1055 { 1056 return "he"; 1057 } 1058 1059 int32_t CharsetRecog_8859_8_I_he::match(InputText *textIn) 1060 { 1061 return match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8); 1062 } 1063 1064 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he() 1065 { 1066 // od ot gnihton 1067 } 1068 1069 const char *CharsetRecog_8859_8_he::getLanguage() const 1070 { 1071 return "he"; 1072 } 1073 1074 int32_t CharsetRecog_8859_8_he::match(InputText *textIn) 1075 { 1076 return match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8); 1077 } 1078 1079 CharsetRecog_8859_9::~CharsetRecog_8859_9() 1080 { 1081 // nothing to do 1082 } 1083 1084 const char *CharsetRecog_8859_9::getName() const 1085 { 1086 return haveC1Bytes? "windows-1254" : "ISO-8859-9"; 1087 } 1088 1089 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr () 1090 { 1091 // nothing to do 1092 } 1093 1094 const char *CharsetRecog_8859_9_tr::getLanguage() const 1095 { 1096 return "tr"; 1097 } 1098 1099 int32_t CharsetRecog_8859_9_tr::match(InputText *textIn) 1100 { 1101 return match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9); 1102 } 1103 1104 CharsetRecog_windows_1256::~CharsetRecog_windows_1256() 1105 { 1106 // nothing to do 1107 } 1108 1109 const char *CharsetRecog_windows_1256::getName() const 1110 { 1111 return "windows-1256"; 1112 } 1113 1114 const char *CharsetRecog_windows_1256::getLanguage() const 1115 { 1116 return "ar"; 1117 } 1118 1119 int32_t CharsetRecog_windows_1256::match(InputText *textIn) 1120 { 1121 return match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256); 1122 } 1123 1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251() 1125 { 1126 // nothing to do 1127 } 1128 1129 const char *CharsetRecog_windows_1251::getName() const 1130 { 1131 return "windows-1251"; 1132 } 1133 1134 const char *CharsetRecog_windows_1251::getLanguage() const 1135 { 1136 return "ru"; 1137 } 1138 1139 int32_t CharsetRecog_windows_1251::match(InputText *textIn) 1140 { 1141 return match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251); 1142 } 1143 1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R() 1145 { 1146 // nothing to do 1147 } 1148 1149 const char *CharsetRecog_KOI8_R::getName() const 1150 { 1151 return "KOI8-R"; 1152 } 1153 1154 const char *CharsetRecog_KOI8_R::getLanguage() const 1155 { 1156 return "ru"; 1157 } 1158 1159 int32_t CharsetRecog_KOI8_R::match(InputText *textIn) 1160 { 1161 return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); 1162 } 1163 1164 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() 1165 { 1166 // nothing to do 1167 } 1168 1169 const char *CharsetRecog_IBM424_he::getLanguage() const 1170 { 1171 return "he"; 1172 } 1173 1174 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl() 1175 { 1176 // nothing to do 1177 } 1178 1179 const char *CharsetRecog_IBM424_he_rtl::getName() const 1180 { 1181 return "IBM424_rtl"; 1182 } 1183 1184 int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn) 1185 { 1186 return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); 1187 } 1188 1189 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr() 1190 { 1191 // nothing to do 1192 } 1193 1194 const char *CharsetRecog_IBM424_he_ltr::getName() const 1195 { 1196 return "IBM424_ltr"; 1197 } 1198 1199 int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn) 1200 { 1201 return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); 1202 } 1203 1204 static const uint8_t unshapeMap_IBM420[] = { 1205 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ 1206 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 1207 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 1208 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 1209 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 1210 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 1211 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 1212 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 1213 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 1214 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, 1215 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, 1216 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, 1217 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, 1218 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, 1219 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, 1220 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 1221 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 1222 }; 1223 1224 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() 1225 { 1226 // nothing to do 1227 } 1228 1229 const char *CharsetRecog_IBM420_ar::getLanguage() const 1230 { 1231 return "ar"; 1232 } 1233 1234 void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) { 1235 prev_fInputBytesLength = textIn->fInputLen; 1236 prev_fInputBytes = textIn->fInputBytes; 1237 1238 int32_t length = 0; 1239 uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length); 1240 1241 if (bb != NULL) { 1242 textIn->fInputBytes = bb; 1243 textIn->fInputLen = length; 1244 1245 deleteBuffer = TRUE; 1246 } else { 1247 deleteBuffer = FALSE; 1248 } 1249 } 1250 1251 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { 1252 uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length); 1253 1254 if (resultArray != NULL) { 1255 for (int32_t i = 0; i < inputBytesLength; i++) { 1256 resultArray[i] = unshapeMap_IBM420[resultArray[i]]; 1257 } 1258 } 1259 1260 return resultArray; 1261 } 1262 1263 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) { 1264 int32_t bigBufferLength = inputBytesLength * 2; 1265 uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength); 1266 uint8_t *resultBuffer = NULL; 1267 1268 if (bigBuffer != NULL) { 1269 int32_t bufferIndex; 1270 uint8_t unshapedLamAlef[] = { 0xb1, 0x56 }; 1271 1272 for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) { 1273 if (isLamAlef(inputBytes[i])) { 1274 bigBuffer[bufferIndex++] = unshapedLamAlef[0]; 1275 bigBuffer[bufferIndex++] = unshapedLamAlef[1]; 1276 } else { 1277 bigBuffer[bufferIndex++] = inputBytes[i]; 1278 } 1279 } 1280 1281 length = bufferIndex; 1282 resultBuffer = (uint8_t *)uprv_malloc(length); 1283 if (resultBuffer != NULL) { 1284 uprv_memcpy(resultBuffer, bigBuffer, length); 1285 } 1286 } 1287 1288 if (bigBuffer != NULL) { 1289 uprv_free(bigBuffer); 1290 } 1291 1292 return resultBuffer; 1293 } 1294 1295 void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) { 1296 if (deleteBuffer) { 1297 uprv_free(textIn->fInputBytes); 1298 1299 textIn->fInputBytes = prev_fInputBytes; 1300 textIn->fInputLen = prev_fInputBytesLength; 1301 } 1302 } 1303 1304 UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) { 1305 uint8_t shapedLamAlef[] = { 1306 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 1307 }; 1308 1309 for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) { 1310 if (b == shapedLamAlef[i]) { 1311 return TRUE; 1312 } 1313 } 1314 1315 return FALSE; 1316 } 1317 1318 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() 1319 { 1320 // nothing to do 1321 } 1322 1323 const char *CharsetRecog_IBM420_ar_rtl::getName() const 1324 { 1325 return "IBM420_rtl"; 1326 } 1327 1328 int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn) 1329 { 1330 return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); 1331 } 1332 1333 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr() 1334 { 1335 // nothing to do 1336 } 1337 1338 const char *CharsetRecog_IBM420_ar_ltr::getName() const 1339 { 1340 return "IBM420_ltr"; 1341 } 1342 1343 int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn) 1344 { 1345 return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); 1346 } 1347 1348 U_NAMESPACE_END 1349 #endif 1350 1351