1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2005-2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.IOException; 12 import java.io.InputStream; 13 import java.io.Reader; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collections; 17 import java.util.List; 18 19 20 /** 21 * <code>CharsetDetector</code> provides a facility for detecting the 22 * charset or encoding of character data in an unknown format. 23 * The input data can either be from an input stream or an array of bytes. 24 * The result of the detection operation is a list of possibly matching 25 * charsets, or, for simple use, you can just ask for a Java Reader that 26 * will will work over the input data. 27 * <p> 28 * Character set detection is at best an imprecise operation. The detection 29 * process will attempt to identify the charset that best matches the characteristics 30 * of the byte data, but the process is partly statistical in nature, and 31 * the results can not be guaranteed to always be correct. 32 * <p> 33 * For best accuracy in charset detection, the input data should be primarily 34 * in a single language, and a minimum of a few hundred bytes worth of plain text 35 * in the language are needed. The detection process will attempt to 36 * ignore html or xml style markup that could otherwise obscure the content. 37 * <p> 38 * @stable ICU 3.4 39 */ 40 public class CharsetDetector { 41 42 // Question: Should we have getters corresponding to the setters for input text 43 // and declared encoding? 44 45 // A thought: If we were to create our own type of Java Reader, we could defer 46 // figuring out an actual charset for data that starts out with too much English 47 // only ASCII until the user actually read through to something that didn't look 48 // like 7 bit English. If nothing else ever appeared, we would never need to 49 // actually choose the "real" charset. All assuming that the application just 50 // wants the data, and doesn't care about a char set name. 51 52 /** 53 * Constructor 54 * 55 * @stable ICU 3.4 56 */ 57 public CharsetDetector() { 58 } 59 60 /** 61 * Set the declared encoding for charset detection. 62 * The declared encoding of an input text is an encoding obtained 63 * from an http header or xml declaration or similar source that 64 * can be provided as additional information to the charset detector. 65 * A match between a declared encoding and a possible detected encoding 66 * will raise the quality of that detected encoding by a small delta, 67 * and will also appear as a "reason" for the match. 68 * <p> 69 * A declared encoding that is incompatible with the input data being 70 * analyzed will not be added to the list of possible encodings. 71 * 72 * @param encoding The declared encoding 73 * 74 * @stable ICU 3.4 75 */ 76 public CharsetDetector setDeclaredEncoding(String encoding) { 77 fDeclaredEncoding = encoding; 78 return this; 79 } 80 81 /** 82 * Set the input text (byte) data whose charset is to be detected. 83 * 84 * @param in the input text of unknown encoding 85 * 86 * @return This CharsetDetector 87 * 88 * @stable ICU 3.4 89 */ 90 public CharsetDetector setText(byte [] in) { 91 fRawInput = in; 92 fRawLength = in.length; 93 94 return this; 95 } 96 97 private static final int kBufSize = 8000; 98 99 /** 100 * Set the input text (byte) data whose charset is to be detected. 101 * <p> 102 * The input stream that supplies the character data must have markSupported() 103 * == true; the charset detection process will read a small amount of data, 104 * then return the stream to its original position via 105 * the InputStream.reset() operation. The exact amount that will 106 * be read depends on the characteristics of the data itself. 107 * 108 * @param in the input text of unknown encoding 109 * 110 * @return This CharsetDetector 111 * 112 * @stable ICU 3.4 113 */ 114 115 public CharsetDetector setText(InputStream in) throws IOException { 116 fInputStream = in; 117 fInputStream.mark(kBufSize); 118 fRawInput = new byte[kBufSize]; // Always make a new buffer because the 119 // previous one may have come from the caller, 120 // in which case we can't touch it. 121 fRawLength = 0; 122 int remainingLength = kBufSize; 123 while (remainingLength > 0 ) { 124 // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. 125 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); 126 if (bytesRead <= 0) { 127 break; 128 } 129 fRawLength += bytesRead; 130 remainingLength -= bytesRead; 131 } 132 fInputStream.reset(); 133 134 return this; 135 } 136 137 138 /** 139 * Return the charset that best matches the supplied input data. 140 * 141 * Note though, that because the detection 142 * only looks at the start of the input data, 143 * there is a possibility that the returned charset will fail to handle 144 * the full set of input data. 145 * <p> 146 * Raise an exception if 147 * <ul> 148 * <li>no charset appears to match the data.</li> 149 * <li>no input text has been provided</li> 150 * </ul> 151 * 152 * @return a CharsetMatch object representing the best matching charset, or 153 * <code>null</code> if there are no matches. 154 * 155 * @stable ICU 3.4 156 */ 157 public CharsetMatch detect() { 158 // TODO: A better implementation would be to copy the detect loop from 159 // detectAll(), and cut it short as soon as a match with a high confidence 160 // is found. This is something to be done later, after things are otherwise 161 // working. 162 CharsetMatch matches[] = detectAll(); 163 164 if (matches == null || matches.length == 0) { 165 return null; 166 } 167 168 return matches[0]; 169 } 170 171 /** 172 * Return an array of all charsets that appear to be plausible 173 * matches with the input data. The array is ordered with the 174 * best quality match first. 175 * <p> 176 * Raise an exception if 177 * <ul> 178 * <li>no charsets appear to match the input data.</li> 179 * <li>no input text has been provided</li> 180 * </ul> 181 * 182 * @return An array of CharsetMatch objects representing possibly matching charsets. 183 * 184 * @stable ICU 3.4 185 */ 186 public CharsetMatch[] detectAll() { 187 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); 188 189 MungeInput(); // Strip html markup, collect byte stats. 190 191 // Iterate over all possible charsets, remember all that 192 // give a match quality > 0. 193 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 194 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 195 boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; 196 if (active) { 197 CharsetMatch m = rcinfo.recognizer.match(this); 198 if (m != null) { 199 matches.add(m); 200 } 201 } 202 } 203 Collections.sort(matches); // CharsetMatch compares on confidence 204 Collections.reverse(matches); // Put best match first. 205 CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; 206 resultArray = matches.toArray(resultArray); 207 return resultArray; 208 } 209 210 211 /** 212 * Autodetect the charset of an inputStream, and return a Java Reader 213 * to access the converted input data. 214 * <p> 215 * This is a convenience method that is equivalent to 216 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code> 217 * <p> 218 * For the input stream that supplies the character data, markSupported() 219 * must be true; the charset detection will read a small amount of data, 220 * then return the stream to its original position via 221 * the InputStream.reset() operation. The exact amount that will 222 * be read depends on the characteristics of the data itself. 223 *<p> 224 * Raise an exception if no charsets appear to match the input data. 225 * 226 * @param in The source of the byte data in the unknown charset. 227 * 228 * @param declaredEncoding A declared encoding for the data, if available, 229 * or null or an empty string if none is available. 230 * 231 * @stable ICU 3.4 232 */ 233 public Reader getReader(InputStream in, String declaredEncoding) { 234 fDeclaredEncoding = declaredEncoding; 235 236 try { 237 setText(in); 238 239 CharsetMatch match = detect(); 240 241 if (match == null) { 242 return null; 243 } 244 245 return match.getReader(); 246 } catch (IOException e) { 247 return null; 248 } 249 } 250 251 /** 252 * Autodetect the charset of an inputStream, and return a String 253 * containing the converted input data. 254 * <p> 255 * This is a convenience method that is equivalent to 256 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> 257 *<p> 258 * Raise an exception if no charsets appear to match the input data. 259 * 260 * @param in The source of the byte data in the unknown charset. 261 * 262 * @param declaredEncoding A declared encoding for the data, if available, 263 * or null or an empty string if none is available. 264 * 265 * @stable ICU 3.4 266 */ 267 public String getString(byte[] in, String declaredEncoding) 268 { 269 fDeclaredEncoding = declaredEncoding; 270 271 try { 272 setText(in); 273 274 CharsetMatch match = detect(); 275 276 if (match == null) { 277 return null; 278 } 279 280 return match.getString(-1); 281 } catch (IOException e) { 282 return null; 283 } 284 } 285 286 287 /** 288 * Get the names of all charsets supported by <code>CharsetDetector</code> class. 289 * <p> 290 * <b>Note:</b> Multiple different charset encodings in a same family may use 291 * a single shared name in this implementation. For example, this method returns 292 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 293 * (Windows Latin 1). However, actual detection result could be "windows-1252" 294 * when the input data matches Latin 1 code points with any points only available 295 * in "windows-1252". 296 * 297 * @return an array of the names of all charsets supported by 298 * <code>CharsetDetector</code> class. 299 * 300 * @stable ICU 3.4 301 */ 302 public static String[] getAllDetectableCharsets() { 303 String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; 304 for (int i = 0; i < allCharsetNames.length; i++) { 305 allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); 306 } 307 return allCharsetNames; 308 } 309 310 /** 311 * Test whether or not input filtering is enabled. 312 * 313 * @return <code>true</code> if input text will be filtered. 314 * 315 * @see #enableInputFilter 316 * 317 * @stable ICU 3.4 318 */ 319 public boolean inputFilterEnabled() 320 { 321 return fStripTags; 322 } 323 324 /** 325 * Enable filtering of input text. If filtering is enabled, 326 * text within angle brackets ("<" and ">") will be removed 327 * before detection. 328 * 329 * @param filter <code>true</code> to enable input text filtering. 330 * 331 * @return The previous setting. 332 * 333 * @stable ICU 3.4 334 */ 335 public boolean enableInputFilter(boolean filter) 336 { 337 boolean previous = fStripTags; 338 339 fStripTags = filter; 340 341 return previous; 342 } 343 344 /* 345 * MungeInput - after getting a set of raw input data to be analyzed, preprocess 346 * it by removing what appears to be html markup. 347 */ 348 private void MungeInput() { 349 int srci = 0; 350 int dsti = 0; 351 byte b; 352 boolean inMarkup = false; 353 int openTags = 0; 354 int badTags = 0; 355 356 // 357 // html / xml markup stripping. 358 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 359 // discard everything within < brackets > 360 // Count how many total '<' and illegal (nested) '<' occur, so we can make some 361 // guess as to whether the input was actually marked up at all. 362 if (fStripTags) { 363 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { 364 b = fRawInput[srci]; 365 if (b == (byte)'<') { 366 if (inMarkup) { 367 badTags++; 368 } 369 inMarkup = true; 370 openTags++; 371 } 372 373 if (! inMarkup) { 374 fInputBytes[dsti++] = b; 375 } 376 377 if (b == (byte)'>') { 378 inMarkup = false; 379 } 380 } 381 382 fInputLen = dsti; 383 } 384 385 // 386 // If it looks like this input wasn't marked up, or if it looks like it's 387 // essentially nothing but markup abandon the markup stripping. 388 // Detection will have to work on the unstripped input. 389 // 390 if (openTags<5 || openTags/5 < badTags || 391 (fInputLen < 100 && fRawLength>600)) { 392 int limit = fRawLength; 393 394 if (limit > kBufSize) { 395 limit = kBufSize; 396 } 397 398 for (srci=0; srci<limit; srci++) { 399 fInputBytes[srci] = fRawInput[srci]; 400 } 401 fInputLen = srci; 402 } 403 404 // 405 // Tally up the byte occurence statistics. 406 // These are available for use by the various detectors. 407 // 408 Arrays.fill(fByteStats, (short)0); 409 for (srci=0; srci<fInputLen; srci++) { 410 int val = fInputBytes[srci] & 0x00ff; 411 fByteStats[val]++; 412 } 413 414 fC1Bytes = false; 415 for (int i = 0x80; i <= 0x9F; i += 1) { 416 if (fByteStats[i] != 0) { 417 fC1Bytes = true; 418 break; 419 } 420 } 421 } 422 423 /* 424 * The following items are accessed by individual CharsetRecongizers during 425 * the recognition process 426 * 427 */ 428 byte[] fInputBytes = // The text to be checked. Markup will have been 429 new byte[kBufSize]; // removed if appropriate. 430 431 int fInputLen; // Length of the byte data in fInputBytes. 432 433 short fByteStats[] = // byte frequency statistics for the input text. 434 new short[256]; // Value is percent, not absolute. 435 // Value is rounded up, so zero really means zero occurences. 436 437 boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; 438 false; 439 440 String fDeclaredEncoding; 441 442 443 byte[] fRawInput; // Original, untouched input bytes. 444 // If user gave us a byte array, this is it. 445 // If user gave us a stream, it's read to a 446 // buffer here. 447 int fRawLength; // Length of data in fRawInput array. 448 449 InputStream fInputStream; // User's input stream, or null if the user 450 // gave us a byte array. 451 452 // 453 // Stuff private to CharsetDetector 454 // 455 private boolean fStripTags = // If true, setText() will strip tags from input text. 456 false; 457 458 private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had 459 // been changed from the default. The array index is 460 // corresponding to ALL_RECOGNIZER. See setDetectableCharset(). 461 462 private static class CSRecognizerInfo { 463 CharsetRecognizer recognizer; 464 boolean isDefaultEnabled; 465 466 CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) { 467 this.recognizer = recognizer; 468 this.isDefaultEnabled = isDefaultEnabled; 469 } 470 } 471 472 /* 473 * List of recognizers for all charsets known to the implementation. 474 */ 475 private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS; 476 477 static { 478 List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>(); 479 480 list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); 481 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); 482 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); 483 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); 484 list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); 485 486 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); 487 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); 488 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); 489 list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); 490 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); 491 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); 492 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); 493 list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); 494 495 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); 496 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); 497 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); 498 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); 499 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); 500 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); 501 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); 502 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); 503 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); 504 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); 505 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); 506 507 // IBM 420/424 recognizers are disabled by default 508 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); 509 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); 510 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); 511 list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); 512 513 ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); 514 } 515 516 /** 517 * Get the names of charsets that can be recognized by this CharsetDetector instance. 518 * 519 * @return an array of the names of charsets that can be recognized by this CharsetDetector 520 * instance. 521 * 522 * @internal 523 * @deprecated This API is ICU internal only. 524 */ 525 @Deprecated 526 public String[] getDetectableCharsets() { 527 List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size()); 528 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 529 CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); 530 boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; 531 if (active) { 532 csnames.add(rcinfo.recognizer.getName()); 533 } 534 } 535 return csnames.toArray(new String[csnames.size()]); 536 } 537 538 /** 539 * Enable or disable individual charset encoding. 540 * A name of charset encoding must be included in the names returned by 541 * {@link #getAllDetectableCharsets()}. 542 * 543 * @param encoding the name of charset encoding. 544 * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 545 * charset encoding. 546 * @return A reference to this <code>CharsetDetector</code>. 547 * @throws IllegalArgumentException when the name of charset encoding is 548 * not supported. 549 * 550 * @internal 551 * @deprecated This API is ICU internal only. 552 */ 553 @Deprecated 554 public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { 555 int modIdx = -1; 556 boolean isDefaultVal = false; 557 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 558 CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); 559 if (csrinfo.recognizer.getName().equals(encoding)) { 560 modIdx = i; 561 isDefaultVal = (csrinfo.isDefaultEnabled == enabled); 562 break; 563 } 564 } 565 if (modIdx < 0) { 566 // No matching encoding found 567 throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); 568 } 569 570 if (fEnabledRecognizers == null && !isDefaultVal) { 571 // Create an array storing the non default setting 572 fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; 573 574 // Initialize the array with default info 575 for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { 576 fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; 577 } 578 } 579 580 if (fEnabledRecognizers != null) { 581 fEnabledRecognizers[modIdx] = enabled; 582 } 583 584 return this; 585 } 586 } 587