Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2005-2016, International Business Machines Corporation and    *
      6 * others. All Rights Reserved.                                                *
      7 *******************************************************************************
      8 */
      9 package com.ibm.icu.text;
     10 
     11 import java.io.IOException;
     12 import java.io.InputStream;
     13 import java.io.Reader;
     14 import java.util.ArrayList;
     15 import java.util.Arrays;
     16 import java.util.Collections;
     17 import java.util.List;
     18 
     19 
     20 /**
     21  * <code>CharsetDetector</code> provides a facility for detecting the
     22  * charset or encoding of character data in an unknown format.
     23  * The input data can either be from an input stream or an array of bytes.
     24  * The result of the detection operation is a list of possibly matching
     25  * charsets, or, for simple use, you can just ask for a Java Reader that
     26  * will will work over the input data.
     27  * <p>
     28  * Character set detection is at best an imprecise operation.  The detection
     29  * process will attempt to identify the charset that best matches the characteristics
     30  * of the byte data, but the process is partly statistical in nature, and
     31  * the results can not be guaranteed to always be correct.
     32  * <p>
     33  * For best accuracy in charset detection, the input data should be primarily
     34  * in a single language, and a minimum of a few hundred bytes worth of plain text
     35  * in the language are needed.  The detection process will attempt to
     36  * ignore html or xml style markup that could otherwise obscure the content.
     37  * <p>
     38  * @stable ICU 3.4
     39  */
     40 public class CharsetDetector {
     41 
     42 //   Question: Should we have getters corresponding to the setters for input text
     43 //   and declared encoding?
     44 
     45 //   A thought: If we were to create our own type of Java Reader, we could defer
     46 //   figuring out an actual charset for data that starts out with too much English
     47 //   only ASCII until the user actually read through to something that didn't look
     48 //   like 7 bit English.  If  nothing else ever appeared, we would never need to
     49 //   actually choose the "real" charset.  All assuming that the application just
     50 //   wants the data, and doesn't care about a char set name.
     51 
     52     /**
     53      *   Constructor
     54      *
     55      * @stable ICU 3.4
     56      */
     57     public CharsetDetector() {
     58     }
     59 
     60     /**
     61      * Set the declared encoding for charset detection.
     62      *  The declared encoding of an input text is an encoding obtained
     63      *  from an http header or xml declaration or similar source that
     64      *  can be provided as additional information to the charset detector.
     65      *  A match between a declared encoding and a possible detected encoding
     66      *  will raise the quality of that detected encoding by a small delta,
     67      *  and will also appear as a "reason" for the match.
     68      * <p>
     69      * A declared encoding that is incompatible with the input data being
     70      * analyzed will not be added to the list of possible encodings.
     71      *
     72      *  @param encoding The declared encoding
     73      *
     74      * @stable ICU 3.4
     75      */
     76     public CharsetDetector setDeclaredEncoding(String encoding) {
     77         fDeclaredEncoding = encoding;
     78         return this;
     79     }
     80 
     81     /**
     82      * Set the input text (byte) data whose charset is to be detected.
     83      *
     84      * @param in the input text of unknown encoding
     85      *
     86      * @return This CharsetDetector
     87      *
     88      * @stable ICU 3.4
     89      */
     90     public CharsetDetector setText(byte [] in) {
     91         fRawInput  = in;
     92         fRawLength = in.length;
     93 
     94         return this;
     95     }
     96 
     97     private static final int kBufSize = 8000;
     98 
     99     /**
    100      * Set the input text (byte) data whose charset is to be detected.
    101      *  <p>
    102      *   The input stream that supplies the character data must have markSupported()
    103      *   == true; the charset detection process will read a small amount of data,
    104      *   then return the stream to its original position via
    105      *   the InputStream.reset() operation.  The exact amount that will
    106      *   be read depends on the characteristics of the data itself.
    107      *
    108      * @param in the input text of unknown encoding
    109      *
    110      * @return This CharsetDetector
    111      *
    112      * @stable ICU 3.4
    113      */
    114 
    115     public CharsetDetector setText(InputStream in) throws IOException {
    116         fInputStream = in;
    117         fInputStream.mark(kBufSize);
    118         fRawInput = new byte[kBufSize];   // Always make a new buffer because the
    119                                           //   previous one may have come from the caller,
    120                                           //   in which case we can't touch it.
    121         fRawLength = 0;
    122         int remainingLength = kBufSize;
    123         while (remainingLength > 0 ) {
    124             // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
    125             int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
    126             if (bytesRead <= 0) {
    127                  break;
    128             }
    129             fRawLength += bytesRead;
    130             remainingLength -= bytesRead;
    131         }
    132         fInputStream.reset();
    133 
    134         return this;
    135     }
    136 
    137 
    138     /**
    139      * Return the charset that best matches the supplied input data.
    140      *
    141      * Note though, that because the detection
    142      * only looks at the start of the input data,
    143      * there is a possibility that the returned charset will fail to handle
    144      * the full set of input data.
    145      * <p>
    146      * Raise an exception if
    147      *  <ul>
    148      *    <li>no charset appears to match the data.</li>
    149      *    <li>no input text has been provided</li>
    150      *  </ul>
    151      *
    152      * @return a CharsetMatch object representing the best matching charset, or
    153      *         <code>null</code> if there are no matches.
    154      *
    155      * @stable ICU 3.4
    156      */
    157     public CharsetMatch detect() {
    158 //   TODO:  A better implementation would be to copy the detect loop from
    159 //          detectAll(), and cut it short as soon as a match with a high confidence
    160 //          is found.  This is something to be done later, after things are otherwise
    161 //          working.
    162         CharsetMatch matches[] = detectAll();
    163 
    164         if (matches == null || matches.length == 0) {
    165             return null;
    166         }
    167 
    168         return matches[0];
    169      }
    170 
    171     /**
    172      *  Return an array of all charsets that appear to be plausible
    173      *  matches with the input data.  The array is ordered with the
    174      *  best quality match first.
    175      * <p>
    176      * Raise an exception if
    177      *  <ul>
    178      *    <li>no charsets appear to match the input data.</li>
    179      *    <li>no input text has been provided</li>
    180      *  </ul>
    181      *
    182      * @return An array of CharsetMatch objects representing possibly matching charsets.
    183      *
    184      * @stable ICU 3.4
    185      */
    186     public CharsetMatch[] detectAll() {
    187         ArrayList<CharsetMatch>         matches = new ArrayList<CharsetMatch>();
    188 
    189         MungeInput();  // Strip html markup, collect byte stats.
    190 
    191         //  Iterate over all possible charsets, remember all that
    192         //    give a match quality > 0.
    193         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
    194             CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
    195             boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
    196             if (active) {
    197                 CharsetMatch m = rcinfo.recognizer.match(this);
    198                 if (m != null) {
    199                     matches.add(m);
    200                 }
    201             }
    202         }
    203         Collections.sort(matches);      // CharsetMatch compares on confidence
    204         Collections.reverse(matches);   //  Put best match first.
    205         CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
    206         resultArray = matches.toArray(resultArray);
    207         return resultArray;
    208     }
    209 
    210 
    211     /**
    212      * Autodetect the charset of an inputStream, and return a Java Reader
    213      * to access the converted input data.
    214      * <p>
    215      * This is a convenience method that is equivalent to
    216      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
    217      * <p>
    218      *   For the input stream that supplies the character data, markSupported()
    219      *   must be true; the  charset detection will read a small amount of data,
    220      *   then return the stream to its original position via
    221      *   the InputStream.reset() operation.  The exact amount that will
    222      *    be read depends on the characteristics of the data itself.
    223      *<p>
    224      * Raise an exception if no charsets appear to match the input data.
    225      *
    226      * @param in The source of the byte data in the unknown charset.
    227      *
    228      * @param declaredEncoding  A declared encoding for the data, if available,
    229      *           or null or an empty string if none is available.
    230      *
    231      * @stable ICU 3.4
    232      */
    233     public Reader getReader(InputStream in, String declaredEncoding) {
    234         fDeclaredEncoding = declaredEncoding;
    235 
    236         try {
    237             setText(in);
    238 
    239             CharsetMatch match = detect();
    240 
    241             if (match == null) {
    242                 return null;
    243             }
    244 
    245             return match.getReader();
    246         } catch (IOException e) {
    247             return null;
    248         }
    249     }
    250 
    251     /**
    252      * Autodetect the charset of an inputStream, and return a String
    253      * containing the converted input data.
    254      * <p>
    255      * This is a convenience method that is equivalent to
    256      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
    257      *<p>
    258      * Raise an exception if no charsets appear to match the input data.
    259      *
    260      * @param in The source of the byte data in the unknown charset.
    261      *
    262      * @param declaredEncoding  A declared encoding for the data, if available,
    263      *           or null or an empty string if none is available.
    264      *
    265      * @stable ICU 3.4
    266      */
    267     public String getString(byte[] in, String declaredEncoding)
    268     {
    269         fDeclaredEncoding = declaredEncoding;
    270 
    271         try {
    272             setText(in);
    273 
    274             CharsetMatch match = detect();
    275 
    276             if (match == null) {
    277                 return null;
    278             }
    279 
    280             return match.getString(-1);
    281         } catch (IOException e) {
    282             return null;
    283         }
    284     }
    285 
    286 
    287     /**
    288      * Get the names of all charsets supported by <code>CharsetDetector</code> class.
    289      * <p>
    290      * <b>Note:</b> Multiple different charset encodings in a same family may use
    291      * a single shared name in this implementation. For example, this method returns
    292      * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
    293      * (Windows Latin 1). However, actual detection result could be "windows-1252"
    294      * when the input data matches Latin 1 code points with any points only available
    295      * in "windows-1252".
    296      *
    297      * @return an array of the names of all charsets supported by
    298      * <code>CharsetDetector</code> class.
    299      *
    300      * @stable ICU 3.4
    301      */
    302     public static String[] getAllDetectableCharsets() {
    303         String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
    304         for (int i = 0; i < allCharsetNames.length; i++) {
    305             allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
    306         }
    307         return allCharsetNames;
    308     }
    309 
    310     /**
    311      * Test whether or not input filtering is enabled.
    312      *
    313      * @return <code>true</code> if input text will be filtered.
    314      *
    315      * @see #enableInputFilter
    316      *
    317      * @stable ICU 3.4
    318      */
    319     public boolean inputFilterEnabled()
    320     {
    321         return fStripTags;
    322     }
    323 
    324     /**
    325      * Enable filtering of input text. If filtering is enabled,
    326      * text within angle brackets ("&lt;" and "&gt;") will be removed
    327      * before detection.
    328      *
    329      * @param filter <code>true</code> to enable input text filtering.
    330      *
    331      * @return The previous setting.
    332      *
    333      * @stable ICU 3.4
    334      */
    335     public boolean enableInputFilter(boolean filter)
    336     {
    337         boolean previous = fStripTags;
    338 
    339         fStripTags = filter;
    340 
    341         return previous;
    342     }
    343 
    344     /*
    345      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
    346      *               it by removing what appears to be html markup.
    347      */
    348     private void MungeInput() {
    349         int srci = 0;
    350         int dsti = 0;
    351         byte b;
    352         boolean  inMarkup = false;
    353         int      openTags = 0;
    354         int      badTags  = 0;
    355 
    356         //
    357         //  html / xml markup stripping.
    358         //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    359         //     discard everything within < brackets >
    360         //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    361         //     guess as to whether the input was actually marked up at all.
    362         if (fStripTags) {
    363             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
    364                 b = fRawInput[srci];
    365                 if (b == (byte)'<') {
    366                     if (inMarkup) {
    367                         badTags++;
    368                     }
    369                     inMarkup = true;
    370                     openTags++;
    371                 }
    372 
    373                 if (! inMarkup) {
    374                     fInputBytes[dsti++] = b;
    375                 }
    376 
    377                 if (b == (byte)'>') {
    378                     inMarkup = false;
    379                 }
    380             }
    381 
    382             fInputLen = dsti;
    383         }
    384 
    385         //
    386         //  If it looks like this input wasn't marked up, or if it looks like it's
    387         //    essentially nothing but markup abandon the markup stripping.
    388         //    Detection will have to work on the unstripped input.
    389         //
    390         if (openTags<5 || openTags/5 < badTags ||
    391                 (fInputLen < 100 && fRawLength>600)) {
    392             int limit = fRawLength;
    393 
    394             if (limit > kBufSize) {
    395                 limit = kBufSize;
    396             }
    397 
    398             for (srci=0; srci<limit; srci++) {
    399                 fInputBytes[srci] = fRawInput[srci];
    400             }
    401             fInputLen = srci;
    402         }
    403 
    404         //
    405         // Tally up the byte occurence statistics.
    406         //   These are available for use by the various detectors.
    407         //
    408         Arrays.fill(fByteStats, (short)0);
    409         for (srci=0; srci<fInputLen; srci++) {
    410             int val = fInputBytes[srci] & 0x00ff;
    411             fByteStats[val]++;
    412         }
    413 
    414         fC1Bytes = false;
    415         for (int i = 0x80; i <= 0x9F; i += 1) {
    416             if (fByteStats[i] != 0) {
    417                 fC1Bytes = true;
    418                 break;
    419             }
    420         }
    421      }
    422 
    423     /*
    424      *  The following items are accessed by individual CharsetRecongizers during
    425      *     the recognition process
    426      *
    427      */
    428     byte[]      fInputBytes =       // The text to be checked.  Markup will have been
    429                    new byte[kBufSize];  //   removed if appropriate.
    430 
    431     int         fInputLen;          // Length of the byte data in fInputBytes.
    432 
    433     short       fByteStats[] =      // byte frequency statistics for the input text.
    434                    new short[256];  //   Value is percent, not absolute.
    435                                     //   Value is rounded up, so zero really means zero occurences.
    436 
    437     boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
    438                    false;
    439 
    440     String      fDeclaredEncoding;
    441 
    442 
    443     byte[]               fRawInput;     // Original, untouched input bytes.
    444                                         //  If user gave us a byte array, this is it.
    445                                         //  If user gave us a stream, it's read to a
    446                                         //  buffer here.
    447     int                  fRawLength;    // Length of data in fRawInput array.
    448 
    449     InputStream          fInputStream;  // User's input stream, or null if the user
    450                                         //   gave us a byte array.
    451 
    452     //
    453     //  Stuff private to CharsetDetector
    454     //
    455     private boolean      fStripTags =   // If true, setText() will strip tags from input text.
    456                            false;
    457 
    458     private boolean[]    fEnabledRecognizers;   // If not null, active set of charset recognizers had
    459                                                 // been changed from the default. The array index is
    460                                                 // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
    461 
    462     private static class CSRecognizerInfo {
    463         CharsetRecognizer recognizer;
    464         boolean isDefaultEnabled;
    465 
    466         CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
    467             this.recognizer = recognizer;
    468             this.isDefaultEnabled = isDefaultEnabled;
    469         }
    470     }
    471 
    472     /*
    473      * List of recognizers for all charsets known to the implementation.
    474      */
    475     private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
    476 
    477     static {
    478         List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>();
    479 
    480         list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
    481         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
    482         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
    483         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
    484         list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
    485 
    486         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
    487         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
    488         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
    489         list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
    490         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
    491         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
    492         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
    493         list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
    494 
    495         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
    496         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
    497         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
    498         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
    499         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
    500         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
    501         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
    502         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
    503         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
    504         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
    505         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
    506 
    507         // IBM 420/424 recognizers are disabled by default
    508         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
    509         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
    510         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
    511         list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
    512 
    513         ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
    514     }
    515 
    516     /**
    517      * Get the names of charsets that can be recognized by this CharsetDetector instance.
    518      *
    519      * @return an array of the names of charsets that can be recognized by this CharsetDetector
    520      * instance.
    521      *
    522      * @internal
    523      * @deprecated This API is ICU internal only.
    524      */
    525     @Deprecated
    526     public String[] getDetectableCharsets() {
    527         List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size());
    528         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
    529             CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
    530             boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
    531             if (active) {
    532                 csnames.add(rcinfo.recognizer.getName());
    533             }
    534         }
    535         return csnames.toArray(new String[csnames.size()]);
    536     }
    537 
    538     /**
    539      * Enable or disable individual charset encoding.
    540      * A name of charset encoding must be included in the names returned by
    541      * {@link #getAllDetectableCharsets()}.
    542      *
    543      * @param encoding the name of charset encoding.
    544      * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
    545      * charset encoding.
    546      * @return A reference to this <code>CharsetDetector</code>.
    547      * @throws IllegalArgumentException when the name of charset encoding is
    548      * not supported.
    549      *
    550      * @internal
    551      * @deprecated This API is ICU internal only.
    552      */
    553     @Deprecated
    554     public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
    555         int modIdx = -1;
    556         boolean isDefaultVal = false;
    557         for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
    558             CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
    559             if (csrinfo.recognizer.getName().equals(encoding)) {
    560                 modIdx = i;
    561                 isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
    562                 break;
    563             }
    564         }
    565         if (modIdx < 0) {
    566             // No matching encoding found
    567             throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
    568         }
    569 
    570         if (fEnabledRecognizers == null && !isDefaultVal) {
    571             // Create an array storing the non default setting
    572             fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
    573 
    574             // Initialize the array with default info
    575             for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
    576                 fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
    577             }
    578         }
    579 
    580         if (fEnabledRecognizers != null) {
    581             fEnabledRecognizers[modIdx] = enabled;
    582         }
    583 
    584         return this;
    585     }
    586 }
    587