Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2005-2016, International Business Machines Corporation and    *
      6 * others. All Rights Reserved.                                                *
      7 *******************************************************************************
      8 */
      9 package com.ibm.icu.text;
     10 
     11 import java.io.ByteArrayInputStream;
     12 import java.io.IOException;
     13 import java.io.InputStream;
     14 import java.io.InputStreamReader;
     15 import java.io.Reader;
     16 
     17 
     18 /**
     19  * This class represents a charset that has been identified by a CharsetDetector
     20  * as a possible encoding for a set of input data.  From an instance of this
     21  * class, you can ask for a confidence level in the charset identification,
     22  * or for Java Reader or String to access the original byte data in Unicode form.
     23  * <p>
     24  * Instances of this class are created only by CharsetDetectors.
     25  * <p>
     26  * Note:  this class has a natural ordering that is inconsistent with equals.
     27  *        The natural ordering is based on the match confidence value.
     28  *
     29  * @stable ICU 3.4
     30  */
     31 public class CharsetMatch implements Comparable<CharsetMatch> {
     32 
     33 
     34     /**
     35      * Create a java.io.Reader for reading the Unicode character data corresponding
     36      * to the original byte data supplied to the Charset detect operation.
     37      * <p>
     38      * CAUTION:  if the source of the byte data was an InputStream, a Reader
     39      * can be created for only one matching char set using this method.  If more
     40      * than one charset needs to be tried, the caller will need to reset
     41      * the InputStream and create InputStreamReaders itself, based on the charset name.
     42      *
     43      * @return the Reader for the Unicode character data.
     44      *
     45      * @stable ICU 3.4
     46      */
     47     public Reader getReader() {
     48         InputStream inputStream = fInputStream;
     49 
     50         if (inputStream == null) {
     51             inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
     52         }
     53 
     54         try {
     55             inputStream.reset();
     56             return new InputStreamReader(inputStream, getName());
     57         } catch (IOException e) {
     58             return null;
     59         }
     60     }
     61 
     62     /**
     63      * Create a Java String from Unicode character data corresponding
     64      * to the original byte data supplied to the Charset detect operation.
     65      *
     66      * @return a String created from the converted input data.
     67      *
     68      * @stable ICU 3.4
     69      */
     70     public String getString()  throws java.io.IOException {
     71         return getString(-1);
     72 
     73     }
     74 
     75     /**
     76      * Create a Java String from Unicode character data corresponding
     77      * to the original byte data supplied to the Charset detect operation.
     78      * The length of the returned string is limited to the specified size;
     79      * the string will be trunctated to this length if necessary.  A limit value of
     80      * zero or less is ignored, and treated as no limit.
     81      *
     82      * @param maxLength The maximium length of the String to be created when the
     83      *                  source of the data is an input stream, or -1 for
     84      *                  unlimited length.
     85      * @return a String created from the converted input data.
     86      *
     87      * @stable ICU 3.4
     88      */
     89     public String getString(int maxLength) throws java.io.IOException {
     90         String result = null;
     91         if (fInputStream != null) {
     92             StringBuilder sb = new StringBuilder();
     93             char[] buffer = new char[1024];
     94             Reader reader = getReader();
     95             int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
     96             int bytesRead = 0;
     97 
     98             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
     99                 sb.append(buffer, 0, bytesRead);
    100                 max -= bytesRead;
    101             }
    102 
    103             reader.close();
    104 
    105             return sb.toString();
    106         } else {
    107             String name = getName();
    108             /*
    109              * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
    110              * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
    111              * should be stripped off before creating the string.
    112              */
    113             int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
    114             if (startSuffix > 0) {
    115                 name = name.substring(0, startSuffix);
    116             }
    117             result = new String(fRawInput, name);
    118         }
    119         return result;
    120 
    121     }
    122 
    123     /**
    124      * Get an indication of the confidence in the charset detected.
    125      * Confidence values range from 0-100, with larger numbers indicating
    126      * a better match of the input data to the characteristics of the
    127      * charset.
    128      *
    129      * @return the confidence in the charset match
    130      *
    131      * @stable ICU 3.4
    132      */
    133     public int getConfidence() {
    134         return fConfidence;
    135     }
    136 
    137     /**
    138      * Get the name of the detected charset.
    139      * The name will be one that can be used with other APIs on the
    140      * platform that accept charset names.  It is the "Canonical name"
    141      * as defined by the class java.nio.charset.Charset; for
    142      * charsets that are registered with the IANA charset registry,
    143      * this is the MIME-preferred registerd name.
    144      *
    145      * @see java.nio.charset.Charset
    146      * @see java.io.InputStreamReader
    147      *
    148      * @return The name of the charset.
    149      *
    150      * @stable ICU 3.4
    151      */
    152     public String getName() {
    153         return fCharsetName;
    154     }
    155 
    156     /**
    157      * Get the ISO code for the language of the detected charset.
    158      *
    159      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
    160      *
    161      * @stable ICU 3.4
    162      */
    163     public String getLanguage() {
    164         return fLang;
    165     }
    166 
    167     /**
    168      * Compare to other CharsetMatch objects.
    169      * Comparison is based on the match confidence value, which
    170      *   allows CharsetDetector.detectAll() to order its results.
    171      *
    172      * @param other the CharsetMatch object to compare against.
    173      * @return  a negative integer, zero, or a positive integer as the
    174      *          confidence level of this CharsetMatch
    175      *          is less than, equal to, or greater than that of
    176      *          the argument.
    177      * @throws ClassCastException if the argument is not a CharsetMatch.
    178      * @stable ICU 4.4
    179      */
    180     @Override
    181     public int compareTo (CharsetMatch other) {
    182         int compareResult = 0;
    183         if (this.fConfidence > other.fConfidence) {
    184             compareResult = 1;
    185         } else if (this.fConfidence < other.fConfidence) {
    186             compareResult = -1;
    187         }
    188         return compareResult;
    189     }
    190 
    191     /*
    192      *  Constructor.  Implementation internal
    193      */
    194     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
    195         fConfidence = conf;
    196 
    197         // The references to the original application input data must be copied out
    198         //   of the charset recognizer to here, in case the application resets the
    199         //   recognizer before using this CharsetMatch.
    200         if (det.fInputStream == null) {
    201             // We only want the existing input byte data if it came straight from the user,
    202             //   not if is just the head of a stream.
    203             fRawInput    = det.fRawInput;
    204             fRawLength   = det.fRawLength;
    205         }
    206         fInputStream = det.fInputStream;
    207         fCharsetName = rec.getName();
    208         fLang = rec.getLanguage();
    209     }
    210 
    211     /*
    212      *  Constructor.  Implementation internal
    213      */
    214     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
    215         fConfidence = conf;
    216 
    217         // The references to the original application input data must be copied out
    218         //   of the charset recognizer to here, in case the application resets the
    219         //   recognizer before using this CharsetMatch.
    220         if (det.fInputStream == null) {
    221             // We only want the existing input byte data if it came straight from the user,
    222             //   not if is just the head of a stream.
    223             fRawInput    = det.fRawInput;
    224             fRawLength   = det.fRawLength;
    225         }
    226         fInputStream = det.fInputStream;
    227         fCharsetName = csName;
    228         fLang = lang;
    229     }
    230 
    231 
    232     //
    233     //   Private Data
    234     //
    235     private int                 fConfidence;
    236     private byte[]              fRawInput = null;     // Original, untouched input bytes.
    237                                                       //  If user gave us a byte array, this is it.
    238     private int                 fRawLength;           // Length of data in fRawInput array.
    239 
    240     private InputStream         fInputStream = null;  // User's input stream, or null if the user
    241                                                       //   gave us a byte array.
    242 
    243     private String              fCharsetName;         // The name of the charset this CharsetMatch
    244                                                       //   represents.  Filled in by the recognizer.
    245     private String              fLang;                // The language, if one was determined by
    246                                                       //   the recognizer during the detect operation.
    247 }
    248