1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2005-2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.ByteArrayInputStream; 12 import java.io.IOException; 13 import java.io.InputStream; 14 import java.io.InputStreamReader; 15 import java.io.Reader; 16 17 18 /** 19 * This class represents a charset that has been identified by a CharsetDetector 20 * as a possible encoding for a set of input data. From an instance of this 21 * class, you can ask for a confidence level in the charset identification, 22 * or for Java Reader or String to access the original byte data in Unicode form. 23 * <p> 24 * Instances of this class are created only by CharsetDetectors. 25 * <p> 26 * Note: this class has a natural ordering that is inconsistent with equals. 27 * The natural ordering is based on the match confidence value. 28 * 29 * @stable ICU 3.4 30 */ 31 public class CharsetMatch implements Comparable<CharsetMatch> { 32 33 34 /** 35 * Create a java.io.Reader for reading the Unicode character data corresponding 36 * to the original byte data supplied to the Charset detect operation. 37 * <p> 38 * CAUTION: if the source of the byte data was an InputStream, a Reader 39 * can be created for only one matching char set using this method. If more 40 * than one charset needs to be tried, the caller will need to reset 41 * the InputStream and create InputStreamReaders itself, based on the charset name. 42 * 43 * @return the Reader for the Unicode character data. 44 * 45 * @stable ICU 3.4 46 */ 47 public Reader getReader() { 48 InputStream inputStream = fInputStream; 49 50 if (inputStream == null) { 51 inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); 52 } 53 54 try { 55 inputStream.reset(); 56 return new InputStreamReader(inputStream, getName()); 57 } catch (IOException e) { 58 return null; 59 } 60 } 61 62 /** 63 * Create a Java String from Unicode character data corresponding 64 * to the original byte data supplied to the Charset detect operation. 65 * 66 * @return a String created from the converted input data. 67 * 68 * @stable ICU 3.4 69 */ 70 public String getString() throws java.io.IOException { 71 return getString(-1); 72 73 } 74 75 /** 76 * Create a Java String from Unicode character data corresponding 77 * to the original byte data supplied to the Charset detect operation. 78 * The length of the returned string is limited to the specified size; 79 * the string will be trunctated to this length if necessary. A limit value of 80 * zero or less is ignored, and treated as no limit. 81 * 82 * @param maxLength The maximium length of the String to be created when the 83 * source of the data is an input stream, or -1 for 84 * unlimited length. 85 * @return a String created from the converted input data. 86 * 87 * @stable ICU 3.4 88 */ 89 public String getString(int maxLength) throws java.io.IOException { 90 String result = null; 91 if (fInputStream != null) { 92 StringBuilder sb = new StringBuilder(); 93 char[] buffer = new char[1024]; 94 Reader reader = getReader(); 95 int max = maxLength < 0? Integer.MAX_VALUE : maxLength; 96 int bytesRead = 0; 97 98 while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) { 99 sb.append(buffer, 0, bytesRead); 100 max -= bytesRead; 101 } 102 103 reader.close(); 104 105 return sb.toString(); 106 } else { 107 String name = getName(); 108 /* 109 * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot 110 * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr' 111 * should be stripped off before creating the string. 112 */ 113 int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl"); 114 if (startSuffix > 0) { 115 name = name.substring(0, startSuffix); 116 } 117 result = new String(fRawInput, name); 118 } 119 return result; 120 121 } 122 123 /** 124 * Get an indication of the confidence in the charset detected. 125 * Confidence values range from 0-100, with larger numbers indicating 126 * a better match of the input data to the characteristics of the 127 * charset. 128 * 129 * @return the confidence in the charset match 130 * 131 * @stable ICU 3.4 132 */ 133 public int getConfidence() { 134 return fConfidence; 135 } 136 137 /** 138 * Get the name of the detected charset. 139 * The name will be one that can be used with other APIs on the 140 * platform that accept charset names. It is the "Canonical name" 141 * as defined by the class java.nio.charset.Charset; for 142 * charsets that are registered with the IANA charset registry, 143 * this is the MIME-preferred registerd name. 144 * 145 * @see java.nio.charset.Charset 146 * @see java.io.InputStreamReader 147 * 148 * @return The name of the charset. 149 * 150 * @stable ICU 3.4 151 */ 152 public String getName() { 153 return fCharsetName; 154 } 155 156 /** 157 * Get the ISO code for the language of the detected charset. 158 * 159 * @return The ISO code for the language or <code>null</code> if the language cannot be determined. 160 * 161 * @stable ICU 3.4 162 */ 163 public String getLanguage() { 164 return fLang; 165 } 166 167 /** 168 * Compare to other CharsetMatch objects. 169 * Comparison is based on the match confidence value, which 170 * allows CharsetDetector.detectAll() to order its results. 171 * 172 * @param other the CharsetMatch object to compare against. 173 * @return a negative integer, zero, or a positive integer as the 174 * confidence level of this CharsetMatch 175 * is less than, equal to, or greater than that of 176 * the argument. 177 * @throws ClassCastException if the argument is not a CharsetMatch. 178 * @stable ICU 4.4 179 */ 180 @Override 181 public int compareTo (CharsetMatch other) { 182 int compareResult = 0; 183 if (this.fConfidence > other.fConfidence) { 184 compareResult = 1; 185 } else if (this.fConfidence < other.fConfidence) { 186 compareResult = -1; 187 } 188 return compareResult; 189 } 190 191 /* 192 * Constructor. Implementation internal 193 */ 194 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { 195 fConfidence = conf; 196 197 // The references to the original application input data must be copied out 198 // of the charset recognizer to here, in case the application resets the 199 // recognizer before using this CharsetMatch. 200 if (det.fInputStream == null) { 201 // We only want the existing input byte data if it came straight from the user, 202 // not if is just the head of a stream. 203 fRawInput = det.fRawInput; 204 fRawLength = det.fRawLength; 205 } 206 fInputStream = det.fInputStream; 207 fCharsetName = rec.getName(); 208 fLang = rec.getLanguage(); 209 } 210 211 /* 212 * Constructor. Implementation internal 213 */ 214 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) { 215 fConfidence = conf; 216 217 // The references to the original application input data must be copied out 218 // of the charset recognizer to here, in case the application resets the 219 // recognizer before using this CharsetMatch. 220 if (det.fInputStream == null) { 221 // We only want the existing input byte data if it came straight from the user, 222 // not if is just the head of a stream. 223 fRawInput = det.fRawInput; 224 fRawLength = det.fRawLength; 225 } 226 fInputStream = det.fInputStream; 227 fCharsetName = csName; 228 fLang = lang; 229 } 230 231 232 // 233 // Private Data 234 // 235 private int fConfidence; 236 private byte[] fRawInput = null; // Original, untouched input bytes. 237 // If user gave us a byte array, this is it. 238 private int fRawLength; // Length of data in fRawInput array. 239 240 private InputStream fInputStream = null; // User's input stream, or null if the user 241 // gave us a byte array. 242 243 private String fCharsetName; // The name of the charset this CharsetMatch 244 // represents. Filled in by the recognizer. 245 private String fLang; // The language, if one was determined by 246 // the recognizer during the detect operation. 247 } 248