1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2009-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.nio.ByteBuffer; 16 17 import android.icu.impl.ICUBinary; 18 import android.icu.impl.Norm2AllModes; 19 import android.icu.util.ICUUncheckedIOException; 20 21 /** 22 * Unicode normalization functionality for standard Unicode normalization or 23 * for using custom mapping tables. 24 * All instances of this class are unmodifiable/immutable. 25 * The Normalizer2 class is not intended for public subclassing. 26 * <p> 27 * The primary functions are to produce a normalized string and to detect whether 28 * a string is already normalized. 29 * The most commonly used normalization forms are those defined in 30 * http://www.unicode.org/unicode/reports/tr15/ 31 * However, this API supports additional normalization forms for specialized purposes. 32 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 33 * and can be used in implementations of UTS #46. 34 * <p> 35 * Not only are the standard compose and decompose modes supplied, 36 * but additional modes are provided as documented in the Mode enum. 37 * <p> 38 * Some of the functions in this class identify normalization boundaries. 39 * At a normalization boundary, the portions of the string 40 * before it and starting from it do not interact and can be handled independently. 41 * <p> 42 * The spanQuickCheckYes() stops at a normalization boundary. 43 * When the goal is a normalized string, then the text before the boundary 44 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 45 * <p> 46 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 47 * a character is guaranteed to be at a normalization boundary, 48 * regardless of context. 49 * This is used for moving from one normalization boundary to the next 50 * or preceding boundary, and for performing iterative normalization. 51 * <p> 52 * Iterative normalization is useful when only a small portion of a 53 * longer string needs to be processed. 54 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 55 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 56 * (to process only the substring for which sort key bytes are computed). 57 * <p> 58 * The set of normalization boundaries returned by these functions may not be 59 * complete: There may be more boundaries that could be returned. 60 * Different functions may return different boundaries. 61 * @author Markus W. Scherer 62 */ 63 public abstract class Normalizer2 { 64 /** 65 * Constants for normalization modes. 66 * For details about standard Unicode normalization forms 67 * and about the algorithms which are also used with custom mapping tables 68 * see http://www.unicode.org/unicode/reports/tr15/ 69 */ 70 public enum Mode { 71 /** 72 * Decomposition followed by composition. 73 * Same as standard NFC when using an "nfc" instance. 74 * Same as standard NFKC when using an "nfkc" instance. 75 * For details about standard Unicode normalization forms 76 * see http://www.unicode.org/unicode/reports/tr15/ 77 */ 78 COMPOSE, 79 /** 80 * Map, and reorder canonically. 81 * Same as standard NFD when using an "nfc" instance. 82 * Same as standard NFKD when using an "nfkc" instance. 83 * For details about standard Unicode normalization forms 84 * see http://www.unicode.org/unicode/reports/tr15/ 85 */ 86 DECOMPOSE, 87 /** 88 * "Fast C or D" form. 89 * If a string is in this form, then further decomposition <i>without reordering</i> 90 * would yield the same form as DECOMPOSE. 91 * Text in "Fast C or D" form can be processed efficiently with data tables 92 * that are "canonically closed", that is, that provide equivalent data for 93 * equivalent text, without having to be fully normalized.<br> 94 * Not a standard Unicode normalization form.<br> 95 * Not a unique form: Different FCD strings can be canonically equivalent.<br> 96 * For details see http://www.unicode.org/notes/tn5/#FCD 97 */ 98 FCD, 99 /** 100 * Compose only contiguously. 101 * Also known as "FCC" or "Fast C Contiguous". 102 * The result will often but not always be in NFC. 103 * The result will conform to FCD which is useful for processing.<br> 104 * Not a standard Unicode normalization form.<br> 105 * For details see http://www.unicode.org/notes/tn5/#FCC 106 */ 107 COMPOSE_CONTIGUOUS 108 }; 109 110 /** 111 * Returns a Normalizer2 instance for Unicode NFC normalization. 112 * Same as getInstance(null, "nfc", Mode.COMPOSE). 113 * Returns an unmodifiable singleton instance. 114 * @return the requested Normalizer2, if successful 115 */ 116 public static Normalizer2 getNFCInstance() { 117 return Norm2AllModes.getNFCInstance().comp; 118 } 119 120 /** 121 * Returns a Normalizer2 instance for Unicode NFD normalization. 122 * Same as getInstance(null, "nfc", Mode.DECOMPOSE). 123 * Returns an unmodifiable singleton instance. 124 * @return the requested Normalizer2, if successful 125 */ 126 public static Normalizer2 getNFDInstance() { 127 return Norm2AllModes.getNFCInstance().decomp; 128 } 129 130 /** 131 * Returns a Normalizer2 instance for Unicode NFKC normalization. 132 * Same as getInstance(null, "nfkc", Mode.COMPOSE). 133 * Returns an unmodifiable singleton instance. 134 * @return the requested Normalizer2, if successful 135 */ 136 public static Normalizer2 getNFKCInstance() { 137 return Norm2AllModes.getNFKCInstance().comp; 138 } 139 140 /** 141 * Returns a Normalizer2 instance for Unicode NFKD normalization. 142 * Same as getInstance(null, "nfkc", Mode.DECOMPOSE). 143 * Returns an unmodifiable singleton instance. 144 * @return the requested Normalizer2, if successful 145 */ 146 public static Normalizer2 getNFKDInstance() { 147 return Norm2AllModes.getNFKCInstance().decomp; 148 } 149 150 /** 151 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. 152 * Same as getInstance(null, "nfkc_cf", Mode.COMPOSE). 153 * Returns an unmodifiable singleton instance. 154 * @return the requested Normalizer2, if successful 155 */ 156 public static Normalizer2 getNFKCCasefoldInstance() { 157 return Norm2AllModes.getNFKC_CFInstance().comp; 158 } 159 160 /** 161 * Returns a Normalizer2 instance which uses the specified data file 162 * (an ICU data file if data=null, or else custom binary data) 163 * and which composes or decomposes text according to the specified mode. 164 * Returns an unmodifiable singleton instance. 165 * <ul> 166 * <li>Use data=null for data files that are part of ICU's own data. 167 * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD. 168 * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD. 169 * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 170 * </ul> 171 * If data!=null, then the binary data is read once and cached using the provided 172 * name as the key. 173 * If you know or expect the data to be cached already, you can use data!=null 174 * for non-ICU data as well. 175 * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}. 176 * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data 177 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 178 * @param mode normalization mode (compose or decompose etc.) 179 * @return the requested Normalizer2, if successful 180 */ 181 public static Normalizer2 getInstance(InputStream data, String name, Mode mode) { 182 // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer. 183 ByteBuffer bytes = null; 184 if (data != null) { 185 try { 186 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data); 187 } catch (IOException e) { 188 throw new ICUUncheckedIOException(e); 189 } 190 } 191 Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name); 192 switch(mode) { 193 case COMPOSE: return all2Modes.comp; 194 case DECOMPOSE: return all2Modes.decomp; 195 case FCD: return all2Modes.fcd; 196 case COMPOSE_CONTIGUOUS: return all2Modes.fcc; 197 default: return null; // will not occur 198 } 199 } 200 201 /** 202 * Returns the normalized form of the source string. 203 * @param src source string 204 * @return normalized src 205 */ 206 public String normalize(CharSequence src) { 207 if(src instanceof String) { 208 // Fastpath: Do not construct a new String if the src is a String 209 // and is already normalized. 210 int spanLength=spanQuickCheckYes(src); 211 if(spanLength==src.length()) { 212 return (String)src; 213 } 214 if (spanLength != 0) { 215 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); 216 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); 217 } 218 } 219 return normalize(src, new StringBuilder(src.length())).toString(); 220 } 221 222 /** 223 * Writes the normalized form of the source string to the destination string 224 * (replacing its contents) and returns the destination string. 225 * The source and destination strings must be different objects. 226 * @param src source string 227 * @param dest destination string; its contents is replaced with normalized src 228 * @return dest 229 */ 230 public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); 231 232 /** 233 * Writes the normalized form of the source string to the destination Appendable 234 * and returns the destination Appendable. 235 * The source and destination strings must be different objects. 236 * 237 * <p>Any {@link java.io.IOException} is wrapped into a {@link android.icu.util.ICUUncheckedIOException}. 238 * 239 * @param src source string 240 * @param dest destination Appendable; gets normalized src appended 241 * @return dest 242 */ 243 public abstract Appendable normalize(CharSequence src, Appendable dest); 244 245 /** 246 * Appends the normalized form of the second string to the first string 247 * (merging them at the boundary) and returns the first string. 248 * The result is normalized if the first string was normalized. 249 * The first and second strings must be different objects. 250 * @param first string, should be normalized 251 * @param second string, will be normalized 252 * @return first 253 */ 254 public abstract StringBuilder normalizeSecondAndAppend( 255 StringBuilder first, CharSequence second); 256 257 /** 258 * Appends the second string to the first string 259 * (merging them at the boundary) and returns the first string. 260 * The result is normalized if both the strings were normalized. 261 * The first and second strings must be different objects. 262 * @param first string, should be normalized 263 * @param second string, should be normalized 264 * @return first 265 */ 266 public abstract StringBuilder append(StringBuilder first, CharSequence second); 267 268 /** 269 * Gets the decomposition mapping of c. 270 * Roughly equivalent to normalizing the String form of c 271 * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function 272 * returns null if c does not have a decomposition mapping in this instance's data. 273 * This function is independent of the mode of the Normalizer2. 274 * @param c code point 275 * @return c's decomposition mapping, if any; otherwise null 276 */ 277 public abstract String getDecomposition(int c); 278 279 /** 280 * Gets the raw decomposition mapping of c. 281 * 282 * <p>This is similar to the getDecomposition() method but returns the 283 * raw decomposition mapping as specified in UnicodeData.txt or 284 * (for custom data) in the mapping files processed by the gennorm2 tool. 285 * By contrast, getDecomposition() returns the processed, 286 * recursively-decomposed version of this mapping. 287 * 288 * <p>When used on a standard NFKC Normalizer2 instance, 289 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 290 * 291 * <p>When used on a standard NFC Normalizer2 instance, 292 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 293 * in this case, the result contains either one or two code points (=1..4 Java chars). 294 * 295 * <p>This function is independent of the mode of the Normalizer2. 296 * The default implementation returns null. 297 * @param c code point 298 * @return c's raw decomposition mapping, if any; otherwise null 299 */ 300 public String getRawDecomposition(int c) { return null; } 301 302 /** 303 * Performs pairwise composition of a & b and returns the composite if there is one. 304 * 305 * <p>Returns a composite code point c only if c has a two-way mapping to a+b. 306 * In standard Unicode normalization, this means that 307 * c has a canonical decomposition to a+b 308 * and c does not have the Full_Composition_Exclusion property. 309 * 310 * <p>This function is independent of the mode of the Normalizer2. 311 * The default implementation returns a negative value. 312 * @param a A (normalization starter) code point. 313 * @param b Another code point. 314 * @return The non-negative composite code point if there is one; otherwise a negative value. 315 */ 316 public int composePair(int a, int b) { return -1; } 317 318 /** 319 * Gets the combining class of c. 320 * The default implementation returns 0 321 * but all standard implementations return the Unicode Canonical_Combining_Class value. 322 * @param c code point 323 * @return c's combining class 324 */ 325 public int getCombiningClass(int c) { return 0; } 326 327 /** 328 * Tests if the string is normalized. 329 * Internally, in cases where the quickCheck() method would return "maybe" 330 * (which is only possible for the two COMPOSE modes) this method 331 * resolves to "yes" or "no" to provide a definitive result, 332 * at the cost of doing more work in those cases. 333 * @param s input string 334 * @return true if s is normalized 335 */ 336 public abstract boolean isNormalized(CharSequence s); 337 338 /** 339 * Tests if the string is normalized. 340 * For the two COMPOSE modes, the result could be "maybe" in cases that 341 * would take a little more work to resolve definitively. 342 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 343 * combination of quick check + normalization, to avoid 344 * re-checking the "yes" prefix. 345 * @param s input string 346 * @return the quick check result 347 */ 348 public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s); 349 350 /** 351 * Returns the end of the normalized substring of the input string. 352 * In other words, with <code>end=spanQuickCheckYes(s);</code> 353 * the substring <code>s.subSequence(0, end)</code> 354 * will pass the quick check with a "yes" result. 355 * <p> 356 * The returned end index is usually one or more characters before the 357 * "no" or "maybe" character: The end index is at a normalization boundary. 358 * (See the class documentation for more about normalization boundaries.) 359 * <p> 360 * When the goal is a normalized string and most input strings are expected 361 * to be normalized already, then call this method, 362 * and if it returns a prefix shorter than the input string, 363 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 364 * @param s input string 365 * @return "yes" span end index 366 */ 367 public abstract int spanQuickCheckYes(CharSequence s); 368 369 /** 370 * Tests if the character always has a normalization boundary before it, 371 * regardless of context. 372 * If true, then the character does not normalization-interact with 373 * preceding characters. 374 * In other words, a string containing this character can be normalized 375 * by processing portions before this character and starting from this 376 * character independently. 377 * This is used for iterative normalization. See the class documentation for details. 378 * @param c character to test 379 * @return true if c has a normalization boundary before it 380 */ 381 public abstract boolean hasBoundaryBefore(int c); 382 383 /** 384 * Tests if the character always has a normalization boundary after it, 385 * regardless of context. 386 * If true, then the character does not normalization-interact with 387 * following characters. 388 * In other words, a string containing this character can be normalized 389 * by processing portions up to this character and after this 390 * character independently. 391 * This is used for iterative normalization. See the class documentation for details. 392 * <p> 393 * Note that this operation may be significantly slower than hasBoundaryBefore(). 394 * @param c character to test 395 * @return true if c has a normalization boundary after it 396 */ 397 public abstract boolean hasBoundaryAfter(int c); 398 399 /** 400 * Tests if the character is normalization-inert. 401 * If true, then the character does not change, nor normalization-interact with 402 * preceding or following characters. 403 * In other words, a string containing this character can be normalized 404 * by processing portions before this character and after this 405 * character independently. 406 * This is used for iterative normalization. See the class documentation for details. 407 * <p> 408 * Note that this operation may be significantly slower than hasBoundaryBefore(). 409 * @param c character to test 410 * @return true if c is normalization-inert 411 */ 412 public abstract boolean isInert(int c); 413 414 /** 415 * Sole constructor. (For invocation by subclass constructors, 416 * typically implicit.) 417 * @deprecated This API is ICU internal only. 418 * @hide original deprecated declaration 419 * @hide draft / provisional / internal are hidden on Android 420 */ 421 @Deprecated 422 protected Normalizer2() { 423 } 424 } 425