1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 06/08/01 aliu Creation. 10 ********************************************************************** 11 */ 12 13 package com.ibm.icu.text; 14 import java.util.HashMap; 15 import java.util.Map; 16 17 import com.ibm.icu.impl.Norm2AllModes; 18 import com.ibm.icu.impl.Normalizer2Impl; 19 20 /** 21 * @author Alan Liu, Markus Scherer 22 */ 23 final class NormalizationTransliterator extends Transliterator { 24 private final Normalizer2 norm2; 25 26 /** 27 * System registration hook. 28 */ 29 static void register() { 30 Transliterator.registerFactory("Any-NFC", new Transliterator.Factory() { 31 @Override 32 public Transliterator getInstance(String ID) { 33 return new NormalizationTransliterator("NFC", Normalizer2.getNFCInstance()); 34 } 35 }); 36 Transliterator.registerFactory("Any-NFD", new Transliterator.Factory() { 37 @Override 38 public Transliterator getInstance(String ID) { 39 return new NormalizationTransliterator("NFD", Normalizer2.getNFDInstance()); 40 } 41 }); 42 Transliterator.registerFactory("Any-NFKC", new Transliterator.Factory() { 43 @Override 44 public Transliterator getInstance(String ID) { 45 return new NormalizationTransliterator("NFKC", Normalizer2.getNFKCInstance()); 46 } 47 }); 48 Transliterator.registerFactory("Any-NFKD", new Transliterator.Factory() { 49 @Override 50 public Transliterator getInstance(String ID) { 51 return new NormalizationTransliterator("NFKD", Normalizer2.getNFKDInstance()); 52 } 53 }); 54 Transliterator.registerFactory("Any-FCD", new Transliterator.Factory() { 55 @Override 56 public Transliterator getInstance(String ID) { 57 return new NormalizationTransliterator("FCD", Norm2AllModes.getFCDNormalizer2()); 58 } 59 }); 60 Transliterator.registerFactory("Any-FCC", new Transliterator.Factory() { 61 @Override 62 public Transliterator getInstance(String ID) { 63 return new NormalizationTransliterator("FCC", Norm2AllModes.getNFCInstance().fcc); 64 } 65 }); 66 Transliterator.registerSpecialInverse("NFC", "NFD", true); 67 Transliterator.registerSpecialInverse("NFKC", "NFKD", true); 68 Transliterator.registerSpecialInverse("FCC", "NFD", false); 69 Transliterator.registerSpecialInverse("FCD", "FCD", false); 70 } 71 72 /** 73 * Constructs a transliterator. 74 */ 75 private NormalizationTransliterator(String id, Normalizer2 n2) { 76 super(id, null); 77 norm2 = n2; 78 } 79 80 /** 81 * Implements {@link Transliterator#handleTransliterate}. 82 */ 83 @Override 84 protected void handleTransliterate(Replaceable text, 85 Position offsets, boolean isIncremental) { 86 // start and limit of the input range 87 int start = offsets.start; 88 int limit = offsets.limit; 89 if(start >= limit) { 90 return; 91 } 92 93 /* 94 * Normalize as short chunks at a time as possible even in 95 * bulk mode, so that styled text is minimally disrupted. 96 * In incremental mode, a chunk that ends with offsets.limit 97 * must not be normalized. 98 * 99 * If it was known that the input text is not styled, then 100 * a bulk mode normalization could be used. 101 * (For details, see the comment in the C++ version.) 102 */ 103 StringBuilder segment = new StringBuilder(); 104 StringBuilder normalized = new StringBuilder(); 105 int c = text.char32At(start); 106 do { 107 int prev = start; 108 // Skip at least one character so we make progress. 109 // c holds the character at start. 110 segment.setLength(0); 111 do { 112 segment.appendCodePoint(c); 113 start += Character.charCount(c); 114 } while(start < limit && !norm2.hasBoundaryBefore(c = text.char32At(start))); 115 if(start == limit && isIncremental && !norm2.hasBoundaryAfter(c)) { 116 // stop in incremental mode when we reach the input limit 117 // in case there are additional characters that could change the 118 // normalization result 119 start=prev; 120 break; 121 } 122 norm2.normalize(segment, normalized); 123 if(!Normalizer2Impl.UTF16Plus.equal(segment, normalized)) { 124 // replace the input chunk with its normalized form 125 text.replace(prev, start, normalized.toString()); 126 127 // update all necessary indexes accordingly 128 int delta = normalized.length() - (start - prev); 129 start += delta; 130 limit += delta; 131 } 132 } while(start < limit); 133 134 offsets.start = start; 135 offsets.contextLimit += limit - offsets.limit; 136 offsets.limit = limit; 137 } 138 139 static final Map<Normalizer2, SourceTargetUtility> SOURCE_CACHE = new HashMap<Normalizer2, SourceTargetUtility>(); 140 141 // TODO Get rid of this if Normalizer2 becomes a Transform 142 static class NormalizingTransform implements Transform<String,String> { 143 final Normalizer2 norm2; 144 public NormalizingTransform(Normalizer2 norm2) { 145 this.norm2 = norm2; 146 } 147 @Override 148 public String transform(String source) { 149 return norm2.normalize(source); 150 } 151 } 152 153 /* (non-Javadoc) 154 * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) 155 */ 156 @Override 157 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 158 SourceTargetUtility cache; 159 synchronized (SOURCE_CACHE) { 160 //String id = getID(); 161 cache = SOURCE_CACHE.get(norm2); 162 if (cache == null) { 163 cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2); 164 SOURCE_CACHE.put(norm2, cache); 165 } 166 } 167 cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); 168 } 169 } 170