Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2001-2014, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  *   Date        Name        Description
      9  *   06/08/01    aliu        Creation.
     10  **********************************************************************
     11  */
     12 
     13 package com.ibm.icu.text;
     14 import java.util.HashMap;
     15 import java.util.Map;
     16 
     17 import com.ibm.icu.impl.Norm2AllModes;
     18 import com.ibm.icu.impl.Normalizer2Impl;
     19 
     20 /**
     21  * @author Alan Liu, Markus Scherer
     22  */
     23 final class NormalizationTransliterator extends Transliterator {
     24     private final Normalizer2 norm2;
     25 
     26     /**
     27      * System registration hook.
     28      */
     29     static void register() {
     30         Transliterator.registerFactory("Any-NFC", new Transliterator.Factory() {
     31             @Override
     32             public Transliterator getInstance(String ID) {
     33                 return new NormalizationTransliterator("NFC", Normalizer2.getNFCInstance());
     34             }
     35         });
     36         Transliterator.registerFactory("Any-NFD", new Transliterator.Factory() {
     37             @Override
     38             public Transliterator getInstance(String ID) {
     39                 return new NormalizationTransliterator("NFD", Normalizer2.getNFDInstance());
     40             }
     41         });
     42         Transliterator.registerFactory("Any-NFKC", new Transliterator.Factory() {
     43             @Override
     44             public Transliterator getInstance(String ID) {
     45                 return new NormalizationTransliterator("NFKC", Normalizer2.getNFKCInstance());
     46             }
     47         });
     48         Transliterator.registerFactory("Any-NFKD", new Transliterator.Factory() {
     49             @Override
     50             public Transliterator getInstance(String ID) {
     51                 return new NormalizationTransliterator("NFKD", Normalizer2.getNFKDInstance());
     52             }
     53         });
     54         Transliterator.registerFactory("Any-FCD", new Transliterator.Factory() {
     55             @Override
     56             public Transliterator getInstance(String ID) {
     57                 return new NormalizationTransliterator("FCD", Norm2AllModes.getFCDNormalizer2());
     58             }
     59         });
     60         Transliterator.registerFactory("Any-FCC", new Transliterator.Factory() {
     61             @Override
     62             public Transliterator getInstance(String ID) {
     63                 return new NormalizationTransliterator("FCC", Norm2AllModes.getNFCInstance().fcc);
     64             }
     65         });
     66         Transliterator.registerSpecialInverse("NFC", "NFD", true);
     67         Transliterator.registerSpecialInverse("NFKC", "NFKD", true);
     68         Transliterator.registerSpecialInverse("FCC", "NFD", false);
     69         Transliterator.registerSpecialInverse("FCD", "FCD", false);
     70     }
     71 
     72     /**
     73      * Constructs a transliterator.
     74      */
     75     private NormalizationTransliterator(String id, Normalizer2 n2) {
     76         super(id, null);
     77         norm2 = n2;
     78     }
     79 
     80     /**
     81      * Implements {@link Transliterator#handleTransliterate}.
     82      */
     83     @Override
     84     protected void handleTransliterate(Replaceable text,
     85             Position offsets, boolean isIncremental) {
     86         // start and limit of the input range
     87         int start = offsets.start;
     88         int limit = offsets.limit;
     89         if(start >= limit) {
     90             return;
     91         }
     92 
     93         /*
     94          * Normalize as short chunks at a time as possible even in
     95          * bulk mode, so that styled text is minimally disrupted.
     96          * In incremental mode, a chunk that ends with offsets.limit
     97          * must not be normalized.
     98          *
     99          * If it was known that the input text is not styled, then
    100          * a bulk mode normalization could be used.
    101          * (For details, see the comment in the C++ version.)
    102          */
    103         StringBuilder segment = new StringBuilder();
    104         StringBuilder normalized = new StringBuilder();
    105         int c = text.char32At(start);
    106         do {
    107             int prev = start;
    108             // Skip at least one character so we make progress.
    109             // c holds the character at start.
    110             segment.setLength(0);
    111             do {
    112                 segment.appendCodePoint(c);
    113                 start += Character.charCount(c);
    114             } while(start < limit && !norm2.hasBoundaryBefore(c = text.char32At(start)));
    115             if(start == limit && isIncremental && !norm2.hasBoundaryAfter(c)) {
    116                 // stop in incremental mode when we reach the input limit
    117                 // in case there are additional characters that could change the
    118                 // normalization result
    119                 start=prev;
    120                 break;
    121             }
    122             norm2.normalize(segment, normalized);
    123             if(!Normalizer2Impl.UTF16Plus.equal(segment, normalized)) {
    124                 // replace the input chunk with its normalized form
    125                 text.replace(prev, start, normalized.toString());
    126 
    127                 // update all necessary indexes accordingly
    128                 int delta = normalized.length() - (start - prev);
    129                 start += delta;
    130                 limit += delta;
    131             }
    132         } while(start < limit);
    133 
    134         offsets.start = start;
    135         offsets.contextLimit += limit - offsets.limit;
    136         offsets.limit = limit;
    137     }
    138 
    139     static final Map<Normalizer2, SourceTargetUtility> SOURCE_CACHE = new HashMap<Normalizer2, SourceTargetUtility>();
    140 
    141     // TODO Get rid of this if Normalizer2 becomes a Transform
    142     static class NormalizingTransform implements Transform<String,String> {
    143         final Normalizer2 norm2;
    144         public NormalizingTransform(Normalizer2 norm2) {
    145             this.norm2 = norm2;
    146         }
    147         @Override
    148         public String transform(String source) {
    149             return norm2.normalize(source);
    150         }
    151     }
    152 
    153     /* (non-Javadoc)
    154      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
    155      */
    156     @Override
    157     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    158         SourceTargetUtility cache;
    159         synchronized (SOURCE_CACHE) {
    160             //String id = getID();
    161             cache = SOURCE_CACHE.get(norm2);
    162             if (cache == null) {
    163                 cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2);
    164                 SOURCE_CACHE.put(norm2, cache);
    165             }
    166         }
    167         cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
    168     }
    169 }
    170