Home | History | Annotate | Download | only in normalizer
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  * Copyright (C) 1998-2007 International Business Machines Corporation and
      6  * Unicode, Inc. All Rights Reserved.<br>
      7  * The Unicode Consortium makes no expressed or implied warranty of any
      8  * kind, and assumes no liability for errors or omissions.
      9  * No liability is assumed for incidental and consequential damages
     10  * in connection with or arising out of the use of the information here.
     11  */
     12 
     13 package android.icu.dev.test.normalizer;
     14 
     15 import android.icu.dev.test.UTF16Util;
     16 import android.icu.testsharding.MainTestShard;
     17 
     18 /**
     19  * Implements Unicode Normalization Forms C, D, KC, KD.<br>
     20  * See UTR#15 for details.<br>
     21  * @author Mark Davis
     22  * Updates for supplementary code points:
     23  * Vladimir Weinstein & Markus Scherer
     24  */
     25 @MainTestShard
     26 public class UnicodeNormalizer {
     27 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
     28 
     29     /**
     30      * Create a normalizer for a given form.
     31      */
     32     public UnicodeNormalizer(byte form, boolean fullData) {
     33         this.form = form;
     34         if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
     35     }
     36 
     37     /**
     38     * Masks for the form selector
     39     */
     40     static final byte
     41         COMPATIBILITY_MASK = 1,
     42         COMPOSITION_MASK = 2;
     43 
     44     /**
     45     * Normalization Form Selector
     46     */
     47     public static final byte
     48         D = 0 ,
     49         C = COMPOSITION_MASK,
     50         KD = COMPATIBILITY_MASK,
     51         KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
     52 
     53     /**
     54     * Normalizes text according to the chosen form,
     55     * replacing contents of the target buffer.
     56     * @param   source      the original text, unnormalized
     57     * @param   target      the resulting normalized text
     58     */
     59     public StringBuffer normalize(String source, StringBuffer target) {
     60 
     61         // First decompose the source into target,
     62         // then compose if the form requires.
     63 
     64         if (source.length() != 0) {
     65             internalDecompose(source, target);
     66             if ((form & COMPOSITION_MASK) != 0) {
     67                 internalCompose(target);
     68             }
     69         }
     70         return target;
     71     }
     72 
     73     /**
     74     * Normalizes text according to the chosen form
     75     * @param   source      the original text, unnormalized
     76     * @return  target      the resulting normalized text
     77     */
     78     public String normalize(String source) {
     79         return normalize(source, new StringBuffer()).toString();
     80     }
     81 
     82     // ======================================
     83     //                  PRIVATES
     84     // ======================================
     85 
     86     /**
     87      * The current form.
     88      */
     89     private byte form;
     90 
     91     /**
     92     * Decomposes text, either canonical or compatibility,
     93     * replacing contents of the target buffer.
     94     * @param   form        the normalization form. If COMPATIBILITY_MASK
     95     *                      bit is on in this byte, then selects the recursive
     96     *                      compatibility decomposition, otherwise selects
     97     *                      the recursive canonical decomposition.
     98     * @param   source      the original text, unnormalized
     99     * @param   target      the resulting normalized text
    100     */
    101     private void internalDecompose(String source, StringBuffer target) {
    102         StringBuffer buffer = new StringBuffer();
    103         boolean canonical = (form & COMPATIBILITY_MASK) == 0;
    104         int ch;
    105         for (int i = 0; i < source.length();) {
    106             buffer.setLength(0);
    107             ch = UTF16Util.nextCodePoint(source, i);
    108             i+=UTF16Util.codePointLength(ch);
    109             data.getRecursiveDecomposition(canonical, ch, buffer);
    110 
    111             // add all of the characters in the decomposition.
    112             // (may be just the original character, if there was
    113             // no decomposition mapping)
    114 
    115             for (int j = 0; j < buffer.length();) {
    116                 ch = UTF16Util.nextCodePoint(buffer, j);
    117                 j+=UTF16Util.codePointLength(ch);
    118                 int chClass = data.getCanonicalClass(ch);
    119                 int k = target.length(); // insertion point
    120                 if (chClass != 0) {
    121 
    122                     // bubble-sort combining marks as necessary
    123 
    124                     int ch2;
    125                     for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
    126                         ch2 = UTF16Util.prevCodePoint(target, k);
    127                         if (data.getCanonicalClass(ch2) <= chClass) break;
    128                     }
    129                 }
    130                 UTF16Util.insertCodePoint(target, k, ch);
    131             }
    132         }
    133     }
    134 
    135     /**
    136     * Composes text in place. Target must already
    137     * have been decomposed.
    138     * @param   target      input: decomposed text.
    139     *                      output: the resulting normalized text.
    140     */
    141     private void internalCompose(StringBuffer target) {
    142 
    143         int starterPos = 0;
    144         int starterCh = UTF16Util.nextCodePoint(target,0);
    145         int compPos = UTF16Util.codePointLength(starterCh);
    146         int lastClass = data.getCanonicalClass(starterCh);
    147         if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
    148 
    149         // Loop on the decomposed characters, combining where possible
    150 
    151         for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
    152             int ch = UTF16Util.nextCodePoint(target, decompPos);
    153             decompPos += UTF16Util.codePointLength(ch);
    154             int chClass = data.getCanonicalClass(ch);
    155             int composite = data.getPairwiseComposition(starterCh, ch);
    156             if (composite != NormalizerData.NOT_COMPOSITE
    157             && (lastClass < chClass || lastClass == 0)) {
    158                 UTF16Util.setCodePointAt(target, starterPos, composite);
    159                 starterCh = composite;
    160             } else {
    161                 if (chClass == 0) {
    162                     starterPos = compPos;
    163                     starterCh  = ch;
    164                 }
    165                 lastClass = chClass;
    166                 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
    167                 compPos += UTF16Util.codePointLength(ch);
    168             }
    169         }
    170         target.setLength(compPos);
    171     }
    172 
    173     /**
    174     * Contains normalization data from the Unicode Character Database.
    175     * use false for the minimal set, true for the real set.
    176     */
    177     private static NormalizerData data = null;
    178 
    179     /**
    180     * Just accessible for testing.
    181     */
    182     boolean getExcluded (char ch) {
    183         return data.getExcluded(ch);
    184     }
    185 
    186     /**
    187     * Just accessible for testing.
    188     */
    189     String getRawDecompositionMapping (char ch) {
    190         return data.getRawDecompositionMapping(ch);
    191     }
    192 }