Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 **********************************************************************
      6 *   Copyright (c) 2001-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 **********************************************************************
      9 *   Date        Name        Description
     10 *   11/19/2001  aliu        Creation.
     11 **********************************************************************
     12 */
     13 package android.icu.text;
     14 import android.icu.impl.Utility;
     15 import android.icu.lang.UCharacter;
     16 
     17 /**
     18  * A transliterator that converts Unicode escape forms to the
     19  * characters they represent.  Escape forms have a prefix, a suffix, a
     20  * radix, and minimum and maximum digit counts.
     21  *
     22  * <p>This class is package private.  It registers several standard
     23  * variants with the system which are then accessed via their IDs.
     24  *
     25  * @author Alan Liu
     26  */
     27 class UnescapeTransliterator extends Transliterator {
     28 
     29     /**
     30      * The encoded pattern specification.  The pattern consists of
     31      * zero or more forms.  Each form consists of a prefix, suffix,
     32      * radix, minimum digit count, and maximum digit count.  These
     33      * values are stored as a five character header.  That is, their
     34      * numeric values are cast to 16-bit characters and stored in the
     35      * string.  Following these five characters, the prefix
     36      * characters, then suffix characters are stored.  Each form thus
     37      * takes n+5 characters, where n is the total length of the prefix
     38      * and suffix.  The end is marked by a header of length one
     39      * consisting of the character END.
     40      */
     41     private char spec[];
     42 
     43     /**
     44      * Special character marking the end of the spec[] array.
     45      */
     46     private static final char END = 0xFFFF;
     47 
     48     /**
     49      * Registers standard variants with the system.  Called by
     50      * Transliterator during initialization.
     51      */
     52     static void register() {
     53         // Unicode: "U+10FFFF" hex, min=4, max=6
     54         Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
     55             @Override
     56             public Transliterator getInstance(String ID) {
     57                 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
     58                     2, 0, 16, 4, 6, 'U', '+',
     59                     END
     60                 });
     61             }
     62         });
     63 
     64         // Java: "\\uFFFF" hex, min=4, max=4
     65         Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
     66             @Override
     67             public Transliterator getInstance(String ID) {
     68                 return new UnescapeTransliterator("Hex-Any/Java", new char[] {
     69                     2, 0, 16, 4, 4, '\\', 'u',
     70                     END
     71                 });
     72             }
     73         });
     74 
     75         // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
     76         Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
     77             @Override
     78             public Transliterator getInstance(String ID) {
     79                 return new UnescapeTransliterator("Hex-Any/C", new char[] {
     80                     2, 0, 16, 4, 4, '\\', 'u',
     81                     2, 0, 16, 8, 8, '\\', 'U',
     82                     END
     83                 });
     84             }
     85         });
     86 
     87         // XML: "&#x10FFFF;" hex, min=1, max=6
     88         Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
     89             @Override
     90             public Transliterator getInstance(String ID) {
     91                 return new UnescapeTransliterator("Hex-Any/XML", new char[] {
     92                     3, 1, 16, 1, 6, '&', '#', 'x', ';',
     93                     END
     94                 });
     95             }
     96         });
     97 
     98         // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
     99         Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
    100             @Override
    101             public Transliterator getInstance(String ID) {
    102                 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
    103                     2, 1, 10, 1, 7, '&', '#', ';',
    104                     END
    105                 });
    106             }
    107         });
    108 
    109         // Perl: "\\x{263A}" hex, min=1, max=6
    110         Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
    111             @Override
    112             public Transliterator getInstance(String ID) {
    113                 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
    114                     3, 1, 16, 1, 6, '\\', 'x', '{', '}',
    115                     END
    116                 });
    117             }
    118         });
    119 
    120         // All: Java, C, Perl, XML, XML10, Unicode
    121         Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
    122             @Override
    123             public Transliterator getInstance(String ID) {
    124                 return new UnescapeTransliterator("Hex-Any", new char[] {
    125                     2, 0, 16, 4, 6, 'U', '+',            // Unicode
    126                     2, 0, 16, 4, 4, '\\', 'u',           // Java
    127                     2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)
    128                     3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML
    129                     2, 1, 10, 1, 7, '&', '#', ';',       // XML10
    130                     3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
    131                     END
    132                 });
    133             }
    134         });
    135     }
    136 
    137     /**
    138      * Package private constructor.  Takes the encoded spec array.
    139      */
    140     UnescapeTransliterator(String ID, char spec[]) {
    141         super(ID, null);
    142         this.spec = spec;
    143     }
    144 
    145     /**
    146      * Implements {@link Transliterator#handleTransliterate}.
    147      */
    148     @Override
    149     protected void handleTransliterate(Replaceable text,
    150                                        Position pos, boolean isIncremental) {
    151         int start = pos.start;
    152         int limit = pos.limit;
    153         int i, ipat;
    154 
    155       loop:
    156         while (start < limit) {
    157             // Loop over the forms in spec[].  Exit this loop when we
    158             // match one of the specs.  Exit the outer loop if a
    159             // partial match is detected and isIncremental is true.
    160             for (ipat = 0; spec[ipat] != END;) {
    161 
    162                 // Read the header
    163                 int prefixLen = spec[ipat++];
    164                 int suffixLen = spec[ipat++];
    165                 int radix     = spec[ipat++];
    166                 int minDigits = spec[ipat++];
    167                 int maxDigits = spec[ipat++];
    168 
    169                 // s is a copy of start that is advanced over the
    170                 // characters as we parse them.
    171                 int s = start;
    172                 boolean match = true;
    173 
    174                 for (i=0; i<prefixLen; ++i) {
    175                     if (s >= limit) {
    176                         if (i > 0) {
    177                             // We've already matched a character.  This is
    178                             // a partial match, so we return if in
    179                             // incremental mode.  In non-incremental mode,
    180                             // go to the next spec.
    181                             if (isIncremental) {
    182                                 break loop;
    183                             }
    184                             match = false;
    185                             break;
    186                         }
    187                     }
    188                     char c = text.charAt(s++);
    189                     if (c != spec[ipat + i]) {
    190                         match = false;
    191                         break;
    192                     }
    193                 }
    194 
    195                 if (match) {
    196                     int u = 0;
    197                     int digitCount = 0;
    198                     for (;;) {
    199                         if (s >= limit) {
    200                             // Check for partial match in incremental mode.
    201                             if (s > start && isIncremental) {
    202                                 break loop;
    203                             }
    204                             break;
    205                         }
    206                         int ch = text.char32At(s);
    207                         int digit = UCharacter.digit(ch, radix);
    208                         if (digit < 0) {
    209                             break;
    210                         }
    211                         s += UTF16.getCharCount(ch);
    212                         u = (u * radix) + digit;
    213                         if (++digitCount == maxDigits) {
    214                             break;
    215                         }
    216                     }
    217 
    218                     match = (digitCount >= minDigits);
    219 
    220                     if (match) {
    221                         for (i=0; i<suffixLen; ++i) {
    222                             if (s >= limit) {
    223                                 // Check for partial match in incremental mode.
    224                                 if (s > start && isIncremental) {
    225                                     break loop;
    226                                 }
    227                                 match = false;
    228                                 break;
    229                             }
    230                             char c = text.charAt(s++);
    231                             if (c != spec[ipat + prefixLen + i]) {
    232                                 match = false;
    233                                 break;
    234                             }
    235                         }
    236 
    237                         if (match) {
    238                             // At this point, we have a match
    239                             String str = UTF16.valueOf(u);
    240                             text.replace(start, s, str);
    241                             limit -= s - start - str.length();
    242                             // The following break statement leaves the
    243                             // loop that is traversing the forms in
    244                             // spec[].  We then parse the next input
    245                             // character.
    246                             break;
    247                         }
    248                     }
    249                 }
    250 
    251                 ipat += prefixLen + suffixLen;
    252             }
    253 
    254             if (start < limit) {
    255                 start += UTF16.getCharCount(text.char32At(start));
    256             }
    257         }
    258 
    259         pos.contextLimit += limit - pos.limit;
    260         pos.limit = limit;
    261         pos.start = start;
    262     }
    263 
    264     /* (non-Javadoc)
    265      * @see android.icu.text.Transliterator#addSourceTargetSet(android.icu.text.UnicodeSet, android.icu.text.UnicodeSet, android.icu.text.UnicodeSet)
    266      */
    267     @Override
    268     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    269         // Each form consists of a prefix, suffix,
    270         // * radix, minimum digit count, and maximum digit count.  These
    271         // * values are stored as a five character header. ...
    272         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
    273         UnicodeSet items = new UnicodeSet();
    274         StringBuilder buffer = new StringBuilder();
    275         for (int i = 0; spec[i] != END;) {
    276             // first 5 items are header
    277             int end = i + spec[i] + spec[i+1] + 5;
    278             int radix = spec[i+2];
    279             for (int j = 0; j < radix; ++j) {
    280                 Utility.appendNumber(buffer, j, radix, 0);
    281             }
    282             // then add the characters
    283             for (int j = i + 5; j < end; ++j) {
    284                 items.add(spec[j]);
    285             }
    286             // and go to next block
    287             i = end;
    288         }
    289         items.addAll(buffer.toString());
    290         items.retainAll(myFilter);
    291 
    292         if (items.size() > 0) {
    293             sourceSet.addAll(items);
    294             targetSet.addAll(0,0x10FFFF); // assume we can produce any character
    295         }
    296     }
    297 }
    298