Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/19/2001  aliu        Creation.
     10 **********************************************************************
     11 */
     12 package com.ibm.icu.text;
     13 import com.ibm.icu.impl.Utility;
     14 import com.ibm.icu.lang.UCharacter;
     15 
     16 /**
     17  * A transliterator that converts Unicode escape forms to the
     18  * characters they represent.  Escape forms have a prefix, a suffix, a
     19  * radix, and minimum and maximum digit counts.
     20  *
     21  * <p>This class is package private.  It registers several standard
     22  * variants with the system which are then accessed via their IDs.
     23  *
     24  * @author Alan Liu
     25  */
     26 class UnescapeTransliterator extends Transliterator {
     27 
     28     /**
     29      * The encoded pattern specification.  The pattern consists of
     30      * zero or more forms.  Each form consists of a prefix, suffix,
     31      * radix, minimum digit count, and maximum digit count.  These
     32      * values are stored as a five character header.  That is, their
     33      * numeric values are cast to 16-bit characters and stored in the
     34      * string.  Following these five characters, the prefix
     35      * characters, then suffix characters are stored.  Each form thus
     36      * takes n+5 characters, where n is the total length of the prefix
     37      * and suffix.  The end is marked by a header of length one
     38      * consisting of the character END.
     39      */
     40     private char spec[];
     41 
     42     /**
     43      * Special character marking the end of the spec[] array.
     44      */
     45     private static final char END = 0xFFFF;
     46 
     47     /**
     48      * Registers standard variants with the system.  Called by
     49      * Transliterator during initialization.
     50      */
     51     static void register() {
     52         // Unicode: "U+10FFFF" hex, min=4, max=6
     53         Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
     54             @Override
     55             public Transliterator getInstance(String ID) {
     56                 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
     57                     2, 0, 16, 4, 6, 'U', '+',
     58                     END
     59                 });
     60             }
     61         });
     62 
     63         // Java: "\\uFFFF" hex, min=4, max=4
     64         Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
     65             @Override
     66             public Transliterator getInstance(String ID) {
     67                 return new UnescapeTransliterator("Hex-Any/Java", new char[] {
     68                     2, 0, 16, 4, 4, '\\', 'u',
     69                     END
     70                 });
     71             }
     72         });
     73 
     74         // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
     75         Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
     76             @Override
     77             public Transliterator getInstance(String ID) {
     78                 return new UnescapeTransliterator("Hex-Any/C", new char[] {
     79                     2, 0, 16, 4, 4, '\\', 'u',
     80                     2, 0, 16, 8, 8, '\\', 'U',
     81                     END
     82                 });
     83             }
     84         });
     85 
     86         // XML: "&#x10FFFF;" hex, min=1, max=6
     87         Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
     88             @Override
     89             public Transliterator getInstance(String ID) {
     90                 return new UnescapeTransliterator("Hex-Any/XML", new char[] {
     91                     3, 1, 16, 1, 6, '&', '#', 'x', ';',
     92                     END
     93                 });
     94             }
     95         });
     96 
     97         // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
     98         Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
     99             @Override
    100             public Transliterator getInstance(String ID) {
    101                 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
    102                     2, 1, 10, 1, 7, '&', '#', ';',
    103                     END
    104                 });
    105             }
    106         });
    107 
    108         // Perl: "\\x{263A}" hex, min=1, max=6
    109         Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
    110             @Override
    111             public Transliterator getInstance(String ID) {
    112                 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
    113                     3, 1, 16, 1, 6, '\\', 'x', '{', '}',
    114                     END
    115                 });
    116             }
    117         });
    118 
    119         // All: Java, C, Perl, XML, XML10, Unicode
    120         Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
    121             @Override
    122             public Transliterator getInstance(String ID) {
    123                 return new UnescapeTransliterator("Hex-Any", new char[] {
    124                     2, 0, 16, 4, 6, 'U', '+',            // Unicode
    125                     2, 0, 16, 4, 4, '\\', 'u',           // Java
    126                     2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)
    127                     3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML
    128                     2, 1, 10, 1, 7, '&', '#', ';',       // XML10
    129                     3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
    130                     END
    131                 });
    132             }
    133         });
    134     }
    135 
    136     /**
    137      * Package private constructor.  Takes the encoded spec array.
    138      */
    139     UnescapeTransliterator(String ID, char spec[]) {
    140         super(ID, null);
    141         this.spec = spec;
    142     }
    143 
    144     /**
    145      * Implements {@link Transliterator#handleTransliterate}.
    146      */
    147     @Override
    148     protected void handleTransliterate(Replaceable text,
    149                                        Position pos, boolean isIncremental) {
    150         int start = pos.start;
    151         int limit = pos.limit;
    152         int i, ipat;
    153 
    154       loop:
    155         while (start < limit) {
    156             // Loop over the forms in spec[].  Exit this loop when we
    157             // match one of the specs.  Exit the outer loop if a
    158             // partial match is detected and isIncremental is true.
    159             for (ipat = 0; spec[ipat] != END;) {
    160 
    161                 // Read the header
    162                 int prefixLen = spec[ipat++];
    163                 int suffixLen = spec[ipat++];
    164                 int radix     = spec[ipat++];
    165                 int minDigits = spec[ipat++];
    166                 int maxDigits = spec[ipat++];
    167 
    168                 // s is a copy of start that is advanced over the
    169                 // characters as we parse them.
    170                 int s = start;
    171                 boolean match = true;
    172 
    173                 for (i=0; i<prefixLen; ++i) {
    174                     if (s >= limit) {
    175                         if (i > 0) {
    176                             // We've already matched a character.  This is
    177                             // a partial match, so we return if in
    178                             // incremental mode.  In non-incremental mode,
    179                             // go to the next spec.
    180                             if (isIncremental) {
    181                                 break loop;
    182                             }
    183                             match = false;
    184                             break;
    185                         }
    186                     }
    187                     char c = text.charAt(s++);
    188                     if (c != spec[ipat + i]) {
    189                         match = false;
    190                         break;
    191                     }
    192                 }
    193 
    194                 if (match) {
    195                     int u = 0;
    196                     int digitCount = 0;
    197                     for (;;) {
    198                         if (s >= limit) {
    199                             // Check for partial match in incremental mode.
    200                             if (s > start && isIncremental) {
    201                                 break loop;
    202                             }
    203                             break;
    204                         }
    205                         int ch = text.char32At(s);
    206                         int digit = UCharacter.digit(ch, radix);
    207                         if (digit < 0) {
    208                             break;
    209                         }
    210                         s += UTF16.getCharCount(ch);
    211                         u = (u * radix) + digit;
    212                         if (++digitCount == maxDigits) {
    213                             break;
    214                         }
    215                     }
    216 
    217                     match = (digitCount >= minDigits);
    218 
    219                     if (match) {
    220                         for (i=0; i<suffixLen; ++i) {
    221                             if (s >= limit) {
    222                                 // Check for partial match in incremental mode.
    223                                 if (s > start && isIncremental) {
    224                                     break loop;
    225                                 }
    226                                 match = false;
    227                                 break;
    228                             }
    229                             char c = text.charAt(s++);
    230                             if (c != spec[ipat + prefixLen + i]) {
    231                                 match = false;
    232                                 break;
    233                             }
    234                         }
    235 
    236                         if (match) {
    237                             // At this point, we have a match
    238                             String str = UTF16.valueOf(u);
    239                             text.replace(start, s, str);
    240                             limit -= s - start - str.length();
    241                             // The following break statement leaves the
    242                             // loop that is traversing the forms in
    243                             // spec[].  We then parse the next input
    244                             // character.
    245                             break;
    246                         }
    247                     }
    248                 }
    249 
    250                 ipat += prefixLen + suffixLen;
    251             }
    252 
    253             if (start < limit) {
    254                 start += UTF16.getCharCount(text.char32At(start));
    255             }
    256         }
    257 
    258         pos.contextLimit += limit - pos.limit;
    259         pos.limit = limit;
    260         pos.start = start;
    261     }
    262 
    263     /* (non-Javadoc)
    264      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
    265      */
    266     @Override
    267     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    268         // Each form consists of a prefix, suffix,
    269         // * radix, minimum digit count, and maximum digit count.  These
    270         // * values are stored as a five character header. ...
    271         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
    272         UnicodeSet items = new UnicodeSet();
    273         StringBuilder buffer = new StringBuilder();
    274         for (int i = 0; spec[i] != END;) {
    275             // first 5 items are header
    276             int end = i + spec[i] + spec[i+1] + 5;
    277             int radix = spec[i+2];
    278             for (int j = 0; j < radix; ++j) {
    279                 Utility.appendNumber(buffer, j, radix, 0);
    280             }
    281             // then add the characters
    282             for (int j = i + 5; j < end; ++j) {
    283                 items.add(spec[j]);
    284             }
    285             // and go to next block
    286             i = end;
    287         }
    288         items.addAll(buffer.toString());
    289         items.retainAll(myFilter);
    290 
    291         if (items.size() > 0) {
    292             sourceSet.addAll(items);
    293             targetSet.addAll(0,0x10FFFF); // assume we can produce any character
    294         }
    295     }
    296 }
    297