Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
      7  * and others. All Rights Reserved.                                            *
      8  *******************************************************************************
      9  */
     10 package android.icu.impl;
     11 
     12 import android.icu.text.UTF16;
     13 import android.icu.text.UnicodeSet;
     14 
     15 /**
     16  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
     17  * The '' (two quotes) is treated as a single quote, inside or outside a quote
     18  * <ul>
     19  * <li>Any ignorable characters are ignored in parsing.</li>
     20  * <li>Any syntax characters are broken into separate tokens</li>
     21  * <li>Quote characters can be specified: '...', "...", and \x </li>
     22  * <li>Other characters are treated as literals</li>
     23  * </ul>
     24  * @hide Only a subset of ICU is exposed in Android
     25  */
     26 public class PatternTokenizer {
     27     // settings used in the interpretation of the pattern
     28     private UnicodeSet ignorableCharacters = new UnicodeSet();
     29     private UnicodeSet syntaxCharacters = new UnicodeSet();
     30     private UnicodeSet extraQuotingCharacters = new UnicodeSet();
     31     private UnicodeSet escapeCharacters = new UnicodeSet();
     32     private boolean usingSlash = false;
     33     private boolean usingQuote = false;
     34 
     35     // transient data, set when needed. Null it out for any changes in the above fields.
     36     private transient UnicodeSet needingQuoteCharacters = null;
     37 
     38     // data about the current pattern being parsed. start gets moved as we go along.
     39     private int start;
     40     private int limit;
     41     private String pattern;
     42 
     43     public UnicodeSet getIgnorableCharacters() {
     44         return (UnicodeSet) ignorableCharacters.clone();
     45     }
     46     /**
     47      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
     48      * @param ignorableCharacters Characters to be ignored.
     49      * @return A PatternTokenizer object in which characters are specified as ignored characters.
     50      */
     51     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
     52         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
     53         needingQuoteCharacters = null;
     54         return this;
     55     }
     56     public UnicodeSet getSyntaxCharacters() {
     57         return (UnicodeSet) syntaxCharacters.clone();
     58     }
     59     public UnicodeSet getExtraQuotingCharacters() {
     60         return (UnicodeSet) extraQuotingCharacters.clone();
     61     }
     62     /**
     63      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
     64      * @param syntaxCharacters Characters to be set as syntax characters.
     65      * @return A PatternTokenizer object in which characters are specified as syntax characters.
     66      */
     67     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
     68         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
     69         needingQuoteCharacters = null;
     70         return this;
     71     }
     72     /**
     73      *  Sets the extra characters to be quoted in literals
     74      * @param syntaxCharacters Characters to be set as extra quoting characters.
     75      * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
     76      */
     77     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
     78         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
     79         needingQuoteCharacters = null;
     80         return this;
     81     }
     82 
     83     public UnicodeSet getEscapeCharacters() {
     84         return (UnicodeSet) escapeCharacters.clone();
     85     }
     86     /**
     87      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
     88      * @param escapeCharacters Characters to be set as escape characters.
     89      * @return A PatternTokenizer object in which characters are specified as escape characters.
     90      */
     91     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
     92         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
     93         return this;
     94     }
     95     public boolean isUsingQuote() {
     96         return usingQuote;
     97     }
     98     public PatternTokenizer setUsingQuote(boolean usingQuote) {
     99         this.usingQuote = usingQuote;
    100         needingQuoteCharacters = null;
    101         return this;
    102     }
    103     public boolean isUsingSlash() {
    104         return usingSlash;
    105     }
    106     public PatternTokenizer setUsingSlash(boolean usingSlash) {
    107         this.usingSlash = usingSlash;
    108         needingQuoteCharacters = null;
    109         return this;
    110     }
    111     //    public UnicodeSet getQuoteCharacters() {
    112 //  return (UnicodeSet) quoteCharacters.clone();
    113 //  }
    114 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
    115 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
    116 //  needingQuoteCharacters = null;
    117 //  return this;
    118 //  }
    119     public int getLimit() {
    120         return limit;
    121     }
    122     public PatternTokenizer setLimit(int limit) {
    123         this.limit = limit;
    124         return this;
    125     }
    126     public int getStart() {
    127         return start;
    128     }
    129     public PatternTokenizer setStart(int start) {
    130         this.start = start;
    131         return this;
    132     }
    133 
    134     public PatternTokenizer setPattern(CharSequence pattern) {
    135         return setPattern(pattern.toString());
    136     }
    137 
    138     public PatternTokenizer setPattern(String pattern) {
    139         if (pattern == null) {
    140             throw new IllegalArgumentException("Inconsistent arguments");
    141         }
    142         this.start = 0;
    143         this.limit = pattern.length();
    144         this.pattern = pattern;
    145         return this;
    146     }
    147 
    148     public static final char SINGLE_QUOTE = '\'';
    149     public static final char BACK_SLASH = '\\';
    150     private static int NO_QUOTE = -1, IN_QUOTE = -2;
    151 
    152     public String quoteLiteral(CharSequence string) {
    153         return quoteLiteral(string.toString());
    154     }
    155 
    156     /**
    157      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
    158      * @param string String passed to quote a literal string.
    159      * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
    160      */
    161     public String quoteLiteral(String string) {
    162         if (needingQuoteCharacters == null) {
    163             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
    164             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
    165             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
    166         }
    167         StringBuffer result = new StringBuffer();
    168         int quotedChar = NO_QUOTE;
    169         int cp;
    170         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
    171             cp = UTF16.charAt(string, i);
    172             if (escapeCharacters.contains(cp)) {
    173                 // we may have to fix up previous characters
    174                 if (quotedChar == IN_QUOTE) {
    175                     result.append(SINGLE_QUOTE);
    176                     quotedChar = NO_QUOTE;
    177                 }
    178                 appendEscaped(result, cp);
    179                 continue;
    180             }
    181 
    182             if (needingQuoteCharacters.contains(cp)) {
    183                 // if we have already started a quote
    184                 if (quotedChar == IN_QUOTE) {
    185                     UTF16.append(result, cp);
    186                     if (usingQuote && cp == SINGLE_QUOTE) { // double it
    187                         result.append(SINGLE_QUOTE);
    188                     }
    189                     continue;
    190                 }
    191                 // otherwise not already in quote
    192                 if (usingSlash) {
    193                     result.append(BACK_SLASH);
    194                     UTF16.append(result, cp);
    195                     continue;
    196                 }
    197                 if (usingQuote) {
    198                     if (cp == SINGLE_QUOTE) { // double it and continue
    199                         result.append(SINGLE_QUOTE);
    200                         result.append(SINGLE_QUOTE);
    201                         continue;
    202                     }
    203                     result.append(SINGLE_QUOTE);
    204                     UTF16.append(result, cp);
    205                     quotedChar = IN_QUOTE;
    206                     continue;
    207                 }
    208                 // we have no choice but to use \\u or \\U
    209                 appendEscaped(result, cp);
    210                 continue;
    211             }
    212             // otherwise cp doesn't need quoting
    213             // we may have to fix up previous characters
    214             if (quotedChar == IN_QUOTE) {
    215                 result.append(SINGLE_QUOTE);
    216                 quotedChar = NO_QUOTE;
    217             }
    218             UTF16.append(result, cp);
    219         }
    220         // all done.
    221         // we may have to fix up previous characters
    222         if (quotedChar == IN_QUOTE) {
    223             result.append(SINGLE_QUOTE);
    224         }
    225         return result.toString();
    226     }
    227 
    228     private void appendEscaped(StringBuffer result, int cp) {
    229         if (cp <= 0xFFFF) {
    230             result.append("\\u").append(Utility.hex(cp,4));
    231         } else {
    232             result.append("\\U").append(Utility.hex(cp,8));
    233         }
    234     }
    235 
    236     public String normalize() {
    237         int oldStart = start;
    238         StringBuffer result = new StringBuffer();
    239         StringBuffer buffer = new StringBuffer();
    240         while (true) {
    241             buffer.setLength(0);
    242             int status = next(buffer);
    243             if (status == DONE) {
    244                 start = oldStart;
    245                 return result.toString();
    246             }
    247             if (status != SYNTAX) {
    248                 result.append(quoteLiteral(buffer));
    249             } else {
    250                 result.append(buffer);
    251             }
    252         }
    253     }
    254 
    255     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
    256 
    257     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
    258 
    259     public int next(StringBuffer buffer) {
    260         if (start >= limit) return DONE;
    261         int status = UNKNOWN;
    262         int lastQuote = UNKNOWN;
    263         int quoteStatus = NONE;
    264         int hexCount = 0;
    265         int hexValue = 0;
    266         int cp;
    267         main:
    268             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
    269                 cp = UTF16.charAt(pattern, i);
    270                 // if we are in a quote, then handle it.
    271                 switch (quoteStatus) {
    272                 case SLASH_START:
    273                     switch (cp) {
    274                     case 'u':
    275                         quoteStatus = HEX;
    276                         hexCount = 4;
    277                         hexValue = 0;
    278                         continue main;
    279                     case 'U':
    280                         quoteStatus = HEX;
    281                         hexCount = 8;
    282                         hexValue = 0;
    283                         continue main;
    284                     default:
    285                         if (usingSlash) {
    286                             UTF16.append(buffer, cp);
    287                             quoteStatus = NONE;
    288                             continue main;
    289                         } else {
    290                             buffer.append(BACK_SLASH);
    291                             quoteStatus = NONE;
    292                         }
    293                     }
    294                     break; // fall through to NONE
    295                 case HEX:
    296                     hexValue <<= 4;
    297                     hexValue += cp;
    298                     switch (cp) {
    299                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
    300                         hexValue -= '0'; break;
    301                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    302                         hexValue -= 'a' - 10; break;
    303                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    304                         hexValue -= 'A' - 10; break;
    305                     default:
    306                         start = i;
    307                     return BROKEN_ESCAPE;
    308                     }
    309                     --hexCount;
    310                     if (hexCount == 0) {
    311                         quoteStatus = NONE;
    312                         UTF16.append(buffer, hexValue);
    313                     }
    314                     continue main;
    315                 case AFTER_QUOTE:
    316                     // see if we get another quote character
    317                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
    318                     if (cp == lastQuote) {
    319                         UTF16.append(buffer, cp);
    320                         quoteStatus = NORMAL_QUOTE;
    321                         continue main;
    322                     }
    323                     quoteStatus = NONE;
    324                     break; // fall through to NONE
    325                 case START_QUOTE:
    326                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
    327                     if (cp == lastQuote) {
    328                         UTF16.append(buffer, cp);
    329                         quoteStatus = NONE; // get out of quote, with no trace remaining
    330                         continue;
    331                     }
    332                     // otherwise get into quote
    333                     UTF16.append(buffer, cp);
    334                     quoteStatus = NORMAL_QUOTE;
    335                     continue main;
    336                 case NORMAL_QUOTE:
    337                     if (cp == lastQuote) {
    338                         quoteStatus = AFTER_QUOTE; // get out of quote
    339                         continue main;
    340                     }
    341                     UTF16.append(buffer, cp);
    342                     continue main;
    343                 }
    344 
    345                 if (ignorableCharacters.contains(cp)) {
    346                     continue;
    347                 }
    348                 // do syntax characters
    349                 if (syntaxCharacters.contains(cp)) {
    350                     if (status == UNKNOWN) {
    351                         UTF16.append(buffer, cp);
    352                         start = i + UTF16.getCharCount(cp);
    353                         return SYNTAX;
    354                     } else { // LITERAL, so back up and break
    355                         start = i;
    356                         return status;
    357                     }
    358                 }
    359                 // otherwise it is a literal; keep on going
    360                 status = LITERAL;
    361                 if (cp == BACK_SLASH) {
    362                     quoteStatus = SLASH_START;
    363                     continue;
    364                 } else if (usingQuote && cp == SINGLE_QUOTE) {
    365                     lastQuote = cp;
    366                     quoteStatus = START_QUOTE;
    367                     continue;
    368                 }
    369                 // normal literals
    370                 UTF16.append(buffer, cp);
    371             }
    372         // handle final cleanup
    373         start = limit;
    374         switch (quoteStatus) {
    375         case HEX:
    376             status = BROKEN_ESCAPE;
    377             break;
    378         case SLASH_START:
    379             if (usingSlash) {
    380                 status = BROKEN_ESCAPE;
    381             } else {
    382                 buffer.append(BACK_SLASH);
    383             }
    384             break;
    385         case START_QUOTE: case NORMAL_QUOTE:
    386             status = BROKEN_QUOTE;
    387             break;
    388         }
    389         return status;
    390     }
    391 
    392 
    393 }
    394 //eof
    395