Home | History | Annotate | Download | only in util
      1 package org.unicode.cldr.util;
      2 
      3 import java.io.IOException;
      4 import java.io.Reader;
      5 
      6 /**
      7  * Extremely simple class for parsing HTML. Extremely lenient. Call next() until
      8  * DONE is returned.
      9  * <p>
     10  * Element content will be returned in the following sequence:
     11  *
     12  * <pre>
     13  *  ELEMENT_START
     14  *  ELEMENT strong
     15  *  ELEMENT_END
     16  *  ELEMENT_CONTENT Alphabetic code
     17  *  ELEMENT_START
     18  *  ELEMENT_POP
     19  *  ELEMENT strong
     20  *  ELEMENT_END
     21  * </pre>
     22  *
     23  * while attributes will be returned as:
     24  *
     25  * <pre>
     26  *  ELEMENT_START
     27  *  ELEMENT div
     28  *  ATTRIBUTE id
     29  *  ATTRIBUTE_CONTENT mainContent
     30  *  ELEMENT_END
     31  * </pre>
     32  *
     33  *
     34  * @author markdavis
     35  *
     36  */
     37 public class SimpleHtmlParser {
     38     public enum Type {
     39         DONE,
     40         /**
     41          * No contents, set when we hit <
     42          */
     43         ELEMENT_START,
     44         /**
     45          * '&lt;' contents/b
     46          */
     47         ELEMENT,
     48         /**
     49          * '&lt;element/bcontents(=...)
     50          */
     51         ATTRIBUTE,
     52         /**
     53          * attribute=['"]contents['"]
     54          */
     55         ATTRIBUTE_CONTENT,
     56         /**
     57          * No contents, set when we hit '&gt'
     58          */
     59         ELEMENT_END,
     60         /**
     61          * No contents, set when we hit '/' after '&lt;'
     62          */
     63         ELEMENT_POP,
     64         /**
     65          * '&lt;!--' contents '--&gt;'
     66          */
     67         QUOTE,
     68         /**
     69          * '&lt;element&gt;' contents '&lt;/element&gt;'
     70          */
     71         ELEMENT_CONTENT
     72     };
     73 
     74     private enum State {
     75         BASE, IN_ELEMENT, AFTER_ELEMENT, IN_CONTENT, IN_ATTRIBUTE, IN_ATTRIBUTE_CONTENT, IN_ATTRIBUTE_CONTENT1, IN_ATTRIBUTE_CONTENT2, ELEMENT_STOP, IN_QUOTE
     76     };
     77 
     78     private Reader input;
     79 
     80     private State state;
     81 
     82     private Type bufferedReturn;
     83 
     84     private int lineCount;
     85 
     86     public SimpleHtmlParser setReader(Reader input) {
     87         this.input = input;
     88         state = State.IN_CONTENT;
     89         bufferedReturn = null;
     90         lineCount = 0;
     91         return this;
     92     }
     93 
     94     public int getLineCount() {
     95         return lineCount;
     96     }
     97 
     98     public Type next(StringBuilder result) throws IOException {
     99         result.setLength(0);
    100         if (bufferedReturn != null) {
    101             if (bufferedReturn == Type.DONE) { // once DONE, stay DONE
    102                 return Type.DONE;
    103             }
    104             Type temp = bufferedReturn;
    105             bufferedReturn = null;
    106             return temp;
    107         }
    108         while (true) {
    109             char ch;
    110             {
    111                 int chi = input.read();
    112                 if (chi < 0) {
    113                     bufferedReturn = Type.DONE;
    114                     chi = 0;
    115                 }
    116                 ch = (char) chi;
    117                 if (ch == '\n') {
    118                     ++lineCount;
    119                 }
    120             }
    121 
    122             switch (state) {
    123             case BASE:
    124                 if (ch == 0xFEFF)
    125                     break;
    126                 // fall through!
    127 
    128             case IN_CONTENT:
    129                 if (ch == '<') {
    130                     state = State.IN_ELEMENT;
    131                     bufferedReturn = Type.ELEMENT_START;
    132                     return Type.ELEMENT_CONTENT;
    133                 }
    134                 if (ch == 0) {
    135                     return Type.ELEMENT_CONTENT;
    136                 }
    137                 result.append(ch);
    138                 break;
    139 
    140             case IN_ELEMENT:
    141                 if (ch <= ' ') {
    142                     if (equals(result, "!--")) {
    143                         state = State.IN_QUOTE;
    144                         result.setLength(0);
    145                         break;
    146                     }
    147                     state = State.AFTER_ELEMENT;
    148                     return Type.ELEMENT;
    149                 }
    150                 if (ch == '>') {
    151                     state = State.IN_CONTENT;
    152                     bufferedReturn = Type.ELEMENT_END;
    153                     return Type.ELEMENT;
    154                 }
    155                 if (ch == '/') {
    156                     return Type.ELEMENT_POP;
    157                 }
    158                 result.append(ch);
    159                 break;
    160 
    161             case AFTER_ELEMENT:
    162                 if (ch <= ' ')
    163                     break;
    164                 if (ch == '>') {
    165                     state = State.IN_CONTENT;
    166                     return Type.ELEMENT_END;
    167                 }
    168                 result.append(ch);
    169                 state = State.IN_ATTRIBUTE;
    170                 break;
    171 
    172             case IN_ATTRIBUTE:
    173                 if (ch <= ' ') {
    174                     state = State.AFTER_ELEMENT;
    175                     return Type.ATTRIBUTE;
    176                 }
    177                 if (ch == '>') {
    178                     state = State.IN_CONTENT;
    179                     bufferedReturn = Type.ELEMENT_END;
    180                     return Type.ATTRIBUTE;
    181                 }
    182                 if (ch == '=') {
    183                     state = State.IN_ATTRIBUTE_CONTENT;
    184                     return Type.ATTRIBUTE;
    185                 }
    186                 result.append(ch);
    187                 break;
    188 
    189             case IN_ATTRIBUTE_CONTENT:
    190                 if (ch <= ' ') {
    191                     break;
    192                 }
    193                 if (ch == '>') {
    194                     state = State.IN_CONTENT;
    195                     bufferedReturn = Type.ELEMENT_END;
    196                     return Type.ATTRIBUTE_CONTENT;
    197                 }
    198                 if (ch == '\'') {
    199                     state = State.IN_ATTRIBUTE_CONTENT1;
    200                     break;
    201                 }
    202                 if (ch == '"') {
    203                     state = State.IN_ATTRIBUTE_CONTENT2;
    204                     break;
    205                 }
    206                 result.append(ch);
    207                 break;
    208 
    209             case IN_ATTRIBUTE_CONTENT1:
    210                 if (ch == 0 || ch == '\'') {
    211                     state = State.AFTER_ELEMENT;
    212                     return Type.ATTRIBUTE_CONTENT;
    213                 }
    214                 result.append(ch);
    215                 break;
    216 
    217             case IN_ATTRIBUTE_CONTENT2:
    218                 if (ch == 0 || ch == '"') {
    219                     state = State.AFTER_ELEMENT;
    220                     return Type.ATTRIBUTE_CONTENT;
    221                 }
    222                 result.append(ch);
    223                 break;
    224 
    225             case IN_QUOTE:
    226                 if (ch == 0) {
    227                     state = State.IN_CONTENT;
    228                     return Type.QUOTE;
    229                 }
    230                 if (ch == '>' && endsWith(result, "--")) {
    231                     result.setLength(result.length() - 2);
    232                     state = State.IN_CONTENT;
    233                     return Type.QUOTE;
    234                 }
    235                 result.append(ch);
    236                 break;
    237             default:
    238             }
    239         }
    240     }
    241 
    242     public static final boolean endsWith(CharSequence a, CharSequence b) {
    243         int aStart = a.length() - b.length();
    244         if (aStart < 0) {
    245             return false;
    246         }
    247         return regionEquals(a, aStart, b, 0, b.length());
    248     }
    249 
    250     public static final boolean equals(CharSequence a, CharSequence b) {
    251         int len = a.length();
    252         if (len != b.length()) {
    253             return false;
    254         }
    255         return regionEquals(a, 0, b, 0, len);
    256     }
    257 
    258     public static boolean regionEquals(CharSequence a, int i, CharSequence b, int j, int len) {
    259         for (; --len >= 0; ++i, ++j) {
    260             if (a.charAt(i) != b.charAt(j)) {
    261                 return false;
    262             }
    263         }
    264         return true;
    265     }
    266 
    267     public static void writeResult(Type type, StringBuilder result, Appendable writer) throws IOException {
    268         switch (type) {
    269         case ELEMENT:
    270             writer.append(result);
    271             break;
    272         case ELEMENT_START:
    273             writer.append('<');
    274             break;
    275         case ELEMENT_END:
    276             writer.append('>');
    277             break;
    278         case ATTRIBUTE:
    279             writer.append(' ').append(result);
    280             break;
    281         case ATTRIBUTE_CONTENT:
    282             writer.append("=\"").append(result).append('"');
    283             break;
    284         case ELEMENT_CONTENT:
    285             writer.append(result);
    286             break;
    287         case ELEMENT_POP:
    288             writer.append('/');
    289             break;
    290         case QUOTE:
    291             writer.append(result);
    292             break;
    293         case DONE:
    294             break;
    295         default:
    296             throw new IllegalArgumentException("Missing case: " + type);
    297         }
    298     }
    299 }
    300