Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import com.google.common.collect.ImmutableSet;
     32 import com.google.common.collect.Lists;
     33 import java.util.LinkedList;
     34 import java.util.NoSuchElementException;
     35 import java.util.Set;
     36 
     37 import javax.annotation.concurrent.NotThreadSafe;
     38 
     39 /**
     40  * A flexible lexer for HTML.
     41  * This is hairy code, but it is outside the TCB for the HTML sanitizer.
     42  *
     43  * @author Mike Samuel <mikesamuel (at) gmail.com>
     44  */
     45 @NotThreadSafe
     46 final class HtmlLexer extends AbstractTokenStream {
     47   private final String input;
     48   private final HtmlInputSplitter splitter;
     49   private State state = State.OUTSIDE_TAG;
     50 
     51   public HtmlLexer(String input) {
     52     this.input = input;
     53     this.splitter = new HtmlInputSplitter(input);
     54   }
     55 
     56   /**
     57    * Normalize case of names that are not name-spaced.  This lower-cases HTML
     58    * element and attribute names, but not ones for embedded SVG or MATHML.
     59    */
     60   static String canonicalName(String elementOrAttribName) {
     61     return elementOrAttribName.indexOf(':') >= 0
     62         ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
     63   }
     64 
     65   /**
     66    * An FSM that lets us reclassify text tokens inside tags as attribute
     67    * names/values
     68    */
     69   private static enum State {
     70     OUTSIDE_TAG,
     71     IN_TAG,
     72     SAW_NAME,
     73     SAW_EQ,
     74     ;
     75   }
     76 
     77   /**
     78    * Makes sure that this.token contains a token if one is available.
     79    * This may require fetching and combining multiple tokens from the underlying
     80    * splitter.
     81    */
     82   @Override
     83   protected HtmlToken produce() {
     84     HtmlToken token = readToken();
     85     if (token == null) { return null; }
     86 
     87     switch (token.type) {
     88 
     89       // Keep track of whether we're inside a tag or not.
     90       case TAGBEGIN:
     91         state = State.IN_TAG;
     92         break;
     93       case TAGEND:
     94         if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
     95           // Distinguish <input type=checkbox checked=> from
     96           // <input type=checkbox checked>
     97           pushbackToken(token);
     98           state = State.IN_TAG;
     99           return HtmlToken.instance(
    100               token.start, token.start, HtmlTokenType.ATTRVALUE);
    101         }
    102 
    103         state = State.OUTSIDE_TAG;
    104         break;
    105 
    106       // Drop ignorable tokens by zeroing out the one received and recursing
    107       case IGNORABLE:
    108         return produce();
    109 
    110       // collapse adjacent text nodes if we're outside a tag, or otherwise,
    111       // Recognize attribute names and values.
    112       default:
    113         switch (state) {
    114           case OUTSIDE_TAG:
    115             if (HtmlTokenType.TEXT == token.type
    116                 || HtmlTokenType.UNESCAPED == token.type) {
    117               token = collapseSubsequent(token);
    118             }
    119             break;
    120           case IN_TAG:
    121             if (HtmlTokenType.TEXT == token.type
    122                 && !token.tokenInContextMatches(input, "=")) {
    123               // Reclassify as attribute name
    124               token = HtmlInputSplitter.reclassify(
    125                   token, HtmlTokenType.ATTRNAME);
    126               state = State.SAW_NAME;
    127             }
    128             break;
    129           case SAW_NAME:
    130             if (HtmlTokenType.TEXT == token.type) {
    131               if (token.tokenInContextMatches(input, "=")) {
    132                 state = State.SAW_EQ;
    133                 // Skip the '=' token
    134                 return produce();
    135               } else {
    136                 // Reclassify as attribute name
    137                 token = HtmlInputSplitter.reclassify(
    138                     token, HtmlTokenType.ATTRNAME);
    139               }
    140             } else {
    141               state = State.IN_TAG;
    142             }
    143             break;
    144           case SAW_EQ:
    145             if (HtmlTokenType.TEXT == token.type
    146                 || HtmlTokenType.QSTRING == token.type) {
    147               if (HtmlTokenType.TEXT == token.type) {
    148                 // Collapse adjacent text nodes to properly handle
    149                 //   <a onclick=this.clicked=true>
    150                 //   <a title=foo bar>
    151                 token = collapseAttributeName(token);
    152               }
    153               // Reclassify as value
    154               token = HtmlInputSplitter.reclassify(
    155                   token, HtmlTokenType.ATTRVALUE);
    156               state = State.IN_TAG;
    157             }
    158             break;
    159         }
    160         break;
    161     }
    162 
    163     return token;
    164   }
    165 
    166   /**
    167    * Collapses all the following tokens of the same type into this.token.
    168    */
    169   private HtmlToken collapseSubsequent(HtmlToken token) {
    170     HtmlToken collapsed = token;
    171     for (HtmlToken next;
    172          (next= peekToken(0)) != null && next.type == token.type;
    173          readToken()) {
    174       collapsed = join(collapsed, next);
    175     }
    176     return collapsed;
    177   }
    178 
    179   private HtmlToken collapseAttributeName(HtmlToken token) {
    180     // We want to collapse tokens into the value that are not parts of an
    181     // attribute value.  We should include any space or text adjacent to the
    182     // value, but should stop at any of the following constructions:
    183     //   space end-of-file              e.g. name=foo_
    184     //   space valueless-attrib-name    e.g. name=foo checked
    185     //   space tag-end                  e.g. name=foo />
    186     //   space text space? '='          e.g. name=foo bar=
    187     int nToMerge = 0;
    188     for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
    189       if (t.type == HtmlTokenType.IGNORABLE) {
    190         HtmlToken tok = peekToken(nToMerge + 1);
    191         if (tok == null) { break; }
    192         if (tok.type != HtmlTokenType.TEXT) { break; }
    193         if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
    194           break;
    195         }
    196         HtmlToken eq = peekToken(nToMerge + 2);
    197         if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
    198           eq = peekToken(nToMerge + 3);
    199         }
    200         if (eq == null || eq.tokenInContextMatches(input, "=")) {
    201           break;
    202         }
    203       } else if (t.type != HtmlTokenType.TEXT) {
    204         break;
    205       }
    206       ++nToMerge;
    207     }
    208     if (nToMerge == 0) { return token; }
    209 
    210     int end = token.end;
    211     do {
    212       end = readToken().end;
    213     } while (--nToMerge > 0);
    214 
    215     return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
    216   }
    217 
    218   private static HtmlToken join(HtmlToken a, HtmlToken b) {
    219     return HtmlToken.instance(a.start, b.end, a.type);
    220   }
    221 
    222   private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
    223   private HtmlToken readToken() {
    224     if (!lookahead.isEmpty()) {
    225       return lookahead.remove();
    226     } else if (splitter.hasNext()) {
    227       return splitter.next();
    228     } else {
    229       return null;
    230     }
    231   }
    232 
    233   private HtmlToken peekToken(int i) {
    234     while (lookahead.size() <= i && splitter.hasNext()) {
    235       lookahead.add(splitter.next());
    236     }
    237     return lookahead.size() > i ? lookahead.get(i) : null;
    238   }
    239 
    240   private void pushbackToken(HtmlToken token) {
    241     lookahead.addFirst(token);
    242   }
    243 
    244   /** Can the attribute appear in HTML without a value. */
    245   private static boolean isValuelessAttribute(String attribName) {
    246     boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
    247         Strings.toLowerCase(attribName));
    248     return valueless;
    249   }
    250 
    251   // From http://issues.apache.org/jira/browse/XALANC-519
    252   private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
    253       "checked", "compact", "declare", "defer", "disabled",
    254       "ismap", "multiple", "nohref", "noresize", "noshade",
    255       "nowrap", "readonly", "selected");
    256 }
    257 
    258 /**
    259  * A token stream that breaks a character stream into <tt>
    260  * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
    261  * tokens.  The matching of attribute names and values is done in a later step.
    262  */
    263 final class HtmlInputSplitter extends AbstractTokenStream {
    264   /** The source of HTML character data. */
    265   private final String input;
    266   /** An offset into input. */
    267   private int offset;
    268   /** True iff the current character is inside a tag. */
    269   private boolean inTag;
    270   /**
    271    * True if inside a script, xmp, listing, or similar tag whose content does
    272    * not follow the normal escaping rules.
    273    */
    274   private boolean inEscapeExemptBlock;
    275 
    276   /**
    277    * Null or the name of the close tag required to end the current escape exempt
    278    * block.
    279    * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
    280    * contain unescaped HTML input.
    281    */
    282   private String escapeExemptTagName = null;
    283 
    284   private HtmlTextEscapingMode textEscapingMode;
    285 
    286   public HtmlInputSplitter(String input) {
    287     this.input = input;
    288   }
    289 
    290   /**
    291    * Make sure that there is a token ready to yield in this.token.
    292    */
    293   @Override
    294   protected HtmlToken produce() {
    295     HtmlToken token = parseToken();
    296     if (null == token) { return null; }
    297 
    298     // Handle escape-exempt blocks.
    299     // The parse() method is only dimly aware of escape-excempt blocks, so
    300     // here we detect the beginning and ends of escape exempt blocks, and
    301     // reclassify as UNESCAPED, any tokens that appear in the middle.
    302     if (inEscapeExemptBlock) {
    303       if (token.type != HtmlTokenType.SERVERCODE) {
    304         // classify RCDATA as text since it can contain entities
    305         token = reclassify(
    306             token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
    307                     ? HtmlTokenType.TEXT
    308                     : HtmlTokenType.UNESCAPED));
    309       }
    310     } else {
    311       switch (token.type) {
    312         case TAGBEGIN:
    313           {
    314             String canonTagName = canonicalName(
    315                 token.start + 1, token.end);
    316             if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
    317                     canonTagName)) {
    318               this.escapeExemptTagName = canonTagName;
    319               this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
    320                   canonTagName);
    321             }
    322             break;
    323           }
    324         case TAGEND:
    325           this.inEscapeExemptBlock = null != this.escapeExemptTagName;
    326           break;
    327         default:
    328           break;
    329       }
    330     }
    331     return token;
    332   }
    333 
    334   /**
    335    * States for a state machine for optimistically identifying tags and other
    336    * html/xml/phpish structures.
    337    */
    338   private static enum State {
    339     TAGNAME,
    340     SLASH,
    341     BANG,
    342     BANG_DASH,
    343     COMMENT,
    344     COMMENT_DASH,
    345     COMMENT_DASH_DASH,
    346     DIRECTIVE,
    347     DONE,
    348     BOGUS_COMMENT,
    349     SERVER_CODE,
    350     SERVER_CODE_PCT,
    351 
    352     // From HTML 5 section 8.1.2.6
    353 
    354     // The text in CDATA and RCDATA elements must not contain any
    355     // occurrences of the string "</" followed by characters that
    356     // case-insensitively match the tag name of the element followed
    357     // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
    358     // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
    359     // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
    360     // that string is part of an escaping text span.
    361 
    362     // An escaping text span is a span of text (in CDATA and RCDATA
    363     // elements) and character entity references (in RCDATA elements)
    364     // that starts with an escaping text span start that is not itself
    365     // in an escaping text span, and ends at the next escaping text
    366     // span end.
    367 
    368     // An escaping text span start is a part of text that consists of
    369     // the four character sequence "<!--".
    370 
    371     // An escaping text span end is a part of text that consists of
    372     // the three character sequence "-->".
    373 
    374     // An escaping text span start may share its U+002D HYPHEN-MINUS characters
    375     // with its corresponding escaping text span end.
    376     UNESCAPED_LT_BANG,             // <!
    377     UNESCAPED_LT_BANG_DASH,        // <!-
    378     ESCAPING_TEXT_SPAN,            // Inside an escaping text span
    379     ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
    380     ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
    381     ;
    382   }
    383 
    384   private HtmlToken lastNonIgnorable = null;
    385   /**
    386    * Breaks the character stream into tokens.
    387    * This method returns a stream of tokens such that each token starts where
    388    * the last token ended.
    389    *
    390    * <p>This property is useful as it allows fetch to collapse and reclassify
    391    * ranges of tokens based on state that is easy to maintain there.
    392    *
    393    * <p>Later passes are responsible for throwing away useless tokens.
    394    */
    395   private HtmlToken parseToken() {
    396     int start = offset;
    397     int limit = input.length();
    398     if (start == limit) { return null; }
    399 
    400     int end = start + 1;
    401     HtmlTokenType type;
    402 
    403     char ch = input.charAt(start);
    404     if (inTag) {
    405       if ('>' == ch) {
    406         type = HtmlTokenType.TAGEND;
    407         inTag = false;
    408       } else if ('/' == ch) {
    409         if (end != limit && '>' == input.charAt(end)) {
    410           type = HtmlTokenType.TAGEND;
    411           inTag = false;
    412           ++end;
    413         } else {
    414           type = HtmlTokenType.TEXT;
    415         }
    416       } else if ('=' == ch) {
    417         type = HtmlTokenType.TEXT;
    418       } else if ('"' == ch || '\'' == ch) {
    419         type = HtmlTokenType.QSTRING;
    420         int delim = ch;
    421         for (; end < limit; ++end) {
    422           if (input.charAt(end) == delim) {
    423             ++end;
    424             break;
    425           }
    426         }
    427       } else if (!Character.isWhitespace(ch)) {
    428         type = HtmlTokenType.TEXT;
    429         for (; end < limit; ++end) {
    430           ch = input.charAt(end);
    431           // End a text chunk before />
    432           if ((lastNonIgnorable == null
    433                || !lastNonIgnorable.tokenInContextMatches(input, "="))
    434               && '/' == ch && end + 1 < limit
    435               && '>' == input.charAt(end + 1)) {
    436             break;
    437           } else if ('>' == ch || '=' == ch
    438                      || Character.isWhitespace(ch)) {
    439             break;
    440           } else if ('"' == ch || '\'' == ch) {
    441             if (end + 1 < limit) {
    442               char ch2 = input.charAt(end + 1);
    443               if (ch2 >= 0 && Character.isWhitespace(ch2)
    444                   || ch2 == '>' || ch2 == '/') {
    445                 ++end;
    446                 break;
    447               }
    448             }
    449           }
    450         }
    451       } else {
    452         // We skip whitespace tokens inside tag bodies.
    453         type = HtmlTokenType.IGNORABLE;
    454         while (end < limit && Character.isWhitespace(input.charAt(end))) {
    455           ++end;
    456         }
    457       }
    458     } else {
    459       if (ch == '<') {
    460         if (end == limit) {
    461           type = HtmlTokenType.TEXT;
    462         } else {
    463           ch = input.charAt(end);
    464           type = null;
    465           State state = null;
    466           switch (ch) {
    467             case '/':  // close tag?
    468               state = State.SLASH;
    469               ++end;
    470               break;
    471             case '!':  // Comment or declaration
    472               if (!this.inEscapeExemptBlock) {
    473                 state = State.BANG;
    474               } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
    475                              escapeExemptTagName)) {
    476                 // Directives, and cdata suppressed in escape
    477                 // exempt mode as they could obscure the close of the
    478                 // escape exempty block, but comments are similar to escaping
    479                 // text spans, and are significant in all CDATA and RCDATA
    480                 // blocks except those inside <xmp> tags.
    481                 // See "Escaping text spans" in section 8.1.2.6 of HTML5.
    482                 // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
    483                 state = State.UNESCAPED_LT_BANG;
    484               }
    485               ++end;
    486               break;
    487             case '?':
    488               if (!this.inEscapeExemptBlock) {
    489                 state = State.BOGUS_COMMENT;
    490               }
    491               ++end;
    492               break;
    493             case '%':
    494               state = State.SERVER_CODE;
    495               ++end;
    496               break;
    497             default:
    498               if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
    499                 state = State.TAGNAME;
    500                 ++end;
    501               } else if ('<' == ch) {
    502                 type = HtmlTokenType.TEXT;
    503               } else {
    504                 ++end;
    505               }
    506               break;
    507           }
    508           if (null != state) {
    509             charloop:
    510             while (end < limit) {
    511               ch = input.charAt(end);
    512               switch (state) {
    513                 case TAGNAME:
    514                   if (Character.isWhitespace(ch)
    515                       || '>' == ch || '/' == ch || '<' == ch) {
    516                     // End processing of an escape exempt block when we see
    517                     // a corresponding end tag.
    518                     if (this.inEscapeExemptBlock
    519                         && '/' == input.charAt(start + 1)
    520                         && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
    521                         && canonicalName(start + 2, end)
    522                             .equals(escapeExemptTagName)) {
    523                       this.inEscapeExemptBlock = false;
    524                       this.escapeExemptTagName = null;
    525                       this.textEscapingMode = null;
    526                     }
    527                     type = HtmlTokenType.TAGBEGIN;
    528                     // Don't process content as attributes if we're inside
    529                     // an escape exempt block.
    530                     inTag = !this.inEscapeExemptBlock;
    531                     state = State.DONE;
    532                     break charloop;
    533                   }
    534                   break;
    535                 case SLASH:
    536                   if (Character.isLetter(ch)) {
    537                     state = State.TAGNAME;
    538                   } else {
    539                     if ('<' == ch) {
    540                       type = HtmlTokenType.TEXT;
    541                     } else {
    542                       ++end;
    543                     }
    544                     break charloop;
    545                   }
    546                   break;
    547                 case BANG:
    548                   if ('-' == ch) {
    549                     state = State.BANG_DASH;
    550                   } else {
    551                     state = State.DIRECTIVE;
    552                   }
    553                   break;
    554                 case BANG_DASH:
    555                   if ('-' == ch) {
    556                     state = State.COMMENT;
    557                   } else {
    558                     state = State.DIRECTIVE;
    559                   }
    560                   break;
    561                 case COMMENT:
    562                   if ('-' == ch) {
    563                     state = State.COMMENT_DASH;
    564                   }
    565                   break;
    566                 case COMMENT_DASH:
    567                   state = ('-' == ch)
    568                       ? State.COMMENT_DASH_DASH
    569                       : State.COMMENT_DASH;
    570                   break;
    571                 case COMMENT_DASH_DASH:
    572                   if ('>' == ch) {
    573                     state = State.DONE;
    574                     type = HtmlTokenType.COMMENT;
    575                   } else if ('-' == ch) {
    576                     state = State.COMMENT_DASH_DASH;
    577                   } else {
    578                     state = State.COMMENT_DASH;
    579                   }
    580                   break;
    581                 case DIRECTIVE:
    582                   if ('>' == ch) {
    583                     type = HtmlTokenType.DIRECTIVE;
    584                     state = State.DONE;
    585                   }
    586                   break;
    587                 case BOGUS_COMMENT:
    588                   if ('>' == ch) {
    589                     type = HtmlTokenType.QMARKMETA;
    590                     state = State.DONE;
    591                   }
    592                   break;
    593                 case SERVER_CODE:
    594                   if ('%' == ch) {
    595                     state = State.SERVER_CODE_PCT;
    596                   }
    597                   break;
    598                 case SERVER_CODE_PCT:
    599                   if ('>' == ch) {
    600                     type = HtmlTokenType.SERVERCODE;
    601                     state = State.DONE;
    602                   } else if ('%' != ch) {
    603                     state = State.SERVER_CODE;
    604                   }
    605                   break;
    606                 case UNESCAPED_LT_BANG:
    607                   if ('-' == ch) {
    608                     state = State.UNESCAPED_LT_BANG_DASH;
    609                   } else {
    610                     type = HtmlTokenType.TEXT;
    611                     state = State.DONE;
    612                   }
    613                   break;
    614                 case UNESCAPED_LT_BANG_DASH:
    615                   if ('-' == ch) {
    616                     // According to HTML 5 section 8.1.2.6
    617 
    618                     // An escaping text span start may share its
    619                     // U+002D HYPHEN-MINUS characters with its
    620                     // corresponding escaping text span end.
    621                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
    622                   } else {
    623                     type = HtmlTokenType.TEXT;
    624                     state = State.DONE;
    625                   }
    626                   break;
    627                 case ESCAPING_TEXT_SPAN:
    628                   if ('-' == ch) {
    629                     state = State.ESCAPING_TEXT_SPAN_DASH;
    630                   }
    631                   break;
    632                 case ESCAPING_TEXT_SPAN_DASH:
    633                   if ('-' == ch) {
    634                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
    635                   } else {
    636                     state = State.ESCAPING_TEXT_SPAN;
    637                   }
    638                   break;
    639                 case ESCAPING_TEXT_SPAN_DASH_DASH:
    640                   if ('>' == ch) {
    641                     type = HtmlTokenType.TEXT;
    642                     state = State.DONE;
    643                   } else if ('-' != ch) {
    644                     state = State.ESCAPING_TEXT_SPAN;
    645                   }
    646                   break;
    647                 case DONE:
    648                   throw new AssertionError(
    649                       "Unexpectedly DONE while lexing HTML token stream");
    650               }
    651               ++end;
    652               if (State.DONE == state) { break; }
    653             }
    654             if (end == limit) {
    655               switch (state) {
    656                 case DONE:
    657                   break;
    658                 case BOGUS_COMMENT:
    659                   type = HtmlTokenType.QMARKMETA;
    660                   break;
    661                 case COMMENT:
    662                 case COMMENT_DASH:
    663                 case COMMENT_DASH_DASH:
    664                   type = HtmlTokenType.COMMENT;
    665                   break;
    666                 case DIRECTIVE:
    667                 case SERVER_CODE:
    668                 case SERVER_CODE_PCT:
    669                   type = HtmlTokenType.SERVERCODE;
    670                   break;
    671                 case TAGNAME:
    672                   type = HtmlTokenType.TAGBEGIN;
    673                   break;
    674                 default:
    675                   type = HtmlTokenType.TEXT;
    676                   break;
    677               }
    678             }
    679           }
    680         }
    681       } else {
    682         type = null;
    683       }
    684     }
    685     if (null == type) {
    686       while (end < limit && '<' != input.charAt(end)) { ++end; }
    687       type = HtmlTokenType.TEXT;
    688     }
    689 
    690     offset = end;
    691     HtmlToken result = HtmlToken.instance(start, end, type);
    692     if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
    693     return result;
    694   }
    695 
    696   private String canonicalName(int start, int end) {
    697     return HtmlLexer.canonicalName(input.substring(start, end));
    698   }
    699 
    700   private static boolean isIdentStart(char ch) {
    701     return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
    702   }
    703 
    704   static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
    705     return HtmlToken.instance(token.start, token.end, type);
    706   }
    707 }
    708 
    709 
    710 /**
    711  * A TokenStream that lazily fetches one token at a time.
    712  *
    713  * @author Mike Samuel <mikesamuel (at) gmail.com>
    714  */
    715 abstract class AbstractTokenStream implements TokenStream {
    716   private HtmlToken tok;
    717 
    718   public final boolean hasNext() {
    719     if (tok == null) { tok = produce(); }
    720     return tok != null;
    721   }
    722 
    723   public HtmlToken next() {
    724     if (this.tok == null) { this.tok = produce(); }
    725     HtmlToken t = this.tok;
    726     if (t == null) { throw new NoSuchElementException(); }
    727     this.tok = null;
    728     return t;
    729   }
    730 
    731   protected abstract HtmlToken produce();
    732 }
    733