Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.google.android.mail.common.html.parser;
     17 
     18 import com.google.android.mail.common.base.CharEscapers;
     19 import com.google.android.mail.common.base.CharMatcher;
     20 import com.google.android.mail.common.base.Preconditions;
     21 import com.google.android.mail.common.base.StringUtil;
     22 import com.google.android.mail.common.base.X;
     23 import com.google.common.collect.Lists;
     24 import com.google.common.collect.Maps;
     25 import com.google.common.io.ByteStreams;
     26 
     27 import java.io.IOException;
     28 import java.util.ArrayList;
     29 import java.util.HashMap;
     30 import java.util.LinkedList;
     31 import java.util.List;
     32 import java.util.ListIterator;
     33 import java.util.regex.Matcher;
     34 import java.util.regex.Pattern;
     35 
     36 /**
     37  * HtmlParser is a simple but efficient html parser.
     38  * - It's simple because it does not do incremental parsing like some other
     39  * parser. It assumes that the entire html text is available.
     40  * - It offers 3 levels of aggressiveness in correcting errors in HTML (see
     41  * HtmlParser.ParseStyle).
     42  * - HTML comments are ignored unless initialized with ParseStyle.PRESERVE_ALL.
     43  */
     44 public class HtmlParser {
     45 
     46   // States
     47   private enum State {
     48     IN_TEXT, IN_TAG, IN_COMMENT, IN_CDATA
     49   }
     50 
     51   // The current state
     52   private State state;
     53 
     54   private int clipLength = Integer.MAX_VALUE;
     55   private boolean clipped;
     56 
     57   // The html text
     58   private String html;
     59 
     60   // The entire array of nodes
     61   private List<HtmlDocument.Node> nodes;
     62 
     63   // Turn on for debug information.
     64   private static boolean DEBUG = false;
     65 
     66   // Default whitelist
     67   public static final HtmlWhitelist DEFAULT_WHITELIST = HTML4.getWhitelist();
     68 
     69   // Whitelists for looking up accepted HTML tags and attributes
     70   private List<HtmlWhitelist> whitelists = Lists.newArrayList(DEFAULT_WHITELIST);
     71 
     72   /**
     73    * This setting controls how much of the original HTML is preserved.  In
     74    * ascending order of aggressiveness:
     75    * - PRESERVE_ALL: Preserves all of original content.
     76    * *** Warning - PRESERVE_ALL mode preserves invalid and unsafe HTML. ***
     77    * - PRESERVE_VALID: Preserves only valid visible HTML handled by
     78    * most browsers.  Discards comments, unknown tags and attributes, and
     79    * nameless end tags.  Encodes all '<' characters that aren't part of a tag.
     80    * - NORMALIZE: In addition to the changes made by PRESERVE_VALID, also
     81    *   - unescapes then reescapes text to normalize entities
     82    *   - normalizes whitespace and quotes around tag attributes
     83    */
     84   public enum ParseStyle { NORMALIZE, PRESERVE_VALID, PRESERVE_ALL }
     85 
     86   /**
     87    * True only in PRESERVE_ALL mode.
     88    * @see HtmlParser.ParseStyle
     89    */
     90   private final boolean preserveAll;
     91 
     92   /**
     93    * True in either PRESERVE_ALL or PRESERVE_VALID mode.
     94    * @see HtmlParser.ParseStyle
     95    */
     96   private final boolean preserveValidHtml;
     97 
     98   /**
     99    * @see HtmlParser#HtmlParser(HtmlParser.ParseStyle)
    100    */
    101   public HtmlParser() {
    102     this(ParseStyle.NORMALIZE);
    103   }
    104 
    105   /**
    106    * @param parseStyle Level of aggressiveness for how different
    107    * toHTML()/toXHTML() are from original.
    108    * @see HtmlParser.ParseStyle
    109    */
    110   public HtmlParser(ParseStyle parseStyle) {
    111     preserveAll = (parseStyle == ParseStyle.PRESERVE_ALL);
    112     preserveValidHtml = preserveAll || (parseStyle == ParseStyle.PRESERVE_VALID);
    113   }
    114 
    115   /**
    116    * Sets the maximum length, in characters, of an HTML message.
    117    *
    118    * @param clipLength must be greater than zero.
    119    * (It starts as Integer.MAX_VALUE)
    120    */
    121   public void setClipLength(int clipLength) {
    122     if (clipLength <= 0) {
    123       throw new IllegalArgumentException(
    124         "clipLength '" + clipLength + "' <= 0");
    125     }
    126     this.clipLength = clipLength;
    127   }
    128 
    129   public boolean isClipped() {
    130     return clipped;
    131   }
    132 
    133   /**
    134    * Sets the HTML whitelist. Calling this overrides any whitelist(s) that
    135    * the parser is configured to use. By default, the parser uses the standard
    136    * HTML4 whitelist.
    137    *
    138    * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
    139    *
    140    * @param whitelist The whitelist to use. Must not be null.
    141    */
    142   public void setWhitelist(HtmlWhitelist whitelist) {
    143     Preconditions.checkNotNull(whitelist);
    144     whitelists = Lists.newArrayList(whitelist);
    145   }
    146 
    147   /**
    148    * Adds an HTML whitelist to the list of whitelists consulted when
    149    * processing an element or attribute. By default, the parser only uses
    150    * the standard HTML4 whitelist.
    151    *
    152    * Whitelists are consulted in reverse chronological order (starting from
    153    * the most recently added whitelist). The last whitelist consulted will
    154    * always be the standard HTML4 whitelist, unless this was overridden by
    155    * a call to {@link #setWhitelist}.
    156    *
    157    * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
    158    *
    159    * @param whitelist The whitelist to use.
    160    */
    161   public void addWhitelist(HtmlWhitelist whitelist) {
    162     whitelists.add(whitelist);
    163   }
    164 
    165   /**
    166    * These are characters that we don't want to allow unquoted in an attribute
    167    * value because they might be interpreted by the browser as HTML control
    168    * characters. They are the 5 characters that are escaped by
    169    * com.google.common.base.CharEscapers.HTML_ESCAPE, plus '=' and whitespace.
    170    * Note that it shouldn't be possible for '>' or whitespace to be parsed as
    171    * part of an unquoted attribute value, but we leave them here for
    172    * completeness.
    173    * Package visibility for unit tests.
    174    */
    175   static Pattern NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX = Pattern.compile("[\"\'&<>=\\s]");
    176 
    177   //------------------------------------------------------------------------
    178   // Parsing
    179   //------------------------------------------------------------------------
    180 
    181   /**
    182    * Parses a String as HTML.
    183    *
    184    * @param html String to parse
    185    * @return an Html document
    186    */
    187   public HtmlDocument parse(String html) {
    188     this.html = html;
    189     // Use a LinkedList because we don't know the number of nodes ahead of
    190     // time. This will be compacted into an ArrayList in coalesceTextNodes().
    191     nodes = Lists.newLinkedList();
    192     state = State.IN_TEXT;
    193 
    194     clipped = false;
    195     int end = html.length();
    196     int clipEnd = Math.min(clipLength, end);
    197 
    198     for (int i = 0; i < end && !clipped;) {
    199 
    200       // At any one time, the parser is in one of these states:
    201       int pos;
    202       switch (state) {
    203         case IN_TEXT:
    204           // text will not attempt to parse beyond the clipping length
    205           pos = scanText(i, clipEnd);
    206           X.assertTrue(pos > i || state != State.IN_TEXT); // Must make progress.
    207           break;
    208 
    209         case IN_TAG:
    210           pos = scanTag(i, end);
    211           X.assertTrue(pos > i);        // Must make progress
    212           break;
    213 
    214         case IN_COMMENT:
    215           pos = scanComment(i, end);
    216           state = State.IN_TEXT;
    217           X.assertTrue(pos > i);        // Must make progress
    218           break;
    219 
    220         case IN_CDATA:
    221           pos = scanCDATA(i, end);
    222           X.assertTrue(pos > i || state != State.IN_CDATA); // Must make progress
    223           break;
    224 
    225         default:
    226           throw new Error("Unknown state!");
    227       }
    228 
    229       i = pos;
    230 
    231       // If we've reached or gone beyond the clipping length, stop.
    232       clipped = pos >= clipLength;
    233     }
    234 
    235     nodes = coalesceTextNodes(nodes);
    236 
    237     HtmlDocument doc = new HtmlDocument(nodes);
    238     nodes = null;
    239     html = null;
    240     return doc;
    241   }
    242 
    243   /**
    244    * During the course of parsing, we may have multiple adjacent Text nodes,
    245    * due to the sanitizer stripping out nodes between Text nodes. It is
    246    * important to coalesce them so that later steps in the pipeline can
    247    * treat the text as a single block (e.g. the step that inserts <wbr> tags).
    248    * @param nodes Original nodes.
    249    * @return Nodes with text nodes changed.
    250    */
    251   static List<HtmlDocument.Node> coalesceTextNodes(
    252       List<HtmlDocument.Node> nodes) {
    253     List<HtmlDocument.Node> out =
    254         new ArrayList<HtmlDocument.Node>(nodes.size());
    255     LinkedList<HtmlDocument.Text> textNodes = Lists.newLinkedList();
    256 
    257     for (HtmlDocument.Node node : nodes) {
    258       if (node instanceof HtmlDocument.Text) {
    259         textNodes.add((HtmlDocument.Text) node);
    260       } else {
    261         mergeTextNodes(textNodes, out);
    262         out.add(node);
    263       }
    264     }
    265     mergeTextNodes(textNodes, out);
    266     return out;
    267   }
    268 
    269   /**
    270    * Flushes any Text nodes in {@code textNodes} into a single Text node
    271    * in {@code output}. {@code textNodes} is guaranteed to be empty when
    272    * the function returns.
    273    * @param textNodes Text nodes.
    274    * @param output Destination to which results are added.
    275    */
    276   private static void mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes,
    277                                      List<HtmlDocument.Node> output) {
    278     if (!textNodes.isEmpty()) {
    279       if (textNodes.size() == 1) {
    280         output.add(textNodes.removeFirst());
    281       } else {
    282         int combinedTextLen = 0;
    283         int combinedInputLen = 0;
    284         for (HtmlDocument.Text text : textNodes) {
    285           combinedTextLen += text.getText().length();
    286           if (text.getOriginalHTML() != null) {
    287             combinedInputLen += text.getOriginalHTML().length();
    288           }
    289         }
    290         StringBuilder combinedText = new StringBuilder(combinedTextLen);
    291         StringBuilder combinedInput = new StringBuilder(combinedInputLen);
    292         while (!textNodes.isEmpty()) {
    293           HtmlDocument.Text text = textNodes.removeFirst();
    294           combinedText.append(text.getText());
    295           if (text.getOriginalHTML() != null) {
    296             combinedInput.append(text.getOriginalHTML());
    297           }
    298         }
    299         String originalInput = combinedInputLen > 0 ? combinedInput.toString() : null;
    300         output.add(HtmlDocument.createText(combinedText.toString(), originalInput));
    301       }
    302     }
    303   }
    304 
    305   //------------------------------------------------------------------------
    306   // Text scanning
    307   //------------------------------------------------------------------------
    308   /**
    309    * A truncated entity is something like <pre>&nbs or &#1a3</pre>.
    310    * We only want to find these at the end of a clipped text.
    311    */
    312   private static final Pattern TRUNCATED_ENTITY =
    313     Pattern.compile("\\& \\#? [0-9a-zA-Z]{0,8} $", Pattern.COMMENTS);
    314 
    315   /**
    316    * In a text mode, scan for a tag
    317    * @param start Position in original html.
    318    * @param end Position in original html.
    319    * @return End position of scanned content.
    320    */
    321   int scanText(final int start, final int end) {
    322     int pos;
    323     for (pos = start; pos < end; pos++) {
    324       char ch = html.charAt(pos);
    325       if (ch == '<' && pos + 1 < end) {
    326         // Check the next char
    327         ch = html.charAt(pos + 1);
    328         if (ch == '/' || Character.isLetter(ch) || ch == '!' || ch == '?') {
    329 
    330           // Check if it's an html comment or tag
    331           if (html.regionMatches(pos + 1, "!--", 0, 3)) {
    332             state = State.IN_COMMENT;
    333           } else {
    334             state = State.IN_TAG;
    335           }
    336           break;
    337         }
    338       }
    339     }
    340 
    341     if (pos > start) {
    342       int finalPos = pos;
    343       String htmlTail = this.html.substring(start, finalPos);
    344 
    345       if ((pos == clipLength) && (clipLength < html.length())) {
    346         // We're clipping this HTML, not running off the end.
    347         // If we're ending with what looks like a truncated entity,
    348         // then clip that part off, too.
    349         // If it really was a truncated entity, great.
    350         // If it was a false positive, the user won't notice that we clipped
    351         // an additional handful of characters.
    352         Matcher matcher = TRUNCATED_ENTITY.matcher(htmlTail);
    353         if (matcher.find()) {
    354           int matchStart = matcher.start();
    355           // The matcher matched in htmlTail, not html.
    356           // htmlTail starts at html[start]
    357           finalPos = start + matchStart;
    358           htmlTail = htmlTail.substring(0, matchStart);
    359         }
    360       }
    361 
    362       if (finalPos > start) {
    363         String originalHtml = null;
    364         if (preserveAll) {
    365           originalHtml = htmlTail;
    366         } else if (preserveValidHtml) {
    367           // the only way htmlTail can start with '<' is if it's the last character
    368           // in html; otherwise, we would have entered State.IN_TAG or
    369           // State.IN_COMMENT above
    370 
    371           // officially a '<' can be valid in a text node, but to be safe we
    372           // always escape them
    373           originalHtml = CharMatcher.is('<').replaceFrom(htmlTail, "&lt;");
    374         }
    375 
    376         HtmlDocument.Text textnode = HtmlDocument.createEscapedText(htmlTail, originalHtml);
    377         nodes.add(textnode);
    378       }
    379     }
    380     return pos;
    381   }
    382 
    383   //------------------------------------------------------------------------
    384   // Tag name scanning utility class
    385   //------------------------------------------------------------------------
    386   private static class TagNameScanner {
    387     private final String html;
    388     private String tagName;
    389     private int startNamePos = -1;
    390     private int endNamePos = -1;
    391 
    392     public TagNameScanner(String html) {
    393       this.html = html;
    394     }
    395 
    396     /**
    397      * Scans for a tag name. Sets #startNamePos and #endNamePos.
    398      * @param start Position in original html.
    399      * @param end Position in original html.
    400      * @return End position of scanned content.
    401      */
    402     public int scanName(final int start, final int end) {
    403       int pos;
    404       for (pos = start; pos < end; pos++) {
    405         char ch = html.charAt(pos);
    406 
    407         // End of tag or end of name.
    408         if ((ch == '>') || (ch == '/') || Character.isWhitespace(ch)) {
    409           break;
    410         }
    411       }
    412       if (pos > start) {
    413         startNamePos = start;
    414         endNamePos = pos;
    415       }
    416       return pos;
    417     }
    418 
    419     /**
    420      * @return Tag name.
    421      */
    422     public String getTagName() {
    423       if (tagName == null && startNamePos != -1 && endNamePos != -1) {
    424         tagName = html.substring(startNamePos, endNamePos);
    425       }
    426       return tagName;
    427     }
    428   }
    429 
    430   //------------------------------------------------------------------------
    431   // Attribute scanning utility class
    432   //------------------------------------------------------------------------
    433   private static class AttributeScanner {
    434     private final String html;
    435     private String name;
    436     private String value;
    437 
    438     // The following have package visibility because they are accessed from
    439     // HtmlParser.addAttribute() to handle preservation of original content
    440     // around the attribute value, but quoting and escaping of the value itself.
    441     int startNamePos = -1;
    442     int endNamePos = -1;
    443     int startValuePos = -1;
    444     int endValuePos = -1;
    445     boolean attrValueIsQuoted = false;
    446 
    447     public AttributeScanner(String html) {
    448       this.html = html;
    449     }
    450 
    451     /**
    452      * Reset to scan another attribute.
    453      */
    454     public void reset() {
    455       startNamePos = -1;
    456       endNamePos = -1;
    457       startValuePos = -1;
    458       endValuePos = -1;
    459       attrValueIsQuoted = false;
    460       name = null;
    461       value = null;
    462     }
    463 
    464     /**
    465      * Scans for a tag attribute name. Sets startNamePos and endNamePos. Sets
    466      * 'attrName'.
    467      *
    468      * @param start Position in original html
    469      * @param end Position in original html
    470      * @return End position of scanned content
    471      */
    472     int scanName(final int start, final int end) {
    473       X.assertTrue(html.charAt(start) != '>');
    474       if (start == end) {
    475         // No attribute name
    476         return start;
    477       }
    478 
    479       int pos;
    480       for (pos = start + 1; pos < end; pos++) {
    481         char ch = html.charAt(pos);
    482 
    483         // End of tag or end of name.
    484         if ((ch == '>') || (ch == '=') || (ch == '/') || Character.isWhitespace(ch)) {
    485           break;
    486         }
    487       }
    488       startNamePos = start;
    489       endNamePos = pos;
    490       return pos;
    491     }
    492 
    493     /**
    494      * Scans for a tag attribute value. Sets startValuePos_ and endValuePos_.
    495      *
    496      * @param start Position in original html
    497      * @param end Position in original html
    498      * @return End position of scanned content
    499      */
    500     int scanValue(final int start, final int end) {
    501       // Skip whitespace before '='.
    502       int pos = skipSpaces(start, end);
    503 
    504       // Handle cases with no attribute value.
    505       if ((pos == end) || (html.charAt(pos) != '=')) {
    506         // Return start so spaces will be parsed as part of next attribute,
    507         // or end of tag.
    508         return start;
    509       }
    510 
    511       // Skip '=' and whitespace after it.
    512       pos++;
    513       pos = skipSpaces(pos, end);
    514 
    515       // Handle cases with no attribute value.
    516       if (pos == end) {
    517         return pos;
    518       }
    519 
    520       // Check for quote character ' or "
    521       char ch = html.charAt(pos);
    522       if (ch == '\'' || ch == '\"') {
    523         attrValueIsQuoted = true;
    524         pos++;
    525         int valueStart = pos;
    526         while (pos < end && html.charAt(pos) != ch) {
    527           pos++;
    528         }
    529         startValuePos = valueStart;
    530         endValuePos = pos;
    531         if (pos < end) {
    532           pos++;                        // Skip the ending quote char
    533         }
    534       } else {
    535         int valueStart = pos;
    536         for (; pos < end; pos++) {
    537           ch = html.charAt(pos);
    538 
    539           // End of tag or end of value. Not that '/' is included in the value
    540           // even if it is the '/>' at the end of the tag.
    541           if ((ch == '>') || Character.isWhitespace(ch)) {
    542             break;
    543           }
    544         }
    545         startValuePos = valueStart;
    546         endValuePos = pos;
    547       }
    548 
    549       X.assertTrue(startValuePos > -1);
    550       X.assertTrue(endValuePos > -1);
    551       X.assertTrue(startValuePos <= endValuePos);
    552       X.assertTrue(pos <= end);
    553 
    554       return pos;
    555     }
    556 
    557     /**
    558      * Skips white spaces.
    559      *
    560      * @param start Position in original html
    561      * @param end Position in original html
    562      * @return End position of scanned content
    563      */
    564     private int skipSpaces(final int start, final int end) {
    565       int pos;
    566       for (pos = start; pos < end; pos++) {
    567         if (!Character.isWhitespace(html.charAt(pos))) {
    568           break;
    569         }
    570       }
    571       return pos;
    572     }
    573 
    574     public String getName() {
    575       if (name == null && startNamePos != -1 && endNamePos != -1) {
    576         name = html.substring(startNamePos, endNamePos);
    577       }
    578       return name;
    579     }
    580 
    581     public String getValue() {
    582       if (value == null && startValuePos != -1 && endValuePos != -1) {
    583         value = html.substring(startValuePos, endValuePos);
    584       }
    585       return value;
    586     }
    587   }
    588 
    589   /**
    590    * Holds any unrecognized elements we encounter.  Only applicable in
    591    * PRESERVE_ALL mode.
    592    */
    593   private final HashMap<String,HTML.Element> unknownElements = Maps.newHashMap();
    594 
    595   /**
    596    * Holds any unrecognized attributes we encounter.  Only applicable in
    597    * PRESERVE_ALL mode.
    598    */
    599   private final HashMap<String,HTML.Attribute> unknownAttributes = Maps.newHashMap();
    600 
    601   /**
    602    * @param name Element name.
    603    * @return "Dummy" element.  Not useful for any real HTML processing, but
    604    * gives us a placeholder for tracking original HTML contents.
    605    */
    606   private HTML.Element lookupUnknownElement(String name) {
    607     name = name.toLowerCase();
    608     HTML.Element result = unknownElements.get(name);
    609     if (result == null) {
    610       result = new HTML.Element(name,
    611           HTML.Element.NO_TYPE,
    612           /* empty */ false,
    613           /* optional end tag */ true,
    614           /* breaks flow*/ false,
    615           HTML.Element.Flow.NONE);
    616       unknownElements.put(name, result);
    617     }
    618     return result;
    619   }
    620 
    621   /**
    622    * @param name Attribute name.
    623    * @return "Dummy" attribute. Not useful for any real HTML processing, but
    624    *         gives us a placeholder for tracking original HTML contents.
    625    */
    626   private HTML.Attribute lookupUnknownAttribute(String name) {
    627     name = name.toLowerCase();
    628     HTML.Attribute result = unknownAttributes.get(name);
    629     if (result == null) {
    630       result = new HTML.Attribute(name, HTML.Attribute.NO_TYPE);
    631       unknownAttributes.put(name, result);
    632     }
    633     return result;
    634   }
    635 
    636   /**
    637    * Scans for an HTML tag.
    638    *
    639    * @param start Position in original html.
    640    * @param end Position in original html.
    641    * @return End position of scanned content.
    642    */
    643   int scanTag(final int start, final int end) {
    644     X.assertTrue(html.charAt(start) == '<');
    645 
    646     // nameStart is where we begin scanning for the tag name and attributes,
    647     // so we skip '<'.
    648     int nameStart = start + 1;
    649 
    650     // Next state is Text, except the case when we see a STYLE/SCRIPT tag. See
    651     // code below.
    652     state = State.IN_TEXT;
    653 
    654     // End tag?
    655     boolean isEndTag = false;
    656     if (html.charAt(nameStart) == '/') {
    657       isEndTag = true;
    658       ++nameStart;
    659     }
    660 
    661     // Tag name and element
    662     TagNameScanner tagNameScanner = new TagNameScanner(html);
    663     int pos = tagNameScanner.scanName(nameStart, end);
    664     String tagName = tagNameScanner.getTagName();
    665     HTML.Element element = null;
    666     if (tagName == null) {
    667       // For some reason, browsers treat start and end tags differently
    668       // when they don't have a valid tag name - end tags are swallowed
    669       // (e.g., "</ >"), start tags treated as text (e.g., "< >")
    670       if (!isEndTag) {
    671         // This is not really a tag, treat the '<' as text.
    672         HtmlDocument.Text text = HtmlDocument.createText("<", preserveAll ? "<" : null);
    673         nodes.add(text);
    674         state = State.IN_TEXT;
    675         return nameStart;
    676       }
    677 
    678       if (preserveAll) {
    679         element = lookupUnknownElement("");
    680       }
    681     } else {
    682       element = lookupElement(tagName);
    683       if (element == null) {
    684         if (DEBUG) {
    685           // Unknown element
    686           debug("Unknown element: " + tagName);
    687         }
    688         if (preserveAll) {
    689           element = lookupUnknownElement(tagName);
    690         }
    691       }
    692     }
    693 
    694     // Attributes
    695     boolean isSingleTag = false;
    696     ArrayList<HtmlDocument.TagAttribute> attributes = null;
    697     int allAttributesStartPos = pos;
    698     int nextAttributeStartPos = pos;
    699     AttributeScanner attributeScanner = new AttributeScanner(html);
    700     while (pos < end) {
    701       int startPos = pos;
    702       char ch = html.charAt(pos);
    703 
    704       // Are we at the end of the tag?
    705       if ((pos + 1 < end) && (ch == '/') && (html.charAt(pos + 1) == '>')) {
    706         isSingleTag = true;
    707         ++pos;
    708         break;                          // Done
    709       }
    710       if (ch == '>') {
    711         break;                          // Done
    712       }
    713 
    714       // See bug 870742 (Buganizer).
    715       if (isEndTag && ('<' == ch)) {
    716         // '<' not allowed in end tag, so we finish processing this tag and
    717         // return to State.IN_TEXT. We mimic Safari & Firefox, which both
    718         // terminate the tag when it contains a '<'.
    719         if (element != null) {
    720           addEndTag(element, start, allAttributesStartPos, pos);
    721         }
    722         state = State.IN_TEXT;
    723         return pos;
    724       }
    725 
    726       if (Character.isWhitespace(ch)) {
    727         // White space, skip it.
    728         ++pos;
    729       } else {
    730         // Scan for attribute
    731         attributeScanner.reset();
    732         pos = attributeScanner.scanName(pos, end);
    733         X.assertTrue(pos > startPos);
    734 
    735         // If it's a valid attribute, scan attribute values
    736         if (attributeScanner.getName() != null) {
    737           pos = attributeScanner.scanValue(pos, end);
    738 
    739           // Add the attribute to the list
    740           if (element != null) {
    741             if (attributes == null) {
    742               attributes = new ArrayList<HtmlDocument.TagAttribute>();
    743             }
    744             addAttribute(attributes, attributeScanner, nextAttributeStartPos, pos);
    745           }
    746           nextAttributeStartPos = pos;
    747         }
    748       }
    749 
    750       // Make sure that we make progress!
    751       X.assertTrue(pos > startPos);
    752     }
    753 
    754     // Cannot find the close tag, so we treat this as text
    755     if (pos == end) {
    756       X.assertTrue(start < end);
    757       String textNodeContent = html.substring(start, end);
    758       String originalContent = null;
    759       if (preserveAll) {
    760         originalContent = textNodeContent;
    761       } else if (preserveValidHtml) {
    762         // Officially a '<' can be valid in a text node, but to be safe we
    763         // always escape them.
    764         originalContent =
    765             CharMatcher.is('<').replaceFrom(html.substring(start, end), "&lt;");
    766       }
    767       nodes.add(HtmlDocument.createEscapedText(textNodeContent, originalContent));
    768       return end;
    769     }
    770 
    771     // Skip '>'
    772     X.assertTrue(html.charAt(pos) == '>');
    773     pos++;
    774 
    775     // Check if it's an element we're keeping (either an HTML4 element, or an
    776     // unknown element we're preserving). If not, ignore the tag.
    777     if (element != null) {
    778       if (isEndTag) {
    779         addEndTag(element, start, allAttributesStartPos, pos);
    780       } else {
    781         // Special case: if it's a STYLE/SCRIPT element, we go to into
    782         // CDATA state.
    783         if (HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element)) {
    784           state = State.IN_CDATA;
    785         }
    786 
    787         addStartTag(element, start, allAttributesStartPos,
    788             nextAttributeStartPos,
    789             pos, isSingleTag, attributes);
    790       }
    791     }
    792 
    793     return pos;
    794   }
    795 
    796   /**
    797    * Lookups the element in our whitelist(s). Whitelists are consulted in
    798    * reverse chronological order (starting from the most recently added
    799    * whitelist), allowing clients to override the default behavior.
    800    *
    801    * @param name Element name.
    802    * @return Element.
    803    */
    804   HTML.Element lookupElement(String name) {
    805     ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
    806     while (iter.hasPrevious()) {
    807       HTML.Element elem = iter.previous().lookupElement(name);
    808       if (elem != null) {
    809         return elem;
    810       }
    811     }
    812     return null;
    813   }
    814 
    815   /**
    816    * Lookups the attribute in our whitelist(s). Whitelists are consulted in
    817    * reverse chronological order (starting from the most recently added
    818    * whitelist), allowing clients to override the default behavior.
    819    *
    820    * @param name Attribute name.
    821    * @return Attribute.
    822    */
    823   HTML.Attribute lookupAttribute(String name) {
    824     ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
    825     while (iter.hasPrevious()) {
    826       HTML.Attribute attr = iter.previous().lookupAttribute(name);
    827       if (attr != null) {
    828         return attr;
    829       }
    830     }
    831     return null;
    832   }
    833 
    834   /**
    835    * @param element Tag element
    836    * @param startPos Start of tag, including '<'
    837    * @param startAttributesPos Start of attributes. This is the first
    838    * character after the tag name. If there are no attributes, this is the end
    839    * of the tag.
    840    * @param endAttributesPos First position after last attribute
    841    * @param endPos End of tag, including '>' character
    842    * @param isSingleTag True iff this is a self-terminating tag
    843    * @param attributes Tag attributes
    844    */
    845   private void addStartTag(HTML.Element element, final int startPos,
    846       final int startAttributesPos, final int endAttributesPos,
    847       final int endPos, final boolean isSingleTag,
    848       ArrayList<HtmlDocument.TagAttribute> attributes) {
    849     X.assertTrue(startPos < startAttributesPos);
    850     X.assertTrue(startAttributesPos <= endAttributesPos);
    851     X.assertTrue(endAttributesPos <= endPos);
    852 
    853     if (preserveAll) {
    854       String beforeAttrs = html.substring(startPos, startAttributesPos);
    855       String afterAttrs = html.substring(endAttributesPos, endPos);
    856       HtmlDocument.Tag tag = (isSingleTag)
    857           ? HtmlDocument.createSelfTerminatingTag(element, attributes,
    858               beforeAttrs, afterAttrs)
    859           : HtmlDocument.createTag(element, attributes,
    860               beforeAttrs, afterAttrs);
    861       nodes.add(tag);
    862     } else if (preserveValidHtml) {
    863       // This is the beginning of the tag up through the tag name. It should not
    864       // be possible for this to contain characters needing escaping, but we add
    865       // this redundant check to avoid an XSS attack that might get past our
    866       // parser but trick a browser into executing a script.
    867       X.assertTrue(html.charAt(startPos) == '<');
    868       StringBuilder beforeAttrs = new StringBuilder("<");
    869       String tagName = html.substring(startPos + 1, startAttributesPos);
    870       beforeAttrs.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
    871 
    872       // Verify end-of-tag characters
    873       int endContentPos = endPos - 1;
    874       X.assertTrue(html.charAt(endContentPos) == '>');
    875       if (isSingleTag) {
    876         --endContentPos;
    877         X.assertTrue(html.charAt(endContentPos) == '/');
    878       }
    879       X.assertTrue(endAttributesPos <= endContentPos);
    880 
    881       // This is any extra characters between the last attribute and the end of
    882       // the tag.
    883       X.assertTrue(endAttributesPos < endPos);
    884       String afterAttrs = html.substring(endAttributesPos, endPos);
    885 
    886       // Strip all but preceding whitespace.
    887       HtmlDocument.Tag tag = (isSingleTag)
    888           ? HtmlDocument.createSelfTerminatingTag(element, attributes,
    889               beforeAttrs.toString(), afterAttrs)
    890           : HtmlDocument.createTag(element, attributes,
    891               beforeAttrs.toString(), afterAttrs);
    892       nodes.add(tag);
    893     } else {
    894       // Normalize.
    895       HtmlDocument.Tag tag = (isSingleTag)
    896           ? HtmlDocument.createSelfTerminatingTag(element, attributes)
    897           : HtmlDocument.createTag(element, attributes);
    898       nodes.add(tag);
    899     }
    900   }
    901 
    902  /**
    903    * @param element End tag element.
    904    * @param startPos Start of tag, including '<'.
    905    * @param startAttributesPos Start of attributes. This is the first
    906    * character after the tag name. If there are no attributes, this is the end
    907    * of the tag.
    908    * @param endPos End of tag. This usually contains the '>' character, but in
    909    * the case where browsers force a termination of a malformed tag, it doesn't.
    910    */
    911   private void addEndTag(HTML.Element element, final int startPos,
    912       final int startAttributesPos, final int endPos) {
    913     X.assertTrue(element != null);
    914     X.assertTrue(html.charAt(startPos) == '<');
    915     X.assertTrue(html.charAt(startPos + 1) == '/');
    916 
    917     if (preserveAll) {
    918       // Preserve all: keep actual content even if it's malformed.
    919       X.assertTrue(startPos < endPos);
    920       String content = html.substring(startPos, endPos);
    921       nodes.add(HtmlDocument.createEndTag(element, content));
    922     } else if (preserveValidHtml) {
    923       // Preserve valid: terminate the tag.
    924 
    925       StringBuilder validContent = new StringBuilder("</");
    926 
    927       // This is the beginning of the tag up through the tag name. It should not
    928       // be possible for this to contain characters needing escaping, but we add
    929       // this redundant check to avoid an XSS attack that might get past our
    930       // parser but trick a browser into executing a script.
    931       X.assertTrue(startPos < startAttributesPos);
    932       String tagName = html.substring(startPos + 2, startAttributesPos);
    933       validContent.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
    934 
    935       // This is the rest of the tag, including any attributes.
    936       // See bug 874396 (Buganizer). We don't allow attributes in an end tag.
    937       X.assertTrue(startAttributesPos <= endPos);
    938       String endOfTag = html.substring(startAttributesPos, endPos);
    939       if (endOfTag.charAt(endOfTag.length() - 1) != '>') {
    940         endOfTag += '>';
    941       }
    942 
    943       // Strip everything but leading whitespace.
    944       validContent.append(endOfTag.replaceAll("\\S+.*>", ">"));
    945 
    946       nodes.add(HtmlDocument.createEndTag(element, validContent.toString()));
    947     } else {
    948       // Normalize: ignore the original content.
    949       nodes.add(HtmlDocument.createEndTag(element));
    950     }
    951   }
    952 
    953   /**
    954    * Creates and adds an attribute to the list.
    955    *
    956    * @param attributes Destination of new attribute.
    957    * @param scanner Scanned attribute.
    958    * @param startPos start position (inclusive) in original HTML of this
    959    *        attribute, including preceeding separator characters (generally this
    960    *        is whitespace, but it might contain other characters). This is the
    961    *        end position of the tag name or previous attribute +1.
    962    * @param endPos end position (exclusive) in original HTML of this attribute.
    963    */
    964   private void addAttribute(ArrayList<HtmlDocument.TagAttribute> attributes,
    965       AttributeScanner scanner, final int startPos, final int endPos) {
    966     X.assertTrue(startPos < endPos);
    967 
    968     String name = scanner.getName();
    969     X.assertTrue(name != null);
    970     HTML.Attribute htmlAttribute = lookupAttribute(name);
    971 
    972     // This can be null when there's no value, e.g., input.checked attribute.
    973     String value = scanner.getValue();
    974 
    975     if (htmlAttribute == null) {
    976       // Unknown attribute.
    977       if (DEBUG) {
    978         debug("Unknown attribute: " + name);
    979       }
    980       if (preserveAll) {
    981         String original = html.substring(startPos, endPos);
    982         attributes.add(HtmlDocument.createTagAttribute(
    983             lookupUnknownAttribute(name), value, original));
    984       }
    985     } else {
    986       String unescapedValue = (value == null) ? null : StringUtil.unescapeHTML(value);
    987       if (preserveAll) {
    988         attributes.add(HtmlDocument.createTagAttribute(htmlAttribute,
    989             unescapedValue, html.substring(startPos, endPos)));
    990       } else if (preserveValidHtml) {
    991         StringBuilder original = new StringBuilder();
    992 
    993         // This includes any separator characters between the tag name or
    994         // preceding attribute and this one.
    995         // This addresses bugs 870757 and 875303 (Buganizer).
    996         // Don't allow non-whitespace separators between attributes.
    997         X.assertTrue(startPos <= scanner.startNamePos);
    998         String originalPrefix = html.substring(
    999             startPos, scanner.startNamePos).replaceAll("\\S+", "");
   1000         if (originalPrefix.length() == 0) {
   1001           originalPrefix = " ";
   1002         }
   1003         original.append(originalPrefix);
   1004 
   1005         if (value == null) {
   1006           // This includes the name and any following whitespace. Escape in case
   1007           // the name has any quotes or '<' that could confuse a browser.
   1008           X.assertTrue(scanner.startNamePos < endPos);
   1009           String nameEtc = html.substring(scanner.startNamePos, endPos);
   1010           original.append(CharEscapers.asciiHtmlEscaper().escape(nameEtc));
   1011         } else {
   1012           // Escape name in case the name has any quotes or '<' that could
   1013           // confuse a browser.
   1014           original.append(CharEscapers.asciiHtmlEscaper().escape(name));
   1015 
   1016           // This includes the equal sign, and any other whitespace
   1017           // between the name and value. It also contains the opening quote
   1018           // character if there is one.
   1019           X.assertTrue(scanner.endNamePos < scanner.startValuePos);
   1020           original.append(html.substring(scanner.endNamePos, scanner.startValuePos));
   1021 
   1022           // This is the value, excluding any quotes.
   1023           if (scanner.attrValueIsQuoted) {
   1024             // Officially a '<' can be valid in an attribute value, but to be
   1025             // safe we always escape them.
   1026             original.append(value.replaceAll("<", "&lt;"));
   1027           } else {
   1028             // This addresses bug 881426 (Buganizer). Put quotes around any
   1029             // dangerous characters, which is what most of the browsers do.
   1030             if (NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX.matcher(value).find()) {
   1031               original.append('"');
   1032               original.append(value.replaceAll("\"", "&quot;"));
   1033               original.append('"');
   1034             } else {
   1035               original.append(value);
   1036             }
   1037           }
   1038 
   1039           // This includes end quote, if applicable.
   1040           X.assertTrue(scanner.endValuePos <= endPos);
   1041           original.append(html.substring(scanner.endValuePos, endPos));
   1042         }
   1043 
   1044         attributes.add(HtmlDocument.createTagAttribute(
   1045             htmlAttribute, unescapedValue, original.toString()));
   1046       } else {
   1047         attributes.add(HtmlDocument.createTagAttribute(
   1048             htmlAttribute, unescapedValue));
   1049       }
   1050     }
   1051   }
   1052 
   1053   //------------------------------------------------------------------------
   1054   // Comment scanning
   1055   //------------------------------------------------------------------------
   1056   private static final String START_COMMENT = "<!--";
   1057   private static final String END_COMMENT = "-->";
   1058 
   1059   private int scanComment(final int start, final int end) {
   1060 
   1061     X.assertTrue(html.regionMatches(start, START_COMMENT, 0, START_COMMENT.length()));
   1062 
   1063     // Scan for end of comment
   1064     int pos = html.indexOf(END_COMMENT, start + START_COMMENT.length());
   1065     if (pos != -1) {
   1066       pos += END_COMMENT.length();
   1067     } else {
   1068       // Look for '>'. If we can't find that, the rest of the text is comments.
   1069       pos = html.indexOf('>', start + 4);
   1070       if (pos != -1) {
   1071         ++pos;
   1072       } else {
   1073         pos = end;
   1074       }
   1075     }
   1076 
   1077     if (preserveAll) {
   1078       nodes.add(HtmlDocument.createHtmlComment(html.substring(start, pos)));
   1079     }
   1080 
   1081     return pos;
   1082   }
   1083 
   1084   //------------------------------------------------------------------------
   1085   // CDATA scanning
   1086   //------------------------------------------------------------------------
   1087   int scanCDATA(final int start, final int end) {
   1088 
   1089     // Get the tag: must be either STYLE or SCRIPT
   1090     HtmlDocument.Tag tag = (HtmlDocument.Tag) nodes.get(nodes.size() - 1);
   1091     HTML.Element element = tag.getElement();
   1092     X.assertTrue(HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element));
   1093 
   1094     int pos;
   1095     for (pos = start; pos < end; pos++) {
   1096       if (pos + 2 < end &&
   1097           html.charAt(pos) == '<' &&
   1098           html.charAt(pos + 1) == '/' &&
   1099           html.regionMatches(true, pos + 2, element.getName(), 0,
   1100                               element.getName().length())) {
   1101         break;
   1102       }
   1103     }
   1104 
   1105     // Add a CDATA node
   1106     if (pos > start) {
   1107       HtmlDocument.CDATA cdata =
   1108         HtmlDocument.createCDATA(html.substring(start, pos));
   1109       nodes.add(cdata);
   1110     }
   1111 
   1112     state = State.IN_TAG;
   1113     return pos;
   1114   }
   1115 
   1116   //------------------------------------------------------------------------
   1117   public static void main(String[] args) throws IOException {
   1118 
   1119     DEBUG = true;
   1120 
   1121     String html = new String(ByteStreams.toByteArray(System.in), "ISO-8859-1");
   1122 
   1123     HtmlParser parser = new HtmlParser();
   1124     HtmlDocument doc = parser.parse(html);
   1125     System.out.println(doc.toString());
   1126   }
   1127 
   1128   private static void debug(String str) {
   1129     System.err.println(str);
   1130   }
   1131 }