Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.android.mail.common.html.parser;
     18 
     19 import com.google.android.mail.common.base.CharEscapers;
     20 import com.google.android.mail.common.base.CharMatcher;
     21 import com.google.android.mail.common.base.StringUtil;
     22 import com.google.android.mail.common.base.X;
     23 import com.google.common.collect.Lists;
     24 
     25 import java.io.PrintWriter;
     26 import java.io.StringWriter;
     27 import java.util.ArrayList;
     28 import java.util.Arrays;
     29 import java.util.List;
     30 
     31 
     32 /**
     33  * HtmlDocument is a container for a list of html nodes, and represents the
     34  * entire html document. It contains toHTML() method which prints out the html
     35  * text, toXHTML for printing out XHTML text and toString() which prints out in
     36  * debug format.
     37  *
     38  * @author jlim (at) google.com (Jing Yee Lim)
     39  */
     40 public class HtmlDocument {
     41   /** List of Node objects */
     42   private final List<Node> nodes;
     43 
     44   /**
     45    * Creates a Html document.
     46    * @param nodes list of html nodes
     47    */
     48   public HtmlDocument(List<Node> nodes) {
     49     this.nodes = nodes;
     50   }
     51 
     52   /** Gets the list of nodes */
     53   public List<Node> getNodes() {
     54     return nodes;
     55   }
     56 
     57   /** Returns a HTML string for the current document */
     58   public String toHTML() {
     59     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     60     for (Node n : nodes) {
     61       n.toHTML(sb);
     62     }
     63     return sb.toString();
     64   }
     65 
     66   /** Returns a XHTML string for the current document */
     67   public String toXHTML() {
     68     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     69     for (Node n : nodes) {
     70       n.toXHTML(sb);
     71     }
     72     return sb.toString();
     73   }
     74 
     75   /**
     76    * Returns, as much as possible, original content of preparsed nodes.  This
     77    * is only different from toHTML() if the nodes were created with original
     78    * content, e.g., by HtmlParser in preserve mode.
     79    */
     80   public String toOriginalHTML() {
     81     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     82     for (Node n : nodes) {
     83       n.toOriginalHTML(sb);
     84     }
     85     return sb.toString();
     86   }
     87 
     88   /** Returns the HTML document in debug format */
     89   @Override
     90   public String toString() {
     91     StringWriter strWriter = new StringWriter();
     92     accept(new DebugPrinter(new PrintWriter(strWriter)));
     93     return strWriter.toString();
     94   }
     95 
     96   /**
     97    * Creates start Tag Node.
     98    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
     99    */
    100   public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) {
    101     return createTag(element, attributes, null, null);
    102   }
    103 
    104   /**
    105    * Creates start Tag Node.
    106    * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String)
    107    */
    108   public static Tag createTag(HTML.Element element,
    109       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
    110       String originalHtmlAfterAttributes) {
    111     return new Tag(element, attributes, false, originalHtmlBeforeAttributes,
    112         originalHtmlAfterAttributes);
    113   }
    114 
    115   /**
    116    * Creates self-terminating Tag Node.
    117    * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String)
    118    */
    119   public static Tag createSelfTerminatingTag(HTML.Element element,
    120       List<TagAttribute> attributes) {
    121     return createSelfTerminatingTag(element, attributes, null, null);
    122   }
    123 
    124   /**
    125    * Creates self-terminating Tag Node.
    126    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
    127    */
    128   public static Tag createSelfTerminatingTag(HTML.Element element,
    129       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
    130       String originalHtmlAfterAttributes) {
    131     return new Tag(element, attributes, true, originalHtmlBeforeAttributes,
    132         originalHtmlAfterAttributes);
    133   }
    134 
    135   /**
    136    * @see HtmlDocument#createEndTag(HTML.Element, String)
    137    */
    138   public static EndTag createEndTag(HTML.Element element) {
    139     return createEndTag(element, null);
    140   }
    141 
    142   /**
    143    * @see HtmlDocument.EndTag#EndTag(HTML.Element, String)
    144    */
    145   public static EndTag createEndTag(HTML.Element element, String originalHtml) {
    146     return new EndTag(element, originalHtml);
    147   }
    148 
    149   /**
    150    * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String)
    151    */
    152   public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) {
    153     return createTagAttribute(attr, value, null);
    154   }
    155 
    156   /**
    157    * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String)
    158    */
    159   public static TagAttribute createTagAttribute(HTML.Attribute attr,
    160       String value, String originalHtml) {
    161     X.assertTrue(attr != null);
    162     return new TagAttribute(attr, value, originalHtml);
    163   }
    164 
    165   /**
    166    * @see HtmlDocument#createText(String, String)
    167    */
    168   public static Text createText(String text) {
    169     return createText(text, null);
    170   }
    171 
    172   /**
    173    * Creates a Text node.
    174    * @see UnescapedText#UnescapedText(String, String)
    175    */
    176   public static Text createText(String text, String original) {
    177     return new UnescapedText(text, original);
    178   }
    179 
    180   /**
    181    * Creates a Text node where the content hasn't been unescaped yet (this will
    182    * be done lazily).
    183    */
    184   public static Text createEscapedText(String htmlText, String original) {
    185     return new EscapedText(htmlText, original);
    186   }
    187 
    188   /**
    189    * Creates an Comment node.
    190    * @see Comment#Comment(String)
    191    */
    192   public static Comment createHtmlComment(String content) {
    193     return new Comment(content);
    194   }
    195 
    196   /**
    197    * Creates a CDATA node.
    198    * @see CDATA#CDATA(String)
    199    */
    200   public static CDATA createCDATA(String text) {
    201     return new CDATA(text);
    202   }
    203 
    204   /** Accepts a Visitor */
    205   public void accept(Visitor v) {
    206     v.start();
    207     for (Node node : nodes) {
    208       node.accept(v);
    209     }
    210     v.finish();
    211   }
    212 
    213   /**
    214    * @param filter results of this filter replace the existing nodes
    215    * @return new document with filtered nodes
    216    */
    217   public HtmlDocument filter(MultiplexFilter filter) {
    218     filter.start();
    219     List<Node> newNodes = new ArrayList<Node>();
    220     for (Node node : nodes) {
    221       filter.filter(node, newNodes);
    222     }
    223     filter.finish(newNodes);
    224     return new HtmlDocument(newNodes);
    225   }
    226 
    227   /**
    228    * Html node
    229    */
    230   public static abstract class Node {
    231 
    232     /** Accepts a visitor */
    233     public abstract void accept(Visitor visitor);
    234 
    235     /** Converts to HTML */
    236     public String toHTML() {
    237       StringBuilder sb = new StringBuilder();
    238       toHTML(sb);
    239       return sb.toString();
    240     }
    241 
    242     /** Converts to HTML */
    243     public abstract void toHTML(StringBuilder sb);
    244 
    245     /** Converts to XHTML */
    246     public String toXHTML() {
    247       StringBuilder sb = new StringBuilder();
    248       toXHTML(sb);
    249       return sb.toString();
    250     }
    251 
    252     /** Converts to XHTML */
    253     public abstract void toXHTML(StringBuilder sb);
    254 
    255     /**
    256      * @return Original if it's available; otherwise, returns
    257      * <code>toHTML()</code>
    258      */
    259     public String toOriginalHTML() {
    260       StringBuilder sb = new StringBuilder();
    261       toOriginalHTML(sb);
    262       return sb.toString();
    263     }
    264 
    265     /**
    266      * @param sb Destination of HTML to be appended.  Appends original if it's
    267      * available; otherwise, appends <code>toHTML()</code>
    268      */
    269     public abstract void toOriginalHTML(StringBuilder sb);
    270   }
    271 
    272   /**
    273    * HTML comment node.
    274    */
    275   public static class Comment extends Node {
    276 
    277     private final String content;
    278 
    279     /**
    280      * @param content Raw comment, including "&lt;!--" and "--&gt;".
    281      */
    282     public Comment(String content) {
    283       this.content = content;
    284     }
    285 
    286     @Override
    287     public void accept(Visitor visitor) {
    288       visitor.visitComment(this);
    289     }
    290 
    291     /**
    292      * Emit original unchanged.
    293      * @param sb Destination of result.
    294      */
    295     @Override
    296     public void toHTML(StringBuilder sb) {
    297       sb.append(content);
    298     }
    299 
    300     /**
    301      * Emit original unchanged.
    302      * @param sb Destination of result.
    303      */
    304     @Override
    305     public void toXHTML(StringBuilder sb) {
    306       sb.append(content);
    307     }
    308 
    309     /**
    310      * Emit original unchanged.
    311      * @param sb Destination of result.
    312      */
    313     @Override
    314     public void toOriginalHTML(StringBuilder sb) {
    315       sb.append(content);
    316     }
    317 
    318     /**
    319      * @return Original unchanged.
    320      */
    321     public String getContent() {
    322       return content;
    323     }
    324   }
    325 
    326   /**
    327    * Text node
    328    */
    329   public static abstract class Text extends Node {
    330 
    331     /**
    332      * unaltered original content of this node
    333      */
    334     private final String originalHtml;
    335 
    336     /**
    337      * content of this node in HTML format
    338      */
    339     private String html;
    340 
    341     /**
    342      * @param originalHtml Unaltered original HTML. If not null,
    343      *        toOriginalHTML() will return this.
    344      */
    345     protected Text(String originalHtml) {
    346       this.originalHtml = originalHtml;
    347     }
    348 
    349     /**
    350      * Gets the plain, unescaped text.
    351      */
    352     abstract public String getText();
    353 
    354     // Returns true if it contains only white space
    355     public boolean isWhitespace() {
    356       String text = getText();
    357       int len = text.length();
    358       for (int i = 0; i < len; i++) {
    359         if (!Character.isWhitespace(text.charAt(i))) {
    360           return false;
    361         }
    362       }
    363       return true;
    364     }
    365 
    366     @Override
    367     public boolean equals(Object o) {
    368       if (o == this) {
    369         return true;
    370       }
    371       if (o instanceof Text) {
    372         Text that = (Text) o;
    373 
    374         return this.originalHtml == null ? that.originalHtml == null
    375             : this.originalHtml.equals(that.originalHtml);
    376       }
    377       return false;
    378     }
    379 
    380     @Override
    381     public int hashCode() {
    382       return originalHtml == null ? 0 : originalHtml.hashCode();
    383     }
    384 
    385     @Override
    386     public String toString() {
    387       return getText();
    388     }
    389 
    390     /** Extends Node.accept */
    391     @Override
    392     public void accept(Visitor visitor) {
    393       visitor.visitText(this);
    394     }
    395 
    396     /**
    397      * Gets the HTML, with HTML entities escaped.
    398      */
    399     @Override
    400     public void toHTML(StringBuilder sb) {
    401       if (html == null) {
    402         html = CharEscapers.asciiHtmlEscaper().escape(getText());
    403       }
    404       sb.append(html);
    405     }
    406 
    407     /**
    408      * @see HtmlDocument.Text#toHTML(StringBuilder)
    409      */
    410     @Override
    411     public void toXHTML(StringBuilder sb) {
    412       toHTML(sb);
    413     }
    414 
    415     /**
    416      * @param sb Appends original HTML to this if available.  Otherwise,
    417      * same as toHTML().
    418      */
    419     @Override
    420     public void toOriginalHTML(StringBuilder sb) {
    421       if (originalHtml != null) {
    422         sb.append(originalHtml);
    423       } else {
    424         toHTML(sb);
    425       }
    426     }
    427 
    428     /**
    429      * @return the original HTML (possibly with entities unescaped if the
    430      * document was malformed). May be null if original HTML was not preserved
    431      * (see constructor argument of {@link HtmlParser})
    432      */
    433     public String getOriginalHTML() {
    434       return originalHtml;
    435     }
    436   }
    437 
    438   /**
    439    * {@link Text} implementation where the given text is assumed to have been
    440    * already HTML unescaped.
    441    */
    442   private static class UnescapedText extends Text {
    443     /**
    444      * content of this node as plain, unescaped text
    445      */
    446     protected final String text;
    447 
    448     private UnescapedText(String plainText, String originalHtml) {
    449       super(originalHtml);
    450       X.assertTrue(plainText != null);
    451       this.text = plainText;
    452     }
    453 
    454     @Override public String getText() {
    455       return text;
    456     }
    457   }
    458 
    459   /**
    460    * {@link Text} implementation where the given text is not unescaped yet, and
    461    * unescaping will only be done lazily.
    462    */
    463   private static class EscapedText extends Text {
    464     private final String htmlText;
    465     private String text;
    466 
    467     private EscapedText(String htmlText, String originalHtml) {
    468       super(originalHtml);
    469       this.htmlText = htmlText;
    470     }
    471 
    472     @Override public String getText() {
    473       if (text == null) {
    474         text = StringUtil.unescapeHTML(htmlText);
    475       }
    476       return text;
    477     }
    478   }
    479 
    480   /**
    481    * CDATA node is a subclass of Text node.
    482    */
    483   public static class CDATA extends UnescapedText {
    484     private CDATA(String text) {
    485       super(text, text);
    486     }
    487 
    488     @Override public void toHTML(StringBuilder sb) {
    489       // Do not htmlescape CDATA text
    490       sb.append(text);
    491     }
    492 
    493     @Override public void toXHTML(StringBuilder sb) {
    494       sb.append("<![CDATA[")
    495         .append(text)
    496         .append("]]>");
    497     }
    498   }
    499 
    500   /**
    501    * Tag is a HTML open tag.
    502    */
    503   public static class Tag extends Node {
    504     // The element
    505     private final HTML.Element element;
    506 
    507     // List of TagAttribute objects. This may be null.
    508     private List<TagAttribute> attributes;
    509 
    510     private final boolean isSelfTerminating;
    511 
    512     private final String originalHtmlBeforeAttributes;
    513 
    514     private final String originalHtmlAfterAttributes;
    515 
    516     /**
    517      * @param element the HTML4 element
    518      * @param attributes list of TagAttribute objects, may be null
    519      * @param isSelfTerminating
    520      * @param originalHtmlBeforeAttributes Original tag's full content before
    521      *        first attribute, including beginning '&lt;'. This should not
    522      *        include preceeding whitespace for the first attribute, as that
    523      *        should be included in the attribute node. If not null, tag will
    524      *        preserve this original content. e.g., if original tag were
    525      *        "&lt;foO bar='zbc'&gt;", case of foO would be preserved. This
    526      *        method does not validate that
    527      *        <code>originalHtmlBeforeAttributes</code> is a valid tag String.
    528      * @param originalHtmlAfterAttributes Full content of original tag after
    529      *        last attribute, including ending '>'. If not null, tag will
    530      *        preserve this original content. e.g., if original tag were
    531      *        "&lt;foo bar='zbc'  &gt;", the spaces before '&gt;' be preserved.
    532      *        This method does not validate that
    533      *        <code>originalHtmlAfterAttributes</code> is a valid tag String.
    534      */
    535     private Tag(HTML.Element element, List<TagAttribute> attributes,
    536         boolean isSelfTerminating, String originalHtmlBeforeAttributes,
    537         String originalHtmlAfterAttributes) {
    538       X.assertTrue(element != null);
    539       this.element = element;
    540       this.attributes = attributes;
    541       this.isSelfTerminating = isSelfTerminating;
    542       this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes;
    543       this.originalHtmlAfterAttributes = originalHtmlAfterAttributes;
    544     }
    545 
    546     /** Gets the name */
    547     public String getName() {
    548       return element.getName();
    549     }
    550 
    551     /** Gets the element */
    552     public HTML.Element getElement() {
    553       return element;
    554     }
    555 
    556     /** Adds an attribute */
    557     public void addAttribute(HTML.Attribute attr, String value) {
    558       X.assertTrue(attr != null);
    559       addAttribute(new TagAttribute(attr, value, null));
    560     }
    561 
    562     /** Adds an attribute */
    563     public void addAttribute(TagAttribute attr) {
    564       X.assertTrue(attr != null);
    565       if (attributes == null) {
    566         attributes = new ArrayList<TagAttribute>();
    567       }
    568       attributes.add(attr);
    569     }
    570 
    571     /** Gets the list of attributes, note that this maybe null. */
    572     public List<TagAttribute> getAttributes() {
    573       return attributes;
    574     }
    575 
    576     /** Finds and returns a TagAttribute, or null if not found */
    577     public TagAttribute getAttribute(HTML.Attribute attr) {
    578       if (attributes != null) {
    579         for (TagAttribute attribute : attributes) {
    580           if (attribute.getAttribute().equals(attr)) {
    581             return attribute;
    582           }
    583         }
    584       }
    585       return null;
    586     }
    587 
    588     /**
    589      * Finds and returns list of TagAttribute of given attribute
    590      * type, or empty list if not found,
    591      */
    592     public List<TagAttribute> getAttributes(HTML.Attribute attr) {
    593       List<TagAttribute> result = Lists.newArrayList();
    594       if (attributes != null) {
    595         for (TagAttribute attribute : attributes) {
    596           if (attribute.getAttribute().equals(attr)) {
    597             result.add(attribute);
    598           }
    599         }
    600       }
    601       return result;
    602     }
    603 
    604     /** Returns debug string */
    605     @Override
    606     public String toString() {
    607       StringBuilder sb = new StringBuilder();
    608       sb.append("Start Tag: ");
    609       sb.append(element.getName());
    610       if (attributes != null) {
    611         for (TagAttribute attr : attributes) {
    612           sb.append(' ');
    613           sb.append(attr.toString());
    614         }
    615       }
    616       return sb.toString();
    617     }
    618 
    619     /** Implements Node.accept */
    620     @Override
    621     public void accept(Visitor visitor) {
    622       visitor.visitTag(this);
    623     }
    624 
    625     /** Implements Node.toHTML */
    626     @Override
    627     public void toHTML(StringBuilder sb) {
    628       serialize(sb, SerializeType.HTML);
    629     }
    630 
    631     @Override
    632     public void toXHTML(StringBuilder sb) {
    633       serialize(sb, SerializeType.XHTML);
    634     }
    635 
    636     @Override
    637     public void toOriginalHTML(StringBuilder sb) {
    638       serialize(sb, SerializeType.ORIGINAL_HTML);
    639     }
    640 
    641     /**
    642      * Specifies format of serialized output.
    643      */
    644     private enum SerializeType {
    645       ORIGINAL_HTML, HTML, XHTML
    646     }
    647 
    648     private void serialize(StringBuilder sb, SerializeType type) {
    649       // before attributes
    650       if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) {
    651         sb.append(originalHtmlBeforeAttributes);
    652       } else {
    653         sb.append('<');
    654         sb.append(element.getName());
    655       }
    656 
    657       // attributes
    658       if (attributes != null) {
    659         for (TagAttribute attr : attributes) {
    660           // attribute includes leading whitespace, so we needn't add it here
    661           if (type == SerializeType.ORIGINAL_HTML) {
    662             attr.toOriginalHTML(sb);
    663           } else if (type == SerializeType.HTML) {
    664             attr.toHTML(sb);
    665           } else {
    666             attr.toXHTML(sb);
    667           }
    668         }
    669       }
    670 
    671       // after attributes
    672       if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) {
    673         sb.append(originalHtmlAfterAttributes);
    674       } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) {
    675         sb.append(" />");
    676       } else {
    677         sb.append('>');
    678       }
    679     }
    680 
    681     public boolean isSelfTerminating() {
    682       return isSelfTerminating;
    683     }
    684 
    685     public String getOriginalHtmlBeforeAttributes() {
    686       return originalHtmlBeforeAttributes;
    687     }
    688 
    689     public String getOriginalHtmlAfterAttributes() {
    690       return originalHtmlAfterAttributes;
    691     }
    692   }
    693 
    694   /**
    695    * EndTag is a closing HTML tag.
    696    */
    697   public static class EndTag extends Node {
    698     // The element
    699     private final HTML.Element element;
    700 
    701     private final String originalHtml;
    702 
    703     /**
    704      * @param element The HTML.Element element.  Can not be null.
    705      * @param originalHtml Full content of original tag, including beginning
    706      * and ending '<' and '>'.  If not null, tag will preserve this original
    707      * content. e.g., if original tag were "&lt;/foo &gt;", the space after foo
    708      * would be preserved.  This method does not validate that originalHtml is a
    709      * valid tag String.
    710      */
    711     private EndTag(HTML.Element element, String originalHtml) {
    712       X.assertTrue(element != null);
    713       this.element = element;
    714       this.originalHtml = originalHtml;
    715     }
    716 
    717     /** Gets the name */
    718     public String getName() {
    719       return element.getName();
    720     }
    721 
    722     /** Gets the element */
    723     public HTML.Element getElement() {
    724       return element;
    725     }
    726 
    727     /** Returns debug string */
    728     @Override
    729     public String toString() {
    730       return "End Tag: " + element.getName();
    731     }
    732 
    733     /** Implements Node.accept */
    734     @Override
    735     public void accept(Visitor visitor) {
    736       visitor.visitEndTag(this);
    737     }
    738 
    739     /** Implements Node.toHTML */
    740     @Override
    741     public void toHTML(StringBuilder sb) {
    742       sb.append("</");
    743       sb.append(element.getName());
    744       sb.append('>');
    745     }
    746 
    747     @Override
    748     public void toXHTML(StringBuilder sb) {
    749       toHTML(sb);
    750     }
    751 
    752     @Override
    753     public void toOriginalHTML(StringBuilder sb) {
    754       if (originalHtml != null) {
    755         sb.append(originalHtml);
    756       } else {
    757         toHTML(sb);
    758       }
    759     }
    760   }
    761 
    762   /**
    763    * TagAttribute represents an attribute in a HTML tag.
    764    */
    765   public static class TagAttribute {
    766     private final HTML.Attribute attribute;
    767     private String value;
    768     private String originalHtml;
    769 
    770     /**
    771      * @param attribute the HTML.Attribute. Can't be null.
    772      * @param value The value in plain-text format. This can be null if the
    773      *        attribute has no value.
    774      * @param originalHtml If not null, toOriginalHTML() will preserve original
    775      *        content. This should contain any leading whitespace from the
    776      *        original.
    777      */
    778     private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) {
    779       X.assertTrue(attribute != null);
    780       this.attribute = attribute;
    781       this.value = value;
    782       this.originalHtml = originalHtml;
    783     }
    784 
    785     /** Gets the name */
    786     public String getName() {
    787       return attribute.getName();
    788     }
    789 
    790     /** Gets the HTML.Attribute information */
    791     public HTML.Attribute getAttribute() {
    792       return attribute;
    793     }
    794 
    795     /**
    796      * Sets the attribute value.
    797      * This value must be in plain-text, not html-escaped.
    798      * This can be null, if the attribute has no values.
    799      * This clears <code>originalHtml_</code> if it were set, so
    800      * <code>toOriginalHTML()</code> might not preserve original any more.
    801      */
    802     public void setValue(String value) {
    803       this.value = value;
    804       originalHtml = null;
    805     }
    806 
    807     /** Returns the attribute value in plain-text, never null */
    808     public String getValue() {
    809       return value != null ? value : "";
    810     }
    811 
    812     /** Returns true if the attribute value is not empty */
    813     public boolean hasValue() {
    814       return value != null;
    815     }
    816 
    817     /**
    818      * Writes out the attribute in HTML format with all necessary preceding
    819      * whitespace. Emits originalHtml_ if it were specified to the constructor.
    820      * Otherwise, emits a new name="value" string with a single preceding space.
    821      */
    822     public void toHTML(StringBuilder sb) {
    823       sb.append(' ');
    824       sb.append(attribute.getName());
    825       if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) {
    826         sb.append("=\"");
    827         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
    828         sb.append("\"");
    829       }
    830     }
    831 
    832     /** Returns the attribute html string */
    833     public String toHTML() {
    834       StringBuilder sb = new StringBuilder();
    835       toHTML(sb);
    836       return sb.toString();
    837     }
    838 
    839     /**
    840      * Writes out the attribute in XHTML format (value is always appended,
    841      * even if it is empty) with all necessary preceeding whitespace.
    842      */
    843     public void toXHTML(StringBuilder sb) {
    844       sb.append(' ');
    845       sb.append(attribute.getName()).append("=\"");
    846 
    847       // Assume that value-less attribute are boolean attributes like "disabled"
    848       if (hasValue()) {
    849         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
    850       } else {
    851         sb.append(attribute.getName());
    852       }
    853 
    854       sb.append("\"");
    855     }
    856 
    857     /** Returns the attribute XHTML string */
    858     public String toXHTML() {
    859       StringBuilder sb = new StringBuilder();
    860       toXHTML(sb);
    861       return sb.toString();
    862     }
    863 
    864     /**
    865      * @param sb Destination to which attribute is written, in its original
    866      * preparsed form if possible.
    867      */
    868     public void toOriginalHTML(StringBuilder sb) {
    869       if (originalHtml != null) {
    870         sb.append(originalHtml);
    871       } else {
    872         toHTML(sb);
    873       }
    874     }
    875 
    876     /**
    877      * Writes out the attribute in its original form as it was parsed..
    878      */
    879     public String toOriginalHTML() {
    880       StringBuilder sb = new StringBuilder();
    881       toOriginalHTML(sb);
    882       return sb.toString();
    883     }
    884 
    885     @Override
    886     public String toString() {
    887       return "{" + attribute.getName() + "=" + value + "}";
    888     }
    889   }
    890 
    891   /**
    892    * Filter is like Visitor, except it implies that the nodes may be changed,
    893    * whereas HtmlDocument.Visitor just implies that the nodes are iterated
    894    * over. A Filter can behave just like a Visitor if it merely returns the
    895    * same node that it visited. Also, methods may be called on a node to change
    896    * the values it contains. Alternatively, a new node entirely can be created
    897    * and returned, which will essentially replace the previous node with the
    898    * new node in the document tree. A node may be removed by returning null
    899    * instead of a node.
    900    */
    901   public static interface Filter {
    902     /** This is called first */
    903     void start();
    904 
    905     /** A text node */
    906     Text visitText(Text n);
    907 
    908     /** An open tag */
    909     Tag visitTag(Tag n);
    910 
    911     /** End tag */
    912     EndTag visitEndTag(EndTag n);
    913 
    914     /** HTML comment */
    915     Comment visitComment(Comment n);
    916 
    917     /* Called at the end. */
    918     void finish();
    919   }
    920 
    921   /**
    922    * Like Filter, except each node may be replaced by multiple nodes.  Also,
    923    * does not do double dispatch accept/visit.
    924    */
    925   public static interface MultiplexFilter {
    926     /**
    927      * Called first.
    928      */
    929     void start();
    930 
    931     /**
    932      * @param originalNode node to filter
    933      * @param out Destination to which this object appends nodes to replace
    934      * originalNode.  Can not be null.
    935      */
    936     void filter(Node originalNode, List<Node> out);
    937 
    938     /**
    939      * Called at the end.
    940      * @param out Destination to which this object appends nodes at the end of
    941      * the document.  Can not be null.
    942      */
    943     void finish(List<Node> out);
    944   }
    945 
    946   /**
    947    * Converts a normal {@link Filter} into a {@link MultiplexFilter}.
    948    */
    949   public static class MultiplexFilterAdapter implements MultiplexFilter {
    950 
    951     private final Filter filter;
    952 
    953     public MultiplexFilterAdapter(Filter filter) {
    954       this.filter = filter;
    955     }
    956 
    957     public void start() {
    958       filter.start();
    959     }
    960 
    961     public void filter(Node originalNode, List<Node> out) {
    962       if (originalNode == null) {
    963         return;
    964       }
    965 
    966       Node resultNode;
    967       if (originalNode instanceof Tag) {
    968         resultNode = filter.visitTag((Tag) originalNode);
    969       } else if (originalNode instanceof Text) {
    970         resultNode = filter.visitText((Text) originalNode);
    971       } else if (originalNode instanceof EndTag) {
    972         resultNode = filter.visitEndTag((EndTag) originalNode);
    973       } else if (originalNode instanceof Comment) {
    974         resultNode = filter.visitComment((Comment) originalNode);
    975       } else {
    976         throw new IllegalArgumentException("unknown node type: " + originalNode.getClass());
    977       }
    978 
    979       if (resultNode != null) {
    980         out.add(resultNode);
    981       }
    982     }
    983 
    984     public void finish(List<Node> out) {
    985       filter.finish();
    986     }
    987   }
    988 
    989   /**
    990    * Like Filter, except each node may be replaced by multiple nodes.  Also,
    991    * does not do double dispatch accept/visit.  Dispatches filterNode() to
    992    * node-specific methods.
    993    */
    994   public static abstract class SimpleMultiplexFilter implements MultiplexFilter {
    995 
    996     /**
    997      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
    998      */
    999     public void filter(Node originalNode, List<Node> out) {
   1000       if (originalNode == null) {
   1001         return;
   1002       }
   1003 
   1004       if (originalNode instanceof Tag) {
   1005         filterTag((Tag) originalNode, out);
   1006       } else if (originalNode instanceof Text) {
   1007         filterText((Text) originalNode, out);
   1008       } else if (originalNode instanceof EndTag) {
   1009         filterEndTag((EndTag) originalNode, out);
   1010       } else if (originalNode instanceof Comment) {
   1011         filterComment((Comment) originalNode, out);
   1012       } else {
   1013         throw new IllegalArgumentException("unknown node type: "
   1014             + originalNode.getClass());
   1015       }
   1016     }
   1017 
   1018     public abstract void filterTag(Tag originalTag, List<Node> out);
   1019 
   1020     public abstract void filterText(Text originalText, List<Node> out);
   1021 
   1022     public abstract void filterEndTag(EndTag originalEndTag, List<Node> out);
   1023 
   1024     public void filterComment(Comment originalComment, List<Node> out) {
   1025     }
   1026   }
   1027 
   1028   /**
   1029    * Contains a list of filters which are applied, in order, to each Node.  The
   1030    * output of each becomes the input to the next.  As soon as one returns an
   1031    * empty list it breaks the chain.
   1032    */
   1033   public static class MultiplexFilterChain implements MultiplexFilter {
   1034 
   1035     private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>();
   1036 
   1037     /**
   1038      * @param sourceFilters these filters are applied in List order
   1039      */
   1040     public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) {
   1041       filters.addAll(sourceFilters);
   1042     }
   1043 
   1044     /**
   1045      * @see HtmlDocument.MultiplexFilter#start()
   1046      */
   1047     public void start() {
   1048       for (MultiplexFilter filter : filters) {
   1049         filter.start();
   1050       }
   1051     }
   1052 
   1053     /**
   1054      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
   1055      */
   1056     public void filter(Node originalNode, List<Node> out) {
   1057       List<Node> result = new ArrayList<Node>();
   1058       result.add(originalNode);
   1059 
   1060       // loop through filters until one returns nothing, or until we're out of
   1061       // filters
   1062       for (MultiplexFilter filter : filters) {
   1063         if (result.isEmpty()) {
   1064           return;
   1065         }
   1066 
   1067         // apply filter to each node and collect results
   1068         List<Node> newResult = new ArrayList<Node>();
   1069         for (Node node : result) {
   1070           filter.filter(node, newResult);
   1071         }
   1072         result = newResult;
   1073       }
   1074 
   1075       out.addAll(result);
   1076     }
   1077 
   1078     /**
   1079      * @see HtmlDocument.MultiplexFilter#finish(List)
   1080      */
   1081     public void finish(List<Node> out) {
   1082       List<Node> result = new ArrayList<Node>();
   1083 
   1084       // loop through filters until one returns nothing, or until we're out of
   1085       // filters
   1086       for (MultiplexFilter filter : filters) {
   1087         // apply filter to each node and collect results
   1088         List<Node> newResult = new ArrayList<Node>();
   1089         for (Node node : result) {
   1090           filter.filter(node, newResult);
   1091         }
   1092         filter.finish(newResult);
   1093         result = newResult;
   1094       }
   1095 
   1096       out.addAll(result);
   1097     }
   1098   }
   1099 
   1100   /**
   1101    * Html visitor allows external code to iterate through the nodes in the
   1102    * document. See HtmlDocument.accept.
   1103    */
   1104   public static interface Visitor {
   1105     /** This is called first */
   1106     void start();
   1107 
   1108     /** A text node */
   1109     void visitText(Text n);
   1110 
   1111     /** An open tag */
   1112     void visitTag(Tag n);
   1113 
   1114     /** End tag */
   1115     void visitEndTag(EndTag n);
   1116 
   1117     /** comment */
   1118     void visitComment(Comment n);
   1119 
   1120     /* Called at the end. */
   1121     void finish();
   1122   }
   1123 
   1124   /**
   1125    * An implementation of the Visitor interface which simply delegates its
   1126    * methods to a wrapped instance of another Visitor.
   1127    *
   1128    * <p>This is useful for chaining Visitors together.
   1129    */
   1130   public static class VisitorWrapper implements Visitor {
   1131     private final Visitor wrapped;
   1132 
   1133     protected VisitorWrapper(Visitor wrap) {
   1134       wrapped = wrap;
   1135     }
   1136 
   1137     public void start() {
   1138       wrapped.start();
   1139     }
   1140 
   1141     public void visitText(Text n) {
   1142       wrapped.visitText(n);
   1143     }
   1144 
   1145     public void visitTag(Tag n) {
   1146       wrapped.visitTag(n);
   1147     }
   1148 
   1149     public void visitEndTag(EndTag n) {
   1150       wrapped.visitEndTag(n);
   1151     }
   1152 
   1153     public void visitComment(Comment n) {
   1154       wrapped.visitComment(n);
   1155     }
   1156 
   1157     public void finish() {
   1158       wrapped.finish();
   1159     }
   1160   }
   1161 
   1162   /**
   1163    * A special helper Visitor that builds a HtmlDocument.
   1164    */
   1165   public static class Builder implements Visitor {
   1166     private final boolean preserveComments;
   1167     private final List<Node> nodes = new ArrayList<Node>();
   1168     private HtmlDocument doc;
   1169 
   1170     /**
   1171      * @see Builder#Builder(boolean)
   1172      */
   1173     public Builder() {
   1174       this(false);
   1175     }
   1176 
   1177     /**
   1178      * @param preserveComments If false, ignores Comment nodes
   1179      */
   1180     public Builder(boolean preserveComments) {
   1181       this.preserveComments = preserveComments;
   1182     }
   1183 
   1184     public void addNode(Node node) {
   1185       nodes.add(node);
   1186     }
   1187     public void start() {
   1188     }
   1189     public void visitText(Text t) {
   1190       addNode(t);
   1191     }
   1192     public void visitTag(Tag t) {
   1193       addNode(t);
   1194     }
   1195     public void visitComment(Comment n) {
   1196       if (preserveComments) {
   1197         addNode(n);
   1198       }
   1199     }
   1200     public void visitEndTag(EndTag t) {
   1201       addNode(t);
   1202     }
   1203     public void finish() {
   1204       doc = new HtmlDocument(nodes);
   1205     }
   1206 
   1207     /** Gets the html document that has been constructed */
   1208     public HtmlDocument getDocument() {
   1209       return doc;
   1210     }
   1211   }
   1212 
   1213   /**
   1214    * A Visitor that prints out the html document in debug format.
   1215    */
   1216   public static class DebugPrinter implements Visitor {
   1217 
   1218     private final PrintWriter writer;
   1219 
   1220     public DebugPrinter(PrintWriter writer) {
   1221       this.writer = writer;
   1222     }
   1223 
   1224     public void start() {
   1225     }
   1226 
   1227     public void visitText(Text t) {
   1228       writeCollapsed("TEXT", t.getText());
   1229     }
   1230 
   1231     public void visitComment(Comment n) {
   1232       writeCollapsed("COMMENT", n.getContent());
   1233     }
   1234 
   1235     private void writeCollapsed(String type, String s) {
   1236       writer.print(type);
   1237       writer.print(": ");
   1238       String noNewlines = s.replace("\n", " ");
   1239       // Use CharMatcher#WHITESPACE?
   1240       String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' ');
   1241       writer.print(collapsed);
   1242     }
   1243 
   1244     public void visitTag(Tag tag) {
   1245       writer.print("==<" + tag.getName() + ">");
   1246       List<TagAttribute> attributes = tag.getAttributes();
   1247       if (attributes != null) {
   1248 
   1249         // Attribute values
   1250         List<String> attrs = new ArrayList<String>();
   1251         for (TagAttribute a : attributes) {
   1252           attrs.add("[" + a.getName() + " : " + a.getValue() + "]");
   1253         }
   1254         String[] array = attrs.toArray(new String[attrs.size()]);
   1255 
   1256         // Sort the attributes so that it's easier to read and compare
   1257         Arrays.sort(array);
   1258         for (int i = 0; i < array.length; i++) {
   1259           writer.print(" " + array[i]);
   1260         }
   1261       }
   1262       writer.println();
   1263     }
   1264 
   1265     public void visitEndTag(EndTag endtag) {
   1266       writer.println("==</" + endtag.getName() + ">");
   1267     }
   1268 
   1269     public void finish() {
   1270     }
   1271   }
   1272 
   1273 }