Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.android.mail.lib.html.parser;
     17 
     18 import com.android.mail.lib.base.CharEscapers;
     19 import com.android.mail.lib.base.CharMatcher;
     20 import com.android.mail.lib.base.StringUtil;
     21 import com.android.mail.lib.base.X;
     22 import com.google.common.collect.Lists;
     23 
     24 import java.io.PrintWriter;
     25 import java.io.StringWriter;
     26 import java.util.ArrayList;
     27 import java.util.Arrays;
     28 import java.util.List;
     29 
     30 
     31 /**
     32  * HtmlDocument is a container for a list of html nodes, and represents the
     33  * entire html document. It contains toHTML() method which prints out the html
     34  * text, toXHTML for printing out XHTML text and toString() which prints out in
     35  * debug format.
     36  *
     37  * @author jlim (at) google.com (Jing Yee Lim)
     38  */
     39 public class HtmlDocument {
     40   /** List of Node objects */
     41   private final List<Node> nodes;
     42 
     43   /**
     44    * Creates a Html document.
     45    * @param nodes list of html nodes
     46    */
     47   public HtmlDocument(List<Node> nodes) {
     48     this.nodes = nodes;
     49   }
     50 
     51   /** Gets the list of nodes */
     52   public List<Node> getNodes() {
     53     return nodes;
     54   }
     55 
     56   /** Returns a HTML string for the current document */
     57   public String toHTML() {
     58     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     59     for (Node n : nodes) {
     60       n.toHTML(sb);
     61     }
     62     return sb.toString();
     63   }
     64 
     65   /** Returns a XHTML string for the current document */
     66   public String toXHTML() {
     67     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     68     for (Node n : nodes) {
     69       n.toXHTML(sb);
     70     }
     71     return sb.toString();
     72   }
     73 
     74   /**
     75    * Returns, as much as possible, original content of preparsed nodes.  This
     76    * is only different from toHTML() if the nodes were created with original
     77    * content, e.g., by HtmlParser in preserve mode.
     78    */
     79   public String toOriginalHTML() {
     80     StringBuilder sb = new StringBuilder(nodes.size() * 10);
     81     for (Node n : nodes) {
     82       n.toOriginalHTML(sb);
     83     }
     84     return sb.toString();
     85   }
     86 
     87   /** Returns the HTML document in debug format */
     88   @Override
     89   public String toString() {
     90     StringWriter strWriter = new StringWriter();
     91     accept(new DebugPrinter(new PrintWriter(strWriter)));
     92     return strWriter.toString();
     93   }
     94 
     95   /**
     96    * Creates start Tag Node.
     97    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
     98    */
     99   public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) {
    100     return createTag(element, attributes, null, null);
    101   }
    102 
    103   /**
    104    * Creates start Tag Node.
    105    * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String)
    106    */
    107   public static Tag createTag(HTML.Element element,
    108       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
    109       String originalHtmlAfterAttributes) {
    110     return new Tag(element, attributes, false, originalHtmlBeforeAttributes,
    111         originalHtmlAfterAttributes);
    112   }
    113 
    114   /**
    115    * Creates self-terminating Tag Node.
    116    * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String)
    117    */
    118   public static Tag createSelfTerminatingTag(HTML.Element element,
    119       List<TagAttribute> attributes) {
    120     return createSelfTerminatingTag(element, attributes, null, null);
    121   }
    122 
    123   /**
    124    * Creates self-terminating Tag Node.
    125    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
    126    */
    127   public static Tag createSelfTerminatingTag(HTML.Element element,
    128       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
    129       String originalHtmlAfterAttributes) {
    130     return new Tag(element, attributes, true, originalHtmlBeforeAttributes,
    131         originalHtmlAfterAttributes);
    132   }
    133 
    134   /**
    135    * @see HtmlDocument#createEndTag(HTML.Element, String)
    136    */
    137   public static EndTag createEndTag(HTML.Element element) {
    138     return createEndTag(element, null);
    139   }
    140 
    141   /**
    142    * @see HtmlDocument.EndTag#EndTag(HTML.Element, String)
    143    */
    144   public static EndTag createEndTag(HTML.Element element, String originalHtml) {
    145     return new EndTag(element, originalHtml);
    146   }
    147 
    148   /**
    149    * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String)
    150    */
    151   public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) {
    152     return createTagAttribute(attr, value, null);
    153   }
    154 
    155   /**
    156    * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String)
    157    */
    158   public static TagAttribute createTagAttribute(HTML.Attribute attr,
    159       String value, String originalHtml) {
    160     X.assertTrue(attr != null);
    161     return new TagAttribute(attr, value, originalHtml);
    162   }
    163 
    164   /**
    165    * @see HtmlDocument#createText(String, String)
    166    */
    167   public static Text createText(String text) {
    168     return createText(text, null);
    169   }
    170 
    171   /**
    172    * Creates a Text node.
    173    * @see UnescapedText#UnescapedText(String, String)
    174    */
    175   public static Text createText(String text, String original) {
    176     return new UnescapedText(text, original);
    177   }
    178 
    179   /**
    180    * Creates a Text node where the content hasn't been unescaped yet (this will
    181    * be done lazily).
    182    */
    183   public static Text createEscapedText(String htmlText, String original) {
    184     return new EscapedText(htmlText, original);
    185   }
    186 
    187   /**
    188    * Creates an Comment node.
    189    * @see Comment#Comment(String)
    190    */
    191   public static Comment createHtmlComment(String content) {
    192     return new Comment(content);
    193   }
    194 
    195   /**
    196    * Creates a CDATA node.
    197    * @see CDATA#CDATA(String)
    198    */
    199   public static CDATA createCDATA(String text) {
    200     return new CDATA(text);
    201   }
    202 
    203   /** Accepts a Visitor */
    204   public void accept(Visitor v) {
    205     v.start();
    206     for (Node node : nodes) {
    207       node.accept(v);
    208     }
    209     v.finish();
    210   }
    211 
    212   /**
    213    * @param filter results of this filter replace the existing nodes
    214    * @return new document with filtered nodes
    215    */
    216   public HtmlDocument filter(MultiplexFilter filter) {
    217     filter.start();
    218     List<Node> newNodes = new ArrayList<Node>();
    219     for (Node node : nodes) {
    220       filter.filter(node, newNodes);
    221     }
    222     filter.finish(newNodes);
    223     return new HtmlDocument(newNodes);
    224   }
    225 
    226   /**
    227    * Html node
    228    */
    229   public static abstract class Node {
    230 
    231     /** Accepts a visitor */
    232     public abstract void accept(Visitor visitor);
    233 
    234     /** Converts to HTML */
    235     public String toHTML() {
    236       StringBuilder sb = new StringBuilder();
    237       toHTML(sb);
    238       return sb.toString();
    239     }
    240 
    241     /** Converts to HTML */
    242     public abstract void toHTML(StringBuilder sb);
    243 
    244     /** Converts to XHTML */
    245     public String toXHTML() {
    246       StringBuilder sb = new StringBuilder();
    247       toXHTML(sb);
    248       return sb.toString();
    249     }
    250 
    251     /** Converts to XHTML */
    252     public abstract void toXHTML(StringBuilder sb);
    253 
    254     /**
    255      * @return Original if it's available; otherwise, returns
    256      * <code>toHTML()</code>
    257      */
    258     public String toOriginalHTML() {
    259       StringBuilder sb = new StringBuilder();
    260       toOriginalHTML(sb);
    261       return sb.toString();
    262     }
    263 
    264     /**
    265      * @param sb Destination of HTML to be appended.  Appends original if it's
    266      * available; otherwise, appends <code>toHTML()</code>
    267      */
    268     public abstract void toOriginalHTML(StringBuilder sb);
    269   }
    270 
    271   /**
    272    * HTML comment node.
    273    */
    274   public static class Comment extends Node {
    275 
    276     private final String content;
    277 
    278     /**
    279      * @param content Raw comment, including "&lt;!--" and "--&gt;".
    280      */
    281     public Comment(String content) {
    282       this.content = content;
    283     }
    284 
    285     @Override
    286     public void accept(Visitor visitor) {
    287       visitor.visitComment(this);
    288     }
    289 
    290     /**
    291      * Emit original unchanged.
    292      * @param sb Destination of result.
    293      */
    294     @Override
    295     public void toHTML(StringBuilder sb) {
    296       sb.append(content);
    297     }
    298 
    299     /**
    300      * Emit original unchanged.
    301      * @param sb Destination of result.
    302      */
    303     @Override
    304     public void toXHTML(StringBuilder sb) {
    305       sb.append(content);
    306     }
    307 
    308     /**
    309      * Emit original unchanged.
    310      * @param sb Destination of result.
    311      */
    312     @Override
    313     public void toOriginalHTML(StringBuilder sb) {
    314       sb.append(content);
    315     }
    316 
    317     /**
    318      * @return Original unchanged.
    319      */
    320     public String getContent() {
    321       return content;
    322     }
    323   }
    324 
    325   /**
    326    * Text node
    327    */
    328   public static abstract class Text extends Node {
    329 
    330     /**
    331      * unaltered original content of this node
    332      */
    333     private final String originalHtml;
    334 
    335     /**
    336      * content of this node in HTML format
    337      */
    338     private String html;
    339 
    340     /**
    341      * @param originalHtml Unaltered original HTML. If not null,
    342      *        toOriginalHTML() will return this.
    343      */
    344     protected Text(String originalHtml) {
    345       this.originalHtml = originalHtml;
    346     }
    347 
    348     /**
    349      * Gets the plain, unescaped text.
    350      */
    351     abstract public String getText();
    352 
    353     // Returns true if it contains only white space
    354     public boolean isWhitespace() {
    355       String text = getText();
    356       int len = text.length();
    357       for (int i = 0; i < len; i++) {
    358         if (!Character.isWhitespace(text.charAt(i))) {
    359           return false;
    360         }
    361       }
    362       return true;
    363     }
    364 
    365     @Override
    366     public boolean equals(Object o) {
    367       if (o == this) {
    368         return true;
    369       }
    370       if (o instanceof Text) {
    371         Text that = (Text) o;
    372 
    373         return this.originalHtml == null ? that.originalHtml == null
    374             : this.originalHtml.equals(that.originalHtml);
    375       }
    376       return false;
    377     }
    378 
    379     @Override
    380     public int hashCode() {
    381       return originalHtml == null ? 0 : originalHtml.hashCode();
    382     }
    383 
    384     @Override
    385     public String toString() {
    386       return getText();
    387     }
    388 
    389     /** Extends Node.accept */
    390     @Override
    391     public void accept(Visitor visitor) {
    392       visitor.visitText(this);
    393     }
    394 
    395     /**
    396      * Gets the HTML, with HTML entities escaped.
    397      */
    398     @Override
    399     public void toHTML(StringBuilder sb) {
    400       if (html == null) {
    401         html = CharEscapers.asciiHtmlEscaper().escape(getText());
    402       }
    403       sb.append(html);
    404     }
    405 
    406     /**
    407      * @see HtmlDocument.Text#toHTML(StringBuilder)
    408      */
    409     @Override
    410     public void toXHTML(StringBuilder sb) {
    411       toHTML(sb);
    412     }
    413 
    414     /**
    415      * @param sb Appends original HTML to this if available.  Otherwise,
    416      * same as toHTML().
    417      */
    418     @Override
    419     public void toOriginalHTML(StringBuilder sb) {
    420       if (originalHtml != null) {
    421         sb.append(originalHtml);
    422       } else {
    423         toHTML(sb);
    424       }
    425     }
    426 
    427     /**
    428      * @return the original HTML (possibly with entities unescaped if the
    429      * document was malformed). May be null if original HTML was not preserved
    430      * (see constructor argument of {@link HtmlParser})
    431      */
    432     public String getOriginalHTML() {
    433       return originalHtml;
    434     }
    435   }
    436 
    437   /**
    438    * {@link Text} implementation where the given text is assumed to have been
    439    * already HTML unescaped.
    440    */
    441   private static class UnescapedText extends Text {
    442     /**
    443      * content of this node as plain, unescaped text
    444      */
    445     protected final String text;
    446 
    447     private UnescapedText(String plainText, String originalHtml) {
    448       super(originalHtml);
    449       X.assertTrue(plainText != null);
    450       this.text = plainText;
    451     }
    452 
    453     @Override public String getText() {
    454       return text;
    455     }
    456   }
    457 
    458   /**
    459    * {@link Text} implementation where the given text is not unescaped yet, and
    460    * unescaping will only be done lazily.
    461    */
    462   private static class EscapedText extends Text {
    463     private final String htmlText;
    464     private String text;
    465 
    466     private EscapedText(String htmlText, String originalHtml) {
    467       super(originalHtml);
    468       this.htmlText = htmlText;
    469     }
    470 
    471     @Override public String getText() {
    472       if (text == null) {
    473         text = StringUtil.unescapeHTML(htmlText);
    474       }
    475       return text;
    476     }
    477   }
    478 
    479   /**
    480    * CDATA node is a subclass of Text node.
    481    */
    482   public static class CDATA extends UnescapedText {
    483     private CDATA(String text) {
    484       super(text, text);
    485     }
    486 
    487     @Override public void toHTML(StringBuilder sb) {
    488       // Do not htmlescape CDATA text
    489       sb.append(text);
    490     }
    491 
    492     @Override public void toXHTML(StringBuilder sb) {
    493       sb.append("<![CDATA[")
    494         .append(text)
    495         .append("]]>");
    496     }
    497   }
    498 
    499   /**
    500    * Tag is a HTML open tag.
    501    */
    502   public static class Tag extends Node {
    503     // The element
    504     private final HTML.Element element;
    505 
    506     // List of TagAttribute objects. This may be null.
    507     private List<TagAttribute> attributes;
    508 
    509     private final boolean isSelfTerminating;
    510 
    511     private final String originalHtmlBeforeAttributes;
    512 
    513     private final String originalHtmlAfterAttributes;
    514 
    515     /**
    516      * @param element the HTML4 element
    517      * @param attributes list of TagAttribute objects, may be null
    518      * @param isSelfTerminating
    519      * @param originalHtmlBeforeAttributes Original tag's full content before
    520      *        first attribute, including beginning '&lt;'. This should not
    521      *        include preceeding whitespace for the first attribute, as that
    522      *        should be included in the attribute node. If not null, tag will
    523      *        preserve this original content. e.g., if original tag were
    524      *        "&lt;foO bar='zbc'&gt;", case of foO would be preserved. This
    525      *        method does not validate that
    526      *        <code>originalHtmlBeforeAttributes</code> is a valid tag String.
    527      * @param originalHtmlAfterAttributes Full content of original tag after
    528      *        last attribute, including ending '>'. If not null, tag will
    529      *        preserve this original content. e.g., if original tag were
    530      *        "&lt;foo bar='zbc'  &gt;", the spaces before '&gt;' be preserved.
    531      *        This method does not validate that
    532      *        <code>originalHtmlAfterAttributes</code> is a valid tag String.
    533      */
    534     private Tag(HTML.Element element, List<TagAttribute> attributes,
    535         boolean isSelfTerminating, String originalHtmlBeforeAttributes,
    536         String originalHtmlAfterAttributes) {
    537       X.assertTrue(element != null);
    538       this.element = element;
    539       this.attributes = attributes;
    540       this.isSelfTerminating = isSelfTerminating;
    541       this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes;
    542       this.originalHtmlAfterAttributes = originalHtmlAfterAttributes;
    543     }
    544 
    545     /** Gets the name */
    546     public String getName() {
    547       return element.getName();
    548     }
    549 
    550     /** Gets the element */
    551     public HTML.Element getElement() {
    552       return element;
    553     }
    554 
    555     /** Adds an attribute */
    556     public void addAttribute(HTML.Attribute attr, String value) {
    557       X.assertTrue(attr != null);
    558       addAttribute(new TagAttribute(attr, value, null));
    559     }
    560 
    561     /** Adds an attribute */
    562     public void addAttribute(TagAttribute attr) {
    563       X.assertTrue(attr != null);
    564       if (attributes == null) {
    565         attributes = new ArrayList<TagAttribute>();
    566       }
    567       attributes.add(attr);
    568     }
    569 
    570     /** Gets the list of attributes, note that this maybe null. */
    571     public List<TagAttribute> getAttributes() {
    572       return attributes;
    573     }
    574 
    575     /** Finds and returns a TagAttribute, or null if not found */
    576     public TagAttribute getAttribute(HTML.Attribute attr) {
    577       if (attributes != null) {
    578         for (TagAttribute attribute : attributes) {
    579           if (attribute.getAttribute().equals(attr)) {
    580             return attribute;
    581           }
    582         }
    583       }
    584       return null;
    585     }
    586 
    587     /**
    588      * Finds and returns list of TagAttribute of given attribute
    589      * type, or empty list if not found,
    590      */
    591     public List<TagAttribute> getAttributes(HTML.Attribute attr) {
    592       List<TagAttribute> result = Lists.newArrayList();
    593       if (attributes != null) {
    594         for (TagAttribute attribute : attributes) {
    595           if (attribute.getAttribute().equals(attr)) {
    596             result.add(attribute);
    597           }
    598         }
    599       }
    600       return result;
    601     }
    602 
    603     /** Returns debug string */
    604     @Override
    605     public String toString() {
    606       StringBuilder sb = new StringBuilder();
    607       sb.append("Start Tag: ");
    608       sb.append(element.getName());
    609       if (attributes != null) {
    610         for (TagAttribute attr : attributes) {
    611           sb.append(' ');
    612           sb.append(attr.toString());
    613         }
    614       }
    615       return sb.toString();
    616     }
    617 
    618     /** Implements Node.accept */
    619     @Override
    620     public void accept(Visitor visitor) {
    621       visitor.visitTag(this);
    622     }
    623 
    624     /** Implements Node.toHTML */
    625     @Override
    626     public void toHTML(StringBuilder sb) {
    627       serialize(sb, SerializeType.HTML);
    628     }
    629 
    630     @Override
    631     public void toXHTML(StringBuilder sb) {
    632       serialize(sb, SerializeType.XHTML);
    633     }
    634 
    635     @Override
    636     public void toOriginalHTML(StringBuilder sb) {
    637       serialize(sb, SerializeType.ORIGINAL_HTML);
    638     }
    639 
    640     /**
    641      * Specifies format of serialized output.
    642      */
    643     private enum SerializeType {
    644       ORIGINAL_HTML, HTML, XHTML
    645     }
    646 
    647     private void serialize(StringBuilder sb, SerializeType type) {
    648       // before attributes
    649       if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) {
    650         sb.append(originalHtmlBeforeAttributes);
    651       } else {
    652         sb.append('<');
    653         sb.append(element.getName());
    654       }
    655 
    656       // attributes
    657       if (attributes != null) {
    658         for (TagAttribute attr : attributes) {
    659           // attribute includes leading whitespace, so we needn't add it here
    660           if (type == SerializeType.ORIGINAL_HTML) {
    661             attr.toOriginalHTML(sb);
    662           } else if (type == SerializeType.HTML) {
    663             attr.toHTML(sb);
    664           } else {
    665             attr.toXHTML(sb);
    666           }
    667         }
    668       }
    669 
    670       // after attributes
    671       if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) {
    672         sb.append(originalHtmlAfterAttributes);
    673       } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) {
    674         sb.append(" />");
    675       } else {
    676         sb.append('>');
    677       }
    678     }
    679 
    680     public boolean isSelfTerminating() {
    681       return isSelfTerminating;
    682     }
    683 
    684     public String getOriginalHtmlBeforeAttributes() {
    685       return originalHtmlBeforeAttributes;
    686     }
    687 
    688     public String getOriginalHtmlAfterAttributes() {
    689       return originalHtmlAfterAttributes;
    690     }
    691   }
    692 
    693   /**
    694    * EndTag is a closing HTML tag.
    695    */
    696   public static class EndTag extends Node {
    697     // The element
    698     private final HTML.Element element;
    699 
    700     private final String originalHtml;
    701 
    702     /**
    703      * @param element The HTML.Element element.  Can not be null.
    704      * @param originalHtml Full content of original tag, including beginning
    705      * and ending '<' and '>'.  If not null, tag will preserve this original
    706      * content. e.g., if original tag were "&lt;/foo &gt;", the space after foo
    707      * would be preserved.  This method does not validate that originalHtml is a
    708      * valid tag String.
    709      */
    710     private EndTag(HTML.Element element, String originalHtml) {
    711       X.assertTrue(element != null);
    712       this.element = element;
    713       this.originalHtml = originalHtml;
    714     }
    715 
    716     /** Gets the name */
    717     public String getName() {
    718       return element.getName();
    719     }
    720 
    721     /** Gets the element */
    722     public HTML.Element getElement() {
    723       return element;
    724     }
    725 
    726     /** Returns debug string */
    727     @Override
    728     public String toString() {
    729       return "End Tag: " + element.getName();
    730     }
    731 
    732     /** Implements Node.accept */
    733     @Override
    734     public void accept(Visitor visitor) {
    735       visitor.visitEndTag(this);
    736     }
    737 
    738     /** Implements Node.toHTML */
    739     @Override
    740     public void toHTML(StringBuilder sb) {
    741       sb.append("</");
    742       sb.append(element.getName());
    743       sb.append('>');
    744     }
    745 
    746     @Override
    747     public void toXHTML(StringBuilder sb) {
    748       toHTML(sb);
    749     }
    750 
    751     @Override
    752     public void toOriginalHTML(StringBuilder sb) {
    753       if (originalHtml != null) {
    754         sb.append(originalHtml);
    755       } else {
    756         toHTML(sb);
    757       }
    758     }
    759   }
    760 
    761   /**
    762    * TagAttribute represents an attribute in a HTML tag.
    763    */
    764   public static class TagAttribute {
    765     private final HTML.Attribute attribute;
    766     private String value;
    767     private String originalHtml;
    768 
    769     /**
    770      * @param attribute the HTML.Attribute. Can't be null.
    771      * @param value The value in plain-text format. This can be null if the
    772      *        attribute has no value.
    773      * @param originalHtml If not null, toOriginalHTML() will preserve original
    774      *        content. This should contain any leading whitespace from the
    775      *        original.
    776      */
    777     private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) {
    778       X.assertTrue(attribute != null);
    779       this.attribute = attribute;
    780       this.value = value;
    781       this.originalHtml = originalHtml;
    782     }
    783 
    784     /** Gets the name */
    785     public String getName() {
    786       return attribute.getName();
    787     }
    788 
    789     /** Gets the HTML.Attribute information */
    790     public HTML.Attribute getAttribute() {
    791       return attribute;
    792     }
    793 
    794     /**
    795      * Sets the attribute value.
    796      * This value must be in plain-text, not html-escaped.
    797      * This can be null, if the attribute has no values.
    798      * This clears <code>originalHtml_</code> if it were set, so
    799      * <code>toOriginalHTML()</code> might not preserve original any more.
    800      */
    801     public void setValue(String value) {
    802       this.value = value;
    803       originalHtml = null;
    804     }
    805 
    806     /** Returns the attribute value in plain-text, never null */
    807     public String getValue() {
    808       return value != null ? value : "";
    809     }
    810 
    811     /** Returns true if the attribute value is not empty */
    812     public boolean hasValue() {
    813       return value != null;
    814     }
    815 
    816     /**
    817      * Writes out the attribute in HTML format with all necessary preceding
    818      * whitespace. Emits originalHtml_ if it were specified to the constructor.
    819      * Otherwise, emits a new name="value" string with a single preceding space.
    820      */
    821     public void toHTML(StringBuilder sb) {
    822       sb.append(' ');
    823       sb.append(attribute.getName());
    824       if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) {
    825         sb.append("=\"");
    826         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
    827         sb.append("\"");
    828       }
    829     }
    830 
    831     /** Returns the attribute html string */
    832     public String toHTML() {
    833       StringBuilder sb = new StringBuilder();
    834       toHTML(sb);
    835       return sb.toString();
    836     }
    837 
    838     /**
    839      * Writes out the attribute in XHTML format (value is always appended,
    840      * even if it is empty) with all necessary preceeding whitespace.
    841      */
    842     public void toXHTML(StringBuilder sb) {
    843       sb.append(' ');
    844       sb.append(attribute.getName()).append("=\"");
    845 
    846       // Assume that value-less attribute are boolean attributes like "disabled"
    847       if (hasValue()) {
    848         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
    849       } else {
    850         sb.append(attribute.getName());
    851       }
    852 
    853       sb.append("\"");
    854     }
    855 
    856     /** Returns the attribute XHTML string */
    857     public String toXHTML() {
    858       StringBuilder sb = new StringBuilder();
    859       toXHTML(sb);
    860       return sb.toString();
    861     }
    862 
    863     /**
    864      * @param sb Destination to which attribute is written, in its original
    865      * preparsed form if possible.
    866      */
    867     public void toOriginalHTML(StringBuilder sb) {
    868       if (originalHtml != null) {
    869         sb.append(originalHtml);
    870       } else {
    871         toHTML(sb);
    872       }
    873     }
    874 
    875     /**
    876      * Writes out the attribute in its original form as it was parsed..
    877      */
    878     public String toOriginalHTML() {
    879       StringBuilder sb = new StringBuilder();
    880       toOriginalHTML(sb);
    881       return sb.toString();
    882     }
    883 
    884     @Override
    885     public String toString() {
    886       return "{" + attribute.getName() + "=" + value + "}";
    887     }
    888   }
    889 
    890   /**
    891    * Filter is like Visitor, except it implies that the nodes may be changed,
    892    * whereas HtmlDocument.Visitor just implies that the nodes are iterated
    893    * over. A Filter can behave just like a Visitor if it merely returns the
    894    * same node that it visited. Also, methods may be called on a node to change
    895    * the values it contains. Alternatively, a new node entirely can be created
    896    * and returned, which will essentially replace the previous node with the
    897    * new node in the document tree. A node may be removed by returning null
    898    * instead of a node.
    899    */
    900   public static interface Filter {
    901     /** This is called first */
    902     void start();
    903 
    904     /** A text node */
    905     Text visitText(Text n);
    906 
    907     /** An open tag */
    908     Tag visitTag(Tag n);
    909 
    910     /** End tag */
    911     EndTag visitEndTag(EndTag n);
    912 
    913     /** HTML comment */
    914     Comment visitComment(Comment n);
    915 
    916     /* Called at the end. */
    917     void finish();
    918   }
    919 
    920   /**
    921    * Like Filter, except each node may be replaced by multiple nodes.  Also,
    922    * does not do double dispatch accept/visit.
    923    */
    924   public static interface MultiplexFilter {
    925     /**
    926      * Called first.
    927      */
    928     void start();
    929 
    930     /**
    931      * @param originalNode node to filter
    932      * @param out Destination to which this object appends nodes to replace
    933      * originalNode.  Can not be null.
    934      */
    935     void filter(Node originalNode, List<Node> out);
    936 
    937     /**
    938      * Called at the end.
    939      * @param out Destination to which this object appends nodes at the end of
    940      * the document.  Can not be null.
    941      */
    942     void finish(List<Node> out);
    943   }
    944 
    945   /**
    946    * Converts a normal {@link Filter} into a {@link MultiplexFilter}.
    947    */
    948   public static class MultiplexFilterAdapter implements MultiplexFilter {
    949 
    950     private final Filter filter;
    951 
    952     public MultiplexFilterAdapter(Filter filter) {
    953       this.filter = filter;
    954     }
    955 
    956     public void start() {
    957       filter.start();
    958     }
    959 
    960     public void filter(Node originalNode, List<Node> out) {
    961       if (originalNode == null) {
    962         return;
    963       }
    964 
    965       Node resultNode;
    966       if (originalNode instanceof Tag) {
    967         resultNode = filter.visitTag((Tag) originalNode);
    968       } else if (originalNode instanceof Text) {
    969         resultNode = filter.visitText((Text) originalNode);
    970       } else if (originalNode instanceof EndTag) {
    971         resultNode = filter.visitEndTag((EndTag) originalNode);
    972       } else if (originalNode instanceof Comment) {
    973         resultNode = filter.visitComment((Comment) originalNode);
    974       } else {
    975         throw new IllegalArgumentException("unknown node type: " + originalNode.getClass());
    976       }
    977 
    978       if (resultNode != null) {
    979         out.add(resultNode);
    980       }
    981     }
    982 
    983     public void finish(List<Node> out) {
    984       filter.finish();
    985     }
    986   }
    987 
    988   /**
    989    * Like Filter, except each node may be replaced by multiple nodes.  Also,
    990    * does not do double dispatch accept/visit.  Dispatches filterNode() to
    991    * node-specific methods.
    992    */
    993   public static abstract class SimpleMultiplexFilter implements MultiplexFilter {
    994 
    995     /**
    996      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
    997      */
    998     public void filter(Node originalNode, List<Node> out) {
    999       if (originalNode == null) {
   1000         return;
   1001       }
   1002 
   1003       if (originalNode instanceof Tag) {
   1004         filterTag((Tag) originalNode, out);
   1005       } else if (originalNode instanceof Text) {
   1006         filterText((Text) originalNode, out);
   1007       } else if (originalNode instanceof EndTag) {
   1008         filterEndTag((EndTag) originalNode, out);
   1009       } else if (originalNode instanceof Comment) {
   1010         filterComment((Comment) originalNode, out);
   1011       } else {
   1012         throw new IllegalArgumentException("unknown node type: "
   1013             + originalNode.getClass());
   1014       }
   1015     }
   1016 
   1017     public abstract void filterTag(Tag originalTag, List<Node> out);
   1018 
   1019     public abstract void filterText(Text originalText, List<Node> out);
   1020 
   1021     public abstract void filterEndTag(EndTag originalEndTag, List<Node> out);
   1022 
   1023     public void filterComment(Comment originalComment, List<Node> out) {
   1024     }
   1025   }
   1026 
   1027   /**
   1028    * Contains a list of filters which are applied, in order, to each Node.  The
   1029    * output of each becomes the input to the next.  As soon as one returns an
   1030    * empty list it breaks the chain.
   1031    */
   1032   public static class MultiplexFilterChain implements MultiplexFilter {
   1033 
   1034     private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>();
   1035 
   1036     /**
   1037      * @param sourceFilters these filters are applied in List order
   1038      */
   1039     public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) {
   1040       filters.addAll(sourceFilters);
   1041     }
   1042 
   1043     /**
   1044      * @see HtmlDocument.MultiplexFilter#start()
   1045      */
   1046     public void start() {
   1047       for (MultiplexFilter filter : filters) {
   1048         filter.start();
   1049       }
   1050     }
   1051 
   1052     /**
   1053      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
   1054      */
   1055     public void filter(Node originalNode, List<Node> out) {
   1056       List<Node> result = new ArrayList<Node>();
   1057       result.add(originalNode);
   1058 
   1059       // loop through filters until one returns nothing, or until we're out of
   1060       // filters
   1061       for (MultiplexFilter filter : filters) {
   1062         if (result.isEmpty()) {
   1063           return;
   1064         }
   1065 
   1066         // apply filter to each node and collect results
   1067         List<Node> newResult = new ArrayList<Node>();
   1068         for (Node node : result) {
   1069           filter.filter(node, newResult);
   1070         }
   1071         result = newResult;
   1072       }
   1073 
   1074       out.addAll(result);
   1075     }
   1076 
   1077     /**
   1078      * @see HtmlDocument.MultiplexFilter#finish(List)
   1079      */
   1080     public void finish(List<Node> out) {
   1081       List<Node> result = new ArrayList<Node>();
   1082 
   1083       // loop through filters until one returns nothing, or until we're out of
   1084       // filters
   1085       for (MultiplexFilter filter : filters) {
   1086         // apply filter to each node and collect results
   1087         List<Node> newResult = new ArrayList<Node>();
   1088         for (Node node : result) {
   1089           filter.filter(node, newResult);
   1090         }
   1091         filter.finish(newResult);
   1092         result = newResult;
   1093       }
   1094 
   1095       out.addAll(result);
   1096     }
   1097   }
   1098 
   1099   /**
   1100    * Html visitor allows external code to iterate through the nodes in the
   1101    * document. See HtmlDocument.accept.
   1102    */
   1103   public static interface Visitor {
   1104     /** This is called first */
   1105     void start();
   1106 
   1107     /** A text node */
   1108     void visitText(Text n);
   1109 
   1110     /** An open tag */
   1111     void visitTag(Tag n);
   1112 
   1113     /** End tag */
   1114     void visitEndTag(EndTag n);
   1115 
   1116     /** comment */
   1117     void visitComment(Comment n);
   1118 
   1119     /* Called at the end. */
   1120     void finish();
   1121   }
   1122 
   1123   /**
   1124    * An implementation of the Visitor interface which simply delegates its
   1125    * methods to a wrapped instance of another Visitor.
   1126    *
   1127    * <p>This is useful for chaining Visitors together.
   1128    */
   1129   public static class VisitorWrapper implements Visitor {
   1130     private final Visitor wrapped;
   1131 
   1132     protected VisitorWrapper(Visitor wrap) {
   1133       wrapped = wrap;
   1134     }
   1135 
   1136     public void start() {
   1137       wrapped.start();
   1138     }
   1139 
   1140     public void visitText(Text n) {
   1141       wrapped.visitText(n);
   1142     }
   1143 
   1144     public void visitTag(Tag n) {
   1145       wrapped.visitTag(n);
   1146     }
   1147 
   1148     public void visitEndTag(EndTag n) {
   1149       wrapped.visitEndTag(n);
   1150     }
   1151 
   1152     public void visitComment(Comment n) {
   1153       wrapped.visitComment(n);
   1154     }
   1155 
   1156     public void finish() {
   1157       wrapped.finish();
   1158     }
   1159   }
   1160 
   1161   /**
   1162    * A special helper Visitor that builds a HtmlDocument.
   1163    */
   1164   public static class Builder implements Visitor {
   1165     private final boolean preserveComments;
   1166     private final List<Node> nodes = new ArrayList<Node>();
   1167     private HtmlDocument doc;
   1168 
   1169     /**
   1170      * @see Builder#Builder(boolean)
   1171      */
   1172     public Builder() {
   1173       this(false);
   1174     }
   1175 
   1176     /**
   1177      * @param preserveComments If false, ignores Comment nodes
   1178      */
   1179     public Builder(boolean preserveComments) {
   1180       this.preserveComments = preserveComments;
   1181     }
   1182 
   1183     public void addNode(Node node) {
   1184       nodes.add(node);
   1185     }
   1186     public void start() {
   1187     }
   1188     public void visitText(Text t) {
   1189       addNode(t);
   1190     }
   1191     public void visitTag(Tag t) {
   1192       addNode(t);
   1193     }
   1194     public void visitComment(Comment n) {
   1195       if (preserveComments) {
   1196         addNode(n);
   1197       }
   1198     }
   1199     public void visitEndTag(EndTag t) {
   1200       addNode(t);
   1201     }
   1202     public void finish() {
   1203       doc = new HtmlDocument(nodes);
   1204     }
   1205 
   1206     /** Gets the html document that has been constructed */
   1207     public HtmlDocument getDocument() {
   1208       return doc;
   1209     }
   1210   }
   1211 
   1212   /**
   1213    * A Visitor that prints out the html document in debug format.
   1214    */
   1215   public static class DebugPrinter implements Visitor {
   1216 
   1217     private final PrintWriter writer;
   1218 
   1219     public DebugPrinter(PrintWriter writer) {
   1220       this.writer = writer;
   1221     }
   1222 
   1223     public void start() {
   1224     }
   1225 
   1226     public void visitText(Text t) {
   1227       writeCollapsed("TEXT", t.getText());
   1228     }
   1229 
   1230     public void visitComment(Comment n) {
   1231       writeCollapsed("COMMENT", n.getContent());
   1232     }
   1233 
   1234     private void writeCollapsed(String type, String s) {
   1235       writer.print(type);
   1236       writer.print(": ");
   1237       String noNewlines = s.replace("\n", " ");
   1238       // Use CharMatcher#WHITESPACE?
   1239       String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' ');
   1240       writer.print(collapsed);
   1241     }
   1242 
   1243     public void visitTag(Tag tag) {
   1244       writer.print("==<" + tag.getName() + ">");
   1245       List<TagAttribute> attributes = tag.getAttributes();
   1246       if (attributes != null) {
   1247 
   1248         // Attribute values
   1249         List<String> attrs = new ArrayList<String>();
   1250         for (TagAttribute a : attributes) {
   1251           attrs.add("[" + a.getName() + " : " + a.getValue() + "]");
   1252         }
   1253         String[] array = attrs.toArray(new String[attrs.size()]);
   1254 
   1255         // Sort the attributes so that it's easier to read and compare
   1256         Arrays.sort(array);
   1257         for (int i = 0; i < array.length; i++) {
   1258           writer.print(" " + array[i]);
   1259         }
   1260       }
   1261       writer.println();
   1262     }
   1263 
   1264     public void visitEndTag(EndTag endtag) {
   1265       writer.println("==</" + endtag.getName() + ">");
   1266     }
   1267 
   1268     public void finish() {
   1269     }
   1270   }
   1271 
   1272 }