1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.mail.common.html.parser; 18 19 import com.google.android.mail.common.base.CharEscapers; 20 import com.google.android.mail.common.base.CharMatcher; 21 import com.google.android.mail.common.base.StringUtil; 22 import com.google.android.mail.common.base.X; 23 import com.google.common.collect.Lists; 24 25 import java.io.PrintWriter; 26 import java.io.StringWriter; 27 import java.util.ArrayList; 28 import java.util.Arrays; 29 import java.util.List; 30 31 32 /** 33 * HtmlDocument is a container for a list of html nodes, and represents the 34 * entire html document. It contains toHTML() method which prints out the html 35 * text, toXHTML for printing out XHTML text and toString() which prints out in 36 * debug format. 37 * 38 * @author jlim (at) google.com (Jing Yee Lim) 39 */ 40 public class HtmlDocument { 41 /** List of Node objects */ 42 private final List<Node> nodes; 43 44 /** 45 * Creates a Html document. 46 * @param nodes list of html nodes 47 */ 48 public HtmlDocument(List<Node> nodes) { 49 this.nodes = nodes; 50 } 51 52 /** Gets the list of nodes */ 53 public List<Node> getNodes() { 54 return nodes; 55 } 56 57 /** Returns a HTML string for the current document */ 58 public String toHTML() { 59 StringBuilder sb = new StringBuilder(nodes.size() * 10); 60 for (Node n : nodes) { 61 n.toHTML(sb); 62 } 63 return sb.toString(); 64 } 65 66 /** Returns a XHTML string for the current document */ 67 public String toXHTML() { 68 StringBuilder sb = new StringBuilder(nodes.size() * 10); 69 for (Node n : nodes) { 70 n.toXHTML(sb); 71 } 72 return sb.toString(); 73 } 74 75 /** 76 * Returns, as much as possible, original content of preparsed nodes. This 77 * is only different from toHTML() if the nodes were created with original 78 * content, e.g., by HtmlParser in preserve mode. 79 */ 80 public String toOriginalHTML() { 81 StringBuilder sb = new StringBuilder(nodes.size() * 10); 82 for (Node n : nodes) { 83 n.toOriginalHTML(sb); 84 } 85 return sb.toString(); 86 } 87 88 /** Returns the HTML document in debug format */ 89 @Override 90 public String toString() { 91 StringWriter strWriter = new StringWriter(); 92 accept(new DebugPrinter(new PrintWriter(strWriter))); 93 return strWriter.toString(); 94 } 95 96 /** 97 * Creates start Tag Node. 98 * @see HtmlDocument#createTag(HTML.Element, List, String, String) 99 */ 100 public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) { 101 return createTag(element, attributes, null, null); 102 } 103 104 /** 105 * Creates start Tag Node. 106 * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String) 107 */ 108 public static Tag createTag(HTML.Element element, 109 List<TagAttribute> attributes, String originalHtmlBeforeAttributes, 110 String originalHtmlAfterAttributes) { 111 return new Tag(element, attributes, false, originalHtmlBeforeAttributes, 112 originalHtmlAfterAttributes); 113 } 114 115 /** 116 * Creates self-terminating Tag Node. 117 * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String) 118 */ 119 public static Tag createSelfTerminatingTag(HTML.Element element, 120 List<TagAttribute> attributes) { 121 return createSelfTerminatingTag(element, attributes, null, null); 122 } 123 124 /** 125 * Creates self-terminating Tag Node. 126 * @see HtmlDocument#createTag(HTML.Element, List, String, String) 127 */ 128 public static Tag createSelfTerminatingTag(HTML.Element element, 129 List<TagAttribute> attributes, String originalHtmlBeforeAttributes, 130 String originalHtmlAfterAttributes) { 131 return new Tag(element, attributes, true, originalHtmlBeforeAttributes, 132 originalHtmlAfterAttributes); 133 } 134 135 /** 136 * @see HtmlDocument#createEndTag(HTML.Element, String) 137 */ 138 public static EndTag createEndTag(HTML.Element element) { 139 return createEndTag(element, null); 140 } 141 142 /** 143 * @see HtmlDocument.EndTag#EndTag(HTML.Element, String) 144 */ 145 public static EndTag createEndTag(HTML.Element element, String originalHtml) { 146 return new EndTag(element, originalHtml); 147 } 148 149 /** 150 * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String) 151 */ 152 public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) { 153 return createTagAttribute(attr, value, null); 154 } 155 156 /** 157 * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String) 158 */ 159 public static TagAttribute createTagAttribute(HTML.Attribute attr, 160 String value, String originalHtml) { 161 X.assertTrue(attr != null); 162 return new TagAttribute(attr, value, originalHtml); 163 } 164 165 /** 166 * @see HtmlDocument#createText(String, String) 167 */ 168 public static Text createText(String text) { 169 return createText(text, null); 170 } 171 172 /** 173 * Creates a Text node. 174 * @see UnescapedText#UnescapedText(String, String) 175 */ 176 public static Text createText(String text, String original) { 177 return new UnescapedText(text, original); 178 } 179 180 /** 181 * Creates a Text node where the content hasn't been unescaped yet (this will 182 * be done lazily). 183 */ 184 public static Text createEscapedText(String htmlText, String original) { 185 return new EscapedText(htmlText, original); 186 } 187 188 /** 189 * Creates an Comment node. 190 * @see Comment#Comment(String) 191 */ 192 public static Comment createHtmlComment(String content) { 193 return new Comment(content); 194 } 195 196 /** 197 * Creates a CDATA node. 198 * @see CDATA#CDATA(String) 199 */ 200 public static CDATA createCDATA(String text) { 201 return new CDATA(text); 202 } 203 204 /** Accepts a Visitor */ 205 public void accept(Visitor v) { 206 v.start(); 207 for (Node node : nodes) { 208 node.accept(v); 209 } 210 v.finish(); 211 } 212 213 /** 214 * @param filter results of this filter replace the existing nodes 215 * @return new document with filtered nodes 216 */ 217 public HtmlDocument filter(MultiplexFilter filter) { 218 filter.start(); 219 List<Node> newNodes = new ArrayList<Node>(); 220 for (Node node : nodes) { 221 filter.filter(node, newNodes); 222 } 223 filter.finish(newNodes); 224 return new HtmlDocument(newNodes); 225 } 226 227 /** 228 * Html node 229 */ 230 public static abstract class Node { 231 232 /** Accepts a visitor */ 233 public abstract void accept(Visitor visitor); 234 235 /** Converts to HTML */ 236 public String toHTML() { 237 StringBuilder sb = new StringBuilder(); 238 toHTML(sb); 239 return sb.toString(); 240 } 241 242 /** Converts to HTML */ 243 public abstract void toHTML(StringBuilder sb); 244 245 /** Converts to XHTML */ 246 public String toXHTML() { 247 StringBuilder sb = new StringBuilder(); 248 toXHTML(sb); 249 return sb.toString(); 250 } 251 252 /** Converts to XHTML */ 253 public abstract void toXHTML(StringBuilder sb); 254 255 /** 256 * @return Original if it's available; otherwise, returns 257 * <code>toHTML()</code> 258 */ 259 public String toOriginalHTML() { 260 StringBuilder sb = new StringBuilder(); 261 toOriginalHTML(sb); 262 return sb.toString(); 263 } 264 265 /** 266 * @param sb Destination of HTML to be appended. Appends original if it's 267 * available; otherwise, appends <code>toHTML()</code> 268 */ 269 public abstract void toOriginalHTML(StringBuilder sb); 270 } 271 272 /** 273 * HTML comment node. 274 */ 275 public static class Comment extends Node { 276 277 private final String content; 278 279 /** 280 * @param content Raw comment, including "<!--" and "-->". 281 */ 282 public Comment(String content) { 283 this.content = content; 284 } 285 286 @Override 287 public void accept(Visitor visitor) { 288 visitor.visitComment(this); 289 } 290 291 /** 292 * Emit original unchanged. 293 * @param sb Destination of result. 294 */ 295 @Override 296 public void toHTML(StringBuilder sb) { 297 sb.append(content); 298 } 299 300 /** 301 * Emit original unchanged. 302 * @param sb Destination of result. 303 */ 304 @Override 305 public void toXHTML(StringBuilder sb) { 306 sb.append(content); 307 } 308 309 /** 310 * Emit original unchanged. 311 * @param sb Destination of result. 312 */ 313 @Override 314 public void toOriginalHTML(StringBuilder sb) { 315 sb.append(content); 316 } 317 318 /** 319 * @return Original unchanged. 320 */ 321 public String getContent() { 322 return content; 323 } 324 } 325 326 /** 327 * Text node 328 */ 329 public static abstract class Text extends Node { 330 331 /** 332 * unaltered original content of this node 333 */ 334 private final String originalHtml; 335 336 /** 337 * content of this node in HTML format 338 */ 339 private String html; 340 341 /** 342 * @param originalHtml Unaltered original HTML. If not null, 343 * toOriginalHTML() will return this. 344 */ 345 protected Text(String originalHtml) { 346 this.originalHtml = originalHtml; 347 } 348 349 /** 350 * Gets the plain, unescaped text. 351 */ 352 abstract public String getText(); 353 354 // Returns true if it contains only white space 355 public boolean isWhitespace() { 356 String text = getText(); 357 int len = text.length(); 358 for (int i = 0; i < len; i++) { 359 if (!Character.isWhitespace(text.charAt(i))) { 360 return false; 361 } 362 } 363 return true; 364 } 365 366 @Override 367 public boolean equals(Object o) { 368 if (o == this) { 369 return true; 370 } 371 if (o instanceof Text) { 372 Text that = (Text) o; 373 374 return this.originalHtml == null ? that.originalHtml == null 375 : this.originalHtml.equals(that.originalHtml); 376 } 377 return false; 378 } 379 380 @Override 381 public int hashCode() { 382 return originalHtml == null ? 0 : originalHtml.hashCode(); 383 } 384 385 @Override 386 public String toString() { 387 return getText(); 388 } 389 390 /** Extends Node.accept */ 391 @Override 392 public void accept(Visitor visitor) { 393 visitor.visitText(this); 394 } 395 396 /** 397 * Gets the HTML, with HTML entities escaped. 398 */ 399 @Override 400 public void toHTML(StringBuilder sb) { 401 if (html == null) { 402 html = CharEscapers.asciiHtmlEscaper().escape(getText()); 403 } 404 sb.append(html); 405 } 406 407 /** 408 * @see HtmlDocument.Text#toHTML(StringBuilder) 409 */ 410 @Override 411 public void toXHTML(StringBuilder sb) { 412 toHTML(sb); 413 } 414 415 /** 416 * @param sb Appends original HTML to this if available. Otherwise, 417 * same as toHTML(). 418 */ 419 @Override 420 public void toOriginalHTML(StringBuilder sb) { 421 if (originalHtml != null) { 422 sb.append(originalHtml); 423 } else { 424 toHTML(sb); 425 } 426 } 427 428 /** 429 * @return the original HTML (possibly with entities unescaped if the 430 * document was malformed). May be null if original HTML was not preserved 431 * (see constructor argument of {@link HtmlParser}) 432 */ 433 public String getOriginalHTML() { 434 return originalHtml; 435 } 436 } 437 438 /** 439 * {@link Text} implementation where the given text is assumed to have been 440 * already HTML unescaped. 441 */ 442 private static class UnescapedText extends Text { 443 /** 444 * content of this node as plain, unescaped text 445 */ 446 protected final String text; 447 448 private UnescapedText(String plainText, String originalHtml) { 449 super(originalHtml); 450 X.assertTrue(plainText != null); 451 this.text = plainText; 452 } 453 454 @Override public String getText() { 455 return text; 456 } 457 } 458 459 /** 460 * {@link Text} implementation where the given text is not unescaped yet, and 461 * unescaping will only be done lazily. 462 */ 463 private static class EscapedText extends Text { 464 private final String htmlText; 465 private String text; 466 467 private EscapedText(String htmlText, String originalHtml) { 468 super(originalHtml); 469 this.htmlText = htmlText; 470 } 471 472 @Override public String getText() { 473 if (text == null) { 474 text = StringUtil.unescapeHTML(htmlText); 475 } 476 return text; 477 } 478 } 479 480 /** 481 * CDATA node is a subclass of Text node. 482 */ 483 public static class CDATA extends UnescapedText { 484 private CDATA(String text) { 485 super(text, text); 486 } 487 488 @Override public void toHTML(StringBuilder sb) { 489 // Do not htmlescape CDATA text 490 sb.append(text); 491 } 492 493 @Override public void toXHTML(StringBuilder sb) { 494 sb.append("<![CDATA[") 495 .append(text) 496 .append("]]>"); 497 } 498 } 499 500 /** 501 * Tag is a HTML open tag. 502 */ 503 public static class Tag extends Node { 504 // The element 505 private final HTML.Element element; 506 507 // List of TagAttribute objects. This may be null. 508 private List<TagAttribute> attributes; 509 510 private final boolean isSelfTerminating; 511 512 private final String originalHtmlBeforeAttributes; 513 514 private final String originalHtmlAfterAttributes; 515 516 /** 517 * @param element the HTML4 element 518 * @param attributes list of TagAttribute objects, may be null 519 * @param isSelfTerminating 520 * @param originalHtmlBeforeAttributes Original tag's full content before 521 * first attribute, including beginning '<'. This should not 522 * include preceeding whitespace for the first attribute, as that 523 * should be included in the attribute node. If not null, tag will 524 * preserve this original content. e.g., if original tag were 525 * "<foO bar='zbc'>", case of foO would be preserved. This 526 * method does not validate that 527 * <code>originalHtmlBeforeAttributes</code> is a valid tag String. 528 * @param originalHtmlAfterAttributes Full content of original tag after 529 * last attribute, including ending '>'. If not null, tag will 530 * preserve this original content. e.g., if original tag were 531 * "<foo bar='zbc' >", the spaces before '>' be preserved. 532 * This method does not validate that 533 * <code>originalHtmlAfterAttributes</code> is a valid tag String. 534 */ 535 private Tag(HTML.Element element, List<TagAttribute> attributes, 536 boolean isSelfTerminating, String originalHtmlBeforeAttributes, 537 String originalHtmlAfterAttributes) { 538 X.assertTrue(element != null); 539 this.element = element; 540 this.attributes = attributes; 541 this.isSelfTerminating = isSelfTerminating; 542 this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes; 543 this.originalHtmlAfterAttributes = originalHtmlAfterAttributes; 544 } 545 546 /** Gets the name */ 547 public String getName() { 548 return element.getName(); 549 } 550 551 /** Gets the element */ 552 public HTML.Element getElement() { 553 return element; 554 } 555 556 /** Adds an attribute */ 557 public void addAttribute(HTML.Attribute attr, String value) { 558 X.assertTrue(attr != null); 559 addAttribute(new TagAttribute(attr, value, null)); 560 } 561 562 /** Adds an attribute */ 563 public void addAttribute(TagAttribute attr) { 564 X.assertTrue(attr != null); 565 if (attributes == null) { 566 attributes = new ArrayList<TagAttribute>(); 567 } 568 attributes.add(attr); 569 } 570 571 /** Gets the list of attributes, note that this maybe null. */ 572 public List<TagAttribute> getAttributes() { 573 return attributes; 574 } 575 576 /** Finds and returns a TagAttribute, or null if not found */ 577 public TagAttribute getAttribute(HTML.Attribute attr) { 578 if (attributes != null) { 579 for (TagAttribute attribute : attributes) { 580 if (attribute.getAttribute().equals(attr)) { 581 return attribute; 582 } 583 } 584 } 585 return null; 586 } 587 588 /** 589 * Finds and returns list of TagAttribute of given attribute 590 * type, or empty list if not found, 591 */ 592 public List<TagAttribute> getAttributes(HTML.Attribute attr) { 593 List<TagAttribute> result = Lists.newArrayList(); 594 if (attributes != null) { 595 for (TagAttribute attribute : attributes) { 596 if (attribute.getAttribute().equals(attr)) { 597 result.add(attribute); 598 } 599 } 600 } 601 return result; 602 } 603 604 /** Returns debug string */ 605 @Override 606 public String toString() { 607 StringBuilder sb = new StringBuilder(); 608 sb.append("Start Tag: "); 609 sb.append(element.getName()); 610 if (attributes != null) { 611 for (TagAttribute attr : attributes) { 612 sb.append(' '); 613 sb.append(attr.toString()); 614 } 615 } 616 return sb.toString(); 617 } 618 619 /** Implements Node.accept */ 620 @Override 621 public void accept(Visitor visitor) { 622 visitor.visitTag(this); 623 } 624 625 /** Implements Node.toHTML */ 626 @Override 627 public void toHTML(StringBuilder sb) { 628 serialize(sb, SerializeType.HTML); 629 } 630 631 @Override 632 public void toXHTML(StringBuilder sb) { 633 serialize(sb, SerializeType.XHTML); 634 } 635 636 @Override 637 public void toOriginalHTML(StringBuilder sb) { 638 serialize(sb, SerializeType.ORIGINAL_HTML); 639 } 640 641 /** 642 * Specifies format of serialized output. 643 */ 644 private enum SerializeType { 645 ORIGINAL_HTML, HTML, XHTML 646 } 647 648 private void serialize(StringBuilder sb, SerializeType type) { 649 // before attributes 650 if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) { 651 sb.append(originalHtmlBeforeAttributes); 652 } else { 653 sb.append('<'); 654 sb.append(element.getName()); 655 } 656 657 // attributes 658 if (attributes != null) { 659 for (TagAttribute attr : attributes) { 660 // attribute includes leading whitespace, so we needn't add it here 661 if (type == SerializeType.ORIGINAL_HTML) { 662 attr.toOriginalHTML(sb); 663 } else if (type == SerializeType.HTML) { 664 attr.toHTML(sb); 665 } else { 666 attr.toXHTML(sb); 667 } 668 } 669 } 670 671 // after attributes 672 if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) { 673 sb.append(originalHtmlAfterAttributes); 674 } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) { 675 sb.append(" />"); 676 } else { 677 sb.append('>'); 678 } 679 } 680 681 public boolean isSelfTerminating() { 682 return isSelfTerminating; 683 } 684 685 public String getOriginalHtmlBeforeAttributes() { 686 return originalHtmlBeforeAttributes; 687 } 688 689 public String getOriginalHtmlAfterAttributes() { 690 return originalHtmlAfterAttributes; 691 } 692 } 693 694 /** 695 * EndTag is a closing HTML tag. 696 */ 697 public static class EndTag extends Node { 698 // The element 699 private final HTML.Element element; 700 701 private final String originalHtml; 702 703 /** 704 * @param element The HTML.Element element. Can not be null. 705 * @param originalHtml Full content of original tag, including beginning 706 * and ending '<' and '>'. If not null, tag will preserve this original 707 * content. e.g., if original tag were "</foo >", the space after foo 708 * would be preserved. This method does not validate that originalHtml is a 709 * valid tag String. 710 */ 711 private EndTag(HTML.Element element, String originalHtml) { 712 X.assertTrue(element != null); 713 this.element = element; 714 this.originalHtml = originalHtml; 715 } 716 717 /** Gets the name */ 718 public String getName() { 719 return element.getName(); 720 } 721 722 /** Gets the element */ 723 public HTML.Element getElement() { 724 return element; 725 } 726 727 /** Returns debug string */ 728 @Override 729 public String toString() { 730 return "End Tag: " + element.getName(); 731 } 732 733 /** Implements Node.accept */ 734 @Override 735 public void accept(Visitor visitor) { 736 visitor.visitEndTag(this); 737 } 738 739 /** Implements Node.toHTML */ 740 @Override 741 public void toHTML(StringBuilder sb) { 742 sb.append("</"); 743 sb.append(element.getName()); 744 sb.append('>'); 745 } 746 747 @Override 748 public void toXHTML(StringBuilder sb) { 749 toHTML(sb); 750 } 751 752 @Override 753 public void toOriginalHTML(StringBuilder sb) { 754 if (originalHtml != null) { 755 sb.append(originalHtml); 756 } else { 757 toHTML(sb); 758 } 759 } 760 } 761 762 /** 763 * TagAttribute represents an attribute in a HTML tag. 764 */ 765 public static class TagAttribute { 766 private final HTML.Attribute attribute; 767 private String value; 768 private String originalHtml; 769 770 /** 771 * @param attribute the HTML.Attribute. Can't be null. 772 * @param value The value in plain-text format. This can be null if the 773 * attribute has no value. 774 * @param originalHtml If not null, toOriginalHTML() will preserve original 775 * content. This should contain any leading whitespace from the 776 * original. 777 */ 778 private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) { 779 X.assertTrue(attribute != null); 780 this.attribute = attribute; 781 this.value = value; 782 this.originalHtml = originalHtml; 783 } 784 785 /** Gets the name */ 786 public String getName() { 787 return attribute.getName(); 788 } 789 790 /** Gets the HTML.Attribute information */ 791 public HTML.Attribute getAttribute() { 792 return attribute; 793 } 794 795 /** 796 * Sets the attribute value. 797 * This value must be in plain-text, not html-escaped. 798 * This can be null, if the attribute has no values. 799 * This clears <code>originalHtml_</code> if it were set, so 800 * <code>toOriginalHTML()</code> might not preserve original any more. 801 */ 802 public void setValue(String value) { 803 this.value = value; 804 originalHtml = null; 805 } 806 807 /** Returns the attribute value in plain-text, never null */ 808 public String getValue() { 809 return value != null ? value : ""; 810 } 811 812 /** Returns true if the attribute value is not empty */ 813 public boolean hasValue() { 814 return value != null; 815 } 816 817 /** 818 * Writes out the attribute in HTML format with all necessary preceding 819 * whitespace. Emits originalHtml_ if it were specified to the constructor. 820 * Otherwise, emits a new name="value" string with a single preceding space. 821 */ 822 public void toHTML(StringBuilder sb) { 823 sb.append(' '); 824 sb.append(attribute.getName()); 825 if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) { 826 sb.append("=\""); 827 sb.append(CharEscapers.asciiHtmlEscaper().escape(value)); 828 sb.append("\""); 829 } 830 } 831 832 /** Returns the attribute html string */ 833 public String toHTML() { 834 StringBuilder sb = new StringBuilder(); 835 toHTML(sb); 836 return sb.toString(); 837 } 838 839 /** 840 * Writes out the attribute in XHTML format (value is always appended, 841 * even if it is empty) with all necessary preceeding whitespace. 842 */ 843 public void toXHTML(StringBuilder sb) { 844 sb.append(' '); 845 sb.append(attribute.getName()).append("=\""); 846 847 // Assume that value-less attribute are boolean attributes like "disabled" 848 if (hasValue()) { 849 sb.append(CharEscapers.asciiHtmlEscaper().escape(value)); 850 } else { 851 sb.append(attribute.getName()); 852 } 853 854 sb.append("\""); 855 } 856 857 /** Returns the attribute XHTML string */ 858 public String toXHTML() { 859 StringBuilder sb = new StringBuilder(); 860 toXHTML(sb); 861 return sb.toString(); 862 } 863 864 /** 865 * @param sb Destination to which attribute is written, in its original 866 * preparsed form if possible. 867 */ 868 public void toOriginalHTML(StringBuilder sb) { 869 if (originalHtml != null) { 870 sb.append(originalHtml); 871 } else { 872 toHTML(sb); 873 } 874 } 875 876 /** 877 * Writes out the attribute in its original form as it was parsed.. 878 */ 879 public String toOriginalHTML() { 880 StringBuilder sb = new StringBuilder(); 881 toOriginalHTML(sb); 882 return sb.toString(); 883 } 884 885 @Override 886 public String toString() { 887 return "{" + attribute.getName() + "=" + value + "}"; 888 } 889 } 890 891 /** 892 * Filter is like Visitor, except it implies that the nodes may be changed, 893 * whereas HtmlDocument.Visitor just implies that the nodes are iterated 894 * over. A Filter can behave just like a Visitor if it merely returns the 895 * same node that it visited. Also, methods may be called on a node to change 896 * the values it contains. Alternatively, a new node entirely can be created 897 * and returned, which will essentially replace the previous node with the 898 * new node in the document tree. A node may be removed by returning null 899 * instead of a node. 900 */ 901 public static interface Filter { 902 /** This is called first */ 903 void start(); 904 905 /** A text node */ 906 Text visitText(Text n); 907 908 /** An open tag */ 909 Tag visitTag(Tag n); 910 911 /** End tag */ 912 EndTag visitEndTag(EndTag n); 913 914 /** HTML comment */ 915 Comment visitComment(Comment n); 916 917 /* Called at the end. */ 918 void finish(); 919 } 920 921 /** 922 * Like Filter, except each node may be replaced by multiple nodes. Also, 923 * does not do double dispatch accept/visit. 924 */ 925 public static interface MultiplexFilter { 926 /** 927 * Called first. 928 */ 929 void start(); 930 931 /** 932 * @param originalNode node to filter 933 * @param out Destination to which this object appends nodes to replace 934 * originalNode. Can not be null. 935 */ 936 void filter(Node originalNode, List<Node> out); 937 938 /** 939 * Called at the end. 940 * @param out Destination to which this object appends nodes at the end of 941 * the document. Can not be null. 942 */ 943 void finish(List<Node> out); 944 } 945 946 /** 947 * Converts a normal {@link Filter} into a {@link MultiplexFilter}. 948 */ 949 public static class MultiplexFilterAdapter implements MultiplexFilter { 950 951 private final Filter filter; 952 953 public MultiplexFilterAdapter(Filter filter) { 954 this.filter = filter; 955 } 956 957 public void start() { 958 filter.start(); 959 } 960 961 public void filter(Node originalNode, List<Node> out) { 962 if (originalNode == null) { 963 return; 964 } 965 966 Node resultNode; 967 if (originalNode instanceof Tag) { 968 resultNode = filter.visitTag((Tag) originalNode); 969 } else if (originalNode instanceof Text) { 970 resultNode = filter.visitText((Text) originalNode); 971 } else if (originalNode instanceof EndTag) { 972 resultNode = filter.visitEndTag((EndTag) originalNode); 973 } else if (originalNode instanceof Comment) { 974 resultNode = filter.visitComment((Comment) originalNode); 975 } else { 976 throw new IllegalArgumentException("unknown node type: " + originalNode.getClass()); 977 } 978 979 if (resultNode != null) { 980 out.add(resultNode); 981 } 982 } 983 984 public void finish(List<Node> out) { 985 filter.finish(); 986 } 987 } 988 989 /** 990 * Like Filter, except each node may be replaced by multiple nodes. Also, 991 * does not do double dispatch accept/visit. Dispatches filterNode() to 992 * node-specific methods. 993 */ 994 public static abstract class SimpleMultiplexFilter implements MultiplexFilter { 995 996 /** 997 * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List) 998 */ 999 public void filter(Node originalNode, List<Node> out) { 1000 if (originalNode == null) { 1001 return; 1002 } 1003 1004 if (originalNode instanceof Tag) { 1005 filterTag((Tag) originalNode, out); 1006 } else if (originalNode instanceof Text) { 1007 filterText((Text) originalNode, out); 1008 } else if (originalNode instanceof EndTag) { 1009 filterEndTag((EndTag) originalNode, out); 1010 } else if (originalNode instanceof Comment) { 1011 filterComment((Comment) originalNode, out); 1012 } else { 1013 throw new IllegalArgumentException("unknown node type: " 1014 + originalNode.getClass()); 1015 } 1016 } 1017 1018 public abstract void filterTag(Tag originalTag, List<Node> out); 1019 1020 public abstract void filterText(Text originalText, List<Node> out); 1021 1022 public abstract void filterEndTag(EndTag originalEndTag, List<Node> out); 1023 1024 public void filterComment(Comment originalComment, List<Node> out) { 1025 } 1026 } 1027 1028 /** 1029 * Contains a list of filters which are applied, in order, to each Node. The 1030 * output of each becomes the input to the next. As soon as one returns an 1031 * empty list it breaks the chain. 1032 */ 1033 public static class MultiplexFilterChain implements MultiplexFilter { 1034 1035 private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>(); 1036 1037 /** 1038 * @param sourceFilters these filters are applied in List order 1039 */ 1040 public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) { 1041 filters.addAll(sourceFilters); 1042 } 1043 1044 /** 1045 * @see HtmlDocument.MultiplexFilter#start() 1046 */ 1047 public void start() { 1048 for (MultiplexFilter filter : filters) { 1049 filter.start(); 1050 } 1051 } 1052 1053 /** 1054 * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List) 1055 */ 1056 public void filter(Node originalNode, List<Node> out) { 1057 List<Node> result = new ArrayList<Node>(); 1058 result.add(originalNode); 1059 1060 // loop through filters until one returns nothing, or until we're out of 1061 // filters 1062 for (MultiplexFilter filter : filters) { 1063 if (result.isEmpty()) { 1064 return; 1065 } 1066 1067 // apply filter to each node and collect results 1068 List<Node> newResult = new ArrayList<Node>(); 1069 for (Node node : result) { 1070 filter.filter(node, newResult); 1071 } 1072 result = newResult; 1073 } 1074 1075 out.addAll(result); 1076 } 1077 1078 /** 1079 * @see HtmlDocument.MultiplexFilter#finish(List) 1080 */ 1081 public void finish(List<Node> out) { 1082 List<Node> result = new ArrayList<Node>(); 1083 1084 // loop through filters until one returns nothing, or until we're out of 1085 // filters 1086 for (MultiplexFilter filter : filters) { 1087 // apply filter to each node and collect results 1088 List<Node> newResult = new ArrayList<Node>(); 1089 for (Node node : result) { 1090 filter.filter(node, newResult); 1091 } 1092 filter.finish(newResult); 1093 result = newResult; 1094 } 1095 1096 out.addAll(result); 1097 } 1098 } 1099 1100 /** 1101 * Html visitor allows external code to iterate through the nodes in the 1102 * document. See HtmlDocument.accept. 1103 */ 1104 public static interface Visitor { 1105 /** This is called first */ 1106 void start(); 1107 1108 /** A text node */ 1109 void visitText(Text n); 1110 1111 /** An open tag */ 1112 void visitTag(Tag n); 1113 1114 /** End tag */ 1115 void visitEndTag(EndTag n); 1116 1117 /** comment */ 1118 void visitComment(Comment n); 1119 1120 /* Called at the end. */ 1121 void finish(); 1122 } 1123 1124 /** 1125 * An implementation of the Visitor interface which simply delegates its 1126 * methods to a wrapped instance of another Visitor. 1127 * 1128 * <p>This is useful for chaining Visitors together. 1129 */ 1130 public static class VisitorWrapper implements Visitor { 1131 private final Visitor wrapped; 1132 1133 protected VisitorWrapper(Visitor wrap) { 1134 wrapped = wrap; 1135 } 1136 1137 public void start() { 1138 wrapped.start(); 1139 } 1140 1141 public void visitText(Text n) { 1142 wrapped.visitText(n); 1143 } 1144 1145 public void visitTag(Tag n) { 1146 wrapped.visitTag(n); 1147 } 1148 1149 public void visitEndTag(EndTag n) { 1150 wrapped.visitEndTag(n); 1151 } 1152 1153 public void visitComment(Comment n) { 1154 wrapped.visitComment(n); 1155 } 1156 1157 public void finish() { 1158 wrapped.finish(); 1159 } 1160 } 1161 1162 /** 1163 * A special helper Visitor that builds a HtmlDocument. 1164 */ 1165 public static class Builder implements Visitor { 1166 private final boolean preserveComments; 1167 private final List<Node> nodes = new ArrayList<Node>(); 1168 private HtmlDocument doc; 1169 1170 /** 1171 * @see Builder#Builder(boolean) 1172 */ 1173 public Builder() { 1174 this(false); 1175 } 1176 1177 /** 1178 * @param preserveComments If false, ignores Comment nodes 1179 */ 1180 public Builder(boolean preserveComments) { 1181 this.preserveComments = preserveComments; 1182 } 1183 1184 public void addNode(Node node) { 1185 nodes.add(node); 1186 } 1187 public void start() { 1188 } 1189 public void visitText(Text t) { 1190 addNode(t); 1191 } 1192 public void visitTag(Tag t) { 1193 addNode(t); 1194 } 1195 public void visitComment(Comment n) { 1196 if (preserveComments) { 1197 addNode(n); 1198 } 1199 } 1200 public void visitEndTag(EndTag t) { 1201 addNode(t); 1202 } 1203 public void finish() { 1204 doc = new HtmlDocument(nodes); 1205 } 1206 1207 /** Gets the html document that has been constructed */ 1208 public HtmlDocument getDocument() { 1209 return doc; 1210 } 1211 } 1212 1213 /** 1214 * A Visitor that prints out the html document in debug format. 1215 */ 1216 public static class DebugPrinter implements Visitor { 1217 1218 private final PrintWriter writer; 1219 1220 public DebugPrinter(PrintWriter writer) { 1221 this.writer = writer; 1222 } 1223 1224 public void start() { 1225 } 1226 1227 public void visitText(Text t) { 1228 writeCollapsed("TEXT", t.getText()); 1229 } 1230 1231 public void visitComment(Comment n) { 1232 writeCollapsed("COMMENT", n.getContent()); 1233 } 1234 1235 private void writeCollapsed(String type, String s) { 1236 writer.print(type); 1237 writer.print(": "); 1238 String noNewlines = s.replace("\n", " "); 1239 // Use CharMatcher#WHITESPACE? 1240 String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' '); 1241 writer.print(collapsed); 1242 } 1243 1244 public void visitTag(Tag tag) { 1245 writer.print("==<" + tag.getName() + ">"); 1246 List<TagAttribute> attributes = tag.getAttributes(); 1247 if (attributes != null) { 1248 1249 // Attribute values 1250 List<String> attrs = new ArrayList<String>(); 1251 for (TagAttribute a : attributes) { 1252 attrs.add("[" + a.getName() + " : " + a.getValue() + "]"); 1253 } 1254 String[] array = attrs.toArray(new String[attrs.size()]); 1255 1256 // Sort the attributes so that it's easier to read and compare 1257 Arrays.sort(array); 1258 for (int i = 0; i < array.length; i++) { 1259 writer.print(" " + array[i]); 1260 } 1261 } 1262 writer.println(); 1263 } 1264 1265 public void visitEndTag(EndTag endtag) { 1266 writer.println("==</" + endtag.getName() + ">"); 1267 } 1268 1269 public void finish() { 1270 } 1271 } 1272 1273 }