1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.android.mail.common.html.parser; 17 18 import com.google.android.mail.common.base.CharMatcher; 19 import com.google.android.mail.common.base.Preconditions; 20 import com.google.android.mail.common.base.X; 21 import com.google.common.annotations.VisibleForTesting; 22 import com.google.common.collect.ImmutableSet; 23 24 import java.util.ArrayList; 25 import java.util.Arrays; 26 import java.util.Collections; 27 import java.util.List; 28 import java.util.Set; 29 import java.util.Stack; 30 import java.util.logging.Logger; 31 32 /** 33 * HtmlTree represents a parsed and well-formed html text, it provides 34 * methods to convert to plain text. It also provides methods to find 35 * well-formed blocks of text, for quote detection. 36 * 37 * We don't really build a html tree data structure. Instead, for 38 * efficiency, and for the ability to do a simple in-order-traversal 39 * of the tree, we simply keeps a linear list of nodes (in-order). 40 * The begin_ and end_ arrays keeps track of the starting end ending 41 * nodes: 42 * 43 * For a string node, begin_[node] = end_[node] = node 44 * For an open tag, begin_[node] = node, end_[node] = the matching end tag 45 * For a close tag, end_[node] = the matching open tag, end_[node] = node 46 * 47 * @author jlim (at) google.com (Jing Yee Lim) 48 */ 49 public class HtmlTree { 50 // http://www.w3.org/TR/html4/struct/text.html#h-9.1 51 private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n"); 52 53 /** 54 * An interface that allows clients to provide their own implementation 55 * of a {@link PlainTextConverter}. 56 */ 57 public static interface PlainTextConverterFactory { 58 /** 59 * Creates a new instance of a {@link PlainTextConverter} to convert 60 * the contents of an {@link HtmlTree} to plain text. 61 */ 62 PlainTextConverter createInstance(); 63 } 64 65 /** 66 * An interface for an object which converts a single HtmlTree into 67 * plaintext. 68 */ 69 public static interface PlainTextConverter { 70 /** 71 * Adds the given node {@code n} to plain text. 72 * 73 * @param n The node to convert to text. 74 * @param nodeNum The number of the node among the list of all notes. 75 * @param endNum The number of the ending node if this is a start node, 76 * otherwise the same as {@code nodeNum}. 77 */ 78 void addNode(HtmlDocument.Node n, int nodeNum, int endNum); 79 80 /** 81 * Returns the current length of the plain text. 82 */ 83 int getPlainTextLength(); 84 85 /** 86 * Returns the current plain text. 87 */ 88 String getPlainText(); 89 } 90 91 /** A factory that produces converters of the default implementation. */ 92 private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY = 93 new PlainTextConverterFactory() { 94 @Override 95 public PlainTextConverter createInstance() { 96 return new DefaultPlainTextConverter(); 97 } 98 }; 99 100 /** Contains html nodes */ 101 private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>(); 102 103 /** Keeps track of beginning and end of each node */ 104 private final Stack<Integer> begins = new Stack<Integer>(); 105 private final Stack<Integer> ends = new Stack<Integer>(); 106 107 /** Plain text (lazy creation) */ 108 private String plainText; 109 110 /** The html string (lazy creation) */ 111 private String html; 112 113 /** textPositions[node pos] = the text position */ 114 private int[] textPositions; 115 116 private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY; 117 118 // For debugging only 119 private static final boolean DEBUG = false; 120 121 private static final Logger logger = Logger.getLogger(HtmlTree.class.getName()); 122 123 //------------------------------------------------------------------------ 124 125 /** HtmlTree can only be constructed from this package */ 126 HtmlTree() { 127 } 128 129 /** 130 * Sets a new {@link PlainTextConverterFactory} to be used to convert 131 * the contents of this tree to plaintext. 132 */ 133 public void setPlainTextConverterFactory(PlainTextConverterFactory factory) { 134 if (factory == null) { 135 throw new NullPointerException("factory must not be null"); 136 } 137 converterFactory = factory; 138 } 139 140 /** 141 * Gets the list of node objects. A node can be either a 142 * Tag, EngTag or a String object. 143 * @return the nodes of the tree 144 */ 145 public List<HtmlDocument.Node> getNodesList() { 146 return Collections.unmodifiableList(nodes); 147 } 148 149 /** 150 * @return number of nodes 151 */ 152 public int getNumNodes() { 153 return nodes.size(); 154 } 155 156 /** 157 * Returns number of matching open tag node, or {@code endTagNodeNum} itself 158 * if it does not point to a closing tag. 159 */ 160 public int findOpenTag(int endTagNodeNum) { 161 X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size()); 162 return begins.get(endTagNodeNum); 163 } 164 165 /** 166 * Returns number of matching closing tag node, or {@code openTagNodeNum} itself 167 * if it does not point to an open tag or points to an open tag with no closing one. 168 */ 169 public int findEndTag(int openTagNodeNum) { 170 X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size()); 171 return ends.get(openTagNodeNum); 172 } 173 174 /** 175 * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself 176 * if it does not point to an open/closing tag (e.g text node or comment). 177 */ 178 public int findPairedTag(int tagNodeNum) { 179 X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size()); 180 int openNodeNum = begins.get(tagNodeNum); 181 int endNodeNum = ends.get(tagNodeNum); 182 return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum; 183 } 184 185 /** 186 * Gets the entire html. 187 */ 188 public String getHtml() { 189 return getHtml(-1); 190 } 191 192 /** 193 * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the 194 * specified size. 195 */ 196 public String getHtml(int wrapSize) { 197 if (html == null) { 198 html = getHtml(0, nodes.size(), wrapSize); 199 } 200 return html; 201 } 202 203 /** Gets parts of the html */ 204 public String getHtml(int fromNode, int toNode) { 205 return getHtml(fromNode, toNode, -1); 206 } 207 208 /** 209 * Gets parts of the html, if wrapSize is > 0, it tries 210 * to do wrapping at the specified size. 211 */ 212 public String getHtml(int fromNode, int toNode, int wrapSize) { 213 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 214 215 int estSize = (toNode - fromNode) * 10; 216 StringBuilder sb = new StringBuilder(estSize); 217 int lastWrapIndex = 0; // used for wrapping 218 for (int n = fromNode; n < toNode; n++) { 219 HtmlDocument.Node node = nodes.get(n); 220 node.toHTML(sb); 221 // TODO: maybe we can be smarter about this and not add newlines 222 // within <pre> tags, unless the whole long line is encompassed 223 // by the <pre> tag. 224 if (wrapSize > 0) { 225 // We can only wrap if the last outputted node is an element that 226 // breaks the flow. Otherwise, we risk the possibility of inserting 227 // spaces where they shouldn't be. 228 if ((node instanceof HtmlDocument.Tag && 229 ((HtmlDocument.Tag) node).getElement().breaksFlow()) || 230 (node instanceof HtmlDocument.EndTag && 231 ((HtmlDocument.EndTag) node).getElement().breaksFlow())) { 232 // Check to see if there is a newline in the most recent node's html. 233 int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n'); 234 if (recentNewLine != -1) { 235 lastWrapIndex += recentNewLine; 236 } 237 // If the last index - last index of a newline is greater than 238 // wrapSize, add a newline. 239 if (((sb.length() - 1) - lastWrapIndex) > wrapSize) { 240 sb.append('\n'); 241 lastWrapIndex = sb.length() - 1; 242 } 243 } 244 } 245 } 246 247 return sb.toString(); 248 } 249 250 /** 251 * Convert a html region into chunks of html code, each containing 252 * roughly chunkSize characters. 253 */ 254 public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) { 255 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 256 257 ArrayList<String> chunks = new ArrayList<String>(); 258 259 // Do a best effort attempt to not split apart certain elements (as of now, 260 // just the <textarea>). We cannot guarantee that they will not be split 261 // because the client may specify endpoint nodes that land in the middle 262 // of an element (although this shouldn't happen if the endpoints returned 263 // by createBlocks() are properly used). 264 int stack = 0; 265 boolean balanced = true; 266 267 StringBuilder sb = new StringBuilder(chunkSize + 256); 268 for (int n = fromNode; n < toNode; n++) { 269 HtmlDocument.Node node = nodes.get(n); 270 node.toHTML(sb); 271 272 if (node instanceof HtmlDocument.Tag) { 273 if (HTML4.TEXTAREA_ELEMENT.equals( 274 ((HtmlDocument.Tag) node).getElement())) { 275 stack++; 276 } 277 } 278 if (node instanceof HtmlDocument.EndTag) { 279 if (HTML4.TEXTAREA_ELEMENT.equals( 280 ((HtmlDocument.EndTag) node).getElement())) { 281 if (stack == 0) { 282 balanced = false; 283 } else { 284 stack--; 285 } 286 } 287 } 288 289 if (stack == 0 && sb.length() >= chunkSize) { 290 chunks.add(sb.toString()); 291 sb.setLength(0); 292 } 293 } 294 295 // Don't forget the last chunk! 296 if (sb.length() > 0) { 297 chunks.add(sb.toString()); 298 } 299 300 // If the tree is not balanced (cut off in the middle of a node), log 301 // debug data. Clients should fix their code so that the endpoints from 302 // createBlocks() are properly used. 303 if (!balanced || stack != 0) { 304 StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n"); 305 debug.append(getHtml()); 306 debug.append("\nfromNode: ").append(fromNode); 307 debug.append("\ntoNode: ").append(toNode); 308 debug.append("\nNum nodes_: ").append(getNumNodes()); 309 for (String chunk : chunks) { 310 debug.append("\nChunk:\n").append(chunk); 311 } 312 logger.severe(debug.toString()); 313 } 314 315 return chunks; 316 } 317 318 /** 319 * Returns height (maximum length from root to a leaf) of the HTML tree. 320 * @return height of the HTML tree. 321 */ 322 public int getTreeHeight() { 323 int currentHeight = 0; 324 int maxHeight = 0; 325 326 for (int i = 0; i < nodes.size(); i++) { 327 HtmlDocument.Node node = nodes.get(i); 328 if (node instanceof HtmlDocument.Tag) { 329 currentHeight++; 330 if (currentHeight > maxHeight) { 331 maxHeight = currentHeight; 332 } 333 if (((HtmlDocument.Tag) node).getElement().isEmpty()) { 334 // Empty tags have no closing pair, so decrease counter here. 335 currentHeight--; 336 } 337 } else if (node instanceof HtmlDocument.EndTag) { 338 currentHeight--; 339 } 340 } 341 342 // TODO(anatol): make this value cachable? 343 return maxHeight; 344 } 345 346 //------------------------------------------------------------------------ 347 // Creating well-formed blocks within the html tree. 348 //------------------------------------------------------------------------ 349 /** 350 * A Block represents a region of a html tree that 351 * 1) is well-formed, i.e. for each node in the block, all its descendants 352 * are also contained in the block. So it's safe to wrap the region 353 * within a <table> or <div>, etc. 354 * 2) starts at the beginning of a "line", e.g. a <div>, a <br>. 355 */ 356 public static class Block { 357 /* The starting node */ 358 public int start_node; 359 360 /* The ending node (non-inclusive to the block) */ 361 public int end_node; 362 } 363 364 /** 365 * Creates a list of Blocks, given a text-range. 366 * We may create multiple blocks if one single well-formed Block cannot be 367 * created. 368 * 369 * @param textStart beginning plain-text offset 370 * @param textEnd beginning plain-text offset 371 * @param minNode the smallest node number 372 * @param maxNode the largest node number 373 * @return a list of 0 or more Block objects, never null 374 */ 375 public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) { 376 377 ArrayList<Block> blocks = new ArrayList<Block>(); 378 int startNode = Math.max(getBlockStart(textStart), minNode); 379 int endNode = Math.min(getBlockEnd(textEnd), maxNode); 380 381 if (DEBUG) { 382 debug("Creating block: " + 383 "text pos: " + textStart + "-" + textEnd + "\n" + 384 "node pos: " + startNode + "-" + endNode + "\n" + 385 plainText.substring(textStart, textEnd)); 386 } 387 388 // Split up the block [start, end) into one or more blocks that 389 // are well-formed, and begins at a "line" boundary. 390 int blockStart = -1; 391 for (int n = startNode; n < endNode;) { 392 393 // The node n spans [nBegin, nEnd] 394 int nBegin = begins.get(n); 395 int nEnd = ends.get(n); 396 397 if (blockStart == -1) { 398 // Check if this is a valid start node 399 if (nBegin >= n && nEnd <= endNode && 400 canBeginBlockAt(n)) { 401 blockStart = n; 402 n = nEnd + 1; 403 } else { 404 n++; 405 } 406 continue; 407 } 408 409 // If the node [nBegin, nEnd) lies completely within 410 // the region then proceed to the (nEnd + 1). 411 if (nBegin >= blockStart && nEnd < endNode) { 412 n = nEnd + 1; 413 continue; 414 } 415 416 // If we got here, we have to break up the region into one 417 // or more blocks because the current node cannot be included 418 // in the region. 419 if (DEBUG) { 420 debug("Forcing new block: " + n + " (" + nBegin + " " + nEnd + 421 ") exceeds (" + blockStart + " " + endNode + ")"); 422 } 423 Block b = new Block(); 424 b.start_node = blockStart; 425 b.end_node = n; 426 blocks.add(b); 427 428 blockStart = -1; 429 n++; 430 } 431 432 // Last block 433 if (blockStart != -1) { 434 Block b = new Block(); 435 b.start_node = blockStart; 436 b.end_node = endNode; 437 blocks.add(b); 438 } 439 440 if (DEBUG) { 441 for (int i = 0; i < blocks.size(); i++) { 442 Block b = blocks.get(i); 443 debug("Block " + i + "/" + blocks.size() + ": " + 444 b.start_node + "-" + b.end_node + " " + 445 getPlainText(b.start_node, b.end_node)); 446 } 447 } 448 449 return blocks; 450 } 451 452 /** 453 * Checks if a block can begin starting from a node position 454 */ 455 private boolean canBeginBlockAt(int nodePos) { 456 int textPos = textPositions[nodePos]; 457 458 // Make sure that we don't exceed the text position, this happens 459 // for the last tag nodes. 460 if (textPos == plainText.length()) { 461 textPos--; 462 } 463 464 // Scan backwards to check if a nodePos is at the beginning 465 // of a line. 466 for (int i = textPos; i > 0; i--) { 467 char ch = plainText.charAt(i); 468 if (ch == '\n') { 469 return true; 470 } 471 if (i < textPos && !HTML_WHITESPACE.matches(ch)) { 472 return false; 473 } 474 } 475 return true; 476 } 477 478 /** 479 * Returns the start of a block given a text-pos 480 */ 481 private int getBlockStart(int textPos) { 482 int nodenum = Arrays.binarySearch(textPositions, textPos); 483 if (nodenum >= 0) { 484 // Got an exact node alignment. Get the outer most pos that 485 // matches the text position 486 while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) { 487 nodenum--; 488 } 489 } else { 490 // textPos matches the middle of a node. 491 nodenum = -nodenum - 1; 492 } 493 494 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 495 return nodenum; 496 } 497 498 /** 499 * Returns the end of a block given a text-pos 500 */ 501 private int getBlockEnd(int textPos) { 502 int nodenum = Arrays.binarySearch(textPositions, textPos); 503 if (nodenum >= 0) { 504 // Got an exact node alignment. 505 while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) { 506 nodenum++; 507 } 508 } else { 509 // textPos matches the middle of a node. 510 nodenum = -nodenum - 2; 511 } 512 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 513 return nodenum; 514 } 515 516 //------------------------------------------------------------------------ 517 // Plain text view of the html tree 518 //------------------------------------------------------------------------ 519 /** 520 * @return the plain-text position corresponding to the node 521 */ 522 public int getTextPosition(int node) { 523 return textPositions[node]; 524 } 525 526 /** 527 * @return a plain-text String of the html tree 528 */ 529 public String getPlainText() { 530 if (plainText == null) { 531 convertToPlainText(); 532 } 533 return plainText; 534 } 535 536 /** 537 * @return a plain-text String of a part of the html tree 538 */ 539 public String getPlainText(int fromNode, int toNode) { 540 if (plainText == null) { 541 convertToPlainText(); 542 } 543 int textstart = textPositions[fromNode]; 544 int textend = textPositions[toNode]; 545 return plainText.substring(textstart, textend); 546 } 547 548 /** 549 * Converts the html tree to plain text. 550 * We simply iterate through the nodes in the tree. 551 * As we output the plain-text, we keep track of the text position 552 * of each node. 553 * For String nodes, we replace '\n' with ' ' unless we're in a 554 * <pre> block. 555 */ 556 private void convertToPlainText() { 557 X.assertTrue(plainText == null && textPositions == null); 558 559 int numNodes = nodes.size(); 560 561 // Keeps track of start text position of each node, including a last 562 // entry for the size of the text. 563 textPositions = new int[numNodes + 1]; 564 565 PlainTextConverter converter = converterFactory.createInstance(); 566 567 for (int i = 0; i < numNodes; i++) { 568 textPositions[i] = converter.getPlainTextLength(); 569 converter.addNode(nodes.get(i), i, ends.get(i)); 570 } 571 572 // Add a last entry, so that textPositions_[nodes_.size()] is valid. 573 textPositions[numNodes] = converter.getPlainTextLength(); 574 575 plainText = converter.getPlainText(); 576 577 if (DEBUG) { 578 debug("Plain text: " + plainText); 579 580 for (int i = 0; i < nodes.size(); i++) { 581 int textPos = textPositions[i]; 582 String text = plainText.substring(textPos, textPositions[i + 1]); 583 debug("At " + i + ": pos=" + textPos + " " + text); 584 } 585 } 586 } 587 588 /** 589 * Encapsulates the logic for outputting plain text with respect to text 590 * segments, white space separators, line breaks, and quote marks. 591 */ 592 @VisibleForTesting 593 static final class PlainTextPrinter { 594 /** 595 * Separators are whitespace inserted between segments of text. The 596 * semantics are such that between any two segments of text, there is 597 * at most one separator. As such, separators are ordered in increasing 598 * priority, and setting a separator multiple times between text will 599 * result in the single separator with the highest priority being used. 600 * For example, a LineBreak (one newline) will override a Space, but will 601 * be overriden by a BlankLine (two newlines). 602 */ 603 static enum Separator { 604 // The values here must be ordered by increasing priority, as the 605 // enum's ordinal() method is used when determining if a new separator 606 // should override an existing one. 607 None, 608 Space, // single space 609 LineBreak, // single new line 610 BlankLine // two new lines 611 } 612 613 // White space characters that are collapsed as a single space. 614 // Note that characters such as the non-breaking whitespace 615 // and full-width spaces are not equivalent to the normal spaces. 616 private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f"; 617 618 /** 619 * Determines if the given character is considered an HTML space character. 620 * Consecutive HTML space characters are collapsed into a single space when 621 * not within a PRE element. 622 */ 623 private static boolean isHtmlWhiteSpace(char ch) { 624 return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0; 625 } 626 627 // The buffer in which we accumulate the converted plain text 628 private final StringBuilder sb = new StringBuilder(); 629 630 // How many <blockquote> blocks we are in. 631 private int quoteDepth = 0; 632 633 // How many logical newlines are at the end of the buffer we've outputted. 634 // Note that we can't simply count the newlines at the end of the output 635 // buffer because a logical new line may be followed by quote marks. 636 // 637 // We initialize the value to 2 so that we consume any initial separators, 638 // since we don't need separators at the beginning of the output. This also 639 // results in correctly outputting any quote marks at the beginning of the 640 // output if the first piece of text is within a BLOCKQUOTE element. 641 private int endingNewLines = 2; 642 643 // The next separator to be inserted between two text nodes. 644 private Separator separator = Separator.None; 645 646 /** Returns the current length of the text. */ 647 final int getTextLength() { 648 return sb.length(); 649 } 650 651 /** Returns the current text. */ 652 final String getText() { 653 return sb.toString(); 654 } 655 656 /** 657 * Sets the next separator between two text nodes. A Space separator is 658 * used if there is any whitespace between the two text nodes when there is 659 * no intervening element that breaks flow. This is automatically handled 660 * by the {@link #appendNormalText} function so the client never needs to 661 * specify this separator. 662 * <p> 663 * A LineBreak separator (single new line) is used if text segments are 664 * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR, 665 * etc.). The client should set this separator for opening and closing tags 666 * of any element that breaks flow. 667 * <p> 668 * A BlankLine separator (two new lines) should be set for opening and 669 * closing P tags. 670 * <p> 671 * If this method is called multiple times between text nodes, a 672 * separator with a higher priority will override that of a lower priority. 673 */ 674 final void setSeparator(Separator newSeparator) { 675 if (newSeparator.ordinal() > separator.ordinal()) { 676 separator = newSeparator; 677 } 678 } 679 680 /** Increments the current quote depth of the text. */ 681 final void incQuoteDepth() { 682 quoteDepth++; 683 } 684 685 /** Decrements the current quote depth of the text. */ 686 final void decQuoteDepth() { 687 quoteDepth = Math.max(0, quoteDepth - 1); 688 } 689 690 /** 691 * Normalizes the HTML whitespace in the given {@code text} and appends it 692 * as the next segment of text. This will flush any separator that should 693 * be appended before the text, as well as any quote marks that should 694 * follow the last newline if the quote depth is non-zero. 695 */ 696 final void appendNormalText(String text) { 697 if (text.length() == 0) { 698 return; 699 } 700 boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0)); 701 boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1)); 702 703 // Strip beginning and ending whitespace. 704 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text); 705 706 // Collapse whitespace within the text. 707 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' '); 708 709 if (startsWithSpace) { 710 setSeparator(Separator.Space); 711 } 712 713 appendTextDirect(text); 714 715 if (endsWithSpace) { 716 setSeparator(Separator.Space); 717 } 718 } 719 720 /** 721 * Appends the given text, preserving all whitespace. This is used for 722 * appending text in a PRE element. 723 */ 724 final void appendPreText(String text) { 725 // We're in a <pre> block. Split the text into lines, and append 726 // each line with appendTextDirect() to preserve white space. 727 String[] lines = text.split("[\\r\\n]", -1); 728 729 // split() will always return an array with at least one element. 730 appendTextDirect(lines[0]); 731 732 // For all of the remaining lines, we append a newline first, which 733 // takes care of any quote marks that we need to output if the quote 734 // depth is non-zero. 735 for (int i = 1; i < lines.length; i++) { 736 appendNewLine(); 737 appendTextDirect(lines[i]); 738 } 739 } 740 741 /** 742 * Appends the {@code text} directly to the output, taking into account 743 * any separator that should be appended before it, and any quote marks 744 * that should follow the last newline if the quote depth is non-zero. 745 * <p> 746 * {@code text} must not contain any new lines--in order to handle 747 * quoting correctly, it is up to the caller to either normalize away the 748 * newlines, or split the text up into separate lines and handle new lines 749 * with the {@link #appendNewLine} method. 750 * <p> 751 * The original {@code text} is not modified in any way. Use this method 752 * when you need to preserve the original white space. 753 * <p> 754 * If the given {@code text} is non empty, this method will result in 755 * {@code endingNewLines} being reset to 0. 756 */ 757 private void appendTextDirect(String text) { 758 if (text.length() == 0) { 759 return; 760 } 761 Preconditions.checkArgument(text.indexOf('\n') < 0, 762 "text must not contain newlines."); 763 flushSeparator(); 764 maybeAddQuoteMarks(true); 765 sb.append(text); 766 endingNewLines = 0; 767 } 768 769 /** 770 * Appends a forced line break, which is the equivalent of a BR element. 771 */ 772 final void appendForcedLineBreak() { 773 flushSeparator(); 774 appendNewLine(); 775 } 776 777 /** 778 * Appends any pending separator to the output buffer. This should be 779 * called before appending text to the buffer. 780 */ 781 private void flushSeparator() { 782 switch (separator) { 783 case Space: 784 if (endingNewLines == 0) { 785 // Only append a space separator if we are not following a new 786 // line character. For example, we don't append a separator 787 // space after a <br> tag, since the <br>'s newline fulfills the 788 // space separation requirement. 789 sb.append(" "); 790 } 791 break; 792 case LineBreak: 793 while (endingNewLines < 1) { 794 appendNewLine(); 795 } 796 break; 797 case BlankLine: 798 while (endingNewLines < 2) { 799 appendNewLine(); 800 } 801 break; 802 } 803 separator = Separator.None; 804 } 805 806 /** 807 * Adds a newline to the output. This handles any quote marks that should 808 * follow any previous new lines, and increments {@code endingNewLines}. 809 */ 810 private void appendNewLine() { 811 maybeAddQuoteMarks(false); 812 sb.append('\n'); 813 endingNewLines++; 814 } 815 816 /** 817 * Adds quote marks to the output if we are at the beginning of a line. 818 * One '>' character is used for every level of quoting we are in. 819 * 820 * @param includeEndingSpace Includes a single space after the quote marks. 821 */ 822 private void maybeAddQuoteMarks(boolean includeEndingSpace) { 823 // We only need to add quote marks if we are at the beginning of line. 824 if (endingNewLines > 0 && quoteDepth > 0) { 825 for (int i = 0; i < quoteDepth; i++) { 826 sb.append('>'); 827 } 828 if (includeEndingSpace) { 829 sb.append(' '); 830 } 831 } 832 } 833 } 834 835 /** 836 * Contains the logic for converting the contents of one HtmlTree into 837 * plaintext. 838 */ 839 public static class DefaultPlainTextConverter implements PlainTextConverter { 840 841 private static final Set<HTML.Element> BLANK_LINE_ELEMENTS = 842 ImmutableSet.of( 843 HTML4.P_ELEMENT, 844 HTML4.BLOCKQUOTE_ELEMENT, 845 HTML4.PRE_ELEMENT); 846 847 private final PlainTextPrinter printer = new PlainTextPrinter(); 848 849 private int preDepth = 0; 850 private int styleDepth = 0; 851 852 @Override 853 public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) { 854 if (n instanceof HtmlDocument.Text) { // A string node 855 856 HtmlDocument.Text textNode = (HtmlDocument.Text) n; 857 String str = textNode.getText(); 858 859 if (preDepth > 0) { 860 printer.appendPreText(str); 861 862 } else if (styleDepth > 0) { 863 // Append nothing 864 } else { 865 printer.appendNormalText(str); 866 } 867 868 } else if (n instanceof HtmlDocument.Tag) { 869 870 // Check for linebreaking tags. 871 HtmlDocument.Tag tag = (HtmlDocument.Tag) n; 872 HTML.Element element = tag.getElement(); 873 874 if (BLANK_LINE_ELEMENTS.contains(element)) { 875 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 876 877 } else if (HTML4.BR_ELEMENT.equals(element)) { 878 // The <BR> element is special in that it always adds a newline. 879 printer.appendForcedLineBreak(); 880 881 } else if (element.breaksFlow()) { 882 // All other elements that break the flow add a LineBreak separator. 883 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 884 885 if (HTML4.HR_ELEMENT.equals(element)) { 886 printer.appendNormalText("________________________________"); 887 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 888 } 889 } 890 891 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 892 printer.incQuoteDepth(); 893 894 } else if (HTML4.PRE_ELEMENT.equals(element)) { 895 preDepth++; 896 } else if (HTML4.STYLE_ELEMENT.equals(element)) { 897 styleDepth++; 898 } 899 900 } else if (n instanceof HtmlDocument.EndTag) { 901 902 // Check for linebreaking tags. 903 HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n; 904 HTML.Element element = endTag.getElement(); 905 906 if (BLANK_LINE_ELEMENTS.contains(element)) { 907 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 908 909 } else if (element.breaksFlow()) { 910 // All other elements that break the flow add a LineBreak separator. 911 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 912 } 913 914 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 915 printer.decQuoteDepth(); 916 917 } else if (HTML4.PRE_ELEMENT.equals(element)) { 918 preDepth--; 919 } else if (HTML4.STYLE_ELEMENT.equals(element)) { 920 styleDepth--; 921 } 922 } 923 } 924 925 @Override 926 public final int getPlainTextLength() { 927 return printer.getTextLength(); 928 } 929 930 @Override 931 public final String getPlainText() { 932 return printer.getText(); 933 } 934 } 935 936 //------------------------------------------------------------------------ 937 // The following methods are used to build the html tree. 938 //------------------------------------------------------------------------ 939 /** For building the html tree */ 940 private Stack<Integer> stack; 941 private int parent; 942 943 /** Starts the build process */ 944 void start() { 945 stack = new Stack<Integer>(); 946 parent = -1; 947 } 948 949 /** Finishes the build process */ 950 void finish() { 951 X.assertTrue(stack.size() == 0); 952 X.assertTrue(parent == -1); 953 } 954 955 /** 956 * Adds a html start tag, there must followed later by a call to addEndTag() 957 * to add the matching end tag 958 */ 959 void addStartTag(HtmlDocument.Tag t) { 960 int nodenum = nodes.size(); 961 addNode(t, nodenum, -1); 962 963 stack.add(parent); 964 parent = nodenum; 965 } 966 967 /** 968 * Adds a html end tag, this must be preceded by a previous matching open tag 969 */ 970 void addEndTag(HtmlDocument.EndTag t) { 971 int nodenum = nodes.size(); 972 addNode(t, parent, nodenum); 973 974 if (parent != -1) { 975 ends.set(parent, nodenum); 976 } 977 978 parent = stack.pop(); 979 } 980 981 /** Adds a singular tag that does not have a corresponding end tag */ 982 void addSingularTag(HtmlDocument.Tag t) { 983 int nodenum = nodes.size(); 984 addNode(t, nodenum, nodenum); 985 } 986 987 /** 988 * Adds a text 989 * @param t a plain-text string 990 */ 991 void addText(HtmlDocument.Text t) { 992 int nodenum = nodes.size(); 993 addNode(t, nodenum, nodenum); 994 } 995 996 /** Adds a node */ 997 private void addNode(HtmlDocument.Node n, int begin, int end) { 998 nodes.add(n); 999 begins.add(begin); 1000 ends.add(end); 1001 } 1002 1003 /** For debugging */ 1004 private static final void debug(String str) { 1005 logger.finest(str); 1006 } 1007 1008 }