1 /** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.mail.lib.html.parser; 17 18 import com.android.mail.lib.base.CharMatcher; 19 import com.android.mail.lib.base.Preconditions; 20 import com.android.mail.lib.base.X; 21 import com.google.common.collect.ImmutableSet; 22 23 import java.util.ArrayList; 24 import java.util.Arrays; 25 import java.util.Collections; 26 import java.util.List; 27 import java.util.Set; 28 import java.util.Stack; 29 import java.util.logging.Logger; 30 31 /** 32 * HtmlTree represents a parsed and well-formed html text, it provides 33 * methods to convert to plain text. It also provides methods to find 34 * well-formed blocks of text, for quote detection. 35 * 36 * We don't really build a html tree data structure. Instead, for 37 * efficiency, and for the ability to do a simple in-order-traversal 38 * of the tree, we simply keeps a linear list of nodes (in-order). 39 * The begin_ and end_ arrays keeps track of the starting end ending 40 * nodes: 41 * 42 * For a string node, begin_[node] = end_[node] = node 43 * For an open tag, begin_[node] = node, end_[node] = the matching end tag 44 * For a close tag, end_[node] = the matching open tag, end_[node] = node 45 * 46 * @author jlim (at) google.com (Jing Yee Lim) 47 */ 48 public class HtmlTree { 49 50 /** 51 * An interface that allows clients to provide their own implementation 52 * of a {@link PlainTextConverter}. 53 */ 54 public static interface PlainTextConverterFactory { 55 /** 56 * Creates a new instance of a {@link PlainTextConverter} to convert 57 * the contents of an {@link HtmlTree} to plain text. 58 */ 59 PlainTextConverter createInstance(); 60 } 61 62 /** 63 * An interface for an object which converts a single HtmlTree into 64 * plaintext. 65 */ 66 public static interface PlainTextConverter { 67 /** 68 * Adds the given node {@code n} to plain text. 69 * 70 * @param n The node to convert to text. 71 * @param nodeNum The number of the node among the list of all notes. 72 * @param endNum The number of the ending node if this is a start node, 73 * otherwise the same as {@code nodeNum}. 74 */ 75 void addNode(HtmlDocument.Node n, int nodeNum, int endNum); 76 77 /** 78 * Returns the current length of the plain text. 79 */ 80 int getPlainTextLength(); 81 82 /** 83 * Returns the current plain text. 84 */ 85 String getPlainText(); 86 } 87 88 /** A factory that produces converters of the default implementation. */ 89 private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY = 90 new PlainTextConverterFactory() { 91 public PlainTextConverter createInstance() { 92 return new DefaultPlainTextConverter(); 93 } 94 }; 95 96 /** Contains html nodes */ 97 private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>(); 98 99 /** Keeps track of beginning and end of each node */ 100 private final Stack<Integer> begins = new Stack<Integer>(); 101 private final Stack<Integer> ends = new Stack<Integer>(); 102 103 /** Plain text (lazy creation) */ 104 private String plainText; 105 106 /** The html string (lazy creation) */ 107 private String html; 108 109 /** textPositions[node pos] = the text position */ 110 private int[] textPositions; 111 112 private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY; 113 114 // For debugging only 115 private static final boolean DEBUG = false; 116 117 private static final Logger logger = Logger.getLogger(HtmlTree.class.getName()); 118 119 //------------------------------------------------------------------------ 120 121 /** HtmlTree can only be constructed from this package */ 122 HtmlTree() { 123 } 124 125 /** 126 * Sets a new {@link PlainTextConverterFactory} to be used to convert 127 * the contents of this tree to plaintext. 128 */ 129 public void setPlainTextConverterFactory(PlainTextConverterFactory factory) { 130 if (factory == null) { 131 throw new NullPointerException("factory must not be null"); 132 } 133 converterFactory = factory; 134 } 135 136 /** 137 * Gets the list of node objects. A node can be either a 138 * Tag, EngTag or a String object. 139 * @return the nodes of the tree 140 */ 141 public List<HtmlDocument.Node> getNodesList() { 142 return Collections.unmodifiableList(nodes); 143 } 144 145 /** 146 * @return number of nodes 147 */ 148 public int getNumNodes() { 149 return nodes.size(); 150 } 151 152 /** 153 * Gets the entire html. 154 */ 155 public String getHtml() { 156 return getHtml(-1); 157 } 158 159 /** 160 * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the 161 * specified size. 162 */ 163 public String getHtml(int wrapSize) { 164 if (html == null) { 165 html = getHtml(0, nodes.size(), wrapSize); 166 } 167 return html; 168 } 169 170 /** Gets parts of the html */ 171 public String getHtml(int fromNode, int toNode) { 172 return getHtml(fromNode, toNode, -1); 173 } 174 175 /** 176 * Gets parts of the html, if wrapSize is > 0, it tries 177 * to do wrapping at the specified size. 178 */ 179 public String getHtml(int fromNode, int toNode, int wrapSize) { 180 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 181 182 int estSize = (toNode - fromNode) * 10; 183 StringBuilder sb = new StringBuilder(estSize); 184 int lastWrapIndex = 0; // used for wrapping 185 for (int n = fromNode; n < toNode; n++) { 186 HtmlDocument.Node node = nodes.get(n); 187 node.toHTML(sb); 188 // TODO: maybe we can be smarter about this and not add newlines 189 // within <pre> tags, unless the whole long line is encompassed 190 // by the <pre> tag. 191 if (wrapSize > 0) { 192 // We can only wrap if the last outputted node is an element that 193 // breaks the flow. Otherwise, we risk the possibility of inserting 194 // spaces where they shouldn't be. 195 if ((node instanceof HtmlDocument.Tag && 196 ((HtmlDocument.Tag) node).getElement().breaksFlow()) || 197 (node instanceof HtmlDocument.EndTag && 198 ((HtmlDocument.EndTag) node).getElement().breaksFlow())) { 199 // Check to see if there is a newline in the most recent node's html. 200 int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n'); 201 if (recentNewLine != -1) { 202 lastWrapIndex += recentNewLine; 203 } 204 // If the last index - last index of a newline is greater than 205 // wrapSize, add a newline. 206 if (((sb.length() - 1) - lastWrapIndex) > wrapSize) { 207 sb.append('\n'); 208 lastWrapIndex = sb.length() - 1; 209 } 210 } 211 } 212 } 213 214 return sb.toString(); 215 } 216 217 /** 218 * Convert a html region into chunks of html code, each containing 219 * roughly chunkSize characters. 220 */ 221 public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) { 222 X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 223 224 ArrayList<String> chunks = new ArrayList<String>(); 225 226 // Do a best effort attempt to not split apart certain elements (as of now, 227 // just the <textarea>). We cannot guarantee that they will not be split 228 // because the client may specify endpoint nodes that land in the middle 229 // of an element (although this shouldn't happen if the endpoints returned 230 // by createBlocks() are properly used). 231 int stack = 0; 232 boolean balanced = true; 233 234 StringBuilder sb = new StringBuilder(chunkSize + 256); 235 for (int n = fromNode; n < toNode; n++) { 236 HtmlDocument.Node node = nodes.get(n); 237 node.toHTML(sb); 238 239 if (node instanceof HtmlDocument.Tag) { 240 if (HTML4.TEXTAREA_ELEMENT.equals( 241 ((HtmlDocument.Tag)node).getElement())) { 242 stack++; 243 } 244 } 245 if (node instanceof HtmlDocument.EndTag) { 246 if (HTML4.TEXTAREA_ELEMENT.equals( 247 ((HtmlDocument.EndTag)node).getElement())) { 248 if (stack == 0) { 249 balanced = false; 250 } else { 251 stack--; 252 } 253 } 254 } 255 256 if (stack == 0 && sb.length() >= chunkSize) { 257 chunks.add(sb.toString()); 258 sb.setLength(0); 259 } 260 } 261 262 // Don't forget the last chunk! 263 if (sb.length() > 0) { 264 chunks.add(sb.toString()); 265 } 266 267 // If the tree is not balanced (cut off in the middle of a node), log 268 // debug data. Clients should fix their code so that the endpoints from 269 // createBlocks() are properly used. 270 if (!balanced || stack != 0) { 271 StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n"); 272 debug.append(getHtml()); 273 debug.append("\nfromNode: ").append(fromNode); 274 debug.append("\ntoNode: ").append(toNode); 275 debug.append("\nNum nodes_: ").append(getNumNodes()); 276 for (String chunk : chunks) { 277 debug.append("\nChunk:\n").append(chunk); 278 } 279 logger.severe(debug.toString()); 280 } 281 282 return chunks; 283 } 284 285 /** 286 * Returns height (maximum length from root to a leaf) of the HTML tree. 287 * @return height of the HTML tree. 288 */ 289 public int getTreeHeight() { 290 int currentHeight = 0; 291 int maxHeight = 0; 292 293 for (int i = 0; i < nodes.size(); i++) { 294 HtmlDocument.Node node = nodes.get(i); 295 if (node instanceof HtmlDocument.Tag) { 296 currentHeight++; 297 if (currentHeight > maxHeight) { 298 maxHeight = currentHeight; 299 } 300 if (((HtmlDocument.Tag) node).getElement().isEmpty()) { 301 // Empty tags have no closing pair, so decrease counter here. 302 currentHeight--; 303 } 304 } else if (node instanceof HtmlDocument.EndTag) { 305 currentHeight--; 306 } 307 } 308 309 // TODO(anatol): make this value cachable? 310 return maxHeight; 311 } 312 313 //------------------------------------------------------------------------ 314 // Creating well-formed blocks within the html tree. 315 //------------------------------------------------------------------------ 316 /** 317 * A Block represents a region of a html tree that 318 * 1) is well-formed, i.e. for each node in the block, all its descendants 319 * are also contained in the block. So it's safe to wrap the region 320 * within a <table> or <div>, etc. 321 * 2) starts at the beginning of a "line", e.g. a <div>, a <br>. 322 */ 323 public static class Block { 324 /* The starting node */ 325 public int start_node; 326 327 /* The ending node (non-inclusive to the block) */ 328 public int end_node; 329 } 330 331 /** 332 * Creates a list of Blocks, given a text-range. 333 * We may create multiple blocks if one single well-formed Block cannot be 334 * created. 335 * 336 * @param textStart beginning plain-text offset 337 * @param textEnd beginning plain-text offset 338 * @param minNode the smallest node number 339 * @param maxNode the largest node number 340 * @return a list of 0 or more Block objects, never null 341 */ 342 public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) { 343 344 ArrayList<Block> blocks = new ArrayList<Block>(); 345 int startNode = Math.max(getBlockStart(textStart), minNode); 346 int endNode = Math.min(getBlockEnd(textEnd), maxNode); 347 348 if (DEBUG) { 349 debug("Creating block: " + 350 "text pos: " + textStart + "-" + textEnd + "\n" + 351 "node pos: " + startNode + "-" + endNode + "\n" + 352 plainText.substring(textStart, textEnd)); 353 } 354 355 // Split up the block [start, end) into one or more blocks that 356 // are well-formed, and begins at a "line" boundary. 357 int blockStart = -1; 358 for (int n = startNode; n < endNode;) { 359 360 // The node n spans [nBegin, nEnd] 361 int nBegin = begins.get(n); 362 int nEnd = ends.get(n); 363 364 if (blockStart == -1) { 365 // Check if this is a valid start node 366 if (nBegin >= n && nEnd <= endNode && 367 canBeginBlockAt(n)) { 368 blockStart = n; 369 n = nEnd + 1; 370 } else { 371 n++; 372 } 373 continue; 374 } 375 376 // If the node [nBegin, nEnd) lies completely within 377 // the region then proceed to the (nEnd + 1). 378 if (nBegin >= blockStart && nEnd < endNode) { 379 n = nEnd + 1; 380 continue; 381 } 382 383 // If we got here, we have to break up the region into one 384 // or more blocks because the current node cannot be included 385 // in the region. 386 if (DEBUG) { 387 debug("Forcing new block: " + n + " (" + nBegin + " " + nEnd + 388 ") exceeds (" + blockStart + " " + endNode + ")"); 389 } 390 Block b = new Block(); 391 b.start_node = blockStart; 392 b.end_node = n; 393 blocks.add(b); 394 395 blockStart = -1; 396 n++; 397 } 398 399 // Last block 400 if (blockStart != -1) { 401 Block b = new Block(); 402 b.start_node = blockStart; 403 b.end_node = endNode; 404 blocks.add(b); 405 } 406 407 if (DEBUG) { 408 for (int i = 0; i < blocks.size(); i++) { 409 Block b = blocks.get(i); 410 debug("Block " + i + "/" + blocks.size() + ": " + 411 b.start_node + "-" + b.end_node + " " + 412 getPlainText(b.start_node, b.end_node)); 413 } 414 } 415 416 return blocks; 417 } 418 419 /** 420 * Checks if a block can begin starting from a node position 421 */ 422 private boolean canBeginBlockAt(int nodePos) { 423 int textPos = textPositions[nodePos]; 424 425 // Make sure that we don't exceed the text position, this happens 426 // for the last tag nodes. 427 if (textPos == plainText.length()) { 428 textPos--; 429 } 430 431 // Scan backwards to check if a nodePos is at the beginning 432 // of a line. 433 for (int i = textPos; i > 0; i--) { 434 char ch = plainText.charAt(i); 435 if (ch == '\n') { 436 return true; 437 } 438 if (i < textPos && !Character.isWhitespace(ch)) { 439 return false; 440 } 441 } 442 return true; 443 } 444 445 /** 446 * Returns the start of a block given a text-pos 447 */ 448 private int getBlockStart(int textPos) { 449 int nodenum = Arrays.binarySearch(textPositions, textPos); 450 if (nodenum >= 0) { 451 // Got an exact node alignment. Get the outer most pos that 452 // matches the text position 453 while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) { 454 nodenum--; 455 } 456 } else { 457 // textPos matches the middle of a node. 458 nodenum = -nodenum - 1; 459 } 460 461 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 462 return nodenum; 463 } 464 465 /** 466 * Returns the end of a block given a text-pos 467 */ 468 private int getBlockEnd(int textPos) { 469 int nodenum = Arrays.binarySearch(textPositions, textPos); 470 if (nodenum >= 0) { 471 // Got an exact node alignment. 472 while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) { 473 nodenum++; 474 } 475 } else { 476 // textPos matches the middle of a node. 477 nodenum = -nodenum - 2; 478 } 479 X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 480 return nodenum; 481 } 482 483 //------------------------------------------------------------------------ 484 // Plain text view of the html tree 485 //------------------------------------------------------------------------ 486 /** 487 * @return the plain-text position corresponding to the node 488 */ 489 public int getTextPosition(int node) { 490 return textPositions[node]; 491 } 492 493 /** 494 * @return a plain-text String of the html tree 495 */ 496 public String getPlainText() { 497 if (plainText == null) { 498 convertToPlainText(); 499 } 500 return plainText; 501 } 502 503 /** 504 * @return a plain-text String of a part of the html tree 505 */ 506 public String getPlainText(int fromNode, int toNode) { 507 if (plainText == null) { 508 convertToPlainText(); 509 } 510 int textstart = textPositions[fromNode]; 511 int textend = textPositions[toNode]; 512 return plainText.substring(textstart, textend); 513 } 514 515 /** 516 * Converts the html tree to plain text. 517 * We simply iterate through the nodes in the tree. 518 * As we output the plain-text, we keep track of the text position 519 * of each node. 520 * For String nodes, we replace '\n' with ' ' unless we're in a 521 * <pre> block. 522 */ 523 private void convertToPlainText() { 524 X.assertTrue(plainText == null && textPositions == null); 525 526 int numNodes = nodes.size(); 527 528 // Keeps track of start text position of each node, including a last 529 // entry for the size of the text. 530 textPositions = new int[numNodes + 1]; 531 532 PlainTextConverter converter = converterFactory.createInstance(); 533 534 for (int i = 0; i < numNodes; i++) { 535 textPositions[i] = converter.getPlainTextLength(); 536 converter.addNode(nodes.get(i), i, ends.get(i)); 537 } 538 539 // Add a last entry, so that textPositions_[nodes_.size()] is valid. 540 textPositions[numNodes] = converter.getPlainTextLength(); 541 542 plainText = converter.getPlainText(); 543 544 if (DEBUG) { 545 debug("Plain text: " + plainText); 546 547 for (int i = 0; i < nodes.size(); i++) { 548 int textPos = textPositions[i]; 549 String text = plainText.substring(textPos, textPositions[i + 1]); 550 debug("At " + i + ": pos=" + textPos + " " + text); 551 } 552 } 553 } 554 555 /** 556 * Encapsulates the logic for outputting plain text with respect to text 557 * segments, white space separators, line breaks, and quote marks. 558 */ 559 static final class PlainTextPrinter { 560 /** 561 * Separators are whitespace inserted between segments of text. The 562 * semantics are such that between any two segments of text, there is 563 * at most one separator. As such, separators are ordered in increasing 564 * priority, and setting a separator multiple times between text will 565 * result in the single separator with the highest priority being used. 566 * For example, a LineBreak (one newline) will override a Space, but will 567 * be overriden by a BlankLine (two newlines). 568 */ 569 static enum Separator { 570 // The values here must be ordered by increasing priority, as the 571 // enum's ordinal() method is used when determining if a new separator 572 // should override an existing one. 573 None, 574 Space, // single space 575 LineBreak, // single new line 576 BlankLine // two new lines 577 } 578 579 // White space characters that are collapsed as a single space. 580 // Note that characters such as the non-breaking whitespace 581 // and full-width spaces are not equivalent to the normal spaces. 582 private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f"; 583 584 /** 585 * Determines if the given character is considered an HTML space character. 586 * Consecutive HTML space characters are collapsed into a single space when 587 * not within a PRE element. 588 */ 589 private static boolean isHtmlWhiteSpace(char ch) { 590 return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0; 591 } 592 593 // The buffer in which we accumulate the converted plain text 594 private final StringBuilder sb = new StringBuilder(); 595 596 // How many <blockquote> blocks we are in. 597 private int quoteDepth = 0; 598 599 // How many logical newlines are at the end of the buffer we've outputted. 600 // Note that we can't simply count the newlines at the end of the output 601 // buffer because a logical new line may be followed by quote marks. 602 // 603 // We initialize the value to 2 so that we consume any initial separators, 604 // since we don't need separators at the beginning of the output. This also 605 // results in correctly outputting any quote marks at the beginning of the 606 // output if the first piece of text is within a BLOCKQUOTE element. 607 private int endingNewLines = 2; 608 609 // The next separator to be inserted between two text nodes. 610 private Separator separator = Separator.None; 611 612 /** Returns the current length of the text. */ 613 final int getTextLength() { 614 return sb.length(); 615 } 616 617 /** Returns the current text. */ 618 final String getText() { 619 return sb.toString(); 620 } 621 622 /** 623 * Sets the next separator between two text nodes. A Space separator is 624 * used if there is any whitespace between the two text nodes when there is 625 * no intervening element that breaks flow. This is automatically handled 626 * by the {@link #appendNormalText} function so the client never needs to 627 * specify this separator. 628 * <p> 629 * A LineBreak separator (single new line) is used if text segments are 630 * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR, 631 * etc.). The client should set this separator for opening and closing tags 632 * of any element that breaks flow. 633 * <p> 634 * A BlankLine separator (two new lines) should be set for opening and 635 * closing P tags. 636 * <p> 637 * If this method is called multiple times between text nodes, a 638 * separator with a higher priority will override that of a lower priority. 639 */ 640 final void setSeparator(Separator newSeparator) { 641 if (newSeparator.ordinal() > separator.ordinal()) { 642 separator = newSeparator; 643 } 644 } 645 646 /** Increments the current quote depth of the text. */ 647 final void incQuoteDepth() { 648 quoteDepth++; 649 } 650 651 /** Decrements the current quote depth of the text. */ 652 final void decQuoteDepth() { 653 quoteDepth = Math.max(0, quoteDepth - 1); 654 } 655 656 /** 657 * Normalizes the HTML whitespace in the given {@code text} and appends it 658 * as the next segment of text. This will flush any separator that should 659 * be appended before the text, as well as any quote marks that should 660 * follow the last newline if the quote depth is non-zero. 661 */ 662 final void appendNormalText(String text) { 663 if (text.length() == 0) { 664 return; 665 } 666 boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0)); 667 boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1)); 668 669 // Strip beginning and ending whitespace. 670 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text); 671 672 // Collapse whitespace within the text. 673 text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' '); 674 675 if (startsWithSpace) { 676 setSeparator(Separator.Space); 677 } 678 679 appendTextDirect(text); 680 681 if (endsWithSpace) { 682 setSeparator(Separator.Space); 683 } 684 } 685 686 /** 687 * Appends the given text, preserving all whitespace. This is used for 688 * appending text in a PRE element. 689 */ 690 final void appendPreText(String text) { 691 // We're in a <pre> block. Split the text into lines, and append 692 // each line with appendTextDirect() to preserve white space. 693 String[] lines = text.split("[\\r\\n]", -1); 694 695 // split() will always return an array with at least one element. 696 appendTextDirect(lines[0]); 697 698 // For all of the remaining lines, we append a newline first, which 699 // takes care of any quote marks that we need to output if the quote 700 // depth is non-zero. 701 for (int i = 1; i < lines.length; i++) { 702 appendNewLine(); 703 appendTextDirect(lines[i]); 704 } 705 } 706 707 /** 708 * Appends the {@code text} directly to the output, taking into account 709 * any separator that should be appended before it, and any quote marks 710 * that should follow the last newline if the quote depth is non-zero. 711 * <p> 712 * {@code text} must not contain any new lines--in order to handle 713 * quoting correctly, it is up to the caller to either normalize away the 714 * newlines, or split the text up into separate lines and handle new lines 715 * with the {@link #appendNewLine} method. 716 * <p> 717 * The original {@code text} is not modified in any way. Use this method 718 * when you need to preserve the original white space. 719 * <p> 720 * If the given {@code text} is non empty, this method will result in 721 * {@code endingNewLines} being reset to 0. 722 */ 723 private void appendTextDirect(String text) { 724 if (text.length() == 0) { 725 return; 726 } 727 Preconditions.checkArgument(text.indexOf('\n') < 0, 728 "text must not contain newlines."); 729 flushSeparator(); 730 maybeAddQuoteMarks(true); 731 sb.append(text); 732 endingNewLines = 0; 733 } 734 735 /** 736 * Appends a forced line break, which is the equivalent of a BR element. 737 */ 738 final void appendForcedLineBreak() { 739 flushSeparator(); 740 appendNewLine(); 741 } 742 743 /** 744 * Appends any pending separator to the output buffer. This should be 745 * called before appending text to the buffer. 746 */ 747 private void flushSeparator() { 748 switch (separator) { 749 case Space: 750 if (endingNewLines == 0) { 751 // Only append a space separator if we are not following a new 752 // line character. For example, we don't append a separator 753 // space after a <br> tag, since the <br>'s newline fulfills the 754 // space separation requirement. 755 sb.append(" "); 756 } 757 break; 758 case LineBreak: 759 while (endingNewLines < 1) { 760 appendNewLine(); 761 } 762 break; 763 case BlankLine: 764 while (endingNewLines < 2) { 765 appendNewLine(); 766 } 767 break; 768 } 769 separator = Separator.None; 770 } 771 772 /** 773 * Adds a newline to the output. This handles any quote marks that should 774 * follow any previous new lines, and increments {@code endingNewLines}. 775 */ 776 private void appendNewLine() { 777 maybeAddQuoteMarks(false); 778 sb.append('\n'); 779 endingNewLines++; 780 } 781 782 /** 783 * Adds quote marks to the output if we are at the beginning of a line. 784 * One '>' character is used for every level of quoting we are in. 785 * 786 * @param includeEndingSpace Includes a single space after the quote marks. 787 */ 788 private void maybeAddQuoteMarks(boolean includeEndingSpace) { 789 // We only need to add quote marks if we are at the beginning of line. 790 if (endingNewLines > 0 && quoteDepth > 0) { 791 for (int i = 0; i < quoteDepth; i++) { 792 sb.append('>'); 793 } 794 if (includeEndingSpace) { 795 sb.append(' '); 796 } 797 } 798 } 799 } 800 801 /** 802 * Contains the logic for converting the contents of one HtmlTree into 803 * plaintext. 804 */ 805 public static class DefaultPlainTextConverter implements PlainTextConverter { 806 807 private static final Set<HTML.Element> BLANK_LINE_ELEMENTS = 808 ImmutableSet.of( 809 HTML4.P_ELEMENT, 810 HTML4.BLOCKQUOTE_ELEMENT, 811 HTML4.PRE_ELEMENT); 812 813 private final PlainTextPrinter printer = new PlainTextPrinter(); 814 815 private int preDepth = 0; 816 817 public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) { 818 if (n instanceof HtmlDocument.Text) { // A string node 819 820 HtmlDocument.Text textNode = (HtmlDocument.Text) n; 821 String str = textNode.getText(); 822 823 if (preDepth > 0) { 824 printer.appendPreText(str); 825 826 } else { 827 printer.appendNormalText(str); 828 } 829 830 } else if (n instanceof HtmlDocument.Tag) { 831 832 // Check for linebreaking tags. 833 HtmlDocument.Tag tag = (HtmlDocument.Tag) n; 834 HTML.Element element = tag.getElement(); 835 836 if (BLANK_LINE_ELEMENTS.contains(element)) { 837 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 838 839 } else if (HTML4.BR_ELEMENT.equals(element)) { 840 // The <BR> element is special in that it always adds a newline. 841 printer.appendForcedLineBreak(); 842 843 } else if (element.breaksFlow()) { 844 // All other elements that break the flow add a LineBreak separator. 845 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 846 847 if (HTML4.HR_ELEMENT.equals(element)) { 848 printer.appendNormalText("________________________________"); 849 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 850 } 851 } 852 853 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 854 printer.incQuoteDepth(); 855 856 } else if (HTML4.PRE_ELEMENT.equals(element)) { 857 preDepth++; 858 } 859 860 } else if (n instanceof HtmlDocument.EndTag) { 861 862 // Check for linebreaking tags. 863 HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n; 864 HTML.Element element = endTag.getElement(); 865 866 if (BLANK_LINE_ELEMENTS.contains(element)) { 867 printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 868 869 } else if (element.breaksFlow()) { 870 // All other elements that break the flow add a LineBreak separator. 871 printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 872 } 873 874 if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 875 printer.decQuoteDepth(); 876 877 } else if (HTML4.PRE_ELEMENT.equals(element)) { 878 preDepth--; 879 } 880 } 881 } 882 883 public final int getPlainTextLength() { 884 return printer.getTextLength(); 885 } 886 887 public final String getPlainText() { 888 return printer.getText(); 889 } 890 } 891 892 //------------------------------------------------------------------------ 893 // The following methods are used to build the html tree. 894 //------------------------------------------------------------------------ 895 /** For building the html tree */ 896 private Stack<Integer> stack; 897 private int parent; 898 899 /** Starts the build process */ 900 void start() { 901 stack = new Stack<Integer>(); 902 parent = -1; 903 } 904 905 /** Finishes the build process */ 906 void finish() { 907 X.assertTrue(stack.size() == 0); 908 X.assertTrue(parent == -1); 909 } 910 911 /** 912 * to add the matching end tag 913 */ 914 void addStartTag(HtmlDocument.Tag t) { 915 int nodenum = nodes.size(); 916 addNode(t, nodenum, -1); 917 918 stack.add(parent); 919 parent = nodenum; 920 } 921 922 /** 923 * Adds a html end tag, this must be preceded by a previous matching open tag 924 */ 925 void addEndTag(HtmlDocument.EndTag t) { 926 int nodenum = nodes.size(); 927 addNode(t, parent, nodenum); 928 929 if (parent != -1) { 930 ends.set(parent, nodenum); 931 } 932 933 //is this the right pop? 934 parent = stack.pop(); 935 } 936 937 /** Adds a singular tag that does not have a corresponding end tag */ 938 void addSingularTag(HtmlDocument.Tag t) { 939 int nodenum = nodes.size(); 940 addNode(t, nodenum, nodenum); 941 } 942 943 /** 944 * Adds a text 945 * @param t a plain-text string 946 */ 947 void addText(HtmlDocument.Text t) { 948 int nodenum = nodes.size(); 949 addNode(t, nodenum, nodenum); 950 } 951 952 /** Adds a node */ 953 private void addNode(HtmlDocument.Node n, int begin, int end) { 954 955 nodes.add(n); 956 begins.add(begin); 957 ends.add(end); 958 } 959 960 /** For debugging */ 961 private static final void debug(String str) { 962 logger.finest(str); 963 } 964 965 }