Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.android.mail.lib.html.parser;
     17 
     18 import com.android.mail.lib.base.CharMatcher;
     19 import com.android.mail.lib.base.Preconditions;
     20 import com.android.mail.lib.base.X;
     21 import com.google.common.collect.ImmutableSet;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Arrays;
     25 import java.util.Collections;
     26 import java.util.List;
     27 import java.util.Set;
     28 import java.util.Stack;
     29 import java.util.logging.Logger;
     30 
     31 /**
     32  * HtmlTree represents a parsed and well-formed html text, it provides
     33  * methods to convert to plain text. It also provides methods to find
     34  * well-formed blocks of text, for quote detection.
     35  *
     36  * We don't really build a html tree data structure. Instead, for
     37  * efficiency, and for the ability to do a simple in-order-traversal
     38  * of the tree, we simply keeps a linear list of nodes (in-order).
     39  * The begin_ and end_ arrays keeps track of the starting end ending
     40  * nodes:
     41  *
     42  * For a string node, begin_[node] = end_[node] = node
     43  * For an open tag, begin_[node] = node, end_[node] = the matching end tag
     44  * For a close tag, end_[node] = the matching open tag, end_[node] = node
     45  *
     46  * @author jlim (at) google.com (Jing Yee Lim)
     47  */
     48 public class HtmlTree {
     49 
     50   /**
     51    * An interface that allows clients to provide their own implementation
     52    * of a {@link PlainTextConverter}.
     53    */
     54   public static interface PlainTextConverterFactory {
     55     /**
     56      * Creates a new instance of a {@link PlainTextConverter} to convert
     57      * the contents of an {@link HtmlTree} to plain text.
     58      */
     59     PlainTextConverter createInstance();
     60   }
     61 
     62   /**
     63    * An interface for an object which converts a single HtmlTree into
     64    * plaintext.
     65    */
     66   public static interface PlainTextConverter {
     67     /**
     68      * Adds the given node {@code n} to plain text.
     69      *
     70      * @param n The node to convert to text.
     71      * @param nodeNum The number of the node among the list of all notes.
     72      * @param endNum The number of the ending node if this is a start node,
     73      *    otherwise the same as {@code nodeNum}.
     74      */
     75     void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
     76 
     77     /**
     78      * Returns the current length of the plain text.
     79      */
     80     int getPlainTextLength();
     81 
     82     /**
     83      * Returns the current plain text.
     84      */
     85     String getPlainText();
     86   }
     87 
     88   /** A factory that produces converters of the default implementation. */
     89   private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY =
     90       new PlainTextConverterFactory() {
     91         public PlainTextConverter createInstance() {
     92           return new DefaultPlainTextConverter();
     93         }
     94       };
     95 
     96   /** Contains html nodes */
     97   private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
     98 
     99   /** Keeps track of beginning and end of each node */
    100   private final Stack<Integer> begins = new Stack<Integer>();
    101   private final Stack<Integer> ends = new Stack<Integer>();
    102 
    103   /** Plain text (lazy creation) */
    104   private String plainText;
    105 
    106   /** The html string (lazy creation) */
    107   private String html;
    108 
    109   /** textPositions[node pos] = the text position */
    110   private int[] textPositions;
    111 
    112   private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
    113 
    114   // For debugging only
    115   private static final boolean DEBUG = false;
    116 
    117   private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
    118 
    119   //------------------------------------------------------------------------
    120 
    121   /** HtmlTree can only be constructed from this package */
    122   HtmlTree() {
    123   }
    124 
    125   /**
    126    * Sets a new {@link PlainTextConverterFactory} to be used to convert
    127    * the contents of this tree to plaintext.
    128    */
    129   public void setPlainTextConverterFactory(PlainTextConverterFactory factory) {
    130     if (factory == null) {
    131       throw new NullPointerException("factory must not be null");
    132     }
    133     converterFactory = factory;
    134   }
    135 
    136   /**
    137    * Gets the list of node objects. A node can be either a
    138    * Tag, EngTag or a String object.
    139    * @return the nodes of the tree
    140    */
    141   public List<HtmlDocument.Node> getNodesList() {
    142     return Collections.unmodifiableList(nodes);
    143   }
    144 
    145   /**
    146    * @return number of nodes
    147    */
    148   public int getNumNodes() {
    149     return nodes.size();
    150   }
    151 
    152   /**
    153    * Gets the entire html.
    154    */
    155   public String getHtml() {
    156     return getHtml(-1);
    157   }
    158 
    159   /**
    160    * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
    161    * specified size.
    162    */
    163   public String getHtml(int wrapSize) {
    164     if (html == null) {
    165       html = getHtml(0, nodes.size(), wrapSize);
    166     }
    167     return html;
    168   }
    169 
    170   /** Gets parts of the html */
    171   public String getHtml(int fromNode, int toNode) {
    172     return getHtml(fromNode, toNode, -1);
    173   }
    174 
    175   /**
    176    * Gets parts of the html, if wrapSize is > 0, it tries
    177    * to do wrapping at the specified size.
    178    */
    179   public String getHtml(int fromNode, int toNode, int wrapSize) {
    180     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    181 
    182     int estSize = (toNode - fromNode) * 10;
    183     StringBuilder sb = new StringBuilder(estSize);
    184     int lastWrapIndex = 0;      // used for wrapping
    185     for (int n = fromNode; n < toNode; n++) {
    186       HtmlDocument.Node node = nodes.get(n);
    187       node.toHTML(sb);
    188       // TODO: maybe we can be smarter about this and not add newlines
    189       // within <pre> tags, unless the whole long line is encompassed
    190       // by the <pre> tag.
    191       if (wrapSize > 0) {
    192         // We can only wrap if the last outputted node is an element that
    193         // breaks the flow. Otherwise, we risk the possibility of inserting
    194         // spaces where they shouldn't be.
    195         if ((node instanceof HtmlDocument.Tag &&
    196               ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
    197             (node instanceof HtmlDocument.EndTag &&
    198               ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
    199           // Check to see if there is a newline in the most recent node's html.
    200           int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
    201           if (recentNewLine != -1) {
    202             lastWrapIndex += recentNewLine;
    203           }
    204           // If the last index - last index of a newline is greater than
    205           // wrapSize, add a newline.
    206           if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
    207             sb.append('\n');
    208             lastWrapIndex = sb.length() - 1;
    209           }
    210         }
    211       }
    212     }
    213 
    214     return sb.toString();
    215   }
    216 
    217   /**
    218    * Convert a html region into chunks of html code, each containing
    219    * roughly chunkSize characters.
    220    */
    221   public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
    222     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    223 
    224     ArrayList<String> chunks = new ArrayList<String>();
    225 
    226     // Do a best effort attempt to not split apart certain elements (as of now,
    227     // just the <textarea>). We cannot guarantee that they will not be split
    228     // because the client may specify endpoint nodes that land in the middle
    229     // of an element (although this shouldn't happen if the endpoints returned
    230     // by createBlocks() are properly used).
    231     int stack = 0;
    232     boolean balanced = true;
    233 
    234     StringBuilder sb = new StringBuilder(chunkSize + 256);
    235     for (int n = fromNode; n < toNode; n++) {
    236       HtmlDocument.Node node = nodes.get(n);
    237       node.toHTML(sb);
    238 
    239       if (node instanceof HtmlDocument.Tag) {
    240         if (HTML4.TEXTAREA_ELEMENT.equals(
    241             ((HtmlDocument.Tag)node).getElement())) {
    242           stack++;
    243         }
    244       }
    245       if (node instanceof HtmlDocument.EndTag) {
    246         if (HTML4.TEXTAREA_ELEMENT.equals(
    247             ((HtmlDocument.EndTag)node).getElement())) {
    248           if (stack == 0) {
    249             balanced = false;
    250           } else {
    251             stack--;
    252           }
    253         }
    254       }
    255 
    256       if (stack == 0 && sb.length() >= chunkSize) {
    257         chunks.add(sb.toString());
    258         sb.setLength(0);
    259       }
    260     }
    261 
    262     // Don't forget the last chunk!
    263     if (sb.length() > 0) {
    264       chunks.add(sb.toString());
    265     }
    266 
    267     // If the tree is not balanced (cut off in the middle of a node), log
    268     // debug data. Clients should fix their code so that the endpoints from
    269     // createBlocks() are properly used.
    270     if (!balanced || stack != 0) {
    271       StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
    272       debug.append(getHtml());
    273       debug.append("\nfromNode: ").append(fromNode);
    274       debug.append("\ntoNode: ").append(toNode);
    275       debug.append("\nNum nodes_: ").append(getNumNodes());
    276       for (String chunk : chunks) {
    277         debug.append("\nChunk:\n").append(chunk);
    278       }
    279       logger.severe(debug.toString());
    280     }
    281 
    282     return chunks;
    283   }
    284 
    285   /**
    286    * Returns height (maximum length from root to a leaf) of the HTML tree.
    287    * @return height of the HTML tree.
    288    */
    289   public int getTreeHeight() {
    290     int currentHeight = 0;
    291     int maxHeight = 0;
    292 
    293     for (int i = 0; i < nodes.size(); i++) {
    294       HtmlDocument.Node node = nodes.get(i);
    295       if (node instanceof HtmlDocument.Tag) {
    296         currentHeight++;
    297         if (currentHeight > maxHeight) {
    298           maxHeight = currentHeight;
    299         }
    300         if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
    301           // Empty tags have no closing pair, so decrease counter here.
    302           currentHeight--;
    303         }
    304       } else if (node instanceof HtmlDocument.EndTag) {
    305         currentHeight--;
    306       }
    307     }
    308 
    309     // TODO(anatol): make this value cachable?
    310     return maxHeight;
    311   }
    312 
    313   //------------------------------------------------------------------------
    314   // Creating well-formed blocks within the html tree.
    315   //------------------------------------------------------------------------
    316   /**
    317    * A Block represents a region of a html tree that
    318    * 1) is well-formed, i.e. for each node in the block, all its descendants
    319    * are also contained in the block. So it's safe to wrap the region
    320    * within a <table> or <div>, etc.
    321    * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
    322    */
    323   public static class Block {
    324     /* The starting node */
    325     public int start_node;
    326 
    327     /* The ending node (non-inclusive to the block) */
    328     public int end_node;
    329   }
    330 
    331   /**
    332    * Creates a list of Blocks, given a text-range.
    333    * We may create multiple blocks if one single well-formed Block cannot be
    334    * created.
    335    *
    336    * @param textStart beginning plain-text offset
    337    * @param textEnd beginning plain-text offset
    338    * @param minNode the smallest node number
    339    * @param maxNode the largest node number
    340    * @return a list of 0 or more Block objects, never null
    341    */
    342   public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
    343 
    344     ArrayList<Block> blocks = new ArrayList<Block>();
    345     int startNode = Math.max(getBlockStart(textStart), minNode);
    346     int endNode = Math.min(getBlockEnd(textEnd), maxNode);
    347 
    348     if (DEBUG) {
    349       debug("Creating block: " +
    350             "text pos: " + textStart + "-" + textEnd + "\n" +
    351             "node pos: " + startNode + "-" + endNode + "\n" +
    352             plainText.substring(textStart, textEnd));
    353     }
    354 
    355     // Split up the block [start, end) into one or more blocks that
    356     // are well-formed, and begins at a "line" boundary.
    357     int blockStart = -1;
    358     for (int n = startNode; n < endNode;) {
    359 
    360       // The node n spans [nBegin, nEnd]
    361       int nBegin = begins.get(n);
    362       int nEnd = ends.get(n);
    363 
    364       if (blockStart == -1) {
    365         // Check if this is a valid start node
    366         if (nBegin >= n && nEnd <= endNode &&
    367             canBeginBlockAt(n)) {
    368           blockStart = n;
    369           n = nEnd + 1;
    370         } else {
    371           n++;
    372         }
    373         continue;
    374       }
    375 
    376       // If the node [nBegin, nEnd) lies completely within
    377       // the region then proceed to the (nEnd + 1).
    378       if (nBegin >= blockStart && nEnd < endNode) {
    379         n = nEnd + 1;
    380         continue;
    381       }
    382 
    383       // If we got here, we have to break up the region into one
    384       // or more blocks because the current node cannot be included
    385       // in the region.
    386       if (DEBUG) {
    387         debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
    388               ") exceeds (" + blockStart + " " + endNode + ")");
    389       }
    390       Block b = new Block();
    391       b.start_node = blockStart;
    392       b.end_node = n;
    393       blocks.add(b);
    394 
    395       blockStart = -1;
    396       n++;
    397     }
    398 
    399     // Last block
    400     if (blockStart != -1) {
    401       Block b = new Block();
    402       b.start_node = blockStart;
    403       b.end_node = endNode;
    404       blocks.add(b);
    405     }
    406 
    407     if (DEBUG) {
    408       for (int i = 0; i < blocks.size(); i++) {
    409         Block b = blocks.get(i);
    410         debug("Block " + i + "/" + blocks.size() + ": " +
    411               b.start_node + "-" + b.end_node + " " +
    412               getPlainText(b.start_node, b.end_node));
    413       }
    414     }
    415 
    416     return blocks;
    417   }
    418 
    419   /**
    420    * Checks if a block can begin starting from a node position
    421    */
    422   private boolean canBeginBlockAt(int nodePos) {
    423     int textPos = textPositions[nodePos];
    424 
    425     // Make sure that we don't exceed the text position, this happens
    426     // for the last tag nodes.
    427     if (textPos == plainText.length()) {
    428       textPos--;
    429     }
    430 
    431     // Scan backwards to check if a nodePos is at the beginning
    432     // of a line.
    433     for (int i = textPos; i > 0; i--) {
    434       char ch = plainText.charAt(i);
    435       if (ch == '\n') {
    436         return true;
    437       }
    438       if (i < textPos && !Character.isWhitespace(ch)) {
    439         return false;
    440       }
    441     }
    442     return true;
    443   }
    444 
    445   /**
    446    * Returns the start of a block given a text-pos
    447    */
    448   private int getBlockStart(int textPos) {
    449     int nodenum = Arrays.binarySearch(textPositions, textPos);
    450     if (nodenum >= 0) {
    451       // Got an exact node alignment. Get the outer most pos that
    452       // matches the text position
    453       while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
    454         nodenum--;
    455       }
    456     } else {
    457       // textPos matches the middle of a node.
    458       nodenum = -nodenum - 1;
    459     }
    460 
    461     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    462     return nodenum;
    463   }
    464 
    465   /**
    466    * Returns the end of a block given a text-pos
    467    */
    468   private int getBlockEnd(int textPos) {
    469     int nodenum = Arrays.binarySearch(textPositions, textPos);
    470     if (nodenum >= 0) {
    471       // Got an exact node alignment.
    472       while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
    473         nodenum++;
    474       }
    475     } else {
    476       // textPos matches the middle of a node.
    477       nodenum = -nodenum - 2;
    478     }
    479     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    480     return nodenum;
    481   }
    482 
    483   //------------------------------------------------------------------------
    484   // Plain text view of the html tree
    485   //------------------------------------------------------------------------
    486   /**
    487    * @return the plain-text position corresponding to the node
    488    */
    489   public int getTextPosition(int node) {
    490     return textPositions[node];
    491   }
    492 
    493   /**
    494    * @return a plain-text String of the html tree
    495    */
    496   public String getPlainText() {
    497     if (plainText == null) {
    498       convertToPlainText();
    499     }
    500     return plainText;
    501   }
    502 
    503   /**
    504    * @return a plain-text String of a part of the html tree
    505    */
    506   public String getPlainText(int fromNode, int toNode) {
    507     if (plainText == null) {
    508       convertToPlainText();
    509     }
    510     int textstart = textPositions[fromNode];
    511     int textend = textPositions[toNode];
    512     return plainText.substring(textstart, textend);
    513   }
    514 
    515   /**
    516    * Converts the html tree to plain text.
    517    * We simply iterate through the nodes in the tree.
    518    * As we output the plain-text, we keep track of the text position
    519    * of each node.
    520    * For String nodes, we replace '\n' with ' ' unless we're in a
    521    * <pre> block.
    522    */
    523   private void convertToPlainText() {
    524     X.assertTrue(plainText == null && textPositions == null);
    525 
    526     int numNodes = nodes.size();
    527 
    528     // Keeps track of start text position of each node, including a last
    529     // entry for the size of the text.
    530     textPositions = new int[numNodes + 1];
    531 
    532     PlainTextConverter converter = converterFactory.createInstance();
    533 
    534     for (int i = 0; i < numNodes; i++) {
    535       textPositions[i] = converter.getPlainTextLength();
    536       converter.addNode(nodes.get(i), i, ends.get(i));
    537     }
    538 
    539     // Add a last entry, so that textPositions_[nodes_.size()] is valid.
    540     textPositions[numNodes] = converter.getPlainTextLength();
    541 
    542     plainText = converter.getPlainText();
    543 
    544     if (DEBUG) {
    545       debug("Plain text: " + plainText);
    546 
    547       for (int i = 0; i < nodes.size(); i++) {
    548         int textPos = textPositions[i];
    549         String text = plainText.substring(textPos, textPositions[i + 1]);
    550         debug("At " + i + ": pos=" + textPos + " " +  text);
    551       }
    552     }
    553   }
    554 
    555   /**
    556    * Encapsulates the logic for outputting plain text with respect to text
    557    * segments, white space separators, line breaks, and quote marks.
    558    */
    559   static final class PlainTextPrinter {
    560     /**
    561      * Separators are whitespace inserted between segments of text. The
    562      * semantics are such that between any two segments of text, there is
    563      * at most one separator. As such, separators are ordered in increasing
    564      * priority, and setting a separator multiple times between text will
    565      * result in the single separator with the highest priority being used.
    566      * For example, a LineBreak (one newline) will override a Space, but will
    567      * be overriden by a BlankLine (two newlines).
    568      */
    569     static enum Separator {
    570       // The values here must be ordered by increasing priority, as the
    571       // enum's ordinal() method is used when determining if a new separator
    572       // should override an existing one.
    573       None,
    574       Space,      // single space
    575       LineBreak,  // single new line
    576       BlankLine   // two new lines
    577     }
    578 
    579     // White space characters that are collapsed as a single space.
    580     // Note that characters such as the non-breaking whitespace
    581     // and full-width spaces are not equivalent to the normal spaces.
    582     private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
    583 
    584     /**
    585      * Determines if the given character is considered an HTML space character.
    586      * Consecutive HTML space characters are collapsed into a single space when
    587      * not within a PRE element.
    588      */
    589     private static boolean isHtmlWhiteSpace(char ch) {
    590       return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
    591     }
    592 
    593     // The buffer in which we accumulate the converted plain text
    594     private final StringBuilder sb = new StringBuilder();
    595 
    596     // How many <blockquote> blocks we are in.
    597     private int quoteDepth = 0;
    598 
    599     // How many logical newlines are at the end of the buffer we've outputted.
    600     // Note that we can't simply count the newlines at the end of the output
    601     // buffer because a logical new line may be followed by quote marks.
    602     //
    603     // We initialize the value to 2 so that we consume any initial separators,
    604     // since we don't need separators at the beginning of the output. This also
    605     // results in correctly outputting any quote marks at the beginning of the
    606     // output if the first piece of text is within a BLOCKQUOTE element.
    607     private int endingNewLines = 2;
    608 
    609     // The next separator to be inserted between two text nodes.
    610     private Separator separator = Separator.None;
    611 
    612     /** Returns the current length of the text. */
    613     final int getTextLength() {
    614       return sb.length();
    615     }
    616 
    617     /** Returns the current text. */
    618     final String getText() {
    619       return sb.toString();
    620     }
    621 
    622     /**
    623      * Sets the next separator between two text nodes. A Space separator is
    624      * used if there is any whitespace between the two text nodes when there is
    625      * no intervening element that breaks flow. This is automatically handled
    626      * by the {@link #appendNormalText} function so the client never needs to
    627      * specify this separator.
    628      * <p>
    629      * A LineBreak separator (single new line) is used if text segments are
    630      * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
    631      * etc.). The client should set this separator for opening and closing tags
    632      * of any element that breaks flow.
    633      * <p>
    634      * A BlankLine separator (two new lines) should be set for opening and
    635      * closing P tags.
    636      * <p>
    637      * If this method is called multiple times between text nodes, a
    638      * separator with a higher priority will override that of a lower priority.
    639      */
    640     final void setSeparator(Separator newSeparator) {
    641       if (newSeparator.ordinal() > separator.ordinal()) {
    642         separator = newSeparator;
    643       }
    644     }
    645 
    646     /** Increments the current quote depth of the text. */
    647     final void incQuoteDepth() {
    648       quoteDepth++;
    649     }
    650 
    651     /** Decrements the current quote depth of the text. */
    652     final void decQuoteDepth() {
    653       quoteDepth = Math.max(0, quoteDepth - 1);
    654     }
    655 
    656     /**
    657      * Normalizes the HTML whitespace in the given {@code text} and appends it
    658      * as the next segment of text. This will flush any separator that should
    659      * be appended before the text, as well as any quote marks that should
    660      * follow the last newline if the quote depth is non-zero.
    661      */
    662     final void appendNormalText(String text) {
    663       if (text.length() == 0) {
    664         return;
    665       }
    666       boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
    667       boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
    668 
    669       // Strip beginning and ending whitespace.
    670       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
    671 
    672       // Collapse whitespace within the text.
    673       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
    674 
    675       if (startsWithSpace) {
    676         setSeparator(Separator.Space);
    677       }
    678 
    679       appendTextDirect(text);
    680 
    681       if (endsWithSpace) {
    682         setSeparator(Separator.Space);
    683       }
    684     }
    685 
    686     /**
    687      * Appends the given text, preserving all whitespace. This is used for
    688      * appending text in a PRE element.
    689      */
    690     final void appendPreText(String text) {
    691       // We're in a <pre> block. Split the text into lines, and append
    692       // each line with appendTextDirect() to preserve white space.
    693       String[] lines = text.split("[\\r\\n]", -1);
    694 
    695       // split() will always return an array with at least one element.
    696       appendTextDirect(lines[0]);
    697 
    698       // For all of the remaining lines, we append a newline first, which
    699       // takes care of any quote marks that we need to output if the quote
    700       // depth is non-zero.
    701       for (int i = 1; i < lines.length; i++) {
    702         appendNewLine();
    703         appendTextDirect(lines[i]);
    704       }
    705     }
    706 
    707     /**
    708      * Appends the {@code text} directly to the output, taking into account
    709      * any separator that should be appended before it, and any quote marks
    710      * that should follow the last newline if the quote depth is non-zero.
    711      * <p>
    712      * {@code text} must not contain any new lines--in order to handle
    713      * quoting correctly, it is up to the caller to either normalize away the
    714      * newlines, or split the text up into separate lines and handle new lines
    715      * with the {@link #appendNewLine} method.
    716      * <p>
    717      * The original {@code text} is not modified in any way. Use this method
    718      * when you need to preserve the original white space.
    719      * <p>
    720      * If the given {@code text} is non empty, this method will result in
    721      * {@code endingNewLines} being reset to 0.
    722      */
    723     private void appendTextDirect(String text) {
    724       if (text.length() == 0) {
    725         return;
    726       }
    727       Preconditions.checkArgument(text.indexOf('\n') < 0,
    728                                   "text must not contain newlines.");
    729       flushSeparator();
    730       maybeAddQuoteMarks(true);
    731       sb.append(text);
    732       endingNewLines = 0;
    733     }
    734 
    735     /**
    736      * Appends a forced line break, which is the equivalent of a BR element.
    737      */
    738     final void appendForcedLineBreak() {
    739       flushSeparator();
    740       appendNewLine();
    741     }
    742 
    743     /**
    744      * Appends any pending separator to the output buffer. This should be
    745      * called before appending text to the buffer.
    746      */
    747     private void flushSeparator() {
    748       switch (separator) {
    749         case Space:
    750           if (endingNewLines == 0) {
    751             // Only append a space separator if we are not following a new
    752             // line character. For example, we don't append a separator
    753             // space after a <br> tag, since the <br>'s newline fulfills the
    754             // space separation requirement.
    755             sb.append(" ");
    756           }
    757           break;
    758         case LineBreak:
    759           while (endingNewLines < 1) {
    760             appendNewLine();
    761           }
    762           break;
    763         case BlankLine:
    764           while (endingNewLines < 2) {
    765             appendNewLine();
    766           }
    767           break;
    768       }
    769       separator = Separator.None;
    770     }
    771 
    772     /**
    773      * Adds a newline to the output. This handles any quote marks that should
    774      * follow any previous new lines, and increments {@code endingNewLines}.
    775      */
    776     private void appendNewLine() {
    777       maybeAddQuoteMarks(false);
    778       sb.append('\n');
    779       endingNewLines++;
    780     }
    781 
    782     /**
    783      * Adds quote marks to the output if we are at the beginning of a line.
    784      * One '>' character is used for every level of quoting we are in.
    785      *
    786      * @param includeEndingSpace Includes a single space after the quote marks.
    787      */
    788     private void maybeAddQuoteMarks(boolean includeEndingSpace) {
    789       // We only need to add quote marks if we are at the beginning of line.
    790       if (endingNewLines > 0 && quoteDepth > 0) {
    791         for (int i = 0; i < quoteDepth; i++) {
    792           sb.append('>');
    793         }
    794         if (includeEndingSpace) {
    795           sb.append(' ');
    796         }
    797       }
    798     }
    799   }
    800 
    801   /**
    802    * Contains the logic for converting the contents of one HtmlTree into
    803    * plaintext.
    804    */
    805   public static class DefaultPlainTextConverter implements PlainTextConverter {
    806 
    807     private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
    808         ImmutableSet.of(
    809             HTML4.P_ELEMENT,
    810             HTML4.BLOCKQUOTE_ELEMENT,
    811             HTML4.PRE_ELEMENT);
    812 
    813     private final PlainTextPrinter printer = new PlainTextPrinter();
    814 
    815     private int preDepth = 0;
    816 
    817     public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
    818       if (n instanceof HtmlDocument.Text) {        // A string node
    819 
    820         HtmlDocument.Text textNode = (HtmlDocument.Text) n;
    821         String str = textNode.getText();
    822 
    823         if (preDepth > 0) {
    824           printer.appendPreText(str);
    825 
    826         } else {
    827           printer.appendNormalText(str);
    828         }
    829 
    830       } else if (n instanceof HtmlDocument.Tag) {
    831 
    832         // Check for linebreaking tags.
    833         HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
    834         HTML.Element element = tag.getElement();
    835 
    836         if (BLANK_LINE_ELEMENTS.contains(element)) {
    837           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    838 
    839         } else if (HTML4.BR_ELEMENT.equals(element)) {
    840           // The <BR> element is special in that it always adds a newline.
    841           printer.appendForcedLineBreak();
    842 
    843         } else if (element.breaksFlow()) {
    844           // All other elements that break the flow add a LineBreak separator.
    845           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    846 
    847           if (HTML4.HR_ELEMENT.equals(element)) {
    848             printer.appendNormalText("________________________________");
    849             printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    850           }
    851         }
    852 
    853         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    854           printer.incQuoteDepth();
    855 
    856         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    857           preDepth++;
    858         }
    859 
    860       } else if (n instanceof HtmlDocument.EndTag) {
    861 
    862         // Check for linebreaking tags.
    863         HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
    864         HTML.Element element = endTag.getElement();
    865 
    866         if (BLANK_LINE_ELEMENTS.contains(element)) {
    867           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    868 
    869         } else if (element.breaksFlow()) {
    870           // All other elements that break the flow add a LineBreak separator.
    871           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    872         }
    873 
    874         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    875           printer.decQuoteDepth();
    876 
    877         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    878           preDepth--;
    879         }
    880       }
    881     }
    882 
    883     public final int getPlainTextLength() {
    884       return printer.getTextLength();
    885     }
    886 
    887     public final String getPlainText() {
    888       return printer.getText();
    889     }
    890   }
    891 
    892   //------------------------------------------------------------------------
    893   // The following methods are used to build the html tree.
    894   //------------------------------------------------------------------------
    895   /** For building the html tree */
    896   private Stack<Integer> stack;
    897   private int parent;
    898 
    899   /** Starts the build process */
    900   void start() {
    901     stack = new Stack<Integer>();
    902     parent = -1;
    903   }
    904 
    905   /** Finishes the build process */
    906   void finish() {
    907     X.assertTrue(stack.size() == 0);
    908     X.assertTrue(parent == -1);
    909   }
    910 
    911   /**
    912    * to add the matching end tag
    913    */
    914   void addStartTag(HtmlDocument.Tag t) {
    915     int nodenum = nodes.size();
    916     addNode(t, nodenum, -1);
    917 
    918     stack.add(parent);
    919     parent = nodenum;
    920   }
    921 
    922   /**
    923    * Adds a html end tag, this must be preceded by a previous matching open tag
    924    */
    925   void addEndTag(HtmlDocument.EndTag t) {
    926     int nodenum = nodes.size();
    927     addNode(t, parent, nodenum);
    928 
    929     if (parent != -1) {
    930       ends.set(parent, nodenum);
    931     }
    932 
    933     //is this the right pop?
    934     parent = stack.pop();
    935   }
    936 
    937   /** Adds a singular tag that does not have a corresponding end tag */
    938   void addSingularTag(HtmlDocument.Tag t) {
    939     int nodenum = nodes.size();
    940     addNode(t, nodenum, nodenum);
    941   }
    942 
    943   /**
    944    * Adds a text
    945    * @param t a plain-text string
    946    */
    947   void addText(HtmlDocument.Text t) {
    948     int nodenum = nodes.size();
    949     addNode(t, nodenum, nodenum);
    950   }
    951 
    952   /** Adds a node */
    953   private void addNode(HtmlDocument.Node n, int begin, int end) {
    954 
    955     nodes.add(n);
    956     begins.add(begin);
    957     ends.add(end);
    958   }
    959 
    960   /** For debugging */
    961   private static final void debug(String str) {
    962     logger.finest(str);
    963   }
    964 
    965 }