Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.google.android.mail.common.html.parser;
     17 
     18 import android.text.Spanned;
     19 
     20 import com.google.android.mail.common.base.CharMatcher;
     21 import com.google.android.mail.common.base.Preconditions;
     22 import com.google.android.mail.common.base.X;
     23 import com.google.common.annotations.VisibleForTesting;
     24 import com.google.common.collect.ImmutableSet;
     25 
     26 import java.util.ArrayList;
     27 import java.util.Arrays;
     28 import java.util.Collections;
     29 import java.util.List;
     30 import java.util.Set;
     31 import java.util.Stack;
     32 import java.util.logging.Logger;
     33 
     34 /**
     35  * HtmlTree represents a parsed and well-formed html text, it provides
     36  * methods to convert to plain text. It also provides methods to find
     37  * well-formed blocks of text, for quote detection.
     38  *
     39  * We don't really build a html tree data structure. Instead, for
     40  * efficiency, and for the ability to do a simple in-order-traversal
     41  * of the tree, we simply keeps a linear list of nodes (in-order).
     42  * The begin_ and end_ arrays keeps track of the starting end ending
     43  * nodes:
     44  *
     45  * For a string node, begin_[node] = end_[node] = node
     46  * For an open tag, begin_[node] = node, end_[node] = the matching end tag
     47  * For a close tag, end_[node] = the matching open tag, end_[node] = node
     48  *
     49  * @author jlim (at) google.com (Jing Yee Lim)
     50  */
     51 public class HtmlTree {
     52   // http://www.w3.org/TR/html4/struct/text.html#h-9.1
     53   private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n");
     54 
     55   /**
     56    * An interface that allows clients to provide their own implementation
     57    * of a {@link Converter}.
     58    */
     59   public static interface ConverterFactory {
     60     /**
     61      * Creates a new instance of a {@link Converter} to convert
     62      * the contents of an {@link HtmlTree} to some resulting object.
     63      */
     64     Converter createInstance();
     65   }
     66 
     67   /**
     68    * An interface for an object which converts a single HtmlTree into some object
     69    */
     70   public static interface Converter<T> {
     71     /**
     72      * Adds the given node {@code n} to plain text.
     73      *
     74      * @param n The node to convert to text.
     75      * @param nodeNum The number of the node among the list of all notes.
     76      * @param endNum The number of the ending node if this is a start node,
     77      *    otherwise the same as {@code nodeNum}.
     78      */
     79     void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
     80 
     81     /**
     82      * Returns the current length of the plain text.
     83      */
     84     int getPlainTextLength();
     85 
     86     /**
     87      * Returns the current built object.
     88      */
     89     T getObject();
     90   }
     91 
     92   /** A factory that produces converters of the default implementation. */
     93   private static final ConverterFactory DEFAULT_CONVERTER_FACTORY =
     94       new ConverterFactory() {
     95         @Override
     96         public Converter<String> createInstance() {
     97           return new DefaultPlainTextConverter();
     98         }
     99       };
    100 
    101   /** Contains html nodes */
    102   private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
    103 
    104   /** Keeps track of beginning and end of each node */
    105   private final Stack<Integer> begins = new Stack<Integer>();
    106   private final Stack<Integer> ends = new Stack<Integer>();
    107 
    108   /** Plain text (lazy creation) */
    109   private String plainText;
    110 
    111   /** Constructed span (lazy creation) */
    112   private Spanned constructedSpan;
    113 
    114   /** The html string (lazy creation) */
    115   private String html;
    116 
    117   /** textPositions[node pos] = the text position */
    118   private int[] textPositions;
    119 
    120   private ConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
    121 
    122   // For debugging only
    123   private static final boolean DEBUG = false;
    124 
    125   private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
    126 
    127   //------------------------------------------------------------------------
    128 
    129   /** HtmlTree can only be constructed from this package */
    130   HtmlTree() {
    131   }
    132 
    133   /**
    134    * Sets a new {@link ConverterFactory} to be used to convert
    135    * the contents of this tree to plaintext.
    136    */
    137   public void setConverterFactory(ConverterFactory factory) {
    138     if (factory == null) {
    139       throw new NullPointerException("factory must not be null");
    140     }
    141     converterFactory = factory;
    142   }
    143 
    144   /**
    145    * Gets the list of node objects. A node can be either a
    146    * Tag, EngTag or a String object.
    147    * @return the nodes of the tree
    148    */
    149   public List<HtmlDocument.Node> getNodesList() {
    150     return Collections.unmodifiableList(nodes);
    151   }
    152 
    153   /**
    154    * @return number of nodes
    155    */
    156   public int getNumNodes() {
    157     return nodes.size();
    158   }
    159 
    160   /**
    161    * Returns number of matching open tag node, or {@code endTagNodeNum} itself
    162    * if it does not point to a closing tag.
    163    */
    164   public int findOpenTag(int endTagNodeNum) {
    165     X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size());
    166     return begins.get(endTagNodeNum);
    167   }
    168 
    169   /**
    170    * Returns number of matching closing tag node, or {@code openTagNodeNum} itself
    171    * if it does not point to an open tag or points to an open tag with no closing one.
    172    */
    173   public int findEndTag(int openTagNodeNum) {
    174     X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size());
    175     return ends.get(openTagNodeNum);
    176   }
    177 
    178   /**
    179    * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself
    180    * if it does not point to an open/closing tag (e.g text node or comment).
    181    */
    182   public int findPairedTag(int tagNodeNum) {
    183     X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size());
    184     int openNodeNum = begins.get(tagNodeNum);
    185     int endNodeNum = ends.get(tagNodeNum);
    186     return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum;
    187   }
    188 
    189   /**
    190    * Gets the entire html.
    191    */
    192   public String getHtml() {
    193     return getHtml(-1);
    194   }
    195 
    196   /**
    197    * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
    198    * specified size.
    199    */
    200   public String getHtml(int wrapSize) {
    201     if (html == null) {
    202       html = getHtml(0, nodes.size(), wrapSize);
    203     }
    204     return html;
    205   }
    206 
    207   /** Gets parts of the html */
    208   public String getHtml(int fromNode, int toNode) {
    209     return getHtml(fromNode, toNode, -1);
    210   }
    211 
    212   /**
    213    * Gets parts of the html, if wrapSize is > 0, it tries
    214    * to do wrapping at the specified size.
    215    */
    216   public String getHtml(int fromNode, int toNode, int wrapSize) {
    217     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    218 
    219     int estSize = (toNode - fromNode) * 10;
    220     StringBuilder sb = new StringBuilder(estSize);
    221     int lastWrapIndex = 0;      // used for wrapping
    222     for (int n = fromNode; n < toNode; n++) {
    223       HtmlDocument.Node node = nodes.get(n);
    224       node.toHTML(sb);
    225       // TODO: maybe we can be smarter about this and not add newlines
    226       // within <pre> tags, unless the whole long line is encompassed
    227       // by the <pre> tag.
    228       if (wrapSize > 0) {
    229         // We can only wrap if the last outputted node is an element that
    230         // breaks the flow. Otherwise, we risk the possibility of inserting
    231         // spaces where they shouldn't be.
    232         if ((node instanceof HtmlDocument.Tag &&
    233               ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
    234             (node instanceof HtmlDocument.EndTag &&
    235               ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
    236           // Check to see if there is a newline in the most recent node's html.
    237           int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
    238           if (recentNewLine != -1) {
    239             lastWrapIndex += recentNewLine;
    240           }
    241           // If the last index - last index of a newline is greater than
    242           // wrapSize, add a newline.
    243           if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
    244             sb.append('\n');
    245             lastWrapIndex = sb.length() - 1;
    246           }
    247         }
    248       }
    249     }
    250 
    251     return sb.toString();
    252   }
    253 
    254   /**
    255    * Convert a html region into chunks of html code, each containing
    256    * roughly chunkSize characters.
    257    */
    258   public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
    259     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    260 
    261     ArrayList<String> chunks = new ArrayList<String>();
    262 
    263     // Do a best effort attempt to not split apart certain elements (as of now,
    264     // just the <textarea>). We cannot guarantee that they will not be split
    265     // because the client may specify endpoint nodes that land in the middle
    266     // of an element (although this shouldn't happen if the endpoints returned
    267     // by createBlocks() are properly used).
    268     int stack = 0;
    269     boolean balanced = true;
    270 
    271     StringBuilder sb = new StringBuilder(chunkSize + 256);
    272     for (int n = fromNode; n < toNode; n++) {
    273       HtmlDocument.Node node = nodes.get(n);
    274       node.toHTML(sb);
    275 
    276       if (node instanceof HtmlDocument.Tag) {
    277         if (HTML4.TEXTAREA_ELEMENT.equals(
    278             ((HtmlDocument.Tag) node).getElement())) {
    279           stack++;
    280         }
    281       }
    282       if (node instanceof HtmlDocument.EndTag) {
    283         if (HTML4.TEXTAREA_ELEMENT.equals(
    284             ((HtmlDocument.EndTag) node).getElement())) {
    285           if (stack == 0) {
    286             balanced = false;
    287           } else {
    288             stack--;
    289           }
    290         }
    291       }
    292 
    293       if (stack == 0 && sb.length() >= chunkSize) {
    294         chunks.add(sb.toString());
    295         sb.setLength(0);
    296       }
    297     }
    298 
    299     // Don't forget the last chunk!
    300     if (sb.length() > 0) {
    301       chunks.add(sb.toString());
    302     }
    303 
    304     // If the tree is not balanced (cut off in the middle of a node), log
    305     // debug data. Clients should fix their code so that the endpoints from
    306     // createBlocks() are properly used.
    307     if (!balanced || stack != 0) {
    308       StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
    309       debug.append(getHtml());
    310       debug.append("\nfromNode: ").append(fromNode);
    311       debug.append("\ntoNode: ").append(toNode);
    312       debug.append("\nNum nodes_: ").append(getNumNodes());
    313       for (String chunk : chunks) {
    314         debug.append("\nChunk:\n").append(chunk);
    315       }
    316       logger.severe(debug.toString());
    317     }
    318 
    319     return chunks;
    320   }
    321 
    322   /**
    323    * Returns height (maximum length from root to a leaf) of the HTML tree.
    324    * @return height of the HTML tree.
    325    */
    326   public int getTreeHeight() {
    327     int currentHeight = 0;
    328     int maxHeight = 0;
    329 
    330     for (int i = 0; i < nodes.size(); i++) {
    331       HtmlDocument.Node node = nodes.get(i);
    332       if (node instanceof HtmlDocument.Tag) {
    333         currentHeight++;
    334         if (currentHeight > maxHeight) {
    335           maxHeight = currentHeight;
    336         }
    337         if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
    338           // Empty tags have no closing pair, so decrease counter here.
    339           currentHeight--;
    340         }
    341       } else if (node instanceof HtmlDocument.EndTag) {
    342         currentHeight--;
    343       }
    344     }
    345 
    346     // TODO(anatol): make this value cachable?
    347     return maxHeight;
    348   }
    349 
    350   //------------------------------------------------------------------------
    351   // Creating well-formed blocks within the html tree.
    352   //------------------------------------------------------------------------
    353   /**
    354    * A Block represents a region of a html tree that
    355    * 1) is well-formed, i.e. for each node in the block, all its descendants
    356    * are also contained in the block. So it's safe to wrap the region
    357    * within a <table> or <div>, etc.
    358    * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
    359    */
    360   public static class Block {
    361     /* The starting node */
    362     public int start_node;
    363 
    364     /* The ending node (non-inclusive to the block) */
    365     public int end_node;
    366   }
    367 
    368   /**
    369    * Creates a list of Blocks, given a text-range.
    370    * We may create multiple blocks if one single well-formed Block cannot be
    371    * created.
    372    *
    373    * @param textStart beginning plain-text offset
    374    * @param textEnd beginning plain-text offset
    375    * @param minNode the smallest node number
    376    * @param maxNode the largest node number
    377    * @return a list of 0 or more Block objects, never null
    378    */
    379   public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
    380 
    381     ArrayList<Block> blocks = new ArrayList<Block>();
    382     int startNode = Math.max(getBlockStart(textStart), minNode);
    383     int endNode = Math.min(getBlockEnd(textEnd), maxNode);
    384 
    385     if (DEBUG) {
    386       debug("Creating block: " +
    387             "text pos: " + textStart + "-" + textEnd + "\n" +
    388             "node pos: " + startNode + "-" + endNode + "\n" +
    389             plainText.substring(textStart, textEnd));
    390     }
    391 
    392     // Split up the block [start, end) into one or more blocks that
    393     // are well-formed, and begins at a "line" boundary.
    394     int blockStart = -1;
    395     for (int n = startNode; n < endNode;) {
    396 
    397       // The node n spans [nBegin, nEnd]
    398       int nBegin = begins.get(n);
    399       int nEnd = ends.get(n);
    400 
    401       if (blockStart == -1) {
    402         // Check if this is a valid start node
    403         if (nBegin >= n && nEnd <= endNode &&
    404             canBeginBlockAt(n)) {
    405           blockStart = n;
    406           n = nEnd + 1;
    407         } else {
    408           n++;
    409         }
    410         continue;
    411       }
    412 
    413       // If the node [nBegin, nEnd) lies completely within
    414       // the region then proceed to the (nEnd + 1).
    415       if (nBegin >= blockStart && nEnd < endNode) {
    416         n = nEnd + 1;
    417         continue;
    418       }
    419 
    420       // If we got here, we have to break up the region into one
    421       // or more blocks because the current node cannot be included
    422       // in the region.
    423       if (DEBUG) {
    424         debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
    425               ") exceeds (" + blockStart + " " + endNode + ")");
    426       }
    427       Block b = new Block();
    428       b.start_node = blockStart;
    429       b.end_node = n;
    430       blocks.add(b);
    431 
    432       blockStart = -1;
    433       n++;
    434     }
    435 
    436     // Last block
    437     if (blockStart != -1) {
    438       Block b = new Block();
    439       b.start_node = blockStart;
    440       b.end_node = endNode;
    441       blocks.add(b);
    442     }
    443 
    444     if (DEBUG) {
    445       for (int i = 0; i < blocks.size(); i++) {
    446         Block b = blocks.get(i);
    447         debug("Block " + i + "/" + blocks.size() + ": " +
    448               b.start_node + "-" + b.end_node + " " +
    449               getPlainText(b.start_node, b.end_node));
    450       }
    451     }
    452 
    453     return blocks;
    454   }
    455 
    456   /**
    457    * Checks if a block can begin starting from a node position
    458    */
    459   private boolean canBeginBlockAt(int nodePos) {
    460     int textPos = textPositions[nodePos];
    461 
    462     // Make sure that we don't exceed the text position, this happens
    463     // for the last tag nodes.
    464     if (textPos == plainText.length()) {
    465       textPos--;
    466     }
    467 
    468     // Scan backwards to check if a nodePos is at the beginning
    469     // of a line.
    470     for (int i = textPos; i > 0; i--) {
    471       char ch = plainText.charAt(i);
    472       if (ch == '\n') {
    473         return true;
    474       }
    475       if (i < textPos && !HTML_WHITESPACE.matches(ch)) {
    476         return false;
    477       }
    478     }
    479     return true;
    480   }
    481 
    482   /**
    483    * Returns the start of a block given a text-pos
    484    */
    485   private int getBlockStart(int textPos) {
    486     int nodenum = Arrays.binarySearch(textPositions, textPos);
    487     if (nodenum >= 0) {
    488       // Got an exact node alignment. Get the outer most pos that
    489       // matches the text position
    490       while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
    491         nodenum--;
    492       }
    493     } else {
    494       // textPos matches the middle of a node.
    495       nodenum = -nodenum - 1;
    496     }
    497 
    498     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    499     return nodenum;
    500   }
    501 
    502   /**
    503    * Returns the end of a block given a text-pos
    504    */
    505   private int getBlockEnd(int textPos) {
    506     int nodenum = Arrays.binarySearch(textPositions, textPos);
    507     if (nodenum >= 0) {
    508       // Got an exact node alignment.
    509       while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
    510         nodenum++;
    511       }
    512     } else {
    513       // textPos matches the middle of a node.
    514       nodenum = -nodenum - 2;
    515     }
    516     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    517     return nodenum;
    518   }
    519 
    520   //------------------------------------------------------------------------
    521   // Plain text view of the html tree
    522   //------------------------------------------------------------------------
    523   /**
    524    * @return the plain-text position corresponding to the node
    525    */
    526   public int getTextPosition(int node) {
    527     return textPositions[node];
    528   }
    529 
    530   /**
    531    * @return a plain-text String of the html tree
    532    */
    533   public String getPlainText() {
    534     if (plainText == null) {
    535       convertToPlainText();
    536     }
    537     return plainText;
    538   }
    539 
    540   /**
    541    * @return a plain-text String of a part of the html tree
    542    */
    543   public String getPlainText(int fromNode, int toNode) {
    544     if (plainText == null) {
    545       convertToPlainText();
    546     }
    547     int textstart = textPositions[fromNode];
    548     int textend = textPositions[toNode];
    549     return plainText.substring(textstart, textend);
    550   }
    551 
    552   /**
    553    * Converts the html tree to plain text.
    554    * We simply iterate through the nodes in the tree.
    555    * As we output the plain-text, we keep track of the text position
    556    * of each node.
    557    * For String nodes, we replace '\n' with ' ' unless we're in a
    558    * <pre> block.
    559    */
    560   private void convertToPlainText() {
    561     X.assertTrue(plainText == null && textPositions == null);
    562 
    563     int numNodes = nodes.size();
    564 
    565     // Keeps track of start text position of each node, including a last
    566     // entry for the size of the text.
    567     textPositions = new int[numNodes + 1];
    568 
    569     Converter<String> converter = (Converter<String>) converterFactory.createInstance();
    570 
    571     for (int i = 0; i < numNodes; i++) {
    572       textPositions[i] = converter.getPlainTextLength();
    573       converter.addNode(nodes.get(i), i, ends.get(i));
    574     }
    575 
    576     // Add a last entry, so that textPositions_[nodes_.size()] is valid.
    577     textPositions[numNodes] = converter.getPlainTextLength();
    578 
    579     plainText = converter.getObject();
    580 
    581     if (DEBUG) {
    582       debug("Plain text: " + plainText);
    583 
    584       for (int i = 0; i < nodes.size(); i++) {
    585         int textPos = textPositions[i];
    586         String text = plainText.substring(textPos, textPositions[i + 1]);
    587         debug("At " + i + ": pos=" + textPos + " " +  text);
    588       }
    589     }
    590   }
    591 
    592     //------------------------------------------------------------------------
    593     // Spanned view of the html tree
    594     //------------------------------------------------------------------------
    595     /**
    596      * @return a Spanned representation of the html tree
    597      */
    598     public Spanned getSpanned() {
    599         if (constructedSpan == null) {
    600             convertToSpan();
    601         }
    602         return constructedSpan;
    603     }
    604 
    605     /**
    606      * Converts the html tree to plain text.
    607      * We simply iterate through the nodes in the tree.
    608      * As we output the plain-text, we keep track of the text position
    609      * of each node.
    610      * For String nodes, we replace '\n' with ' ' unless we're in a
    611      * <pre> block.
    612      */
    613     private void convertToSpan() {
    614         X.assertTrue(constructedSpan == null);
    615 
    616         int numNodes = nodes.size();
    617 
    618         Converter<Spanned> converter = (Converter<Spanned>) converterFactory.createInstance();
    619 
    620         for (int i = 0; i < numNodes; i++) {
    621             converter.addNode(nodes.get(i), i, ends.get(i));
    622         }
    623 
    624         constructedSpan = converter.getObject();
    625     }
    626 
    627   /**
    628    * Encapsulates the logic for outputting plain text with respect to text
    629    * segments, white space separators, line breaks, and quote marks.
    630    */
    631   @VisibleForTesting
    632   static final class PlainTextPrinter {
    633     /**
    634      * Separators are whitespace inserted between segments of text. The
    635      * semantics are such that between any two segments of text, there is
    636      * at most one separator. As such, separators are ordered in increasing
    637      * priority, and setting a separator multiple times between text will
    638      * result in the single separator with the highest priority being used.
    639      * For example, a LineBreak (one newline) will override a Space, but will
    640      * be overriden by a BlankLine (two newlines).
    641      */
    642     static enum Separator {
    643       // The values here must be ordered by increasing priority, as the
    644       // enum's ordinal() method is used when determining if a new separator
    645       // should override an existing one.
    646       None,
    647       Space,      // single space
    648       LineBreak,  // single new line
    649       BlankLine   // two new lines
    650     }
    651 
    652     // White space characters that are collapsed as a single space.
    653     // Note that characters such as the non-breaking whitespace
    654     // and full-width spaces are not equivalent to the normal spaces.
    655     private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
    656 
    657     /**
    658      * Determines if the given character is considered an HTML space character.
    659      * Consecutive HTML space characters are collapsed into a single space when
    660      * not within a PRE element.
    661      */
    662     private static boolean isHtmlWhiteSpace(char ch) {
    663       return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
    664     }
    665 
    666     // The buffer in which we accumulate the converted plain text
    667     private final StringBuilder sb = new StringBuilder();
    668 
    669     // How many <blockquote> blocks we are in.
    670     private int quoteDepth = 0;
    671 
    672     // How many logical newlines are at the end of the buffer we've outputted.
    673     // Note that we can't simply count the newlines at the end of the output
    674     // buffer because a logical new line may be followed by quote marks.
    675     //
    676     // We initialize the value to 2 so that we consume any initial separators,
    677     // since we don't need separators at the beginning of the output. This also
    678     // results in correctly outputting any quote marks at the beginning of the
    679     // output if the first piece of text is within a BLOCKQUOTE element.
    680     private int endingNewLines = 2;
    681 
    682     // The next separator to be inserted between two text nodes.
    683     private Separator separator = Separator.None;
    684 
    685     /** Returns the current length of the text. */
    686     final int getTextLength() {
    687       return sb.length();
    688     }
    689 
    690     /** Returns the current text. */
    691     final String getText() {
    692       return sb.toString();
    693     }
    694 
    695     /**
    696      * Sets the next separator between two text nodes. A Space separator is
    697      * used if there is any whitespace between the two text nodes when there is
    698      * no intervening element that breaks flow. This is automatically handled
    699      * by the {@link #appendNormalText} function so the client never needs to
    700      * specify this separator.
    701      * <p>
    702      * A LineBreak separator (single new line) is used if text segments are
    703      * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
    704      * etc.). The client should set this separator for opening and closing tags
    705      * of any element that breaks flow.
    706      * <p>
    707      * A BlankLine separator (two new lines) should be set for opening and
    708      * closing P tags.
    709      * <p>
    710      * If this method is called multiple times between text nodes, a
    711      * separator with a higher priority will override that of a lower priority.
    712      */
    713     final void setSeparator(Separator newSeparator) {
    714       if (newSeparator.ordinal() > separator.ordinal()) {
    715         separator = newSeparator;
    716       }
    717     }
    718 
    719     /** Increments the current quote depth of the text. */
    720     final void incQuoteDepth() {
    721       quoteDepth++;
    722     }
    723 
    724     /** Decrements the current quote depth of the text. */
    725     final void decQuoteDepth() {
    726       quoteDepth = Math.max(0, quoteDepth - 1);
    727     }
    728 
    729     /**
    730      * Normalizes the HTML whitespace in the given {@code text} and appends it
    731      * as the next segment of text. This will flush any separator that should
    732      * be appended before the text, as well as any quote marks that should
    733      * follow the last newline if the quote depth is non-zero.
    734      */
    735     final void appendNormalText(String text) {
    736       if (text.length() == 0) {
    737         return;
    738       }
    739       boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
    740       boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
    741 
    742       // Strip beginning and ending whitespace.
    743       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
    744 
    745       // Collapse whitespace within the text.
    746       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
    747 
    748       if (startsWithSpace) {
    749         setSeparator(Separator.Space);
    750       }
    751 
    752       appendTextDirect(text);
    753 
    754       if (endsWithSpace) {
    755         setSeparator(Separator.Space);
    756       }
    757     }
    758 
    759     /**
    760      * Appends the given text, preserving all whitespace. This is used for
    761      * appending text in a PRE element.
    762      */
    763     final void appendPreText(String text) {
    764       // We're in a <pre> block. Split the text into lines, and append
    765       // each line with appendTextDirect() to preserve white space.
    766       String[] lines = text.split("[\\r\\n]", -1);
    767 
    768       // split() will always return an array with at least one element.
    769       appendTextDirect(lines[0]);
    770 
    771       // For all of the remaining lines, we append a newline first, which
    772       // takes care of any quote marks that we need to output if the quote
    773       // depth is non-zero.
    774       for (int i = 1; i < lines.length; i++) {
    775         appendNewLine();
    776         appendTextDirect(lines[i]);
    777       }
    778     }
    779 
    780     /**
    781      * Appends the {@code text} directly to the output, taking into account
    782      * any separator that should be appended before it, and any quote marks
    783      * that should follow the last newline if the quote depth is non-zero.
    784      * <p>
    785      * {@code text} must not contain any new lines--in order to handle
    786      * quoting correctly, it is up to the caller to either normalize away the
    787      * newlines, or split the text up into separate lines and handle new lines
    788      * with the {@link #appendNewLine} method.
    789      * <p>
    790      * The original {@code text} is not modified in any way. Use this method
    791      * when you need to preserve the original white space.
    792      * <p>
    793      * If the given {@code text} is non empty, this method will result in
    794      * {@code endingNewLines} being reset to 0.
    795      */
    796     private void appendTextDirect(String text) {
    797       if (text.length() == 0) {
    798         return;
    799       }
    800       Preconditions.checkArgument(text.indexOf('\n') < 0,
    801                                   "text must not contain newlines.");
    802       flushSeparator();
    803       maybeAddQuoteMarks(true);
    804       sb.append(text);
    805       endingNewLines = 0;
    806     }
    807 
    808     /**
    809      * Appends a forced line break, which is the equivalent of a BR element.
    810      */
    811     final void appendForcedLineBreak() {
    812       flushSeparator();
    813       appendNewLine();
    814     }
    815 
    816     /**
    817      * Appends any pending separator to the output buffer. This should be
    818      * called before appending text to the buffer.
    819      */
    820     private void flushSeparator() {
    821       switch (separator) {
    822         case Space:
    823           if (endingNewLines == 0) {
    824             // Only append a space separator if we are not following a new
    825             // line character. For example, we don't append a separator
    826             // space after a <br> tag, since the <br>'s newline fulfills the
    827             // space separation requirement.
    828             sb.append(" ");
    829           }
    830           break;
    831         case LineBreak:
    832           while (endingNewLines < 1) {
    833             appendNewLine();
    834           }
    835           break;
    836         case BlankLine:
    837           while (endingNewLines < 2) {
    838             appendNewLine();
    839           }
    840           break;
    841       }
    842       separator = Separator.None;
    843     }
    844 
    845     /**
    846      * Adds a newline to the output. This handles any quote marks that should
    847      * follow any previous new lines, and increments {@code endingNewLines}.
    848      */
    849     private void appendNewLine() {
    850       maybeAddQuoteMarks(false);
    851       sb.append('\n');
    852       endingNewLines++;
    853     }
    854 
    855     /**
    856      * Adds quote marks to the output if we are at the beginning of a line.
    857      * One '>' character is used for every level of quoting we are in.
    858      *
    859      * @param includeEndingSpace Includes a single space after the quote marks.
    860      */
    861     private void maybeAddQuoteMarks(boolean includeEndingSpace) {
    862       // We only need to add quote marks if we are at the beginning of line.
    863       if (endingNewLines > 0 && quoteDepth > 0) {
    864         for (int i = 0; i < quoteDepth; i++) {
    865           sb.append('>');
    866         }
    867         if (includeEndingSpace) {
    868           sb.append(' ');
    869         }
    870       }
    871     }
    872   }
    873 
    874   /**
    875    * Contains the logic for converting the contents of one HtmlTree into
    876    * plaintext.
    877    */
    878   public static class DefaultPlainTextConverter implements Converter<String> {
    879 
    880     private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
    881         ImmutableSet.of(
    882             HTML4.P_ELEMENT,
    883             HTML4.BLOCKQUOTE_ELEMENT,
    884             HTML4.PRE_ELEMENT);
    885 
    886     private final PlainTextPrinter printer = new PlainTextPrinter();
    887 
    888     private int preDepth = 0;
    889     private int styleDepth = 0;
    890 
    891     @Override
    892     public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
    893       if (n instanceof HtmlDocument.Text) {        // A string node
    894 
    895         HtmlDocument.Text textNode = (HtmlDocument.Text) n;
    896         String str = textNode.getText();
    897 
    898         if (preDepth > 0) {
    899           printer.appendPreText(str);
    900 
    901         } else if (styleDepth > 0) {
    902           // Append nothing
    903         } else {
    904           printer.appendNormalText(str);
    905         }
    906 
    907       } else if (n instanceof HtmlDocument.Tag) {
    908 
    909         // Check for linebreaking tags.
    910         HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
    911         HTML.Element element = tag.getElement();
    912 
    913         if (BLANK_LINE_ELEMENTS.contains(element)) {
    914           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    915 
    916         } else if (HTML4.BR_ELEMENT.equals(element)) {
    917           // The <BR> element is special in that it always adds a newline.
    918           printer.appendForcedLineBreak();
    919 
    920         } else if (element.breaksFlow()) {
    921           // All other elements that break the flow add a LineBreak separator.
    922           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    923 
    924           if (HTML4.HR_ELEMENT.equals(element)) {
    925             printer.appendNormalText("________________________________");
    926             printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    927           }
    928         }
    929 
    930         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    931           printer.incQuoteDepth();
    932 
    933         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    934           preDepth++;
    935         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
    936           styleDepth++;
    937         }
    938 
    939       } else if (n instanceof HtmlDocument.EndTag) {
    940 
    941         // Check for linebreaking tags.
    942         HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
    943         HTML.Element element = endTag.getElement();
    944 
    945         if (BLANK_LINE_ELEMENTS.contains(element)) {
    946           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    947 
    948         } else if (element.breaksFlow()) {
    949           // All other elements that break the flow add a LineBreak separator.
    950           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    951         }
    952 
    953         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    954           printer.decQuoteDepth();
    955 
    956         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    957           preDepth--;
    958         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
    959           styleDepth--;
    960         }
    961       }
    962     }
    963 
    964     @Override
    965     public final int getPlainTextLength() {
    966       return printer.getTextLength();
    967     }
    968 
    969     @Override
    970     public final String getObject() {
    971       return printer.getText();
    972     }
    973   }
    974 
    975   //------------------------------------------------------------------------
    976   // The following methods are used to build the html tree.
    977   //------------------------------------------------------------------------
    978   /** For building the html tree */
    979   private Stack<Integer> stack;
    980   private int parent;
    981 
    982   /** Starts the build process */
    983   void start() {
    984     stack = new Stack<Integer>();
    985     parent = -1;
    986   }
    987 
    988   /** Finishes the build process */
    989   void finish() {
    990     X.assertTrue(stack.size() == 0);
    991     X.assertTrue(parent == -1);
    992   }
    993 
    994   /**
    995    * Adds a html start tag, there must followed later by a call to addEndTag()
    996    * to add the matching end tag
    997    */
    998   void addStartTag(HtmlDocument.Tag t) {
    999     int nodenum = nodes.size();
   1000     addNode(t, nodenum, -1);
   1001 
   1002     stack.add(parent);
   1003     parent = nodenum;
   1004   }
   1005 
   1006   /**
   1007    * Adds a html end tag, this must be preceded by a previous matching open tag
   1008    */
   1009   void addEndTag(HtmlDocument.EndTag t) {
   1010     int nodenum = nodes.size();
   1011     addNode(t, parent, nodenum);
   1012 
   1013     if (parent != -1) {
   1014       ends.set(parent, nodenum);
   1015     }
   1016 
   1017     parent = stack.pop();
   1018   }
   1019 
   1020   /** Adds a singular tag that does not have a corresponding end tag */
   1021   void addSingularTag(HtmlDocument.Tag t) {
   1022     int nodenum = nodes.size();
   1023     addNode(t, nodenum, nodenum);
   1024   }
   1025 
   1026   /**
   1027    * Adds a text
   1028    * @param t a plain-text string
   1029    */
   1030   void addText(HtmlDocument.Text t) {
   1031     int nodenum = nodes.size();
   1032     addNode(t, nodenum, nodenum);
   1033   }
   1034 
   1035   /** Adds a node */
   1036   private void addNode(HtmlDocument.Node n, int begin, int end) {
   1037     nodes.add(n);
   1038     begins.add(begin);
   1039     ends.add(end);
   1040   }
   1041 
   1042   /** For debugging */
   1043   private static final void debug(String str) {
   1044     logger.finest(str);
   1045   }
   1046 
   1047 }