Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.google.android.mail.common.html.parser;
     17 
     18 import com.google.android.mail.common.base.CharMatcher;
     19 import com.google.android.mail.common.base.Preconditions;
     20 import com.google.android.mail.common.base.X;
     21 import com.google.common.annotations.VisibleForTesting;
     22 import com.google.common.collect.ImmutableSet;
     23 
     24 import java.util.ArrayList;
     25 import java.util.Arrays;
     26 import java.util.Collections;
     27 import java.util.List;
     28 import java.util.Set;
     29 import java.util.Stack;
     30 import java.util.logging.Logger;
     31 
     32 /**
     33  * HtmlTree represents a parsed and well-formed html text, it provides
     34  * methods to convert to plain text. It also provides methods to find
     35  * well-formed blocks of text, for quote detection.
     36  *
     37  * We don't really build a html tree data structure. Instead, for
     38  * efficiency, and for the ability to do a simple in-order-traversal
     39  * of the tree, we simply keeps a linear list of nodes (in-order).
     40  * The begin_ and end_ arrays keeps track of the starting end ending
     41  * nodes:
     42  *
     43  * For a string node, begin_[node] = end_[node] = node
     44  * For an open tag, begin_[node] = node, end_[node] = the matching end tag
     45  * For a close tag, end_[node] = the matching open tag, end_[node] = node
     46  *
     47  * @author jlim (at) google.com (Jing Yee Lim)
     48  */
     49 public class HtmlTree {
     50   // http://www.w3.org/TR/html4/struct/text.html#h-9.1
     51   private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n");
     52 
     53   /**
     54    * An interface that allows clients to provide their own implementation
     55    * of a {@link PlainTextConverter}.
     56    */
     57   public static interface PlainTextConverterFactory {
     58     /**
     59      * Creates a new instance of a {@link PlainTextConverter} to convert
     60      * the contents of an {@link HtmlTree} to plain text.
     61      */
     62     PlainTextConverter createInstance();
     63   }
     64 
     65   /**
     66    * An interface for an object which converts a single HtmlTree into
     67    * plaintext.
     68    */
     69   public static interface PlainTextConverter {
     70     /**
     71      * Adds the given node {@code n} to plain text.
     72      *
     73      * @param n The node to convert to text.
     74      * @param nodeNum The number of the node among the list of all notes.
     75      * @param endNum The number of the ending node if this is a start node,
     76      *    otherwise the same as {@code nodeNum}.
     77      */
     78     void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
     79 
     80     /**
     81      * Returns the current length of the plain text.
     82      */
     83     int getPlainTextLength();
     84 
     85     /**
     86      * Returns the current plain text.
     87      */
     88     String getPlainText();
     89   }
     90 
     91   /** A factory that produces converters of the default implementation. */
     92   private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY =
     93       new PlainTextConverterFactory() {
     94         @Override
     95         public PlainTextConverter createInstance() {
     96           return new DefaultPlainTextConverter();
     97         }
     98       };
     99 
    100   /** Contains html nodes */
    101   private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
    102 
    103   /** Keeps track of beginning and end of each node */
    104   private final Stack<Integer> begins = new Stack<Integer>();
    105   private final Stack<Integer> ends = new Stack<Integer>();
    106 
    107   /** Plain text (lazy creation) */
    108   private String plainText;
    109 
    110   /** The html string (lazy creation) */
    111   private String html;
    112 
    113   /** textPositions[node pos] = the text position */
    114   private int[] textPositions;
    115 
    116   private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
    117 
    118   // For debugging only
    119   private static final boolean DEBUG = false;
    120 
    121   private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
    122 
    123   //------------------------------------------------------------------------
    124 
    125   /** HtmlTree can only be constructed from this package */
    126   HtmlTree() {
    127   }
    128 
    129   /**
    130    * Sets a new {@link PlainTextConverterFactory} to be used to convert
    131    * the contents of this tree to plaintext.
    132    */
    133   public void setPlainTextConverterFactory(PlainTextConverterFactory factory) {
    134     if (factory == null) {
    135       throw new NullPointerException("factory must not be null");
    136     }
    137     converterFactory = factory;
    138   }
    139 
    140   /**
    141    * Gets the list of node objects. A node can be either a
    142    * Tag, EngTag or a String object.
    143    * @return the nodes of the tree
    144    */
    145   public List<HtmlDocument.Node> getNodesList() {
    146     return Collections.unmodifiableList(nodes);
    147   }
    148 
    149   /**
    150    * @return number of nodes
    151    */
    152   public int getNumNodes() {
    153     return nodes.size();
    154   }
    155 
    156   /**
    157    * Returns number of matching open tag node, or {@code endTagNodeNum} itself
    158    * if it does not point to a closing tag.
    159    */
    160   public int findOpenTag(int endTagNodeNum) {
    161     X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size());
    162     return begins.get(endTagNodeNum);
    163   }
    164 
    165   /**
    166    * Returns number of matching closing tag node, or {@code openTagNodeNum} itself
    167    * if it does not point to an open tag or points to an open tag with no closing one.
    168    */
    169   public int findEndTag(int openTagNodeNum) {
    170     X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size());
    171     return ends.get(openTagNodeNum);
    172   }
    173 
    174   /**
    175    * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself
    176    * if it does not point to an open/closing tag (e.g text node or comment).
    177    */
    178   public int findPairedTag(int tagNodeNum) {
    179     X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size());
    180     int openNodeNum = begins.get(tagNodeNum);
    181     int endNodeNum = ends.get(tagNodeNum);
    182     return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum;
    183   }
    184 
    185   /**
    186    * Gets the entire html.
    187    */
    188   public String getHtml() {
    189     return getHtml(-1);
    190   }
    191 
    192   /**
    193    * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
    194    * specified size.
    195    */
    196   public String getHtml(int wrapSize) {
    197     if (html == null) {
    198       html = getHtml(0, nodes.size(), wrapSize);
    199     }
    200     return html;
    201   }
    202 
    203   /** Gets parts of the html */
    204   public String getHtml(int fromNode, int toNode) {
    205     return getHtml(fromNode, toNode, -1);
    206   }
    207 
    208   /**
    209    * Gets parts of the html, if wrapSize is > 0, it tries
    210    * to do wrapping at the specified size.
    211    */
    212   public String getHtml(int fromNode, int toNode, int wrapSize) {
    213     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    214 
    215     int estSize = (toNode - fromNode) * 10;
    216     StringBuilder sb = new StringBuilder(estSize);
    217     int lastWrapIndex = 0;      // used for wrapping
    218     for (int n = fromNode; n < toNode; n++) {
    219       HtmlDocument.Node node = nodes.get(n);
    220       node.toHTML(sb);
    221       // TODO: maybe we can be smarter about this and not add newlines
    222       // within <pre> tags, unless the whole long line is encompassed
    223       // by the <pre> tag.
    224       if (wrapSize > 0) {
    225         // We can only wrap if the last outputted node is an element that
    226         // breaks the flow. Otherwise, we risk the possibility of inserting
    227         // spaces where they shouldn't be.
    228         if ((node instanceof HtmlDocument.Tag &&
    229               ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
    230             (node instanceof HtmlDocument.EndTag &&
    231               ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
    232           // Check to see if there is a newline in the most recent node's html.
    233           int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
    234           if (recentNewLine != -1) {
    235             lastWrapIndex += recentNewLine;
    236           }
    237           // If the last index - last index of a newline is greater than
    238           // wrapSize, add a newline.
    239           if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
    240             sb.append('\n');
    241             lastWrapIndex = sb.length() - 1;
    242           }
    243         }
    244       }
    245     }
    246 
    247     return sb.toString();
    248   }
    249 
    250   /**
    251    * Convert a html region into chunks of html code, each containing
    252    * roughly chunkSize characters.
    253    */
    254   public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
    255     X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
    256 
    257     ArrayList<String> chunks = new ArrayList<String>();
    258 
    259     // Do a best effort attempt to not split apart certain elements (as of now,
    260     // just the <textarea>). We cannot guarantee that they will not be split
    261     // because the client may specify endpoint nodes that land in the middle
    262     // of an element (although this shouldn't happen if the endpoints returned
    263     // by createBlocks() are properly used).
    264     int stack = 0;
    265     boolean balanced = true;
    266 
    267     StringBuilder sb = new StringBuilder(chunkSize + 256);
    268     for (int n = fromNode; n < toNode; n++) {
    269       HtmlDocument.Node node = nodes.get(n);
    270       node.toHTML(sb);
    271 
    272       if (node instanceof HtmlDocument.Tag) {
    273         if (HTML4.TEXTAREA_ELEMENT.equals(
    274             ((HtmlDocument.Tag) node).getElement())) {
    275           stack++;
    276         }
    277       }
    278       if (node instanceof HtmlDocument.EndTag) {
    279         if (HTML4.TEXTAREA_ELEMENT.equals(
    280             ((HtmlDocument.EndTag) node).getElement())) {
    281           if (stack == 0) {
    282             balanced = false;
    283           } else {
    284             stack--;
    285           }
    286         }
    287       }
    288 
    289       if (stack == 0 && sb.length() >= chunkSize) {
    290         chunks.add(sb.toString());
    291         sb.setLength(0);
    292       }
    293     }
    294 
    295     // Don't forget the last chunk!
    296     if (sb.length() > 0) {
    297       chunks.add(sb.toString());
    298     }
    299 
    300     // If the tree is not balanced (cut off in the middle of a node), log
    301     // debug data. Clients should fix their code so that the endpoints from
    302     // createBlocks() are properly used.
    303     if (!balanced || stack != 0) {
    304       StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
    305       debug.append(getHtml());
    306       debug.append("\nfromNode: ").append(fromNode);
    307       debug.append("\ntoNode: ").append(toNode);
    308       debug.append("\nNum nodes_: ").append(getNumNodes());
    309       for (String chunk : chunks) {
    310         debug.append("\nChunk:\n").append(chunk);
    311       }
    312       logger.severe(debug.toString());
    313     }
    314 
    315     return chunks;
    316   }
    317 
    318   /**
    319    * Returns height (maximum length from root to a leaf) of the HTML tree.
    320    * @return height of the HTML tree.
    321    */
    322   public int getTreeHeight() {
    323     int currentHeight = 0;
    324     int maxHeight = 0;
    325 
    326     for (int i = 0; i < nodes.size(); i++) {
    327       HtmlDocument.Node node = nodes.get(i);
    328       if (node instanceof HtmlDocument.Tag) {
    329         currentHeight++;
    330         if (currentHeight > maxHeight) {
    331           maxHeight = currentHeight;
    332         }
    333         if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
    334           // Empty tags have no closing pair, so decrease counter here.
    335           currentHeight--;
    336         }
    337       } else if (node instanceof HtmlDocument.EndTag) {
    338         currentHeight--;
    339       }
    340     }
    341 
    342     // TODO(anatol): make this value cachable?
    343     return maxHeight;
    344   }
    345 
    346   //------------------------------------------------------------------------
    347   // Creating well-formed blocks within the html tree.
    348   //------------------------------------------------------------------------
    349   /**
    350    * A Block represents a region of a html tree that
    351    * 1) is well-formed, i.e. for each node in the block, all its descendants
    352    * are also contained in the block. So it's safe to wrap the region
    353    * within a <table> or <div>, etc.
    354    * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
    355    */
    356   public static class Block {
    357     /* The starting node */
    358     public int start_node;
    359 
    360     /* The ending node (non-inclusive to the block) */
    361     public int end_node;
    362   }
    363 
    364   /**
    365    * Creates a list of Blocks, given a text-range.
    366    * We may create multiple blocks if one single well-formed Block cannot be
    367    * created.
    368    *
    369    * @param textStart beginning plain-text offset
    370    * @param textEnd beginning plain-text offset
    371    * @param minNode the smallest node number
    372    * @param maxNode the largest node number
    373    * @return a list of 0 or more Block objects, never null
    374    */
    375   public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
    376 
    377     ArrayList<Block> blocks = new ArrayList<Block>();
    378     int startNode = Math.max(getBlockStart(textStart), minNode);
    379     int endNode = Math.min(getBlockEnd(textEnd), maxNode);
    380 
    381     if (DEBUG) {
    382       debug("Creating block: " +
    383             "text pos: " + textStart + "-" + textEnd + "\n" +
    384             "node pos: " + startNode + "-" + endNode + "\n" +
    385             plainText.substring(textStart, textEnd));
    386     }
    387 
    388     // Split up the block [start, end) into one or more blocks that
    389     // are well-formed, and begins at a "line" boundary.
    390     int blockStart = -1;
    391     for (int n = startNode; n < endNode;) {
    392 
    393       // The node n spans [nBegin, nEnd]
    394       int nBegin = begins.get(n);
    395       int nEnd = ends.get(n);
    396 
    397       if (blockStart == -1) {
    398         // Check if this is a valid start node
    399         if (nBegin >= n && nEnd <= endNode &&
    400             canBeginBlockAt(n)) {
    401           blockStart = n;
    402           n = nEnd + 1;
    403         } else {
    404           n++;
    405         }
    406         continue;
    407       }
    408 
    409       // If the node [nBegin, nEnd) lies completely within
    410       // the region then proceed to the (nEnd + 1).
    411       if (nBegin >= blockStart && nEnd < endNode) {
    412         n = nEnd + 1;
    413         continue;
    414       }
    415 
    416       // If we got here, we have to break up the region into one
    417       // or more blocks because the current node cannot be included
    418       // in the region.
    419       if (DEBUG) {
    420         debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
    421               ") exceeds (" + blockStart + " " + endNode + ")");
    422       }
    423       Block b = new Block();
    424       b.start_node = blockStart;
    425       b.end_node = n;
    426       blocks.add(b);
    427 
    428       blockStart = -1;
    429       n++;
    430     }
    431 
    432     // Last block
    433     if (blockStart != -1) {
    434       Block b = new Block();
    435       b.start_node = blockStart;
    436       b.end_node = endNode;
    437       blocks.add(b);
    438     }
    439 
    440     if (DEBUG) {
    441       for (int i = 0; i < blocks.size(); i++) {
    442         Block b = blocks.get(i);
    443         debug("Block " + i + "/" + blocks.size() + ": " +
    444               b.start_node + "-" + b.end_node + " " +
    445               getPlainText(b.start_node, b.end_node));
    446       }
    447     }
    448 
    449     return blocks;
    450   }
    451 
    452   /**
    453    * Checks if a block can begin starting from a node position
    454    */
    455   private boolean canBeginBlockAt(int nodePos) {
    456     int textPos = textPositions[nodePos];
    457 
    458     // Make sure that we don't exceed the text position, this happens
    459     // for the last tag nodes.
    460     if (textPos == plainText.length()) {
    461       textPos--;
    462     }
    463 
    464     // Scan backwards to check if a nodePos is at the beginning
    465     // of a line.
    466     for (int i = textPos; i > 0; i--) {
    467       char ch = plainText.charAt(i);
    468       if (ch == '\n') {
    469         return true;
    470       }
    471       if (i < textPos && !HTML_WHITESPACE.matches(ch)) {
    472         return false;
    473       }
    474     }
    475     return true;
    476   }
    477 
    478   /**
    479    * Returns the start of a block given a text-pos
    480    */
    481   private int getBlockStart(int textPos) {
    482     int nodenum = Arrays.binarySearch(textPositions, textPos);
    483     if (nodenum >= 0) {
    484       // Got an exact node alignment. Get the outer most pos that
    485       // matches the text position
    486       while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
    487         nodenum--;
    488       }
    489     } else {
    490       // textPos matches the middle of a node.
    491       nodenum = -nodenum - 1;
    492     }
    493 
    494     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    495     return nodenum;
    496   }
    497 
    498   /**
    499    * Returns the end of a block given a text-pos
    500    */
    501   private int getBlockEnd(int textPos) {
    502     int nodenum = Arrays.binarySearch(textPositions, textPos);
    503     if (nodenum >= 0) {
    504       // Got an exact node alignment.
    505       while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
    506         nodenum++;
    507       }
    508     } else {
    509       // textPos matches the middle of a node.
    510       nodenum = -nodenum - 2;
    511     }
    512     X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
    513     return nodenum;
    514   }
    515 
    516   //------------------------------------------------------------------------
    517   // Plain text view of the html tree
    518   //------------------------------------------------------------------------
    519   /**
    520    * @return the plain-text position corresponding to the node
    521    */
    522   public int getTextPosition(int node) {
    523     return textPositions[node];
    524   }
    525 
    526   /**
    527    * @return a plain-text String of the html tree
    528    */
    529   public String getPlainText() {
    530     if (plainText == null) {
    531       convertToPlainText();
    532     }
    533     return plainText;
    534   }
    535 
    536   /**
    537    * @return a plain-text String of a part of the html tree
    538    */
    539   public String getPlainText(int fromNode, int toNode) {
    540     if (plainText == null) {
    541       convertToPlainText();
    542     }
    543     int textstart = textPositions[fromNode];
    544     int textend = textPositions[toNode];
    545     return plainText.substring(textstart, textend);
    546   }
    547 
    548   /**
    549    * Converts the html tree to plain text.
    550    * We simply iterate through the nodes in the tree.
    551    * As we output the plain-text, we keep track of the text position
    552    * of each node.
    553    * For String nodes, we replace '\n' with ' ' unless we're in a
    554    * <pre> block.
    555    */
    556   private void convertToPlainText() {
    557     X.assertTrue(plainText == null && textPositions == null);
    558 
    559     int numNodes = nodes.size();
    560 
    561     // Keeps track of start text position of each node, including a last
    562     // entry for the size of the text.
    563     textPositions = new int[numNodes + 1];
    564 
    565     PlainTextConverter converter = converterFactory.createInstance();
    566 
    567     for (int i = 0; i < numNodes; i++) {
    568       textPositions[i] = converter.getPlainTextLength();
    569       converter.addNode(nodes.get(i), i, ends.get(i));
    570     }
    571 
    572     // Add a last entry, so that textPositions_[nodes_.size()] is valid.
    573     textPositions[numNodes] = converter.getPlainTextLength();
    574 
    575     plainText = converter.getPlainText();
    576 
    577     if (DEBUG) {
    578       debug("Plain text: " + plainText);
    579 
    580       for (int i = 0; i < nodes.size(); i++) {
    581         int textPos = textPositions[i];
    582         String text = plainText.substring(textPos, textPositions[i + 1]);
    583         debug("At " + i + ": pos=" + textPos + " " +  text);
    584       }
    585     }
    586   }
    587 
    588   /**
    589    * Encapsulates the logic for outputting plain text with respect to text
    590    * segments, white space separators, line breaks, and quote marks.
    591    */
    592   @VisibleForTesting
    593   static final class PlainTextPrinter {
    594     /**
    595      * Separators are whitespace inserted between segments of text. The
    596      * semantics are such that between any two segments of text, there is
    597      * at most one separator. As such, separators are ordered in increasing
    598      * priority, and setting a separator multiple times between text will
    599      * result in the single separator with the highest priority being used.
    600      * For example, a LineBreak (one newline) will override a Space, but will
    601      * be overriden by a BlankLine (two newlines).
    602      */
    603     static enum Separator {
    604       // The values here must be ordered by increasing priority, as the
    605       // enum's ordinal() method is used when determining if a new separator
    606       // should override an existing one.
    607       None,
    608       Space,      // single space
    609       LineBreak,  // single new line
    610       BlankLine   // two new lines
    611     }
    612 
    613     // White space characters that are collapsed as a single space.
    614     // Note that characters such as the non-breaking whitespace
    615     // and full-width spaces are not equivalent to the normal spaces.
    616     private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
    617 
    618     /**
    619      * Determines if the given character is considered an HTML space character.
    620      * Consecutive HTML space characters are collapsed into a single space when
    621      * not within a PRE element.
    622      */
    623     private static boolean isHtmlWhiteSpace(char ch) {
    624       return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
    625     }
    626 
    627     // The buffer in which we accumulate the converted plain text
    628     private final StringBuilder sb = new StringBuilder();
    629 
    630     // How many <blockquote> blocks we are in.
    631     private int quoteDepth = 0;
    632 
    633     // How many logical newlines are at the end of the buffer we've outputted.
    634     // Note that we can't simply count the newlines at the end of the output
    635     // buffer because a logical new line may be followed by quote marks.
    636     //
    637     // We initialize the value to 2 so that we consume any initial separators,
    638     // since we don't need separators at the beginning of the output. This also
    639     // results in correctly outputting any quote marks at the beginning of the
    640     // output if the first piece of text is within a BLOCKQUOTE element.
    641     private int endingNewLines = 2;
    642 
    643     // The next separator to be inserted between two text nodes.
    644     private Separator separator = Separator.None;
    645 
    646     /** Returns the current length of the text. */
    647     final int getTextLength() {
    648       return sb.length();
    649     }
    650 
    651     /** Returns the current text. */
    652     final String getText() {
    653       return sb.toString();
    654     }
    655 
    656     /**
    657      * Sets the next separator between two text nodes. A Space separator is
    658      * used if there is any whitespace between the two text nodes when there is
    659      * no intervening element that breaks flow. This is automatically handled
    660      * by the {@link #appendNormalText} function so the client never needs to
    661      * specify this separator.
    662      * <p>
    663      * A LineBreak separator (single new line) is used if text segments are
    664      * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
    665      * etc.). The client should set this separator for opening and closing tags
    666      * of any element that breaks flow.
    667      * <p>
    668      * A BlankLine separator (two new lines) should be set for opening and
    669      * closing P tags.
    670      * <p>
    671      * If this method is called multiple times between text nodes, a
    672      * separator with a higher priority will override that of a lower priority.
    673      */
    674     final void setSeparator(Separator newSeparator) {
    675       if (newSeparator.ordinal() > separator.ordinal()) {
    676         separator = newSeparator;
    677       }
    678     }
    679 
    680     /** Increments the current quote depth of the text. */
    681     final void incQuoteDepth() {
    682       quoteDepth++;
    683     }
    684 
    685     /** Decrements the current quote depth of the text. */
    686     final void decQuoteDepth() {
    687       quoteDepth = Math.max(0, quoteDepth - 1);
    688     }
    689 
    690     /**
    691      * Normalizes the HTML whitespace in the given {@code text} and appends it
    692      * as the next segment of text. This will flush any separator that should
    693      * be appended before the text, as well as any quote marks that should
    694      * follow the last newline if the quote depth is non-zero.
    695      */
    696     final void appendNormalText(String text) {
    697       if (text.length() == 0) {
    698         return;
    699       }
    700       boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
    701       boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
    702 
    703       // Strip beginning and ending whitespace.
    704       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
    705 
    706       // Collapse whitespace within the text.
    707       text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
    708 
    709       if (startsWithSpace) {
    710         setSeparator(Separator.Space);
    711       }
    712 
    713       appendTextDirect(text);
    714 
    715       if (endsWithSpace) {
    716         setSeparator(Separator.Space);
    717       }
    718     }
    719 
    720     /**
    721      * Appends the given text, preserving all whitespace. This is used for
    722      * appending text in a PRE element.
    723      */
    724     final void appendPreText(String text) {
    725       // We're in a <pre> block. Split the text into lines, and append
    726       // each line with appendTextDirect() to preserve white space.
    727       String[] lines = text.split("[\\r\\n]", -1);
    728 
    729       // split() will always return an array with at least one element.
    730       appendTextDirect(lines[0]);
    731 
    732       // For all of the remaining lines, we append a newline first, which
    733       // takes care of any quote marks that we need to output if the quote
    734       // depth is non-zero.
    735       for (int i = 1; i < lines.length; i++) {
    736         appendNewLine();
    737         appendTextDirect(lines[i]);
    738       }
    739     }
    740 
    741     /**
    742      * Appends the {@code text} directly to the output, taking into account
    743      * any separator that should be appended before it, and any quote marks
    744      * that should follow the last newline if the quote depth is non-zero.
    745      * <p>
    746      * {@code text} must not contain any new lines--in order to handle
    747      * quoting correctly, it is up to the caller to either normalize away the
    748      * newlines, or split the text up into separate lines and handle new lines
    749      * with the {@link #appendNewLine} method.
    750      * <p>
    751      * The original {@code text} is not modified in any way. Use this method
    752      * when you need to preserve the original white space.
    753      * <p>
    754      * If the given {@code text} is non empty, this method will result in
    755      * {@code endingNewLines} being reset to 0.
    756      */
    757     private void appendTextDirect(String text) {
    758       if (text.length() == 0) {
    759         return;
    760       }
    761       Preconditions.checkArgument(text.indexOf('\n') < 0,
    762                                   "text must not contain newlines.");
    763       flushSeparator();
    764       maybeAddQuoteMarks(true);
    765       sb.append(text);
    766       endingNewLines = 0;
    767     }
    768 
    769     /**
    770      * Appends a forced line break, which is the equivalent of a BR element.
    771      */
    772     final void appendForcedLineBreak() {
    773       flushSeparator();
    774       appendNewLine();
    775     }
    776 
    777     /**
    778      * Appends any pending separator to the output buffer. This should be
    779      * called before appending text to the buffer.
    780      */
    781     private void flushSeparator() {
    782       switch (separator) {
    783         case Space:
    784           if (endingNewLines == 0) {
    785             // Only append a space separator if we are not following a new
    786             // line character. For example, we don't append a separator
    787             // space after a <br> tag, since the <br>'s newline fulfills the
    788             // space separation requirement.
    789             sb.append(" ");
    790           }
    791           break;
    792         case LineBreak:
    793           while (endingNewLines < 1) {
    794             appendNewLine();
    795           }
    796           break;
    797         case BlankLine:
    798           while (endingNewLines < 2) {
    799             appendNewLine();
    800           }
    801           break;
    802       }
    803       separator = Separator.None;
    804     }
    805 
    806     /**
    807      * Adds a newline to the output. This handles any quote marks that should
    808      * follow any previous new lines, and increments {@code endingNewLines}.
    809      */
    810     private void appendNewLine() {
    811       maybeAddQuoteMarks(false);
    812       sb.append('\n');
    813       endingNewLines++;
    814     }
    815 
    816     /**
    817      * Adds quote marks to the output if we are at the beginning of a line.
    818      * One '>' character is used for every level of quoting we are in.
    819      *
    820      * @param includeEndingSpace Includes a single space after the quote marks.
    821      */
    822     private void maybeAddQuoteMarks(boolean includeEndingSpace) {
    823       // We only need to add quote marks if we are at the beginning of line.
    824       if (endingNewLines > 0 && quoteDepth > 0) {
    825         for (int i = 0; i < quoteDepth; i++) {
    826           sb.append('>');
    827         }
    828         if (includeEndingSpace) {
    829           sb.append(' ');
    830         }
    831       }
    832     }
    833   }
    834 
    835   /**
    836    * Contains the logic for converting the contents of one HtmlTree into
    837    * plaintext.
    838    */
    839   public static class DefaultPlainTextConverter implements PlainTextConverter {
    840 
    841     private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
    842         ImmutableSet.of(
    843             HTML4.P_ELEMENT,
    844             HTML4.BLOCKQUOTE_ELEMENT,
    845             HTML4.PRE_ELEMENT);
    846 
    847     private final PlainTextPrinter printer = new PlainTextPrinter();
    848 
    849     private int preDepth = 0;
    850     private int styleDepth = 0;
    851 
    852     @Override
    853     public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
    854       if (n instanceof HtmlDocument.Text) {        // A string node
    855 
    856         HtmlDocument.Text textNode = (HtmlDocument.Text) n;
    857         String str = textNode.getText();
    858 
    859         if (preDepth > 0) {
    860           printer.appendPreText(str);
    861 
    862         } else if (styleDepth > 0) {
    863           // Append nothing
    864         } else {
    865           printer.appendNormalText(str);
    866         }
    867 
    868       } else if (n instanceof HtmlDocument.Tag) {
    869 
    870         // Check for linebreaking tags.
    871         HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
    872         HTML.Element element = tag.getElement();
    873 
    874         if (BLANK_LINE_ELEMENTS.contains(element)) {
    875           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    876 
    877         } else if (HTML4.BR_ELEMENT.equals(element)) {
    878           // The <BR> element is special in that it always adds a newline.
    879           printer.appendForcedLineBreak();
    880 
    881         } else if (element.breaksFlow()) {
    882           // All other elements that break the flow add a LineBreak separator.
    883           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    884 
    885           if (HTML4.HR_ELEMENT.equals(element)) {
    886             printer.appendNormalText("________________________________");
    887             printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    888           }
    889         }
    890 
    891         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    892           printer.incQuoteDepth();
    893 
    894         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    895           preDepth++;
    896         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
    897           styleDepth++;
    898         }
    899 
    900       } else if (n instanceof HtmlDocument.EndTag) {
    901 
    902         // Check for linebreaking tags.
    903         HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
    904         HTML.Element element = endTag.getElement();
    905 
    906         if (BLANK_LINE_ELEMENTS.contains(element)) {
    907           printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
    908 
    909         } else if (element.breaksFlow()) {
    910           // All other elements that break the flow add a LineBreak separator.
    911           printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
    912         }
    913 
    914         if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
    915           printer.decQuoteDepth();
    916 
    917         } else if (HTML4.PRE_ELEMENT.equals(element)) {
    918           preDepth--;
    919         } else if (HTML4.STYLE_ELEMENT.equals(element)) {
    920           styleDepth--;
    921         }
    922       }
    923     }
    924 
    925     @Override
    926     public final int getPlainTextLength() {
    927       return printer.getTextLength();
    928     }
    929 
    930     @Override
    931     public final String getPlainText() {
    932       return printer.getText();
    933     }
    934   }
    935 
    936   //------------------------------------------------------------------------
    937   // The following methods are used to build the html tree.
    938   //------------------------------------------------------------------------
    939   /** For building the html tree */
    940   private Stack<Integer> stack;
    941   private int parent;
    942 
    943   /** Starts the build process */
    944   void start() {
    945     stack = new Stack<Integer>();
    946     parent = -1;
    947   }
    948 
    949   /** Finishes the build process */
    950   void finish() {
    951     X.assertTrue(stack.size() == 0);
    952     X.assertTrue(parent == -1);
    953   }
    954 
    955   /**
    956    * Adds a html start tag, there must followed later by a call to addEndTag()
    957    * to add the matching end tag
    958    */
    959   void addStartTag(HtmlDocument.Tag t) {
    960     int nodenum = nodes.size();
    961     addNode(t, nodenum, -1);
    962 
    963     stack.add(parent);
    964     parent = nodenum;
    965   }
    966 
    967   /**
    968    * Adds a html end tag, this must be preceded by a previous matching open tag
    969    */
    970   void addEndTag(HtmlDocument.EndTag t) {
    971     int nodenum = nodes.size();
    972     addNode(t, parent, nodenum);
    973 
    974     if (parent != -1) {
    975       ends.set(parent, nodenum);
    976     }
    977 
    978     parent = stack.pop();
    979   }
    980 
    981   /** Adds a singular tag that does not have a corresponding end tag */
    982   void addSingularTag(HtmlDocument.Tag t) {
    983     int nodenum = nodes.size();
    984     addNode(t, nodenum, nodenum);
    985   }
    986 
    987   /**
    988    * Adds a text
    989    * @param t a plain-text string
    990    */
    991   void addText(HtmlDocument.Text t) {
    992     int nodenum = nodes.size();
    993     addNode(t, nodenum, nodenum);
    994   }
    995 
    996   /** Adds a node */
    997   private void addNode(HtmlDocument.Node n, int begin, int end) {
    998     nodes.add(n);
    999     begins.add(begin);
   1000     ends.add(end);
   1001   }
   1002 
   1003   /** For debugging */
   1004   private static final void debug(String str) {
   1005     logger.finest(str);
   1006   }
   1007 
   1008 }