Home | History | Annotate | Download | only in parser
      1 /**
      2  * Copyright (c) 2004, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.google.android.mail.common.html.parser;
     17 
     18 import com.google.android.mail.common.base.X;
     19 import com.google.android.mail.common.html.parser.HtmlDocument.EndTag;
     20 import com.google.common.io.ByteStreams;
     21 
     22 import java.io.IOException;
     23 import java.util.ArrayList;
     24 import java.util.List;
     25 import java.util.logging.Level;
     26 import java.util.logging.Logger;
     27 
     28 /**
     29  * HtmlTreeBuilder builds a well-formed HtmlTree.
     30  *
     31  * @see HtmlTree
     32  * @author jlim (at) google.com (Jing Yee Lim)
     33  */
     34 public class HtmlTreeBuilder implements HtmlDocument.Visitor {
     35 
     36   private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName());
     37 
     38   /** Stack contains HTML4.Element objects to keep track of unclosed tags */
     39   private final List<HTML.Element> stack = new ArrayList<HTML.Element>();
     40   private final TableFixer tableFixer = new TableFixer();
     41   private HtmlTree tree;
     42   private boolean built = false;
     43 
     44   /** Gets the built html tree */
     45   public HtmlTree getTree() {
     46     X.assertTrue(built);
     47     return tree;
     48   }
     49 
     50   /** Implements HtmlDocument.Visitor.start */
     51   public void start() {
     52     tree = new HtmlTree();
     53     tree.start();
     54   }
     55 
     56   /** Implements HtmlDocument.Visitor.finish */
     57   public void finish() {
     58     // Close all tags
     59     while (stack.size() > 0) {
     60       addMissingEndTag();
     61     }
     62     tableFixer.finish();
     63     tree.finish();
     64 
     65     built = true;
     66   }
     67 
     68   /** Implements HtmlDocument.Visitor.visitTag */
     69   public void visitTag(HtmlDocument.Tag t) {
     70     tableFixer.seeTag(t);
     71 
     72     HTML.Element element = t.getElement();
     73     if (element.isEmpty()) {
     74       tree.addSingularTag(t);
     75     } else if (t.isSelfTerminating()) {
     76       // Explicitly create a non-selfterminating open tag and add it to the tree
     77       // and also immediately add the corresponding close tag. This is done
     78       // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list
     79       // will be balanced consistently.
     80       // Otherwise there is a possibility of "<span /></span>" for example, if
     81       // the created tree is converted to string through toXHTML.
     82       tree.addStartTag(HtmlDocument.createTag(element,
     83           t.getAttributes(), t.getOriginalHtmlBeforeAttributes(),
     84           t.getOriginalHtmlAfterAttributes()));
     85       EndTag end = HtmlDocument.createEndTag(element);
     86       tableFixer.seeEndTag(end);
     87       tree.addEndTag(end);
     88     } else {
     89       tree.addStartTag(t);
     90       push(element);                       // Track the open tags
     91     }
     92   }
     93 
     94   /** Implements HtmlVisitor.visit */
     95   public void visitEndTag(HtmlDocument.EndTag t) {
     96 
     97     // Here we pop back to the start tag
     98     HTML.Element element = t.getElement();
     99     int pos = findStartTag(element);
    100     if (pos >= 0) {
    101 
    102       // Add missing end-tags if any
    103       while (pos < stack.size() - 1) {
    104         addMissingEndTag();
    105       }
    106 
    107       pop();
    108       tableFixer.seeEndTag(t);
    109       tree.addEndTag(t);
    110 
    111     } else {
    112       // Not found, ignore this end tag
    113       logger.finest("Ignoring end tag: " + element.getName());
    114     }
    115   }
    116 
    117   /** Implements HtmlDocument.Visitor.visitText */
    118   public void visitText(HtmlDocument.Text t) {
    119     tableFixer.seeText(t);
    120     tree.addText(t);
    121   }
    122 
    123   /** Implements HtmlDocument.Visitor.visitComment */
    124   public void visitComment(HtmlDocument.Comment n) {
    125     // ignore
    126   }
    127 
    128   /** Finds the start tag from the stack, returns -1 if not found */
    129   private int findStartTag(HTML.Element element) {
    130     for (int i = stack.size() - 1; i >= 0; i--) {
    131       HTML.Element e = stack.get(i);
    132       if (e == element) {
    133         return i;
    134       }
    135     }
    136     return -1;
    137   }
    138 
    139   /**
    140    * Adds a close tag corresponding to a tag on the stack, if
    141    * the tag needs a close tag.
    142    */
    143   private void addMissingEndTag() {
    144     HTML.Element element = pop();
    145 
    146     HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element);
    147     tableFixer.seeEndTag(endTag);
    148     tree.addEndTag(endTag);
    149   }
    150 
    151   /** Pushes a tag onto the stack */
    152   private void push(HTML.Element element) {
    153     stack.add(element);
    154   }
    155 
    156   /** Pops an elemnt from the stack */
    157   private HTML.Element pop() {
    158     return stack.remove(stack.size() - 1);
    159   }
    160 
    161   /**
    162    * The TableFixer makes sure that a <table> structure is more or less well
    163    * formed. Note that it only ensures that data within the <table> tag doesn't
    164    * "leak out" of the table.
    165    *
    166    * For instance, all the tags here are balanced with end tags. But the
    167    * 'outside' text ends up leaking out of the table.
    168    * <table><tr><td bgcolor=yellow>
    169    * <table><table>inside</table><td>outside</td></table>
    170    * </td></tr></table>
    171    *
    172    * The TableFixer makes sure that
    173    * 1) Within a table:, text and other elements are enclosed within a TD.
    174    *    A TD tag is inserted where necessary.
    175    * 2) All table structure tags are enclosed within a <table>. A TABLE tag
    176    *    is inserted where necessary.
    177    *
    178    * Note that the TableFixer only adds open tags, it doesn't add end tags.
    179    * The HtmlTreeVerifier ensures that all open tags are properly matched
    180    * up and closed.
    181    *
    182    * @author Jing Yee Lim (jlim (at) google.com)
    183    */
    184   class TableFixer {
    185 
    186     private int tables = 0;             // table nesting level
    187 
    188     // States within a <table>
    189     static final int NULL = 0;
    190     static final int IN_CELL = 1;       // in a <td> or <th> tag
    191     static final int IN_CAPTION = 2;    // in a <caption> tag
    192 
    193     private int state;
    194 
    195     void seeTag(HtmlDocument.Tag tag) {
    196       HTML.Element element = tag.getElement();
    197       if (element.getType() == HTML.Element.TABLE_TYPE) {
    198 
    199         if (HTML4.TABLE_ELEMENT.equals(element)) {
    200           if (tables > 0) {
    201             ensureCellState();
    202           }
    203           tables++;
    204           state = NULL;
    205 
    206         } else {
    207           // Make sure that we're in a table
    208           ensureTableState();
    209 
    210           // In cell/caption?
    211           if (HTML4.TD_ELEMENT.equals(element) ||
    212               HTML4.TH_ELEMENT.equals(element)) {
    213             state = IN_CELL;
    214 
    215           } else if (HTML4.CAPTION_ELEMENT.equals(element)) {
    216             state = IN_CAPTION;
    217           }
    218         }
    219       } else {
    220         if (tables > 0) {
    221 
    222           // Ok to have a form element outside a table cell.
    223           // e.g. <TR><FORM><TD>...
    224           if (!HTML4.FORM_ELEMENT.equals(element)) {
    225             ensureCellState();
    226           }
    227         }
    228       }
    229     }
    230 
    231     void seeEndTag(HtmlDocument.EndTag endTag) {
    232       HTML.Element element= endTag.getElement();
    233 
    234       if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) {
    235 
    236         if (HTML4.TD_ELEMENT.equals(element) ||
    237             HTML4.TR_ELEMENT.equals(element) ||
    238             HTML4.TH_ELEMENT.equals(element)) {
    239           // End of a cell
    240           state = NULL;
    241 
    242         } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption
    243           state = NULL;
    244 
    245         } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table
    246           X.assertTrue(tables > 0);
    247           tables--;
    248           state = (tables > 0) ? IN_CELL : NULL;
    249         }
    250       }
    251     }
    252 
    253     void seeText(HtmlDocument.Text textNode) {
    254       // If we're in a table, but not in a cell or caption, and the
    255       // text is not whitespace, add a <TD>
    256       if (tables > 0 &&
    257           state == NULL &&
    258           !textNode.isWhitespace()) {
    259         ensureCellState();
    260       }
    261     }
    262 
    263     void finish() {
    264       X.assertTrue(tables == 0);
    265       X.assertTrue(state == NULL);
    266     }
    267 
    268     // Ensure that we're within a TABLE
    269     private void ensureTableState() {
    270       if (tables == 0) {
    271         push(HTML4.TABLE_ELEMENT);
    272 
    273         HtmlDocument.Tag tableTag =
    274           HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null);
    275         tree.addStartTag(tableTag);
    276 
    277         tables++;
    278       }
    279     }
    280 
    281     // Ensure that we're within a TD or TH cell
    282     private void ensureCellState() {
    283       if (state != IN_CELL) {
    284         push(HTML4.TD_ELEMENT);
    285 
    286         HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null);
    287         tree.addStartTag(tdTag);
    288 
    289         state = IN_CELL;
    290       }
    291     }
    292   }
    293 
    294   /** For testing */
    295   public static void main(String[] args) throws IOException {
    296     logger.setLevel(Level.FINEST);
    297 
    298     String html = new String(ByteStreams.toByteArray(System.in));
    299     HtmlParser parser = new HtmlParser();
    300     HtmlDocument doc = parser.parse(html);
    301 
    302     HtmlTreeBuilder builder = new HtmlTreeBuilder();
    303     doc.accept(builder);
    304     String outputHtml = builder.getTree().getHtml();
    305 
    306     System.out.println(outputHtml);
    307   }
    308 }