Home | History | Annotate | Download | only in template
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.clearsilver.jsilver.template;
     18 
     19 import java.io.IOException;
     20 
     21 /**
     22  * HTML whitespace stripper to be used by JSilver.  It removes leading and
     23  * trailing whitespace, it reduces contiguous whitespace characters with just
     24  * the first character, and removes lines of nothing but whitespace.
     25  *
     26  * It does not strip whitespace inside the following elements:
     27  * <ul>
     28  * <li> PRE
     29  * <li> VERBATIM
     30  * <li> TEXTAREA
     31  * <li> SCRIPT
     32  * </ul>
     33  * It also strips out empty lines and leading whitespace inside HTML tags (i.e.
     34  * between '<' and '>') and inside SCRIPT elements.  It leaves trailing
     35  * whitespace since that is more costly to remove and tends to not be common
     36  * based on how templates are created (they don't have trailing whitespace).
     37  * <p>
     38  * Loadtests indicate that this class can strip whitespace almost as quickly
     39  * as just reading every character from a string (20% slower).
     40  * <p>
     41  * While not strictly compatible with the JNI Clearsilver whitestripping
     42  * function, we are not aware of any differences that yield functionally
     43  * different HTML output. However, we encourage users to verify for themselves
     44  * and report any differences.
     45  */
     46 public class HtmlWhiteSpaceStripper implements Appendable {
     47 
     48   // Object to output stripped content to.
     49   private final Appendable out;
     50   // Level of whitespace stripping to perform. (Currently not used).
     51   // TODO: Determine what the exact differences are in levels in
     52   // JNI Clearsilver and see if it is worth porting it.
     53   private final int level;
     54 
     55   // Has any non-whitespace character been seen since the start of the line.
     56   private boolean nonWsSeen = false;
     57   // Was there previously one or more whitespace chars? If so, we should output
     58   // the first whitespace char in the sequence before any other non-whitespace
     59   // character. 0 signifies no pending whitespace.
     60   private char pendingWs = 0;
     61 
     62   // We just saw the start of an HTML tag '<'.
     63   private boolean startHtmlTag = false;
     64   // Are we currently in an opening HTML tag (not "</").
     65   private boolean inOpenTag = false;
     66   // Are we currently in a closing HTML tag.
     67   private boolean inCloseTag = false;
     68   // Are we currently in an HTML tag name.
     69   private boolean inTagName = false;
     70 
     71   // Are we between <textarea> tags
     72   private int textAreaScope = 0;
     73   // Are we between <pre> tags
     74   private int preScope = 0;
     75   // Are we between verbatim flags
     76   private int verbatimScope = 0;
     77   // Are we between <script> tags
     78   private int scriptScope = 0;
     79 
     80   // Used to hold HTML tag element name.
     81   private StringBuilder tagName = new StringBuilder(16);
     82 
     83   /**
     84    * Intermediate Appendable object that strips whitespace as it passes through characters to
     85    * another Appendable object.
     86    *
     87    * @param out The Appendable object to dump the stripped output to.
     88    */
     89   public HtmlWhiteSpaceStripper(Appendable out) {
     90     this(out, 1);
     91   }
     92 
     93   /**
     94    * Intermediate Appendable object that strips whitespace as it passes through characters to
     95    * another Appendable object.
     96    *
     97    * @param out The Appendable object to dump the stripped output to.
     98    * @param level Ignored for now.
     99    */
    100   public HtmlWhiteSpaceStripper(Appendable out, int level) {
    101     this.out = out;
    102     this.level = level;
    103   }
    104 
    105   @Override
    106   public String toString() {
    107     return out.toString();
    108   }
    109 
    110   @Override
    111   public Appendable append(CharSequence csq) throws IOException {
    112     return append(csq, 0, csq.length());
    113   }
    114 
    115   @Override
    116   public Appendable append(CharSequence csq, int start, int end) throws IOException {
    117     for (int i = start; i < end; i++) {
    118       append(csq.charAt(i));
    119     }
    120     return this;
    121   }
    122 
    123   @Override
    124   public Appendable append(char c) throws IOException {
    125     if (inOpenTag || inCloseTag) {
    126       // In an HTML tag.
    127       if (startHtmlTag) {
    128         // This is the first character in an HTML tag.
    129         if (c == '/') {
    130           // We are in a close tag.
    131           inOpenTag = false;
    132           inCloseTag = true;
    133         } else {
    134           // This is the first non-'/' character in an HTML tag.
    135           startHtmlTag = false;
    136           if (isTagNameStartChar(c)) {
    137             // we have a valid tag name first char.
    138             inTagName = true;
    139             tagName.append(c);
    140           }
    141         }
    142       } else if (inTagName) {
    143         // We were last parsing the name of an HTML attribute.
    144         if (isTagNameChar(c)) {
    145           tagName.append(c);
    146         } else {
    147           processTagName();
    148           inTagName = false;
    149         }
    150       }
    151       if (c == '>') {
    152         // We are at the end of the tag.
    153         inOpenTag = inCloseTag = false;
    154         nonWsSeen = true;
    155       }
    156       stripLeadingWsAndEmptyLines(c);
    157     } else {
    158       // Outside of HTML tag.
    159       if (c == '<') {
    160         // Starting a new HTML tag.
    161         inOpenTag = true;
    162         startHtmlTag = true;
    163       }
    164       if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) {
    165         // In an HTML element that we want to preserve whitespace in.
    166         out.append(c);
    167       } else if (scriptScope > 0) {
    168         // Want to remove newlines only.
    169         stripLeadingWsAndEmptyLines(c);
    170       } else {
    171         stripAll(c);
    172       }
    173     }
    174 
    175     return this;
    176   }
    177 
    178   private void stripLeadingWsAndEmptyLines(char c) throws IOException {
    179     // Detect and delete empty lines.
    180     switch (c) {
    181       case '\n':
    182         if (nonWsSeen) {
    183           out.append(c);
    184         }
    185         nonWsSeen = false;
    186         break;
    187       case ' ':
    188       case '\t':
    189       case '\r':
    190         if (nonWsSeen) {
    191           out.append(c);
    192         }
    193         break;
    194       default:
    195         if (!nonWsSeen) {
    196           nonWsSeen = true;
    197         }
    198         out.append(c);
    199     }
    200   }
    201 
    202   private void stripAll(char c) throws IOException {
    203     // All that remains is content that is safe to remove whitespace from.
    204     switch (c) {
    205       case '\n':
    206         if (nonWsSeen) {
    207           // We don't want blank lines so we don't output linefeed unless we
    208           // saw non-whitespace.
    209           out.append(c);
    210         }
    211         // We don't want trailing whitespace.
    212         pendingWs = 0;
    213         nonWsSeen = false;
    214         break;
    215       case ' ':
    216       case '\t':
    217       case '\r':
    218         if (nonWsSeen) {
    219           pendingWs = c;
    220         } else {
    221           // Omit leading whitespace
    222         }
    223         break;
    224       default:
    225         if (pendingWs != 0) {
    226           out.append(pendingWs);
    227           pendingWs = 0;
    228         }
    229         nonWsSeen = true;
    230         out.append(c);
    231     }
    232   }
    233 
    234   private int updateScope(int current, int inc) {
    235     current += inc;
    236     return current < 0 ? 0 : current;
    237   }
    238 
    239   /**
    240    * This code assumes well-formed HTML as input with HTML elements opening and closing properly in
    241    * the right order.
    242    */
    243   private void processTagName() {
    244     inTagName = false;
    245     String name = tagName.toString();
    246     tagName.delete(0, tagName.length());
    247     int inc = inOpenTag ? 1 : -1;
    248     if ("textarea".equalsIgnoreCase(name)) {
    249       textAreaScope = updateScope(textAreaScope, inc);
    250     } else if ("pre".equalsIgnoreCase(name)) {
    251       preScope = updateScope(preScope, inc);
    252     } else if ("verbatim".equalsIgnoreCase(name)) {
    253       verbatimScope = updateScope(verbatimScope, inc);
    254     } else if ("script".equalsIgnoreCase(name)) {
    255       scriptScope = updateScope(scriptScope, inc);
    256     }
    257   }
    258 
    259   private boolean isTagNameStartChar(char c) {
    260     return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
    261   }
    262 
    263   // From W3C HTML spec.
    264   private boolean isTagNameChar(char c) {
    265     return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_')
    266         || (c == '-') || (c == ':') || (c == '.');
    267   }
    268 
    269   /**
    270    * Note, we treat '\n' as a separate special character as it has special rules since it determines
    271    * what a 'line' of content is for doing leading and trailing whitespace removal and empty line
    272    * removal.
    273    */
    274   private boolean isWs(char c) {
    275     return c == ' ' || c == '\t' || c == '\r';
    276   }
    277 }
    278