1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.clearsilver.jsilver.template; 18 19 import java.io.IOException; 20 21 /** 22 * HTML whitespace stripper to be used by JSilver. It removes leading and 23 * trailing whitespace, it reduces contiguous whitespace characters with just 24 * the first character, and removes lines of nothing but whitespace. 25 * 26 * It does not strip whitespace inside the following elements: 27 * <ul> 28 * <li> PRE 29 * <li> VERBATIM 30 * <li> TEXTAREA 31 * <li> SCRIPT 32 * </ul> 33 * It also strips out empty lines and leading whitespace inside HTML tags (i.e. 34 * between '<' and '>') and inside SCRIPT elements. It leaves trailing 35 * whitespace since that is more costly to remove and tends to not be common 36 * based on how templates are created (they don't have trailing whitespace). 37 * <p> 38 * Loadtests indicate that this class can strip whitespace almost as quickly 39 * as just reading every character from a string (20% slower). 40 * <p> 41 * While not strictly compatible with the JNI Clearsilver whitestripping 42 * function, we are not aware of any differences that yield functionally 43 * different HTML output. However, we encourage users to verify for themselves 44 * and report any differences. 45 */ 46 public class HtmlWhiteSpaceStripper implements Appendable { 47 48 // Object to output stripped content to. 49 private final Appendable out; 50 // Level of whitespace stripping to perform. (Currently not used). 51 // TODO: Determine what the exact differences are in levels in 52 // JNI Clearsilver and see if it is worth porting it. 53 private final int level; 54 55 // Has any non-whitespace character been seen since the start of the line. 56 private boolean nonWsSeen = false; 57 // Was there previously one or more whitespace chars? If so, we should output 58 // the first whitespace char in the sequence before any other non-whitespace 59 // character. 0 signifies no pending whitespace. 60 private char pendingWs = 0; 61 62 // We just saw the start of an HTML tag '<'. 63 private boolean startHtmlTag = false; 64 // Are we currently in an opening HTML tag (not "</"). 65 private boolean inOpenTag = false; 66 // Are we currently in a closing HTML tag. 67 private boolean inCloseTag = false; 68 // Are we currently in an HTML tag name. 69 private boolean inTagName = false; 70 71 // Are we between <textarea> tags 72 private int textAreaScope = 0; 73 // Are we between <pre> tags 74 private int preScope = 0; 75 // Are we between verbatim flags 76 private int verbatimScope = 0; 77 // Are we between <script> tags 78 private int scriptScope = 0; 79 80 // Used to hold HTML tag element name. 81 private StringBuilder tagName = new StringBuilder(16); 82 83 /** 84 * Intermediate Appendable object that strips whitespace as it passes through characters to 85 * another Appendable object. 86 * 87 * @param out The Appendable object to dump the stripped output to. 88 */ 89 public HtmlWhiteSpaceStripper(Appendable out) { 90 this(out, 1); 91 } 92 93 /** 94 * Intermediate Appendable object that strips whitespace as it passes through characters to 95 * another Appendable object. 96 * 97 * @param out The Appendable object to dump the stripped output to. 98 * @param level Ignored for now. 99 */ 100 public HtmlWhiteSpaceStripper(Appendable out, int level) { 101 this.out = out; 102 this.level = level; 103 } 104 105 @Override 106 public String toString() { 107 return out.toString(); 108 } 109 110 @Override 111 public Appendable append(CharSequence csq) throws IOException { 112 return append(csq, 0, csq.length()); 113 } 114 115 @Override 116 public Appendable append(CharSequence csq, int start, int end) throws IOException { 117 for (int i = start; i < end; i++) { 118 append(csq.charAt(i)); 119 } 120 return this; 121 } 122 123 @Override 124 public Appendable append(char c) throws IOException { 125 if (inOpenTag || inCloseTag) { 126 // In an HTML tag. 127 if (startHtmlTag) { 128 // This is the first character in an HTML tag. 129 if (c == '/') { 130 // We are in a close tag. 131 inOpenTag = false; 132 inCloseTag = true; 133 } else { 134 // This is the first non-'/' character in an HTML tag. 135 startHtmlTag = false; 136 if (isTagNameStartChar(c)) { 137 // we have a valid tag name first char. 138 inTagName = true; 139 tagName.append(c); 140 } 141 } 142 } else if (inTagName) { 143 // We were last parsing the name of an HTML attribute. 144 if (isTagNameChar(c)) { 145 tagName.append(c); 146 } else { 147 processTagName(); 148 inTagName = false; 149 } 150 } 151 if (c == '>') { 152 // We are at the end of the tag. 153 inOpenTag = inCloseTag = false; 154 nonWsSeen = true; 155 } 156 stripLeadingWsAndEmptyLines(c); 157 } else { 158 // Outside of HTML tag. 159 if (c == '<') { 160 // Starting a new HTML tag. 161 inOpenTag = true; 162 startHtmlTag = true; 163 } 164 if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) { 165 // In an HTML element that we want to preserve whitespace in. 166 out.append(c); 167 } else if (scriptScope > 0) { 168 // Want to remove newlines only. 169 stripLeadingWsAndEmptyLines(c); 170 } else { 171 stripAll(c); 172 } 173 } 174 175 return this; 176 } 177 178 private void stripLeadingWsAndEmptyLines(char c) throws IOException { 179 // Detect and delete empty lines. 180 switch (c) { 181 case '\n': 182 if (nonWsSeen) { 183 out.append(c); 184 } 185 nonWsSeen = false; 186 break; 187 case ' ': 188 case '\t': 189 case '\r': 190 if (nonWsSeen) { 191 out.append(c); 192 } 193 break; 194 default: 195 if (!nonWsSeen) { 196 nonWsSeen = true; 197 } 198 out.append(c); 199 } 200 } 201 202 private void stripAll(char c) throws IOException { 203 // All that remains is content that is safe to remove whitespace from. 204 switch (c) { 205 case '\n': 206 if (nonWsSeen) { 207 // We don't want blank lines so we don't output linefeed unless we 208 // saw non-whitespace. 209 out.append(c); 210 } 211 // We don't want trailing whitespace. 212 pendingWs = 0; 213 nonWsSeen = false; 214 break; 215 case ' ': 216 case '\t': 217 case '\r': 218 if (nonWsSeen) { 219 pendingWs = c; 220 } else { 221 // Omit leading whitespace 222 } 223 break; 224 default: 225 if (pendingWs != 0) { 226 out.append(pendingWs); 227 pendingWs = 0; 228 } 229 nonWsSeen = true; 230 out.append(c); 231 } 232 } 233 234 private int updateScope(int current, int inc) { 235 current += inc; 236 return current < 0 ? 0 : current; 237 } 238 239 /** 240 * This code assumes well-formed HTML as input with HTML elements opening and closing properly in 241 * the right order. 242 */ 243 private void processTagName() { 244 inTagName = false; 245 String name = tagName.toString(); 246 tagName.delete(0, tagName.length()); 247 int inc = inOpenTag ? 1 : -1; 248 if ("textarea".equalsIgnoreCase(name)) { 249 textAreaScope = updateScope(textAreaScope, inc); 250 } else if ("pre".equalsIgnoreCase(name)) { 251 preScope = updateScope(preScope, inc); 252 } else if ("verbatim".equalsIgnoreCase(name)) { 253 verbatimScope = updateScope(verbatimScope, inc); 254 } else if ("script".equalsIgnoreCase(name)) { 255 scriptScope = updateScope(scriptScope, inc); 256 } 257 } 258 259 private boolean isTagNameStartChar(char c) { 260 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); 261 } 262 263 // From W3C HTML spec. 264 private boolean isTagNameChar(char c) { 265 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_') 266 || (c == '-') || (c == ':') || (c == '.'); 267 } 268 269 /** 270 * Note, we treat '\n' as a separate special character as it has special rules since it determines 271 * what a 'line' of content is for doing leading and trailing whitespace removal and empty line 272 * removal. 273 */ 274 private boolean isWs(char c) { 275 return c == ' ' || c == '\t' || c == '\r'; 276 } 277 } 278