Home | History | Annotate | Download | only in html
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.clearsilver.jsilver.functions.html;
     18 
     19 import com.google.clearsilver.jsilver.functions.TextFilter;
     20 import com.google.clearsilver.jsilver.functions.escape.HtmlEscapeFunction;
     21 import com.google.clearsilver.jsilver.functions.escape.SimpleEscapingFunction;
     22 
     23 import java.io.IOException;
     24 import java.util.regex.Matcher;
     25 import java.util.regex.Pattern;
     26 
     27 /**
     28  * This class implements the ClearSilver text_html function.
     29  *
     30  * It converts plain text into html, including adding 'tt' tags to ascii art and linking email and
     31  * web addresses.
     32  *
     33  * Note this implementation differs from ClearSilver, in that it html escapes the contents of links
     34  * and mailtos.
     35  */
     36 public class TextHtmlFunction implements TextFilter {
     37 
     38   // These regular expressions are adapted from html.c in the ClearSilver
     39   // source.
     40 
     41   // Regular expression used to match email addresses, taken from the
     42   // ClearSilver source to maintain compatibility.
     43   private static final String EMAIL_REGEXP =
     44       "[^]\\[@:;<>\\\"()\\s\\p{Cntrl}]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
     45 
     46   // Regular expression used to match urls without a scheme (www.foo.com),
     47   // adapted from the ClearSilver source to maintain compatibility.
     48   private static final String WITH_SCHEME_REGEXP = "(?:http|https|ftp|mailto):[^\\s>\"]*";
     49 
     50   // Regular expression used to match urls with a scheme (http://www.foo.com),
     51   // adapted from the ClearSilver source to maintain compatibility.
     52   private static final String WITHOUT_SCHEME_REGEXP = "www\\.[-a-z0-9\\.]+[^\\s;\">]*";
     53 
     54   // Pattern to match any string in the input that is linkable.
     55   private static final Pattern LINKABLES =
     56       Pattern.compile("(" + EMAIL_REGEXP + ")|(" + WITH_SCHEME_REGEXP + ")|("
     57           + WITHOUT_SCHEME_REGEXP + ")", Pattern.CASE_INSENSITIVE);
     58 
     59   // Matching groups for the LINKABLES pattern.
     60   private static final int EMAIL_GROUP = 1;
     61   private static final int WITH_SCHEME_GROUP = 2;
     62 
     63   // We don't have access to the global html escaper here, so create a new one.
     64   private final HtmlEscapeFunction htmlEscaper = new HtmlEscapeFunction(false);
     65 
     66   // Escapes a small set of non-safe html characters, and does a a very small
     67   // amount of formatting.
     68   private final SimpleEscapingFunction htmlCharEscaper =
     69       new SimpleEscapingFunction(new char[] {'<', '>', '&', '\n', '\r'}) {
     70 
     71         @Override
     72         protected String getEscapeString(char c) {
     73           switch (c) {
     74             case '<':
     75               return "&lt;";
     76             case '>':
     77               return "&gt;";
     78             case '&':
     79               return "&amp;";
     80             case '\n':
     81               return "<br/>\n";
     82             case '\r':
     83               return "";
     84             default:
     85               return null;
     86           }
     87         }
     88 
     89       };
     90 
     91   @Override
     92   public void filter(String in, Appendable out) throws IOException {
     93 
     94     boolean hasAsciiArt = hasAsciiArt(in);
     95 
     96     // Add 'tt' tag to a string that contains 'ascii-art'.
     97     if (hasAsciiArt) {
     98       out.append("<tt>");
     99     }
    100 
    101     splitAndConvert(in, out);
    102 
    103     if (hasAsciiArt) {
    104       out.append("</tt>");
    105     }
    106   }
    107 
    108   /**
    109    * Splits the input string into blocks of normal text or linkable text. The linkable text is
    110    * converted into anchor tags before being appended to the output. The normal text is escaped and
    111    * appended to the output.
    112    */
    113   private void splitAndConvert(String in, Appendable out) throws IOException {
    114     Matcher matcher = LINKABLES.matcher(in);
    115     int end = in.length();
    116     int matchStart;
    117     int matchEnd;
    118     int regionStart = 0;
    119 
    120     // Keep looking for email addresses and web links until there are none left.
    121     while (matcher.find()) {
    122       matchStart = matcher.start();
    123       matchEnd = matcher.end();
    124 
    125       // Escape all the text from the end of the previous match to the start of
    126       // this match, and append it to the output.
    127       htmlCharEscaper.filter(in.subSequence(regionStart, matchStart).toString(), out);
    128 
    129       // Don't include a . or , in the text that is linked.
    130       if (in.charAt(matchEnd - 1) == ',' || in.charAt(matchEnd - 1) == '.') {
    131         matchEnd--;
    132       }
    133 
    134       if (matcher.group(EMAIL_GROUP) != null) {
    135         formatEmail(in, matchStart, matchEnd, out);
    136       } else {
    137         formatUrl(in, matchStart, matchEnd,
    138         // Add a scheme if the one wasn't found.
    139             matcher.group(WITH_SCHEME_GROUP) == null, out);
    140       }
    141 
    142       regionStart = matchEnd;
    143     }
    144 
    145     // Escape the text after the last match, and append it to the output.
    146     htmlCharEscaper.filter(in.substring(regionStart, end), out);
    147   }
    148 
    149   /**
    150    * Formats the input sequence into a suitable mailto: anchor tag and appends it to the output.
    151    *
    152    * @param in The string that contains the email.
    153    * @param start The start of the email address in the whole string.
    154    * @param end The end of the email in the whole string.
    155    * @param out The text output that the email address should be appended to.
    156    * @throws IOException
    157    */
    158   private void formatEmail(String in, int start, int end, Appendable out) throws IOException {
    159 
    160     String emailPart = in.substring(start, end);
    161 
    162     out.append("<a href=\"mailto:");
    163     htmlEscaper.filter(emailPart, out);
    164     out.append("\">");
    165     htmlEscaper.filter(emailPart, out);
    166     out.append("</a>");
    167   }
    168 
    169   /**
    170    * Formats the input sequence into a suitable anchor tag and appends it to the output.
    171    *
    172    * @param in The string that contains the url.
    173    * @param start The start of the url in the containing string.
    174    * @param end The end of the url in the containing string.
    175    * @param addScheme true if 'http://' should be added to the anchor.
    176    * @param out The text output that the url should be appended to.
    177    * @throws IOException
    178    */
    179   private void formatUrl(String in, int start, int end, boolean addScheme, Appendable out)
    180       throws IOException {
    181 
    182     String urlPart = in.substring(start, end);
    183 
    184     out.append(" <a target=\"_blank\" href=\"");
    185     if (addScheme) {
    186       out.append("http://");
    187     }
    188     htmlEscaper.filter(urlPart, out);
    189     out.append("\">");
    190     htmlEscaper.filter(urlPart, out);
    191     out.append("</a>");
    192   }
    193 
    194   /**
    195    * Attempts to detect if a string contains ascii art, whitespace such as tabs will suppress ascii
    196    * art detection.
    197    *
    198    * This method takes its conditions from ClearSilver to maintain compatibility. See
    199    * has_space_formatting in html.c in the ClearSilver source.
    200    *
    201    * @param in The string to analyze for ascii art.
    202    * @return true if it is believed that the string contains ascii art.
    203    */
    204   private boolean hasAsciiArt(String in) {
    205     int spaces = 0;
    206     int returns = 0;
    207     int asciiArt = 0;
    208     int x = 0;
    209     char[] inChars = in.toCharArray();
    210 
    211     int length = in.length();
    212     for (x = 0; x < length; x++) {
    213 
    214       switch (inChars[x]) {
    215         case '\t':
    216           return false;
    217 
    218         case '\r':
    219           break;
    220 
    221         case ' ':
    222           // Ignore spaces after full stops.
    223           if (x == 0 || inChars[x - 1] != '.') {
    224             spaces++;
    225           }
    226           break;
    227 
    228         case '\n':
    229           spaces = 0;
    230           returns++;
    231           break;
    232 
    233         // Characters to count towards the art total.
    234         case '/':
    235         case '\\':
    236         case '<':
    237         case '>':
    238         case ':':
    239         case '[':
    240         case ']':
    241         case '!':
    242         case '@':
    243         case '#':
    244         case '$':
    245         case '%':
    246         case '^':
    247         case '&':
    248         case '*':
    249         case '(':
    250         case ')':
    251         case '|':
    252           asciiArt++;
    253           if (asciiArt > 3) {
    254             return true;
    255           }
    256           break;
    257 
    258         default:
    259           if (returns > 2) {
    260             return false;
    261           }
    262           if (spaces > 2) {
    263             return false;
    264           }
    265           returns = 0;
    266           spaces = 0;
    267           asciiArt = 0;
    268           break;
    269       }
    270     }
    271 
    272     return false;
    273   }
    274 }
    275