Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.util;
     18 
     19 import com.google.common.base.Preconditions;
     20 import com.google.common.collect.ImmutableMap;
     21 
     22 import java.util.Map;
     23 
     24 /**
     25  * <p>Decodes (unescapes) HTML entities with the complication that these
     26  * are received one character at a time hence must be stored temporarily.
     27  * Also, we may receive some "junk" characters before the actual
     28  * entity which we will discard.
     29  *
     30  * <p>This class is designed to be 100% compatible with the corresponding
     31  * logic in the C-version of the
     32  * {@link com.google.security.streamhtmlparser.HtmlParser}, found
     33  * in <code>htmlparser.c</code>. There are however a few intentional
     34  * differences outlines below:
     35  * <ul>
     36  *   <li>We accept lower and upper-case hex NCRs, the C-version
     37  *       accepts only lower-case ones.
     38  *   <li>The output on some invalid inputs may be different. This is
     39  *       currently in the process of consolidation with Filipe.
     40  *   <li>The API is a bit different, I find this one better suited
     41  *       for Java. In particular, the C method <code>processChar</code>
     42  *       returns the output {@code String} whereas in Java, we return
     43  *       a status code and then provide the {@code String} in a separate
     44  *       method <code>getEntity</code>. It is cleaner as it avoids the
     45  *       need to return empty {@code String}s during incomplete processing.
     46  * </ul>
     47  *
     48  * <p>Valid HTML entities have one of the following three forms:
     49  * <ul>
     50  *   <li><code>&amp;dd;</code> where dd is a number in decimal (base 10) form.
     51  *   <li><code>&amp;x|Xyy;</code> where yy is a hex-number (base 16).
     52  *   <li><code>&&lt;html-entity&gt;;</code> where
     53  *       <code>&lt;html-entity&gt;</code> is one of <code>lt</code>,
     54  *       <code>gt</code>, <code>amp</code>, <code>quot</code> or
     55  *       <code>apos</code>.
     56  * </ul>
     57  *
     58  * <p>A <code>reset</code> method is provided to facilitate object re-use.
     59  */
     60 public class EntityResolver {
     61 
     62   /**
     63    * Returned in <code>processChar</code> method.
     64    * <p>
     65    * <ul>
     66    *   <li><code>NOT_STARTED</code> indicates we are still processing
     67    *       trailing characters before the start of an entity.
     68    *       The caller may want to save the characters it provided us.
     69    *   <li><code>IN_PROGRESS</code> indicates we are currently processing
     70    *       characters part of an entity.
     71    *   <li><code>COMPLETED</code> indicates we have finished processing
     72    *       an entity. The caller can then invoke <code>getEntity</code>
     73    *       then re-set the object for future re-use.
     74    * </ul>
     75    */
     76   public enum Status {
     77     NOT_STARTED("Not Started"),
     78     IN_PROGRESS("In Progress"),
     79     COMPLETED("Completed");
     80 
     81     private final String message;
     82 
     83     private Status(String message) {
     84       this.message = message;
     85     }
     86 
     87     /**
     88      * Returns a brief description of the {@code Status} for
     89      * debugging purposes. The format of the returned {@code String}
     90      * is not fully specified nor guaranteed to remain the same.
     91      */
     92     @Override
     93     public String toString() {
     94       return message;
     95     }
     96   }
     97 
     98   /**
     99    * How many characters to store as we are processing an entity. Once we
    100    * reach that size, we know the entity is definitely invalid. The size
    101    * is higher than needed but keeping it as-is for compatibility with
    102    * the C-version.
    103    */
    104   private static final int MAX_ENTITY_SIZE = 10;
    105 
    106   /**
    107    * Map containing the recognized HTML entities and their decoded values.
    108    * The trailing ';' is not included in the key but it is accounted for.
    109    */
    110   private static final Map<String, String> HTML_ENTITIES_MAP =
    111       new ImmutableMap.Builder<String, String>()
    112           .put("&lt", "<")
    113           .put("&gt", ">")
    114           .put("&amp", "&")
    115           .put("&apos", "'")
    116           .build();
    117 
    118   /** Storage for received until characters until an HTML entity is complete. */
    119   private final StringBuilder sb;
    120 
    121   /**
    122    * Indicates the state we are in. see {@link EntityResolver.Status}.
    123    */
    124   private Status status;
    125   private String entity;
    126 
    127   /**
    128    * Constructs an entity resolver that is initially empty and
    129    * with status {@code NOT_STARTED}, see {@link EntityResolver.Status}.
    130    *
    131    */
    132   public EntityResolver() {
    133     sb = new StringBuilder();
    134     status = Status.NOT_STARTED;
    135     entity = "";
    136   }
    137 
    138   /**
    139    * Constructs an entity resolver that is an exact copy of
    140    * the one provided. In particular it has the same contents
    141    * and status.
    142    *
    143    * @param aEntityResolver the entity resolver to copy
    144    */
    145   public EntityResolver(EntityResolver aEntityResolver) {
    146     sb = new StringBuilder();
    147     sb.replace(0, sb.length(), aEntityResolver.sb.toString());
    148     entity = aEntityResolver.entity;
    149     status = aEntityResolver.status;
    150   }
    151 
    152   /**
    153    * Returns the object to its original state for re-use, deleting any
    154    * stored characters that may be present.
    155    */
    156   public void reset() {
    157     status = Status.NOT_STARTED;
    158     sb.setLength(0);
    159     entity = "";
    160   }
    161 
    162   /**
    163    * Returns the full state of the <code>StreamEntityResolver</code>
    164    * in a human readable form. The format of the returned <code>String</code>
    165    * is not specified and is subject to change.
    166    *
    167    * @return full state of this object
    168    */
    169   @Override
    170   public String toString() {
    171     return String.format("Status: %s; Contents (%d): %s", status.toString(),
    172                          sb.length(), sb.toString());
    173   }
    174 
    175   /**
    176    * Returns the decoded HTML Entity. Should only be called
    177    * after {@code processChar} returned status {@code COMPLETED}.
    178    *
    179    * @return the decoded HTML Entity or an empty {@code String} if
    180    *         we were called with any status other than {@code COMPLETED}
    181    */
    182   public String getEntity() {
    183     return entity;
    184   }
    185 
    186   /**
    187    * Processes a character from the input stream and decodes any html entities
    188    * from that processed input stream.
    189    *
    190    * @param input the {@code char} to process
    191    * @return the processed {@code String}. Typically returns an empty
    192    *         {@code String} while awaiting for more characters to complete
    193    *         processing of the entity.
    194    */
    195   public Status processChar(char input) {
    196     // Developer error if the precondition fails.
    197     Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0);
    198     if (status == Status.NOT_STARTED) {
    199       if (input == '&') {
    200         sb.append(input);
    201         status = Status.IN_PROGRESS;
    202       }
    203     } else if (status == Status.IN_PROGRESS) {
    204       if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) {
    205         status = Status.COMPLETED;
    206         entity = convertEntity(input);
    207       } else {
    208         if (sb.length() < MAX_ENTITY_SIZE) {
    209           sb.append(input);
    210         } else {
    211           status = Status.COMPLETED;
    212           entity = uncovertedInput(input);
    213         }
    214       }
    215     } else {
    216       // Status.COMPLETED, ignore character, do nothing.
    217     }
    218     return status;
    219   }
    220 
    221   /**
    222    * Performs the decoding of a complete HTML entity and saves the
    223    * result back into the buffer.
    224    * <a href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1">
    225    * Numeric Character References</a>
    226    *
    227    * @param terminator the last character read, unused on successful
    228    *        conversions since it is the end delimiter of the entity
    229    * @return The decoded entity or the original input if we could not decode it.
    230    */
    231   private String convertEntity(char terminator) {
    232     // Developer error if the buffer was empty or does not start with '&'.
    233     Preconditions.checkArgument(sb.length() > 0);
    234     Preconditions.checkArgument(sb.charAt(0) == '&');
    235 
    236     if (sb.length() > 1) {
    237       if (sb.charAt(1) == '#') {
    238         if (sb.length() <= 2) {    // Error => return content as-is.
    239           return uncovertedInput(terminator);
    240         }
    241         try {
    242           if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) {    // Hex NCR
    243             return new String(Character.toChars(
    244                 Integer.parseInt(sb.substring(3), 16)));
    245           } else {                                              // Decimal NCR
    246             return new String(Character.toChars(
    247                 Integer.parseInt(sb.substring(2))));
    248           }
    249         } catch (NumberFormatException e) {
    250           return uncovertedInput(terminator);
    251         }
    252       }
    253 
    254       // See if it matches any of the few recognized entities.
    255       String key = sb.toString();
    256       if (HTML_ENTITIES_MAP.containsKey(key)) {
    257         return HTML_ENTITIES_MAP.get(key);
    258       }
    259     }
    260     // Covers the case of a lonely '&' given or valid/invalid unknown entities.
    261     return uncovertedInput(terminator);
    262   }
    263 
    264   private String uncovertedInput(char terminator) {
    265     return String.format("%s%c", sb.toString(), terminator);
    266   }
    267 }
    268