Home | History | Annotate | Download | only in impl
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.impl;
     18 
     19 import com.google.common.base.Preconditions;
     20 import com.google.streamhtmlparser.ExternalState;
     21 import com.google.streamhtmlparser.Parser;
     22 import com.google.streamhtmlparser.ParseException;
     23 import com.google.streamhtmlparser.util.HtmlUtils;
     24 
     25 import java.util.Map;
     26 
     27 /**
     28  * An implementation of the {@code Parser} interface that is common to both
     29  * {@code HtmlParser} and {@code JavascriptParser}.
     30  *
     31  * <p>Provides methods for parsing input and ensuring that all in-state,
     32  * entering-a-state and exiting-a-state callbacks are invoked as appropriate.
     33  *
     34  * <p>This class started as abstract but it was found better for testing to
     35  * make it instantiatable so that the parsing logic can be tested with dummy
     36  * state transitions.
     37  */
     38 public class GenericParser implements Parser {
     39 
     40   protected final ParserStateTable parserStateTable;
     41   protected final Map<InternalState, ExternalState> intToExtStateTable;
     42   protected final InternalState initialState;
     43   protected InternalState currentState;
     44   protected int lineNumber;
     45   protected int columnNumber;
     46 
     47   protected GenericParser(ParserStateTable parserStateTable,
     48                           Map<InternalState, ExternalState> intToExtStateTable,
     49                           InternalState initialState) {
     50     this.parserStateTable = parserStateTable;
     51     this.intToExtStateTable = intToExtStateTable;
     52     this.initialState = initialState;
     53     this.currentState = initialState;
     54     this.lineNumber = 1;
     55     this.columnNumber = 1;
     56   }
     57 
     58   /**
     59    * Constructs a generic parser that is an exact copy of the
     60    * one given. Note that here too, data structures that do not
     61    * change are shallow-copied (parser state table and state mappings).
     62    *
     63    * @param aGenericParser the {@code GenericParser} to copy
     64    */
     65   protected GenericParser(GenericParser aGenericParser) {
     66     parserStateTable = aGenericParser.parserStateTable;
     67     intToExtStateTable = aGenericParser.intToExtStateTable;
     68     initialState = aGenericParser.initialState;
     69     currentState = aGenericParser.currentState;
     70     lineNumber = aGenericParser.lineNumber;
     71     columnNumber = aGenericParser.columnNumber;
     72   }
     73 
     74   /**
     75    * Tell the parser to process the provided {@code String}. This is just a
     76    * convenience method that wraps over {@link Parser#parse(char)}.
     77    * @param input the {@code String} to parse
     78    * @throws ParseException if an unrecoverable error occurred during parsing
     79    */
     80   @Override
     81   public void parse(String input) throws ParseException {
     82     for (int i = 0; i < input.length(); i++)
     83       parse(input.charAt(i));
     84   }
     85 
     86   /**
     87    * Main loop for parsing of input.
     88    *
     89    * <p>Absent any callbacks defined, this function simply determines the
     90    * next state to switch to based on the <code>ParserStateTable</code> which is
     91    * derived from a state-machine configuration file in the original C++ parser.
     92    *
     93    * <p>However some states have specific callbacks defined which when
     94    * receiving specific characters may decide to overwrite the next state to
     95    * go to. Hence the next state is a function both of the main state table
     96    * in {@code ParserStateTable} as well as specific run-time information
     97    * from the callback functions.
     98    *
     99    * <p>Also note that the callbacks are called in a proper sequence,
    100    * first the exit-state one then the enter-state one and finally the
    101    * in-state one. Changing the order may result in a functional change.
    102    *
    103    * @param input the input character to parse (process)
    104    * @throws ParseException if an unrecoverable error occurred during parsing
    105    */
    106   @Override
    107   public void parse(char input) throws ParseException {
    108     InternalState nextState =
    109         parserStateTable.getNextState(currentState, input);
    110 
    111     if (nextState == InternalState.INTERNAL_ERROR_STATE) {
    112         String errorMsg =
    113             String.format("Unexpected character '%s' in int_state '%s' " +
    114                           "(ext_state '%s')",
    115                           HtmlUtils.encodeCharForAscii(input),
    116                           currentState.getName(), getState().getName());
    117       currentState = InternalState.INTERNAL_ERROR_STATE;
    118       throw new ParseException(this, errorMsg);
    119     }
    120 
    121     if (currentState != nextState) {
    122       nextState = handleExitState(currentState, nextState, input);
    123     }
    124     if (currentState != nextState) {
    125       nextState = handleEnterState(nextState, nextState, input);
    126     }
    127     nextState = handleInState(nextState, input);
    128     currentState = nextState;
    129     record(input);
    130 
    131     columnNumber++;
    132     if (input == '\n') {
    133       lineNumber++;
    134       columnNumber = 1;
    135     }
    136   }
    137 
    138   /**
    139    * Return the current state of the parser.
    140    */
    141   @Override
    142   public ExternalState getState() {
    143     if (!intToExtStateTable.containsKey(currentState)) {
    144       throw new NullPointerException("Did not find external state mapping " +
    145                                      "For internal state: " + currentState);
    146     }
    147     return intToExtStateTable.get(currentState);
    148   }
    149 
    150   /**
    151    * Reset the parser back to its initial default state.
    152    */
    153   @Override
    154   public void reset() {
    155     currentState = initialState;
    156     lineNumber = 1;
    157     columnNumber = 1;
    158   }
    159 
    160   /**
    161    * Sets the current line number which is returned during error messages.
    162    */
    163   @Override
    164   public void setLineNumber(int lineNumber) {
    165     this.lineNumber = lineNumber;
    166   }
    167 
    168   /**
    169    * Returns the current line number.
    170    */
    171   @Override
    172   public int getLineNumber() {
    173     return lineNumber;
    174   }
    175 
    176   /**
    177    * Sets the current column number which is returned during error messages.
    178    */
    179   @Override
    180   public void setColumnNumber(int columnNumber) {
    181     this.columnNumber = columnNumber;
    182   }
    183 
    184   /**
    185    * Returns the current column number.
    186    */
    187   @Override
    188   public int getColumnNumber() {
    189     return columnNumber;
    190   }
    191 
    192   InternalState getCurrentInternalState() {
    193     return currentState;
    194   }
    195 
    196   protected void setNextState(InternalState nextState) throws ParseException {
    197     Preconditions.checkNotNull(nextState);   // Developer error if it triggers.
    198 
    199     /* We are not actually parsing hence providing
    200      * a null char to the event handlers.
    201      */
    202     // TODO: Complicated logic to follow in C++ but clean it up.
    203     final char nullChar = '\0';
    204 
    205     if (currentState != nextState) {
    206       nextState = handleExitState(currentState, nextState, nullChar);
    207     }
    208     if (currentState != nextState) {
    209       handleEnterState(nextState, nextState, nullChar);
    210     }
    211     currentState = nextState;
    212   }
    213 
    214   /**
    215    * Invoked when the parser enters a new state.
    216    *
    217    * @param currentState the current state of the parser
    218    * @param expectedNextState the next state according to the
    219    *        state table definition
    220    * @param input the last character parsed
    221    * @return the state to change to, could be the same as the
    222    *         {@code expectedNextState} provided
    223    * @throws ParseException if an unrecoverable error occurred during parsing
    224    */
    225   protected InternalState handleEnterState(InternalState currentState,
    226                                            InternalState expectedNextState,
    227                                            char input) throws ParseException {
    228     return expectedNextState;
    229   }
    230 
    231   /**
    232    * Invoked when the parser exits a state.
    233    *
    234    * @param currentState the current state of the parser
    235    * @param expectedNextState the next state according to the
    236    *        state table definition
    237    * @param input the last character parsed
    238    * @return the state to change to, could be the same as the
    239    *         {@code expectedNextState} provided
    240    * @throws ParseException if an unrecoverable error occurred during parsing
    241    */
    242   protected InternalState handleExitState(InternalState currentState,
    243                                           InternalState expectedNextState,
    244                                           char input) throws ParseException {
    245     return expectedNextState;
    246   }
    247 
    248   /**
    249    * Invoked for each character read when no state change occured.
    250    *
    251    * @param currentState the current state of the parser
    252    * @param input the last character parsed
    253    * @return the state to change to, could be the same as the
    254    *         {@code expectedNextState} provided
    255    * @throws ParseException if an unrecoverable error occurred during parsing
    256    */
    257   protected InternalState handleInState(InternalState currentState,
    258                                         char input) throws ParseException {
    259     return currentState;
    260   }
    261 
    262   /**
    263    * Perform some processing on the given character. Derived classes
    264    * may override this method in order to perform additional logic
    265    * on every processed character beyond the logic defined in
    266    * state transitions.
    267    *
    268    * @param input the input character to operate on
    269    */
    270   protected void record(char input) { }
    271 }
    272