Home | History | Annotate | Download | only in impl
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.impl;
     18 
     19 import com.google.common.base.Preconditions;
     20 
     21 /**
     22  * Holds a state table which is defined as the set of all
     23  * recognized state transitions and the set of characters that
     24  * trigger them.
     25  *
     26  * <p>The logic of what character causes what state transition is derived from
     27  * a base definition written as a Python configuration file in the original
     28  * C++ parser.
     29  *
     30  * <p>This class provides methods to initially build the state table and then
     31  * methods at parsing time to determine the transitions to subsequent states.
     32  *
     33  * <p>Note on characters outside the extended ASCII range: Currently, all state
     34  * transitions in the Python configuration file trigger only on extended
     35  * ASCII characters, that is characters in the Unicode space of [U+0000 to
     36  * U+00FF]. We use that property to design a more efficient state transition
     37  * representation. When receiving characters outside that ASCII range, we
     38  * simply apply the DEFAULT transition for the given state - as we do for any
     39  * character that is not a hot character for that state. If no default
     40  * transition exists, we switch to the Internal Error state.
     41  *
     42  * <p>Technical note: In Java, a {@code char} is a code unit and in some cases
     43  * may not represent a complete Unicode code point. However, when that happens,
     44  * the code units that follow for that code point are all in the surrogate area
     45  * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
     46  * any incorrect state transitions.
     47  *
     48  * <p>This class is storage-inefficient but it is static at least
     49  * and not generated for each Parser instance.
     50  */
     51 class ParserStateTable {
     52 
     53   /**
     54    * A limit on how many different states we can have in one state table.
     55    * Can be increased should it no longer be sufficient.
     56    */
     57   private static final int MAX_STATES = 256;
     58 
     59   /**
     60    * We only check transitions for (extended) ASCII characters, hence
     61    * characters in the range 0 to MAX_CHARS -1.
     62    */
     63   private static final int MAX_CHARS = 256;
     64 
     65   /**
     66    * Records all state transitions except those identified as DEFAULT
     67    * transitions. It is two dimensional: Stores a target {@code InternalState}
     68    * given a source state (referenced by its numeric ID) and the current
     69    * character.
     70    */
     71   private final InternalState[][] stateTable;
     72 
     73   /**
     74    * Records all DEFAULT state transitions. These are transitions provided
     75    * using the {@code "[:default:]"} syntax in the Python configuration file.
     76    * There can be only one such transition for any given source state, hence
     77    * the array is one dimensional.
     78    */
     79   private final InternalState[] defaultStateTable;
     80 
     81   public ParserStateTable() {
     82     stateTable = new InternalState[MAX_STATES][MAX_CHARS];
     83     defaultStateTable = new InternalState[MAX_STATES];
     84   }
     85 
     86   /**
     87    * Returns the state to go to when receiving the current {@code char}
     88    * in the {@code from} state.
     89    * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
     90    * state transition for that character and no default state transition
     91    * for that state.
     92    *
     93    * <p>For ASCII characters, first look-up an explicit state transition for
     94    * the current character. If none is found, look-up a default transition. For
     95    * non-ASCII characters, look-up a default transition only.
     96    *
     97    * @param from the source state
     98    * @param currentChar the character received
     99    * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
    100    */
    101   InternalState getNextState(InternalState from, int currentChar) {
    102     // TODO: Consider throwing run-time error here.
    103     if (from == null || currentChar < 0)
    104       return InternalState.INTERNAL_ERROR_STATE;
    105 
    106     int id = from.getId();
    107     if (id < 0 || id >= MAX_STATES) {
    108       return InternalState.INTERNAL_ERROR_STATE;
    109     }
    110 
    111     InternalState result = null;
    112     if (currentChar < MAX_CHARS) {
    113       result = stateTable[id][currentChar];
    114     }
    115     if (result == null) {
    116         result = defaultStateTable[from.getId()];
    117     }
    118     return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
    119   }
    120 
    121   void setExpression(String expr, InternalState from, InternalState to) {
    122     if ((expr == null) || (from == null) || (to == null)) {
    123       return;
    124     }
    125 
    126     // This special string indicates a default state transition.
    127     if ("[:default:]".equals(expr)) {
    128       setDefaultDestination(from, to);
    129       return;
    130     }
    131     int i = 0;
    132     while (i < expr.length()) {
    133       // If next char is a '-' which is not the last character of the expr
    134       if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
    135         setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
    136         i += 2;
    137       } else {
    138         setDestination(from, expr.charAt(i), to);
    139         i++;
    140       }
    141     }
    142   }
    143 
    144   private void fill(InternalState from, InternalState to) {
    145     char c;
    146     for (c = 0; c < MAX_CHARS; c++) {
    147       setDestination(from, c, to);
    148     }
    149   }
    150 
    151   private void setDefaultDestination(InternalState from, InternalState to) {
    152     Preconditions.checkNotNull(from);   // Developer error if it triggers
    153     Preconditions.checkNotNull(to);     // Developer error if it triggers
    154     int id = from.getId();
    155     if ((id < 0) || (id >= MAX_STATES)) {
    156       return;
    157     }
    158     // TODO: Consider asserting if there was a state transition defined.
    159     defaultStateTable[from.getId()] = to;
    160   }
    161 
    162   private void setDestination(InternalState from, char chr, InternalState to) {
    163     Preconditions.checkNotNull(from);   // Developer error if it triggers
    164     Preconditions.checkNotNull(to);     // Developer error if it triggers
    165     Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
    166                                 "char must be in ASCII set: %c", chr);
    167     int id = from.getId();
    168     if ((id < 0) || (id >= MAX_STATES)) {
    169       return;
    170     }
    171     stateTable[from.getId()][chr] = to;
    172   }
    173 
    174   private void setRange(InternalState from, char start, char end,
    175                         InternalState to) {
    176     // Developer error if either trigger.
    177     Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
    178                                 "char must be in ASCII set: %c", start);
    179     Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
    180                                 "char must be in ASCII set: %c", end);
    181     char c;
    182     for (c = start; c <= end; c++) {
    183       setDestination(from, c, to);
    184     }
    185   }
    186 }
    187