Home | History | Annotate | Download | only in impl
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.impl;
     18 
     19 import com.google.common.collect.Maps;
     20 import com.google.streamhtmlparser.ExternalState;
     21 import com.google.streamhtmlparser.JavascriptParser;
     22 import com.google.streamhtmlparser.util.HtmlUtils;
     23 import com.google.streamhtmlparser.util.JavascriptTokenBuffer;
     24 
     25 import java.util.Map;
     26 
     27 /**
     28  * <p>Many comments copied almost verbatim from the original C version.
     29  */
     30 public class JavascriptParserImpl extends GenericParser
     31     implements JavascriptParser {
     32 
     33   final static InternalState JS_TEXT;
     34   final static InternalState JS_Q;
     35   final static InternalState JS_Q_E;
     36   final static InternalState JS_DQ;
     37   final static InternalState JS_DQ_E;
     38   final static InternalState JS_SLASH;
     39   final static InternalState JS_REGEXP_SLASH;
     40   final static InternalState JS_REGEXP;
     41   final static InternalState JS_REGEXP_BRK;
     42   final static InternalState JS_REGEXP_BRK_E;
     43   final static InternalState JS_REGEXP_E;
     44   final static InternalState JS_COM_LN;
     45   final static InternalState JS_COM_ML;
     46   final static InternalState JS_COM_ML_CLOSE;
     47   final static InternalState JS_COM_AFTER;
     48 
     49   static {
     50     JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT");
     51     JS_Q  = InternalState.getInstanceJavascript("JS_Q");
     52     JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E");
     53     JS_DQ = InternalState.getInstanceJavascript("JS_DQ");
     54     JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E");
     55     JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH");
     56     JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP");
     57     JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH");
     58     JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E");
     59     JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK");
     60     JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E");
     61     JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN");
     62     JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML");
     63     JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE");
     64     JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER");
     65   }
     66 
     67   private static final Map<InternalState, ExternalState> STATE_MAPPING =
     68       Maps.newHashMap();
     69   static {
     70     initializeStateMapping();
     71   }
     72 
     73   private static final ParserStateTable STATE_TABLE = new ParserStateTable();
     74   static {
     75     initializeParserStateTable();
     76   }
     77 
     78   private final JavascriptTokenBuffer ccBuffer;
     79 
     80   /**
     81    * Creates a {@code JavascriptParserImpl} object.
     82    */
     83   public JavascriptParserImpl() {
     84     super(STATE_TABLE, STATE_MAPPING, JS_TEXT);
     85     ccBuffer = new JavascriptTokenBuffer();
     86   }
     87 
     88   /**
     89    * Creates a {@code JavascriptParserImpl} object that is a copy
     90    * of the one provided.
     91    *
     92    * @param aJavascriptParserImpl the {@code JavascriptParserImpl} to copy
     93    */
     94   public JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl) {
     95     super(aJavascriptParserImpl);
     96     ccBuffer = new JavascriptTokenBuffer(aJavascriptParserImpl.ccBuffer);
     97   }
     98 
     99   @Override
    100   public void reset() {
    101     super.reset();
    102     currentState = JS_TEXT;
    103   }
    104 
    105   @Override
    106   protected InternalState handleEnterState(InternalState currentState,
    107                                            InternalState expectedNextState,
    108                                            char input) {
    109     InternalState nextState = expectedNextState;
    110     if (currentState == JS_SLASH) {
    111       nextState = enterStateJsSlash(currentState, input);
    112     } else if (currentState == JS_COM_AFTER) {
    113       enterStateJsCommentAfter();
    114     }
    115     return nextState;
    116   }
    117 
    118   @Override
    119   protected InternalState handleExitState(InternalState currentState,
    120                                           InternalState expectedNextState,
    121                                           char input) {
    122     // Nothing to do - no handlers for exit states
    123     return expectedNextState;
    124   }
    125 
    126   @Override
    127   protected InternalState handleInState(InternalState currentState,
    128                                         char input) {
    129     if (currentState == JS_TEXT) {
    130       inStateJsText(input);
    131     }
    132     return currentState;
    133   }
    134 
    135   /**
    136    * Called every time we find a slash ('/') character in the javascript
    137    * text (except for slashes that close comments or regexp literals).
    138    *
    139    * <p>Comment copied verbatim from the corresponding C-version.
    140    *
    141    * <p>Implements the logic to figure out if this slash character is a
    142    * division operator or if it opens a regular expression literal.
    143    * This is heavily inspired by the syntactic resynchronization
    144    * for javascript 2.0:
    145    *
    146    * <p>When we receive a '/', we look at the previous non space character
    147    * to figure out if it's the ending of a punctuator that can precede a
    148    * regexp literal, in which case we assume the current '/' is part of a
    149    * regular expression literal (or the opening of a javascript comment,
    150    * but that part is dealt with in the state machine). The exceptions to
    151    * this are unary operators, so we look back a second character to rule
    152    * out '++' and '--'.
    153    *
    154    * <p> Although it is not straightforward to figure out if the binary
    155    * operator is a postfix of the previous expression or a prefix of the
    156    * regular expression, we rule out the later as it is an uncommon practice.
    157    *
    158    * <p>If we ruled out the previous token to be a valid regexp preceding
    159    * punctuator, we extract the last identifier in the buffer and match
    160    * against a list of keywords that are known to precede expressions in
    161    * the grammar. If we get a match on any of these keywords, then we are
    162    * opening a regular expression, if not, then we have a division operator.
    163    *
    164    * <p>Known cases that are accepted by the grammar but we handle
    165    * differently, although I (falmeida) don't believe there is a
    166    * legitimate usage for those:
    167    *   Division of a regular expression: var result = /test/ / 5;
    168    *   Prefix unary increment of a regular expression: var result = ++/test/;
    169    *   Division of an object literal: { a: 1 } /x/.exec('x');
    170    *
    171    * @param state being entered to
    172    * @param input character being processed
    173    * @return state next state to go to, may be the same as the one we
    174    *     were called with
    175    *
    176    * <a>http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html>
    177    * Syntactic Resynchronization</a>
    178    */
    179   private InternalState enterStateJsSlash(InternalState state, char input) {
    180 
    181     InternalState nextState = state;
    182     int position = -1;
    183 
    184     // Consume the last whitespace
    185     if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) {
    186       --position;
    187     }
    188 
    189     switch (ccBuffer.getChar(position)) {
    190       // Ignore unary increment
    191       case '+':
    192         if (ccBuffer.getChar(position - 1) != '+') {
    193           nextState = JS_REGEXP_SLASH;
    194         }
    195         break;
    196       case '-':
    197         // Ignore unary decrement
    198         if (ccBuffer.getChar(position - 1) != '-') {
    199           nextState = JS_REGEXP_SLASH;
    200         }
    201         break;
    202         // List of punctuator endings except ), ], }, + and - *
    203       case '=':
    204       case '<':
    205       case '>':
    206       case '&':
    207       case '|':
    208       case '!':
    209       case '%':
    210       case '*':
    211       case '/':
    212       case ',':
    213       case ';':
    214       case '?':
    215       case ':':
    216       case '^':
    217       case '~':
    218       case '{':
    219       case '(':
    220       case '[':
    221       case '}':
    222       case '\0':
    223         nextState = JS_REGEXP_SLASH;
    224         break;
    225       default:
    226         String lastIdentifier = ccBuffer.getLastIdentifier();
    227         if (lastIdentifier != null && HtmlUtils
    228             .isJavascriptRegexpPrefix(lastIdentifier)) {
    229           nextState = JS_REGEXP_SLASH;
    230         }
    231     }
    232     ccBuffer.appendChar(input);
    233     return nextState;
    234   }
    235 
    236   /**
    237    * Called at the end of a javascript comment.
    238    *
    239    * <p>When we open a comment, the initial '/' was inserted into the ring
    240    * buffer, but it is not a token and should be considered whitespace
    241    * for parsing purposes.
    242    *
    243    * <p>When we first saw the '/' character, we didn't yet know if it was
    244    * the beginning of a comment, a division operator, or a regexp.
    245    *
    246    * <p>In this function we just replace the inital '/' with a whitespace
    247    * character, unless we had a preceding whitespace character, in which
    248    * case we just remove the '/'. This is needed to ensure all spaces in
    249    * the buffer are correctly folded.
    250    */
    251   private void enterStateJsCommentAfter() {
    252     if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) {
    253       ccBuffer.popChar();
    254     } else {
    255       ccBuffer.setChar(-1, ' ');
    256     }
    257   }
    258 
    259   private void inStateJsText(char input) {
    260     ccBuffer.appendChar(input);
    261   }
    262 
    263 // ======================================================= //
    264 // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
    265 // ======================================================= //
    266 
    267   private static void registerMapping(InternalState internalState,
    268                                       ExternalState externalState) {
    269     STATE_MAPPING.put(internalState, externalState);
    270   }
    271 
    272   private static void initializeStateMapping() {
    273     // Each parser implementation must map the error state appropriately.
    274     registerMapping(InternalState.INTERNAL_ERROR_STATE,
    275                     JavascriptParser.STATE_ERROR);
    276 
    277     registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT);
    278     registerMapping(JS_Q, JavascriptParser.STATE_Q);
    279     registerMapping(JS_Q_E, JavascriptParser.STATE_Q);
    280     registerMapping(JS_DQ, JavascriptParser.STATE_DQ);
    281     registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ);
    282     registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT);
    283     registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT);
    284     registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP);
    285     registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP);
    286     registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP);
    287     registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP);
    288     registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT);
    289     registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT);
    290     registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT);
    291     registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT);
    292   }
    293 
    294   private static void registerTransition(String expression,
    295                                          InternalState source,
    296                                          InternalState to) {
    297     // It seems to silly to go through a StateTableTransition here
    298     // but it adds extra data checking.
    299     StateTableTransition stt = new StateTableTransition(expression,
    300                                                         source, to);
    301     STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
    302                               stt.getTo());
    303   }
    304 
    305   private static void initializeParserStateTable() {
    306     registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT);
    307     registerTransition("/", JS_COM_AFTER, JS_SLASH);
    308     registerTransition("\"", JS_COM_AFTER, JS_DQ);
    309     registerTransition("\'", JS_COM_AFTER, JS_Q);
    310     registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML);
    311     registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER);
    312     registerTransition("[:default:]", JS_COM_ML, JS_COM_ML);
    313     registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE);
    314     registerTransition("[:default:]", JS_COM_LN,JS_COM_LN);
    315     registerTransition("\n", JS_COM_LN,JS_COM_AFTER);
    316     registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP);
    317     registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK);
    318     registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK);
    319     registerTransition("]", JS_REGEXP_BRK, JS_REGEXP);
    320     registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E);
    321     registerTransition("[:default:]", JS_REGEXP, JS_REGEXP);
    322     registerTransition("/", JS_REGEXP, JS_TEXT);
    323     registerTransition("[", JS_REGEXP, JS_REGEXP_BRK);
    324     registerTransition("\\", JS_REGEXP, JS_REGEXP_E);
    325     registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP);
    326     registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK);
    327     registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E);
    328     registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML);
    329     registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN);
    330     registerTransition("[:default:]", JS_SLASH, JS_TEXT);
    331     registerTransition("*", JS_SLASH, JS_COM_ML);
    332     registerTransition("/", JS_SLASH, JS_COM_LN);
    333     registerTransition("[:default:]", JS_DQ_E,JS_DQ);
    334     registerTransition("[:default:]", JS_DQ,JS_DQ);
    335     registerTransition("\"", JS_DQ, JS_TEXT);
    336     registerTransition("\\", JS_DQ, JS_DQ_E);
    337     registerTransition("[:default:]", JS_Q_E,JS_Q);
    338     registerTransition("[:default:]", JS_Q,JS_Q);
    339     registerTransition("\'", JS_Q, JS_TEXT);
    340     registerTransition("\\", JS_Q, JS_Q_E);
    341     registerTransition("[:default:]", JS_TEXT, JS_TEXT);
    342     registerTransition("/", JS_TEXT, JS_SLASH);
    343     registerTransition("\"", JS_TEXT, JS_DQ);
    344     registerTransition("\'", JS_TEXT, JS_Q);
    345   }
    346 }