Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
      4  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #ifndef InputStreamPreprocessor_h
     29 #define InputStreamPreprocessor_h
     30 
     31 #include "platform/text/SegmentedString.h"
     32 #include "wtf/Noncopyable.h"
     33 
     34 namespace blink {
     35 
     36 const LChar kEndOfFileMarker = 0;
     37 
     38 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     39 template <typename Tokenizer>
     40 class InputStreamPreprocessor {
     41     WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
     42 public:
     43     InputStreamPreprocessor(Tokenizer* tokenizer)
     44         : m_tokenizer(tokenizer)
     45     {
     46         reset();
     47     }
     48 
     49     ALWAYS_INLINE UChar nextInputCharacter() const { return m_nextInputCharacter; }
     50 
     51     // Returns whether we succeeded in peeking at the next character.
     52     // The only way we can fail to peek is if there are no more
     53     // characters in |source| (after collapsing \r\n, etc).
     54     ALWAYS_INLINE bool peek(SegmentedString& source)
     55     {
     56         m_nextInputCharacter = source.currentChar();
     57 
     58         // Every branch in this function is expensive, so we have a
     59         // fast-reject branch for characters that don't require special
     60         // handling. Please run the parser benchmark whenever you touch
     61         // this function. It's very hot.
     62         static const UChar specialCharacterMask = '\n' | '\r' | '\0';
     63         if (m_nextInputCharacter & ~specialCharacterMask) {
     64             m_skipNextNewLine = false;
     65             return true;
     66         }
     67         return processNextInputCharacter(source);
     68     }
     69 
     70     // Returns whether there are more characters in |source| after advancing.
     71     ALWAYS_INLINE bool advance(SegmentedString& source)
     72     {
     73         source.advanceAndUpdateLineNumber();
     74         if (source.isEmpty())
     75             return false;
     76         return peek(source);
     77     }
     78 
     79     bool skipNextNewLine() const { return m_skipNextNewLine; }
     80 
     81     void reset(bool skipNextNewLine = false)
     82     {
     83         m_nextInputCharacter = '\0';
     84         m_skipNextNewLine = skipNextNewLine;
     85     }
     86 
     87 private:
     88     bool processNextInputCharacter(SegmentedString& source)
     89     {
     90     ProcessAgain:
     91         ASSERT(m_nextInputCharacter == source.currentChar());
     92 
     93         if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
     94             m_skipNextNewLine = false;
     95             source.advancePastNewlineAndUpdateLineNumber();
     96             if (source.isEmpty())
     97                 return false;
     98             m_nextInputCharacter = source.currentChar();
     99         }
    100         if (m_nextInputCharacter == '\r') {
    101             m_nextInputCharacter = '\n';
    102             m_skipNextNewLine = true;
    103         } else {
    104             m_skipNextNewLine = false;
    105             // FIXME: The spec indicates that the surrogate pair range as well as
    106             // a number of specific character values are parse errors and should be replaced
    107             // by the replacement character. We suspect this is a problem with the spec as doing
    108             // that filtering breaks surrogate pair handling and causes us not to match Minefield.
    109             if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
    110                 if (m_tokenizer->shouldSkipNullCharacters()) {
    111                     source.advancePastNonNewline();
    112                     if (source.isEmpty())
    113                         return false;
    114                     m_nextInputCharacter = source.currentChar();
    115                     goto ProcessAgain;
    116                 }
    117                 m_nextInputCharacter = 0xFFFD;
    118             }
    119         }
    120         return true;
    121     }
    122 
    123     bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
    124     {
    125         return source.isClosed() && source.length() == 1;
    126     }
    127 
    128     Tokenizer* m_tokenizer;
    129 
    130     // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
    131     UChar m_nextInputCharacter;
    132     bool m_skipNextNewLine;
    133 };
    134 
    135 }
    136 
    137 #endif // InputStreamPreprocessor_h
    138 
    139