1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2013 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifndef InputStreamPreprocessor_h 29 #define InputStreamPreprocessor_h 30 31 #include "platform/text/SegmentedString.h" 32 #include "wtf/Noncopyable.h" 33 34 namespace WebCore { 35 36 const LChar kEndOfFileMarker = 0; 37 38 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 39 template <typename Tokenizer> 40 class InputStreamPreprocessor { 41 WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor); 42 public: 43 InputStreamPreprocessor(Tokenizer* tokenizer) 44 : m_tokenizer(tokenizer) 45 { 46 reset(); 47 } 48 49 ALWAYS_INLINE UChar nextInputCharacter() const { return m_nextInputCharacter; } 50 51 // Returns whether we succeeded in peeking at the next character. 52 // The only way we can fail to peek is if there are no more 53 // characters in |source| (after collapsing \r\n, etc). 54 ALWAYS_INLINE bool peek(SegmentedString& source) 55 { 56 m_nextInputCharacter = source.currentChar(); 57 58 // Every branch in this function is expensive, so we have a 59 // fast-reject branch for characters that don't require special 60 // handling. Please run the parser benchmark whenever you touch 61 // this function. It's very hot. 62 static const UChar specialCharacterMask = '\n' | '\r' | '\0'; 63 if (m_nextInputCharacter & ~specialCharacterMask) { 64 m_skipNextNewLine = false; 65 return true; 66 } 67 return processNextInputCharacter(source); 68 } 69 70 // Returns whether there are more characters in |source| after advancing. 71 ALWAYS_INLINE bool advance(SegmentedString& source) 72 { 73 source.advanceAndUpdateLineNumber(); 74 if (source.isEmpty()) 75 return false; 76 return peek(source); 77 } 78 79 bool skipNextNewLine() const { return m_skipNextNewLine; } 80 81 void reset(bool skipNextNewLine = false) 82 { 83 m_nextInputCharacter = '\0'; 84 m_skipNextNewLine = skipNextNewLine; 85 } 86 87 private: 88 bool processNextInputCharacter(SegmentedString& source) 89 { 90 ProcessAgain: 91 ASSERT(m_nextInputCharacter == source.currentChar()); 92 93 if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { 94 m_skipNextNewLine = false; 95 source.advancePastNewlineAndUpdateLineNumber(); 96 if (source.isEmpty()) 97 return false; 98 m_nextInputCharacter = source.currentChar(); 99 } 100 if (m_nextInputCharacter == '\r') { 101 m_nextInputCharacter = '\n'; 102 m_skipNextNewLine = true; 103 } else { 104 m_skipNextNewLine = false; 105 // FIXME: The spec indicates that the surrogate pair range as well as 106 // a number of specific character values are parse errors and should be replaced 107 // by the replacement character. We suspect this is a problem with the spec as doing 108 // that filtering breaks surrogate pair handling and causes us not to match Minefield. 109 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) { 110 if (m_tokenizer->shouldSkipNullCharacters()) { 111 source.advancePastNonNewline(); 112 if (source.isEmpty()) 113 return false; 114 m_nextInputCharacter = source.currentChar(); 115 goto ProcessAgain; 116 } 117 m_nextInputCharacter = 0xFFFD; 118 } 119 } 120 return true; 121 } 122 123 bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const 124 { 125 return source.isClosed() && source.length() == 1; 126 } 127 128 Tokenizer* m_tokenizer; 129 130 // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character 131 UChar m_nextInputCharacter; 132 bool m_skipNextNewLine; 133 }; 134 135 } 136 137 #endif // InputStreamPreprocessor_h 138 139