1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 33 #include "core/html/track/vtt/VTTTokenizer.h" 34 35 #include "core/xml/parser/MarkupTokenizerInlines.h" 36 #include "wtf/text/StringBuilder.h" 37 #include "wtf/unicode/CharacterNames.h" 38 39 namespace WebCore { 40 41 #define WEBVTT_BEGIN_STATE(stateName) case stateName: stateName: 42 #define WEBVTT_ADVANCE_TO(stateName) \ 43 do { \ 44 state = stateName; \ 45 ASSERT(!m_input.isEmpty()); \ 46 m_inputStreamPreprocessor.advance(m_input); \ 47 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 48 goto stateName; \ 49 } while (false) 50 51 template<unsigned charactersCount> 52 ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount]) 53 { 54 return WTF::equal(s, reinterpret_cast<const LChar*>(characters), charactersCount - 1); 55 } 56 57 static void addNewClass(StringBuilder& classes, const StringBuilder& newClass) 58 { 59 if (!classes.isEmpty()) 60 classes.append(' '); 61 classes.append(newClass); 62 } 63 64 inline bool emitToken(VTTToken& resultToken, const VTTToken& token) 65 { 66 resultToken = token; 67 return true; 68 } 69 70 inline bool advanceAndEmitToken(SegmentedString& source, VTTToken& resultToken, const VTTToken& token) 71 { 72 source.advanceAndUpdateLineNumber(); 73 return emitToken(resultToken, token); 74 } 75 76 VTTTokenizer::VTTTokenizer(const String& input) 77 : m_input(input) 78 , m_inputStreamPreprocessor(this) 79 { 80 // Append a EOF marker and close the input "stream". 81 ASSERT(!m_input.isClosed()); 82 m_input.append(SegmentedString(String(&kEndOfFileMarker, 1))); 83 m_input.close(); 84 } 85 86 bool VTTTokenizer::nextToken(VTTToken& token) 87 { 88 if (m_input.isEmpty() || !m_inputStreamPreprocessor.peek(m_input)) 89 return false; 90 91 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 92 if (cc == kEndOfFileMarker) { 93 m_inputStreamPreprocessor.advance(m_input); 94 return false; 95 } 96 97 StringBuilder buffer; 98 StringBuilder result; 99 StringBuilder classes; 100 enum { 101 DataState, 102 EscapeState, 103 TagState, 104 StartTagState, 105 StartTagClassState, 106 StartTagAnnotationState, 107 EndTagState, 108 TimestampTagState, 109 } state = DataState; 110 111 // 4.8.10.13.4 WebVTT cue text tokenizer 112 switch (state) { 113 WEBVTT_BEGIN_STATE(DataState) { 114 if (cc == '&') { 115 buffer.append(static_cast<LChar>(cc)); 116 WEBVTT_ADVANCE_TO(EscapeState); 117 } else if (cc == '<') { 118 if (result.isEmpty()) { 119 WEBVTT_ADVANCE_TO(TagState); 120 } else { 121 // We don't want to advance input or perform a state transition - just return a (new) token. 122 // (On the next call to nextToken we will see '<' again, but take the other branch in this if instead.) 123 return emitToken(token, VTTToken::StringToken(result.toString())); 124 } 125 } else if (cc == kEndOfFileMarker) { 126 return advanceAndEmitToken(m_input, token, VTTToken::StringToken(result.toString())); 127 } else { 128 result.append(cc); 129 WEBVTT_ADVANCE_TO(DataState); 130 } 131 } 132 END_STATE() 133 134 WEBVTT_BEGIN_STATE(EscapeState) { 135 if (cc == ';') { 136 if (equalLiteral(buffer, "&")) { 137 result.append('&'); 138 } else if (equalLiteral(buffer, "<")) { 139 result.append('<'); 140 } else if (equalLiteral(buffer, ">")) { 141 result.append('>'); 142 } else if (equalLiteral(buffer, "&lrm")) { 143 result.append(leftToRightMark); 144 } else if (equalLiteral(buffer, "&rlm")) { 145 result.append(rightToLeftMark); 146 } else if (equalLiteral(buffer, " ")) { 147 result.append(noBreakSpace); 148 } else { 149 buffer.append(static_cast<LChar>(cc)); 150 result.append(buffer); 151 } 152 buffer.clear(); 153 WEBVTT_ADVANCE_TO(DataState); 154 } else if (isASCIIAlphanumeric(cc)) { 155 buffer.append(static_cast<LChar>(cc)); 156 WEBVTT_ADVANCE_TO(EscapeState); 157 } else if (cc == '<') { 158 result.append(buffer); 159 return emitToken(token, VTTToken::StringToken(result.toString())); 160 } else if (cc == kEndOfFileMarker) { 161 result.append(buffer); 162 return advanceAndEmitToken(m_input, token, VTTToken::StringToken(result.toString())); 163 } else { 164 result.append(buffer); 165 buffer.clear(); 166 167 if (cc == '&') { 168 buffer.append(static_cast<LChar>(cc)); 169 WEBVTT_ADVANCE_TO(EscapeState); 170 } 171 result.append(cc); 172 WEBVTT_ADVANCE_TO(DataState); 173 } 174 } 175 END_STATE() 176 177 WEBVTT_BEGIN_STATE(TagState) { 178 if (isTokenizerWhitespace(cc)) { 179 ASSERT(result.isEmpty()); 180 WEBVTT_ADVANCE_TO(StartTagAnnotationState); 181 } else if (cc == '.') { 182 ASSERT(result.isEmpty()); 183 WEBVTT_ADVANCE_TO(StartTagClassState); 184 } else if (cc == '/') { 185 WEBVTT_ADVANCE_TO(EndTagState); 186 } else if (WTF::isASCIIDigit(cc)) { 187 result.append(cc); 188 WEBVTT_ADVANCE_TO(TimestampTagState); 189 } else if (cc == '>' || cc == kEndOfFileMarker) { 190 ASSERT(result.isEmpty()); 191 return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString())); 192 } else { 193 result.append(cc); 194 WEBVTT_ADVANCE_TO(StartTagState); 195 } 196 } 197 END_STATE() 198 199 WEBVTT_BEGIN_STATE(StartTagState) { 200 if (isTokenizerWhitespace(cc)) { 201 WEBVTT_ADVANCE_TO(StartTagAnnotationState); 202 } else if (cc == '.') { 203 WEBVTT_ADVANCE_TO(StartTagClassState); 204 } else if (cc == '>' || cc == kEndOfFileMarker) { 205 return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString())); 206 } else { 207 result.append(cc); 208 WEBVTT_ADVANCE_TO(StartTagState); 209 } 210 } 211 END_STATE() 212 213 WEBVTT_BEGIN_STATE(StartTagClassState) { 214 if (isTokenizerWhitespace(cc)) { 215 addNewClass(classes, buffer); 216 buffer.clear(); 217 WEBVTT_ADVANCE_TO(StartTagAnnotationState); 218 } else if (cc == '.') { 219 addNewClass(classes, buffer); 220 buffer.clear(); 221 WEBVTT_ADVANCE_TO(StartTagClassState); 222 } else if (cc == '>' || cc == kEndOfFileMarker) { 223 addNewClass(classes, buffer); 224 buffer.clear(); 225 return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString(), classes.toAtomicString())); 226 } else { 227 buffer.append(cc); 228 WEBVTT_ADVANCE_TO(StartTagClassState); 229 } 230 } 231 END_STATE() 232 233 WEBVTT_BEGIN_STATE(StartTagAnnotationState) { 234 if (cc == '>' || cc == kEndOfFileMarker) { 235 return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString(), classes.toAtomicString(), buffer.toAtomicString())); 236 } 237 buffer.append(cc); 238 WEBVTT_ADVANCE_TO(StartTagAnnotationState); 239 } 240 END_STATE() 241 242 WEBVTT_BEGIN_STATE(EndTagState) { 243 if (cc == '>' || cc == kEndOfFileMarker) 244 return advanceAndEmitToken(m_input, token, VTTToken::EndTag(result.toString())); 245 result.append(cc); 246 WEBVTT_ADVANCE_TO(EndTagState); 247 } 248 END_STATE() 249 250 WEBVTT_BEGIN_STATE(TimestampTagState) { 251 if (cc == '>' || cc == kEndOfFileMarker) 252 return advanceAndEmitToken(m_input, token, VTTToken::TimestampTag(result.toString())); 253 result.append(cc); 254 WEBVTT_ADVANCE_TO(TimestampTagState); 255 } 256 END_STATE() 257 258 } 259 260 ASSERT_NOT_REACHED(); 261 return false; 262 } 263 264 } 265 266