1 /* 2 * Copyright (C) 2013 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef HTMLToken_h 27 #define HTMLToken_h 28 29 #include "core/dom/Attribute.h" 30 #include "wtf/PassOwnPtr.h" 31 #include "wtf/RefCounted.h" 32 #include "wtf/RefPtr.h" 33 34 namespace blink { 35 36 class DoctypeData { 37 WTF_MAKE_NONCOPYABLE(DoctypeData); 38 public: 39 DoctypeData() 40 : m_hasPublicIdentifier(false) 41 , m_hasSystemIdentifier(false) 42 , m_forceQuirks(false) 43 { 44 } 45 46 bool m_hasPublicIdentifier; 47 bool m_hasSystemIdentifier; 48 WTF::Vector<UChar> m_publicIdentifier; 49 WTF::Vector<UChar> m_systemIdentifier; 50 bool m_forceQuirks; 51 }; 52 53 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name) 54 { 55 for (unsigned i = 0; i < attributes.size(); ++i) { 56 if (attributes.at(i).name().matches(name)) 57 return &attributes.at(i); 58 } 59 return 0; 60 } 61 62 class HTMLToken { 63 WTF_MAKE_NONCOPYABLE(HTMLToken); 64 WTF_MAKE_FAST_ALLOCATED; 65 public: 66 enum Type { 67 Uninitialized, 68 DOCTYPE, 69 StartTag, 70 EndTag, 71 Comment, 72 Character, 73 EndOfFile, 74 }; 75 76 class Attribute { 77 public: 78 class Range { 79 public: 80 int start; 81 int end; 82 }; 83 84 Range nameRange; 85 Range valueRange; 86 Vector<UChar, 32> name; 87 Vector<UChar, 32> value; 88 }; 89 90 typedef Vector<Attribute, 10> AttributeList; 91 92 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer 93 // approximately 99% of the time based on a non-scientific browse around a number of 94 // popular web sites on 23 May 2013. 95 typedef Vector<UChar, 256> DataVector; 96 97 HTMLToken() { clear(); } 98 99 void clear() 100 { 101 m_type = Uninitialized; 102 m_range.start = 0; 103 m_range.end = 0; 104 m_baseOffset = 0; 105 // Don't call Vector::clear() as that would destroy the 106 // alloced VectorBuffer. If the innerHTML'd content has 107 // two 257 character text nodes in a row, we'll needlessly 108 // thrash malloc. When we finally finish the parse the 109 // HTMLToken will be destroyed and the VectorBuffer released. 110 m_data.shrink(0); 111 m_orAllData = 0; 112 } 113 114 bool isUninitialized() { return m_type == Uninitialized; } 115 Type type() const { return m_type; } 116 117 void makeEndOfFile() 118 { 119 ASSERT(m_type == Uninitialized); 120 m_type = EndOfFile; 121 } 122 123 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */ 124 int startIndex() const { return m_range.start; } 125 int endIndex() const { return m_range.end; } 126 127 void setBaseOffset(int offset) 128 { 129 m_baseOffset = offset; 130 } 131 132 void end(int endOffset) 133 { 134 m_range.end = endOffset - m_baseOffset; 135 } 136 137 const DataVector& data() const 138 { 139 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag); 140 return m_data; 141 } 142 143 bool isAll8BitData() const 144 { 145 return (m_orAllData <= 0xff); 146 } 147 148 const DataVector& name() const 149 { 150 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 151 return m_data; 152 } 153 154 void appendToName(UChar character) 155 { 156 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 157 ASSERT(character); 158 m_data.append(character); 159 m_orAllData |= character; 160 } 161 162 /* DOCTYPE Tokens */ 163 164 bool forceQuirks() const 165 { 166 ASSERT(m_type == DOCTYPE); 167 return m_doctypeData->m_forceQuirks; 168 } 169 170 void setForceQuirks() 171 { 172 ASSERT(m_type == DOCTYPE); 173 m_doctypeData->m_forceQuirks = true; 174 } 175 176 void beginDOCTYPE() 177 { 178 ASSERT(m_type == Uninitialized); 179 m_type = DOCTYPE; 180 m_doctypeData = adoptPtr(new DoctypeData); 181 } 182 183 void beginDOCTYPE(UChar character) 184 { 185 ASSERT(character); 186 beginDOCTYPE(); 187 m_data.append(character); 188 m_orAllData |= character; 189 } 190 191 // FIXME: Distinguish between a missing public identifer and an empty one. 192 const WTF::Vector<UChar>& publicIdentifier() const 193 { 194 ASSERT(m_type == DOCTYPE); 195 return m_doctypeData->m_publicIdentifier; 196 } 197 198 // FIXME: Distinguish between a missing system identifer and an empty one. 199 const WTF::Vector<UChar>& systemIdentifier() const 200 { 201 ASSERT(m_type == DOCTYPE); 202 return m_doctypeData->m_systemIdentifier; 203 } 204 205 void setPublicIdentifierToEmptyString() 206 { 207 ASSERT(m_type == DOCTYPE); 208 m_doctypeData->m_hasPublicIdentifier = true; 209 m_doctypeData->m_publicIdentifier.clear(); 210 } 211 212 void setSystemIdentifierToEmptyString() 213 { 214 ASSERT(m_type == DOCTYPE); 215 m_doctypeData->m_hasSystemIdentifier = true; 216 m_doctypeData->m_systemIdentifier.clear(); 217 } 218 219 void appendToPublicIdentifier(UChar character) 220 { 221 ASSERT(character); 222 ASSERT(m_type == DOCTYPE); 223 ASSERT(m_doctypeData->m_hasPublicIdentifier); 224 m_doctypeData->m_publicIdentifier.append(character); 225 } 226 227 void appendToSystemIdentifier(UChar character) 228 { 229 ASSERT(character); 230 ASSERT(m_type == DOCTYPE); 231 ASSERT(m_doctypeData->m_hasSystemIdentifier); 232 m_doctypeData->m_systemIdentifier.append(character); 233 } 234 235 PassOwnPtr<DoctypeData> releaseDoctypeData() 236 { 237 return m_doctypeData.release(); 238 } 239 240 /* Start/End Tag Tokens */ 241 242 bool selfClosing() const 243 { 244 ASSERT(m_type == StartTag || m_type == EndTag); 245 return m_selfClosing; 246 } 247 248 void setSelfClosing() 249 { 250 ASSERT(m_type == StartTag || m_type == EndTag); 251 m_selfClosing = true; 252 } 253 254 void beginStartTag(UChar character) 255 { 256 ASSERT(character); 257 ASSERT(m_type == Uninitialized); 258 m_type = StartTag; 259 m_selfClosing = false; 260 m_currentAttribute = 0; 261 m_attributes.clear(); 262 263 m_data.append(character); 264 m_orAllData |= character; 265 } 266 267 void beginEndTag(LChar character) 268 { 269 ASSERT(m_type == Uninitialized); 270 m_type = EndTag; 271 m_selfClosing = false; 272 m_currentAttribute = 0; 273 m_attributes.clear(); 274 275 m_data.append(character); 276 } 277 278 void beginEndTag(const Vector<LChar, 32>& characters) 279 { 280 ASSERT(m_type == Uninitialized); 281 m_type = EndTag; 282 m_selfClosing = false; 283 m_currentAttribute = 0; 284 m_attributes.clear(); 285 286 m_data.appendVector(characters); 287 } 288 289 void addNewAttribute() 290 { 291 ASSERT(m_type == StartTag || m_type == EndTag); 292 m_attributes.grow(m_attributes.size() + 1); 293 m_currentAttribute = &m_attributes.last(); 294 #if ENABLE(ASSERT) 295 m_currentAttribute->nameRange.start = 0; 296 m_currentAttribute->nameRange.end = 0; 297 m_currentAttribute->valueRange.start = 0; 298 m_currentAttribute->valueRange.end = 0; 299 #endif 300 } 301 302 void beginAttributeName(int offset) 303 { 304 m_currentAttribute->nameRange.start = offset - m_baseOffset; 305 } 306 307 void endAttributeName(int offset) 308 { 309 int index = offset - m_baseOffset; 310 m_currentAttribute->nameRange.end = index; 311 m_currentAttribute->valueRange.start = index; 312 m_currentAttribute->valueRange.end = index; 313 } 314 315 void beginAttributeValue(int offset) 316 { 317 m_currentAttribute->valueRange.start = offset - m_baseOffset; 318 #if ENABLE(ASSERT) 319 m_currentAttribute->valueRange.end = 0; 320 #endif 321 } 322 323 void endAttributeValue(int offset) 324 { 325 m_currentAttribute->valueRange.end = offset - m_baseOffset; 326 } 327 328 void appendToAttributeName(UChar character) 329 { 330 ASSERT(character); 331 ASSERT(m_type == StartTag || m_type == EndTag); 332 ASSERT(m_currentAttribute->nameRange.start); 333 m_currentAttribute->name.append(character); 334 } 335 336 void appendToAttributeValue(UChar character) 337 { 338 ASSERT(character); 339 ASSERT(m_type == StartTag || m_type == EndTag); 340 ASSERT(m_currentAttribute->valueRange.start); 341 m_currentAttribute->value.append(character); 342 } 343 344 void appendToAttributeValue(size_t i, const String& value) 345 { 346 ASSERT(!value.isEmpty()); 347 ASSERT(m_type == StartTag || m_type == EndTag); 348 append(m_attributes[i].value, value); 349 } 350 351 const AttributeList& attributes() const 352 { 353 ASSERT(m_type == StartTag || m_type == EndTag); 354 return m_attributes; 355 } 356 357 const Attribute* getAttributeItem(const QualifiedName& name) const 358 { 359 for (unsigned i = 0; i < m_attributes.size(); ++i) { 360 if (AtomicString(m_attributes.at(i).name) == name.localName()) 361 return &m_attributes.at(i); 362 } 363 return 0; 364 } 365 366 // Used by the XSSAuditor to nuke XSS-laden attributes. 367 void eraseValueOfAttribute(size_t i) 368 { 369 ASSERT(m_type == StartTag || m_type == EndTag); 370 m_attributes[i].value.clear(); 371 } 372 373 /* Character Tokens */ 374 375 // Starting a character token works slightly differently than starting 376 // other types of tokens because we want to save a per-character branch. 377 void ensureIsCharacterToken() 378 { 379 ASSERT(m_type == Uninitialized || m_type == Character); 380 m_type = Character; 381 } 382 383 const DataVector& characters() const 384 { 385 ASSERT(m_type == Character); 386 return m_data; 387 } 388 389 void appendToCharacter(char character) 390 { 391 ASSERT(m_type == Character); 392 m_data.append(character); 393 } 394 395 void appendToCharacter(UChar character) 396 { 397 ASSERT(m_type == Character); 398 m_data.append(character); 399 m_orAllData |= character; 400 } 401 402 void appendToCharacter(const Vector<LChar, 32>& characters) 403 { 404 ASSERT(m_type == Character); 405 m_data.appendVector(characters); 406 } 407 408 /* Comment Tokens */ 409 410 const DataVector& comment() const 411 { 412 ASSERT(m_type == Comment); 413 return m_data; 414 } 415 416 void beginComment() 417 { 418 ASSERT(m_type == Uninitialized); 419 m_type = Comment; 420 } 421 422 void appendToComment(UChar character) 423 { 424 ASSERT(character); 425 ASSERT(m_type == Comment); 426 m_data.append(character); 427 m_orAllData |= character; 428 } 429 430 // Only for XSSAuditor 431 void eraseCharacters() 432 { 433 ASSERT(m_type == Character); 434 m_data.clear(); 435 m_orAllData = 0; 436 } 437 438 private: 439 Type m_type; 440 Attribute::Range m_range; // Always starts at zero. 441 int m_baseOffset; 442 DataVector m_data; 443 UChar m_orAllData; 444 445 // For StartTag and EndTag 446 bool m_selfClosing; 447 AttributeList m_attributes; 448 449 // A pointer into m_attributes used during lexing. 450 Attribute* m_currentAttribute; 451 452 // For DOCTYPE 453 OwnPtr<DoctypeData> m_doctypeData; 454 }; 455 456 } 457 458 #endif 459