1 /* 2 * Copyright (C) 2013 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef HTMLToken_h 27 #define HTMLToken_h 28 29 #include "core/dom/Attribute.h" 30 #include "wtf/PassOwnPtr.h" 31 #include "wtf/RefCounted.h" 32 #include "wtf/RefPtr.h" 33 34 namespace WebCore { 35 36 class DoctypeData { 37 WTF_MAKE_NONCOPYABLE(DoctypeData); 38 public: 39 DoctypeData() 40 : m_hasPublicIdentifier(false) 41 , m_hasSystemIdentifier(false) 42 , m_forceQuirks(false) 43 { 44 } 45 46 // FIXME: This should use String instead of Vector<UChar>. 47 bool m_hasPublicIdentifier; 48 bool m_hasSystemIdentifier; 49 WTF::Vector<UChar> m_publicIdentifier; 50 WTF::Vector<UChar> m_systemIdentifier; 51 bool m_forceQuirks; 52 }; 53 54 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name) 55 { 56 for (unsigned i = 0; i < attributes.size(); ++i) { 57 if (attributes.at(i).name().matches(name)) 58 return &attributes.at(i); 59 } 60 return 0; 61 } 62 63 class HTMLToken { 64 WTF_MAKE_NONCOPYABLE(HTMLToken); 65 WTF_MAKE_FAST_ALLOCATED; 66 public: 67 enum Type { 68 Uninitialized, 69 DOCTYPE, 70 StartTag, 71 EndTag, 72 Comment, 73 Character, 74 EndOfFile, 75 }; 76 77 class Attribute { 78 public: 79 class Range { 80 public: 81 int start; 82 int end; 83 }; 84 85 Range nameRange; 86 Range valueRange; 87 Vector<UChar, 32> name; 88 Vector<UChar, 32> value; 89 }; 90 91 typedef Vector<Attribute, 10> AttributeList; 92 93 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer 94 // approximately 99% of the time based on a non-scientific browse around a number of 95 // popular web sites on 23 May 2013. 96 typedef Vector<UChar, 256> DataVector; 97 98 HTMLToken() { clear(); } 99 100 void clear() 101 { 102 m_type = Uninitialized; 103 m_range.start = 0; 104 m_range.end = 0; 105 m_baseOffset = 0; 106 // Don't call Vector::clear() as that would destroy the 107 // alloced VectorBuffer. If the innerHTML'd content has 108 // two 257 character text nodes in a row, we'll needlessly 109 // thrash malloc. When we finally finish the parse the 110 // HTMLToken will be destroyed and the VectorBuffer released. 111 m_data.shrink(0); 112 m_orAllData = 0; 113 } 114 115 bool isUninitialized() { return m_type == Uninitialized; } 116 Type type() const { return m_type; } 117 118 void makeEndOfFile() 119 { 120 ASSERT(m_type == Uninitialized); 121 m_type = EndOfFile; 122 } 123 124 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */ 125 int startIndex() const { return m_range.start; } 126 int endIndex() const { return m_range.end; } 127 128 void setBaseOffset(int offset) 129 { 130 m_baseOffset = offset; 131 } 132 133 void end(int endOffset) 134 { 135 m_range.end = endOffset - m_baseOffset; 136 } 137 138 const DataVector& data() const 139 { 140 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag); 141 return m_data; 142 } 143 144 bool isAll8BitData() const 145 { 146 return (m_orAllData <= 0xff); 147 } 148 149 const DataVector& name() const 150 { 151 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 152 return m_data; 153 } 154 155 void appendToName(UChar character) 156 { 157 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 158 ASSERT(character); 159 m_data.append(character); 160 m_orAllData |= character; 161 } 162 163 /* DOCTYPE Tokens */ 164 165 bool forceQuirks() const 166 { 167 ASSERT(m_type == DOCTYPE); 168 return m_doctypeData->m_forceQuirks; 169 } 170 171 void setForceQuirks() 172 { 173 ASSERT(m_type == DOCTYPE); 174 m_doctypeData->m_forceQuirks = true; 175 } 176 177 void beginDOCTYPE() 178 { 179 ASSERT(m_type == Uninitialized); 180 m_type = DOCTYPE; 181 m_doctypeData = adoptPtr(new DoctypeData); 182 } 183 184 void beginDOCTYPE(UChar character) 185 { 186 ASSERT(character); 187 beginDOCTYPE(); 188 m_data.append(character); 189 m_orAllData |= character; 190 } 191 192 // FIXME: Distinguish between a missing public identifer and an empty one. 193 const WTF::Vector<UChar>& publicIdentifier() const 194 { 195 ASSERT(m_type == DOCTYPE); 196 return m_doctypeData->m_publicIdentifier; 197 } 198 199 // FIXME: Distinguish between a missing system identifer and an empty one. 200 const WTF::Vector<UChar>& systemIdentifier() const 201 { 202 ASSERT(m_type == DOCTYPE); 203 return m_doctypeData->m_systemIdentifier; 204 } 205 206 void setPublicIdentifierToEmptyString() 207 { 208 ASSERT(m_type == DOCTYPE); 209 m_doctypeData->m_hasPublicIdentifier = true; 210 m_doctypeData->m_publicIdentifier.clear(); 211 } 212 213 void setSystemIdentifierToEmptyString() 214 { 215 ASSERT(m_type == DOCTYPE); 216 m_doctypeData->m_hasSystemIdentifier = true; 217 m_doctypeData->m_systemIdentifier.clear(); 218 } 219 220 void appendToPublicIdentifier(UChar character) 221 { 222 ASSERT(character); 223 ASSERT(m_type == DOCTYPE); 224 ASSERT(m_doctypeData->m_hasPublicIdentifier); 225 m_doctypeData->m_publicIdentifier.append(character); 226 } 227 228 void appendToSystemIdentifier(UChar character) 229 { 230 ASSERT(character); 231 ASSERT(m_type == DOCTYPE); 232 ASSERT(m_doctypeData->m_hasSystemIdentifier); 233 m_doctypeData->m_systemIdentifier.append(character); 234 } 235 236 PassOwnPtr<DoctypeData> releaseDoctypeData() 237 { 238 return m_doctypeData.release(); 239 } 240 241 /* Start/End Tag Tokens */ 242 243 bool selfClosing() const 244 { 245 ASSERT(m_type == StartTag || m_type == EndTag); 246 return m_selfClosing; 247 } 248 249 void setSelfClosing() 250 { 251 ASSERT(m_type == StartTag || m_type == EndTag); 252 m_selfClosing = true; 253 } 254 255 void beginStartTag(UChar character) 256 { 257 ASSERT(character); 258 ASSERT(m_type == Uninitialized); 259 m_type = StartTag; 260 m_selfClosing = false; 261 m_currentAttribute = 0; 262 m_attributes.clear(); 263 264 m_data.append(character); 265 m_orAllData |= character; 266 } 267 268 void beginEndTag(LChar character) 269 { 270 ASSERT(m_type == Uninitialized); 271 m_type = EndTag; 272 m_selfClosing = false; 273 m_currentAttribute = 0; 274 m_attributes.clear(); 275 276 m_data.append(character); 277 } 278 279 void beginEndTag(const Vector<LChar, 32>& characters) 280 { 281 ASSERT(m_type == Uninitialized); 282 m_type = EndTag; 283 m_selfClosing = false; 284 m_currentAttribute = 0; 285 m_attributes.clear(); 286 287 m_data.appendVector(characters); 288 } 289 290 void addNewAttribute() 291 { 292 ASSERT(m_type == StartTag || m_type == EndTag); 293 m_attributes.grow(m_attributes.size() + 1); 294 m_currentAttribute = &m_attributes.last(); 295 #ifndef NDEBUG 296 m_currentAttribute->nameRange.start = 0; 297 m_currentAttribute->nameRange.end = 0; 298 m_currentAttribute->valueRange.start = 0; 299 m_currentAttribute->valueRange.end = 0; 300 #endif 301 } 302 303 void beginAttributeName(int offset) 304 { 305 m_currentAttribute->nameRange.start = offset - m_baseOffset; 306 } 307 308 void endAttributeName(int offset) 309 { 310 int index = offset - m_baseOffset; 311 m_currentAttribute->nameRange.end = index; 312 m_currentAttribute->valueRange.start = index; 313 m_currentAttribute->valueRange.end = index; 314 } 315 316 void beginAttributeValue(int offset) 317 { 318 m_currentAttribute->valueRange.start = offset - m_baseOffset; 319 #ifndef NDEBUG 320 m_currentAttribute->valueRange.end = 0; 321 #endif 322 } 323 324 void endAttributeValue(int offset) 325 { 326 m_currentAttribute->valueRange.end = offset - m_baseOffset; 327 } 328 329 void appendToAttributeName(UChar character) 330 { 331 ASSERT(character); 332 ASSERT(m_type == StartTag || m_type == EndTag); 333 ASSERT(m_currentAttribute->nameRange.start); 334 m_currentAttribute->name.append(character); 335 } 336 337 void appendToAttributeValue(UChar character) 338 { 339 ASSERT(character); 340 ASSERT(m_type == StartTag || m_type == EndTag); 341 ASSERT(m_currentAttribute->valueRange.start); 342 m_currentAttribute->value.append(character); 343 } 344 345 void appendToAttributeValue(size_t i, const String& value) 346 { 347 ASSERT(!value.isEmpty()); 348 ASSERT(m_type == StartTag || m_type == EndTag); 349 append(m_attributes[i].value, value); 350 } 351 352 const AttributeList& attributes() const 353 { 354 ASSERT(m_type == StartTag || m_type == EndTag); 355 return m_attributes; 356 } 357 358 const Attribute* getAttributeItem(const QualifiedName& name) const 359 { 360 for (unsigned i = 0; i < m_attributes.size(); ++i) { 361 if (AtomicString(m_attributes.at(i).name) == name.localName()) 362 return &m_attributes.at(i); 363 } 364 return 0; 365 } 366 367 // Used by the XSSAuditor to nuke XSS-laden attributes. 368 void eraseValueOfAttribute(size_t i) 369 { 370 ASSERT(m_type == StartTag || m_type == EndTag); 371 m_attributes[i].value.clear(); 372 } 373 374 /* Character Tokens */ 375 376 // Starting a character token works slightly differently than starting 377 // other types of tokens because we want to save a per-character branch. 378 void ensureIsCharacterToken() 379 { 380 ASSERT(m_type == Uninitialized || m_type == Character); 381 m_type = Character; 382 } 383 384 const DataVector& characters() const 385 { 386 ASSERT(m_type == Character); 387 return m_data; 388 } 389 390 void appendToCharacter(char character) 391 { 392 ASSERT(m_type == Character); 393 m_data.append(character); 394 } 395 396 void appendToCharacter(UChar character) 397 { 398 ASSERT(m_type == Character); 399 m_data.append(character); 400 m_orAllData |= character; 401 } 402 403 void appendToCharacter(const Vector<LChar, 32>& characters) 404 { 405 ASSERT(m_type == Character); 406 m_data.appendVector(characters); 407 } 408 409 /* Comment Tokens */ 410 411 const DataVector& comment() const 412 { 413 ASSERT(m_type == Comment); 414 return m_data; 415 } 416 417 void beginComment() 418 { 419 ASSERT(m_type == Uninitialized); 420 m_type = Comment; 421 } 422 423 void appendToComment(UChar character) 424 { 425 ASSERT(character); 426 ASSERT(m_type == Comment); 427 m_data.append(character); 428 m_orAllData |= character; 429 } 430 431 // Only for XSSAuditor 432 void eraseCharacters() 433 { 434 ASSERT(m_type == Character); 435 m_data.clear(); 436 m_orAllData = 0; 437 } 438 439 private: 440 Type m_type; 441 Attribute::Range m_range; // Always starts at zero. 442 int m_baseOffset; 443 DataVector m_data; 444 UChar m_orAllData; 445 446 // For StartTag and EndTag 447 bool m_selfClosing; 448 AttributeList m_attributes; 449 450 // A pointer into m_attributes used during lexing. 451 Attribute* m_currentAttribute; 452 453 // For DOCTYPE 454 OwnPtr<DoctypeData> m_doctypeData; 455 }; 456 457 } 458 459 #endif 460