1 /* 2 * Copyright (C) 2013 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef HTMLToken_h 27 #define HTMLToken_h 28 29 #include "core/dom/Attribute.h" 30 #include "core/html/parser/HTMLToken.h" 31 #include "wtf/PassOwnPtr.h" 32 #include "wtf/RefCounted.h" 33 #include "wtf/RefPtr.h" 34 35 namespace WebCore { 36 37 class DoctypeData { 38 WTF_MAKE_NONCOPYABLE(DoctypeData); 39 public: 40 DoctypeData() 41 : m_hasPublicIdentifier(false) 42 , m_hasSystemIdentifier(false) 43 , m_forceQuirks(false) 44 { 45 } 46 47 // FIXME: This should use String instead of Vector<UChar>. 48 bool m_hasPublicIdentifier; 49 bool m_hasSystemIdentifier; 50 WTF::Vector<UChar> m_publicIdentifier; 51 WTF::Vector<UChar> m_systemIdentifier; 52 bool m_forceQuirks; 53 }; 54 55 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name) 56 { 57 for (unsigned i = 0; i < attributes.size(); ++i) { 58 if (attributes.at(i).name().matches(name)) 59 return &attributes.at(i); 60 } 61 return 0; 62 } 63 64 class HTMLToken { 65 WTF_MAKE_NONCOPYABLE(HTMLToken); 66 WTF_MAKE_FAST_ALLOCATED; 67 public: 68 enum Type { 69 Uninitialized, 70 DOCTYPE, 71 StartTag, 72 EndTag, 73 Comment, 74 Character, 75 EndOfFile, 76 }; 77 78 class Attribute { 79 public: 80 class Range { 81 public: 82 int start; 83 int end; 84 }; 85 86 Range nameRange; 87 Range valueRange; 88 Vector<UChar, 32> name; 89 Vector<UChar, 32> value; 90 }; 91 92 typedef Vector<Attribute, 10> AttributeList; 93 94 // By using an inline capacity of 256, we avoid spilling over into an malloced buffer 95 // approximately 99% of the time based on a non-scientific browse around a number of 96 // popular web sites on 23 May 2013. 97 typedef Vector<UChar, 256> DataVector; 98 99 HTMLToken() { clear(); } 100 101 void clear() 102 { 103 m_type = Uninitialized; 104 m_range.start = 0; 105 m_range.end = 0; 106 m_baseOffset = 0; 107 m_data.clear(); 108 m_orAllData = 0; 109 } 110 111 bool isUninitialized() { return m_type == Uninitialized; } 112 Type type() const { return m_type; } 113 114 void makeEndOfFile() 115 { 116 ASSERT(m_type == Uninitialized); 117 m_type = EndOfFile; 118 } 119 120 /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */ 121 int startIndex() const { return m_range.start; } 122 int endIndex() const { return m_range.end; } 123 124 void setBaseOffset(int offset) 125 { 126 m_baseOffset = offset; 127 } 128 129 void end(int endOffset) 130 { 131 m_range.end = endOffset - m_baseOffset; 132 } 133 134 const DataVector& data() const 135 { 136 ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag); 137 return m_data; 138 } 139 140 bool isAll8BitData() const 141 { 142 return (m_orAllData <= 0xff); 143 } 144 145 const DataVector& name() const 146 { 147 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 148 return m_data; 149 } 150 151 void appendToName(UChar character) 152 { 153 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 154 ASSERT(character); 155 m_data.append(character); 156 m_orAllData |= character; 157 } 158 159 /* DOCTYPE Tokens */ 160 161 bool forceQuirks() const 162 { 163 ASSERT(m_type == DOCTYPE); 164 return m_doctypeData->m_forceQuirks; 165 } 166 167 void setForceQuirks() 168 { 169 ASSERT(m_type == DOCTYPE); 170 m_doctypeData->m_forceQuirks = true; 171 } 172 173 void beginDOCTYPE() 174 { 175 ASSERT(m_type == Uninitialized); 176 m_type = DOCTYPE; 177 m_doctypeData = adoptPtr(new DoctypeData); 178 } 179 180 void beginDOCTYPE(UChar character) 181 { 182 ASSERT(character); 183 beginDOCTYPE(); 184 m_data.append(character); 185 m_orAllData |= character; 186 } 187 188 // FIXME: Distinguish between a missing public identifer and an empty one. 189 const WTF::Vector<UChar>& publicIdentifier() const 190 { 191 ASSERT(m_type == DOCTYPE); 192 return m_doctypeData->m_publicIdentifier; 193 } 194 195 // FIXME: Distinguish between a missing system identifer and an empty one. 196 const WTF::Vector<UChar>& systemIdentifier() const 197 { 198 ASSERT(m_type == DOCTYPE); 199 return m_doctypeData->m_systemIdentifier; 200 } 201 202 void setPublicIdentifierToEmptyString() 203 { 204 ASSERT(m_type == DOCTYPE); 205 m_doctypeData->m_hasPublicIdentifier = true; 206 m_doctypeData->m_publicIdentifier.clear(); 207 } 208 209 void setSystemIdentifierToEmptyString() 210 { 211 ASSERT(m_type == DOCTYPE); 212 m_doctypeData->m_hasSystemIdentifier = true; 213 m_doctypeData->m_systemIdentifier.clear(); 214 } 215 216 void appendToPublicIdentifier(UChar character) 217 { 218 ASSERT(character); 219 ASSERT(m_type == DOCTYPE); 220 ASSERT(m_doctypeData->m_hasPublicIdentifier); 221 m_doctypeData->m_publicIdentifier.append(character); 222 } 223 224 void appendToSystemIdentifier(UChar character) 225 { 226 ASSERT(character); 227 ASSERT(m_type == DOCTYPE); 228 ASSERT(m_doctypeData->m_hasSystemIdentifier); 229 m_doctypeData->m_systemIdentifier.append(character); 230 } 231 232 PassOwnPtr<DoctypeData> releaseDoctypeData() 233 { 234 return m_doctypeData.release(); 235 } 236 237 /* Start/End Tag Tokens */ 238 239 bool selfClosing() const 240 { 241 ASSERT(m_type == StartTag || m_type == EndTag); 242 return m_selfClosing; 243 } 244 245 void setSelfClosing() 246 { 247 ASSERT(m_type == StartTag || m_type == EndTag); 248 m_selfClosing = true; 249 } 250 251 void beginStartTag(UChar character) 252 { 253 ASSERT(character); 254 ASSERT(m_type == Uninitialized); 255 m_type = StartTag; 256 m_selfClosing = false; 257 m_currentAttribute = 0; 258 m_attributes.clear(); 259 260 m_data.append(character); 261 m_orAllData |= character; 262 } 263 264 void beginEndTag(LChar character) 265 { 266 ASSERT(m_type == Uninitialized); 267 m_type = EndTag; 268 m_selfClosing = false; 269 m_currentAttribute = 0; 270 m_attributes.clear(); 271 272 m_data.append(character); 273 } 274 275 void beginEndTag(const Vector<LChar, 32>& characters) 276 { 277 ASSERT(m_type == Uninitialized); 278 m_type = EndTag; 279 m_selfClosing = false; 280 m_currentAttribute = 0; 281 m_attributes.clear(); 282 283 m_data.appendVector(characters); 284 } 285 286 void addNewAttribute() 287 { 288 ASSERT(m_type == StartTag || m_type == EndTag); 289 m_attributes.grow(m_attributes.size() + 1); 290 m_currentAttribute = &m_attributes.last(); 291 #ifndef NDEBUG 292 m_currentAttribute->nameRange.start = 0; 293 m_currentAttribute->nameRange.end = 0; 294 m_currentAttribute->valueRange.start = 0; 295 m_currentAttribute->valueRange.end = 0; 296 #endif 297 } 298 299 void beginAttributeName(int offset) 300 { 301 m_currentAttribute->nameRange.start = offset - m_baseOffset; 302 } 303 304 void endAttributeName(int offset) 305 { 306 int index = offset - m_baseOffset; 307 m_currentAttribute->nameRange.end = index; 308 m_currentAttribute->valueRange.start = index; 309 m_currentAttribute->valueRange.end = index; 310 } 311 312 void beginAttributeValue(int offset) 313 { 314 m_currentAttribute->valueRange.start = offset - m_baseOffset; 315 #ifndef NDEBUG 316 m_currentAttribute->valueRange.end = 0; 317 #endif 318 } 319 320 void endAttributeValue(int offset) 321 { 322 m_currentAttribute->valueRange.end = offset - m_baseOffset; 323 } 324 325 void appendToAttributeName(UChar character) 326 { 327 ASSERT(character); 328 ASSERT(m_type == StartTag || m_type == EndTag); 329 // FIXME: We should be able to add the following ASSERT once we fix 330 // https://bugs.webkit.org/show_bug.cgi?id=62971 331 // ASSERT(m_currentAttribute->nameRange.start); 332 m_currentAttribute->name.append(character); 333 } 334 335 void appendToAttributeValue(UChar character) 336 { 337 ASSERT(character); 338 ASSERT(m_type == StartTag || m_type == EndTag); 339 ASSERT(m_currentAttribute->valueRange.start); 340 m_currentAttribute->value.append(character); 341 } 342 343 void appendToAttributeValue(size_t i, const String& value) 344 { 345 ASSERT(!value.isEmpty()); 346 ASSERT(m_type == StartTag || m_type == EndTag); 347 append(m_attributes[i].value, value); 348 } 349 350 const AttributeList& attributes() const 351 { 352 ASSERT(m_type == StartTag || m_type == EndTag); 353 return m_attributes; 354 } 355 356 const Attribute* getAttributeItem(const QualifiedName& name) const 357 { 358 for (unsigned i = 0; i < m_attributes.size(); ++i) { 359 if (AtomicString(m_attributes.at(i).name) == name.localName()) 360 return &m_attributes.at(i); 361 } 362 return 0; 363 } 364 365 // Used by the XSSAuditor to nuke XSS-laden attributes. 366 void eraseValueOfAttribute(size_t i) 367 { 368 ASSERT(m_type == StartTag || m_type == EndTag); 369 m_attributes[i].value.clear(); 370 } 371 372 /* Character Tokens */ 373 374 // Starting a character token works slightly differently than starting 375 // other types of tokens because we want to save a per-character branch. 376 void ensureIsCharacterToken() 377 { 378 ASSERT(m_type == Uninitialized || m_type == Character); 379 m_type = Character; 380 } 381 382 const DataVector& characters() const 383 { 384 ASSERT(m_type == Character); 385 return m_data; 386 } 387 388 void appendToCharacter(char character) 389 { 390 ASSERT(m_type == Character); 391 m_data.append(character); 392 } 393 394 void appendToCharacter(UChar character) 395 { 396 ASSERT(m_type == Character); 397 m_data.append(character); 398 m_orAllData |= character; 399 } 400 401 void appendToCharacter(const Vector<LChar, 32>& characters) 402 { 403 ASSERT(m_type == Character); 404 m_data.appendVector(characters); 405 } 406 407 /* Comment Tokens */ 408 409 const DataVector& comment() const 410 { 411 ASSERT(m_type == Comment); 412 return m_data; 413 } 414 415 void beginComment() 416 { 417 ASSERT(m_type == Uninitialized); 418 m_type = Comment; 419 } 420 421 void appendToComment(UChar character) 422 { 423 ASSERT(character); 424 ASSERT(m_type == Comment); 425 m_data.append(character); 426 m_orAllData |= character; 427 } 428 429 void eraseCharacters() 430 { 431 ASSERT(m_type == Character); 432 m_data.clear(); 433 m_orAllData = 0; 434 } 435 436 private: 437 Type m_type; 438 Attribute::Range m_range; // Always starts at zero. 439 int m_baseOffset; 440 DataVector m_data; 441 UChar m_orAllData; 442 443 // For StartTag and EndTag 444 bool m_selfClosing; 445 AttributeList m_attributes; 446 447 // A pointer into m_attributes used during lexing. 448 Attribute* m_currentAttribute; 449 450 // For DOCTYPE 451 OwnPtr<DoctypeData> m_doctypeData; 452 }; 453 454 } 455 456 #endif 457