1 /* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef HTMLToken_h 27 #define HTMLToken_h 28 29 #include "NamedNodeMap.h" 30 #include <wtf/PassOwnPtr.h> 31 #include <wtf/Vector.h> 32 33 namespace WebCore { 34 35 class HTMLToken { 36 WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED; 37 public: 38 enum Type { 39 Uninitialized, 40 DOCTYPE, 41 StartTag, 42 EndTag, 43 Comment, 44 Character, 45 EndOfFile, 46 }; 47 48 class Range { 49 public: 50 int m_start; 51 int m_end; 52 }; 53 54 class Attribute { 55 public: 56 Range m_nameRange; 57 Range m_valueRange; 58 WTF::Vector<UChar, 32> m_name; 59 WTF::Vector<UChar, 32> m_value; 60 }; 61 62 typedef WTF::Vector<Attribute, 10> AttributeList; 63 typedef WTF::Vector<UChar, 1024> DataVector; 64 65 HTMLToken() { clear(); } 66 67 void clear() 68 { 69 m_type = Uninitialized; 70 m_range.m_start = 0; 71 m_range.m_end = 0; 72 m_baseOffset = 0; 73 m_data.clear(); 74 } 75 76 bool isUninitialized() { return m_type == Uninitialized; } 77 78 int startIndex() const { return m_range.m_start; } 79 int endIndex() const { return m_range.m_end; } 80 81 void setBaseOffset(int offset) 82 { 83 m_baseOffset = offset; 84 } 85 86 void end(int endOffset) 87 { 88 m_range.m_end = endOffset - m_baseOffset; 89 } 90 91 void makeEndOfFile() 92 { 93 ASSERT(m_type == Uninitialized); 94 m_type = EndOfFile; 95 } 96 97 void beginStartTag(UChar character) 98 { 99 ASSERT(character); 100 ASSERT(m_type == Uninitialized); 101 m_type = StartTag; 102 m_selfClosing = false; 103 m_currentAttribute = 0; 104 m_attributes.clear(); 105 106 m_data.append(character); 107 } 108 109 template<typename T> 110 void beginEndTag(T characters) 111 { 112 ASSERT(m_type == Uninitialized); 113 m_type = EndTag; 114 m_selfClosing = false; 115 m_currentAttribute = 0; 116 m_attributes.clear(); 117 118 m_data.append(characters); 119 } 120 121 // Starting a character token works slightly differently than starting 122 // other types of tokens because we want to save a per-character branch. 123 void ensureIsCharacterToken() 124 { 125 ASSERT(m_type == Uninitialized || m_type == Character); 126 m_type = Character; 127 } 128 129 void beginComment() 130 { 131 ASSERT(m_type == Uninitialized); 132 m_type = Comment; 133 } 134 135 void beginDOCTYPE() 136 { 137 ASSERT(m_type == Uninitialized); 138 m_type = DOCTYPE; 139 m_doctypeData = adoptPtr(new DoctypeData()); 140 } 141 142 void beginDOCTYPE(UChar character) 143 { 144 ASSERT(character); 145 beginDOCTYPE(); 146 m_data.append(character); 147 } 148 149 void appendToName(UChar character) 150 { 151 ASSERT(character); 152 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 153 m_data.append(character); 154 } 155 156 template<typename T> 157 void appendToCharacter(T characters) 158 { 159 ASSERT(m_type == Character); 160 m_data.append(characters); 161 } 162 163 void appendToComment(UChar character) 164 { 165 ASSERT(character); 166 ASSERT(m_type == Comment); 167 m_data.append(character); 168 } 169 170 void addNewAttribute() 171 { 172 ASSERT(m_type == StartTag || m_type == EndTag); 173 m_attributes.grow(m_attributes.size() + 1); 174 m_currentAttribute = &m_attributes.last(); 175 #ifndef NDEBUG 176 m_currentAttribute->m_nameRange.m_start = 0; 177 m_currentAttribute->m_nameRange.m_end = 0; 178 m_currentAttribute->m_valueRange.m_start = 0; 179 m_currentAttribute->m_valueRange.m_end = 0; 180 #endif 181 } 182 183 void beginAttributeName(int offset) 184 { 185 m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset; 186 } 187 188 void endAttributeName(int offset) 189 { 190 int index = offset - m_baseOffset; 191 m_currentAttribute->m_nameRange.m_end = index; 192 m_currentAttribute->m_valueRange.m_start = index; 193 m_currentAttribute->m_valueRange.m_end = index; 194 } 195 196 void beginAttributeValue(int offset) 197 { 198 m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset; 199 #ifndef NDEBUG 200 m_currentAttribute->m_valueRange.m_end = 0; 201 #endif 202 } 203 204 void endAttributeValue(int offset) 205 { 206 m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset; 207 } 208 209 void appendToAttributeName(UChar character) 210 { 211 ASSERT(character); 212 ASSERT(m_type == StartTag || m_type == EndTag); 213 ASSERT(m_currentAttribute->m_nameRange.m_start); 214 m_currentAttribute->m_name.append(character); 215 } 216 217 void appendToAttributeValue(UChar character) 218 { 219 ASSERT(character); 220 ASSERT(m_type == StartTag || m_type == EndTag); 221 ASSERT(m_currentAttribute->m_valueRange.m_start); 222 m_currentAttribute->m_value.append(character); 223 } 224 225 void appendToAttributeValue(size_t i, const String& value) 226 { 227 ASSERT(!value.isEmpty()); 228 ASSERT(m_type == StartTag || m_type == EndTag); 229 m_attributes[i].m_value.append(value.characters(), value.length()); 230 } 231 232 Type type() const { return m_type; } 233 234 bool selfClosing() const 235 { 236 ASSERT(m_type == StartTag || m_type == EndTag); 237 return m_selfClosing; 238 } 239 240 void setSelfClosing() 241 { 242 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 243 m_selfClosing = true; 244 } 245 246 const AttributeList& attributes() const 247 { 248 ASSERT(m_type == StartTag || m_type == EndTag); 249 return m_attributes; 250 } 251 252 const DataVector& name() const 253 { 254 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE); 255 return m_data; 256 } 257 258 void eraseCharacters() 259 { 260 ASSERT(m_type == Character); 261 m_data.clear(); 262 } 263 264 void eraseValueOfAttribute(size_t i) 265 { 266 ASSERT(m_type == StartTag || m_type == EndTag); 267 m_attributes[i].m_value.clear(); 268 } 269 270 const DataVector& characters() const 271 { 272 ASSERT(m_type == Character); 273 return m_data; 274 } 275 276 const DataVector& comment() const 277 { 278 ASSERT(m_type == Comment); 279 return m_data; 280 } 281 282 // FIXME: Distinguish between a missing public identifer and an empty one. 283 const WTF::Vector<UChar>& publicIdentifier() const 284 { 285 ASSERT(m_type == DOCTYPE); 286 return m_doctypeData->m_publicIdentifier; 287 } 288 289 // FIXME: Distinguish between a missing system identifer and an empty one. 290 const WTF::Vector<UChar>& systemIdentifier() const 291 { 292 ASSERT(m_type == DOCTYPE); 293 return m_doctypeData->m_systemIdentifier; 294 } 295 296 void setPublicIdentifierToEmptyString() 297 { 298 ASSERT(m_type == DOCTYPE); 299 m_doctypeData->m_hasPublicIdentifier = true; 300 m_doctypeData->m_publicIdentifier.clear(); 301 } 302 303 void setSystemIdentifierToEmptyString() 304 { 305 ASSERT(m_type == DOCTYPE); 306 m_doctypeData->m_hasSystemIdentifier = true; 307 m_doctypeData->m_systemIdentifier.clear(); 308 } 309 310 bool forceQuirks() const 311 { 312 ASSERT(m_type == DOCTYPE); 313 return m_doctypeData->m_forceQuirks; 314 } 315 316 void setForceQuirks() 317 { 318 ASSERT(m_type == DOCTYPE); 319 m_doctypeData->m_forceQuirks = true; 320 } 321 322 void appendToPublicIdentifier(UChar character) 323 { 324 ASSERT(character); 325 ASSERT(m_type == DOCTYPE); 326 ASSERT(m_doctypeData->m_hasPublicIdentifier); 327 m_doctypeData->m_publicIdentifier.append(character); 328 } 329 330 void appendToSystemIdentifier(UChar character) 331 { 332 ASSERT(character); 333 ASSERT(m_type == DOCTYPE); 334 ASSERT(m_doctypeData->m_hasSystemIdentifier); 335 m_doctypeData->m_systemIdentifier.append(character); 336 } 337 338 private: 339 // FIXME: I'm not sure what the final relationship between HTMLToken and 340 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll 341 // want to end up with a cleaner interface between the two classes. 342 friend class AtomicHTMLToken; 343 344 class DoctypeData { 345 WTF_MAKE_NONCOPYABLE(DoctypeData); 346 public: 347 DoctypeData() 348 : m_hasPublicIdentifier(false) 349 , m_hasSystemIdentifier(false) 350 , m_forceQuirks(false) 351 { 352 } 353 354 bool m_hasPublicIdentifier; 355 bool m_hasSystemIdentifier; 356 bool m_forceQuirks; 357 WTF::Vector<UChar> m_publicIdentifier; 358 WTF::Vector<UChar> m_systemIdentifier; 359 }; 360 361 Type m_type; 362 Range m_range; // Always starts at zero. 363 int m_baseOffset; 364 365 // "name" for DOCTYPE, StartTag, and EndTag 366 // "characters" for Character 367 // "data" for Comment 368 DataVector m_data; 369 370 // For DOCTYPE 371 OwnPtr<DoctypeData> m_doctypeData; 372 373 // For StartTag and EndTag 374 bool m_selfClosing; 375 AttributeList m_attributes; 376 377 // A pointer into m_attributes used during lexing. 378 Attribute* m_currentAttribute; 379 }; 380 381 // FIXME: This class should eventually be named HTMLToken once we move the 382 // exiting HTMLToken to be internal to the HTMLTokenizer. 383 class AtomicHTMLToken { 384 WTF_MAKE_NONCOPYABLE(AtomicHTMLToken); 385 public: 386 AtomicHTMLToken(HTMLToken& token) 387 : m_type(token.type()) 388 { 389 switch (m_type) { 390 case HTMLToken::Uninitialized: 391 ASSERT_NOT_REACHED(); 392 break; 393 case HTMLToken::DOCTYPE: 394 m_name = AtomicString(token.name().data(), token.name().size()); 395 m_doctypeData = token.m_doctypeData.release(); 396 break; 397 case HTMLToken::EndOfFile: 398 break; 399 case HTMLToken::StartTag: 400 case HTMLToken::EndTag: { 401 m_selfClosing = token.selfClosing(); 402 m_name = AtomicString(token.name().data(), token.name().size()); 403 initializeAttributes(token.attributes()); 404 break; 405 } 406 case HTMLToken::Comment: 407 m_data = String(token.comment().data(), token.comment().size()); 408 break; 409 case HTMLToken::Character: 410 m_externalCharacters = &token.characters(); 411 break; 412 } 413 } 414 415 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0) 416 : m_type(type) 417 , m_name(name) 418 , m_attributes(attributes) 419 { 420 ASSERT(usesName()); 421 } 422 423 HTMLToken::Type type() const { return m_type; } 424 425 const AtomicString& name() const 426 { 427 ASSERT(usesName()); 428 return m_name; 429 } 430 431 void setName(const AtomicString& name) 432 { 433 ASSERT(usesName()); 434 m_name = name; 435 } 436 437 bool selfClosing() const 438 { 439 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag); 440 return m_selfClosing; 441 } 442 443 Attribute* getAttributeItem(const QualifiedName& attributeName) 444 { 445 ASSERT(usesAttributes()); 446 if (!m_attributes) 447 return 0; 448 return m_attributes->getAttributeItem(attributeName); 449 } 450 451 NamedNodeMap* attributes() const 452 { 453 ASSERT(usesAttributes()); 454 return m_attributes.get(); 455 } 456 457 PassRefPtr<NamedNodeMap> takeAtributes() 458 { 459 ASSERT(usesAttributes()); 460 return m_attributes.release(); 461 } 462 463 const HTMLToken::DataVector& characters() const 464 { 465 ASSERT(m_type == HTMLToken::Character); 466 return *m_externalCharacters; 467 } 468 469 const String& comment() const 470 { 471 ASSERT(m_type == HTMLToken::Comment); 472 return m_data; 473 } 474 475 // FIXME: Distinguish between a missing public identifer and an empty one. 476 WTF::Vector<UChar>& publicIdentifier() const 477 { 478 ASSERT(m_type == HTMLToken::DOCTYPE); 479 return m_doctypeData->m_publicIdentifier; 480 } 481 482 // FIXME: Distinguish between a missing system identifer and an empty one. 483 WTF::Vector<UChar>& systemIdentifier() const 484 { 485 ASSERT(m_type == HTMLToken::DOCTYPE); 486 return m_doctypeData->m_systemIdentifier; 487 } 488 489 bool forceQuirks() const 490 { 491 ASSERT(m_type == HTMLToken::DOCTYPE); 492 return m_doctypeData->m_forceQuirks; 493 } 494 495 private: 496 HTMLToken::Type m_type; 497 498 void initializeAttributes(const HTMLToken::AttributeList& attributes); 499 500 bool usesName() const 501 { 502 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 503 } 504 505 bool usesAttributes() const 506 { 507 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 508 } 509 510 // "name" for DOCTYPE, StartTag, and EndTag 511 AtomicString m_name; 512 513 // "data" for Comment 514 String m_data; 515 516 // "characters" for Character 517 // 518 // We don't want to copy the the characters out of the HTMLToken, so we 519 // keep a pointer to its buffer instead. This buffer is owned by the 520 // HTMLToken and causes a lifetime dependence between these objects. 521 // 522 // FIXME: Add a mechanism for "internalizing" the characters when the 523 // HTMLToken is destructed. 524 const HTMLToken::DataVector* m_externalCharacters; 525 526 // For DOCTYPE 527 OwnPtr<HTMLToken::DoctypeData> m_doctypeData; 528 529 // For StartTag and EndTag 530 bool m_selfClosing; 531 532 RefPtr<NamedNodeMap> m_attributes; 533 }; 534 535 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes) 536 { 537 size_t size = attributes.size(); 538 if (!size) 539 return; 540 541 m_attributes = NamedNodeMap::create(); 542 m_attributes->reserveInitialCapacity(size); 543 for (size_t i = 0; i < size; ++i) { 544 const HTMLToken::Attribute& attribute = attributes[i]; 545 if (attribute.m_name.isEmpty()) 546 continue; 547 548 ASSERT(attribute.m_nameRange.m_start); 549 ASSERT(attribute.m_nameRange.m_end); 550 ASSERT(attribute.m_valueRange.m_start); 551 ASSERT(attribute.m_valueRange.m_end); 552 553 String name(attribute.m_name.data(), attribute.m_name.size()); 554 String value(attribute.m_value.data(), attribute.m_value.size()); 555 m_attributes->insertAttribute(Attribute::createMapped(name, value), false); 556 } 557 } 558 559 } 560 561 #endif 562