1 /* 2 * Copyright (C) 2003 Lars Knoll (knoll (at) kde.org) 3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde (at) carewolf.com) 4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved. 5 * Copyright (C) 2007 Nicholas Shanks <webkit (at) nickshanks.com> 6 * Copyright (C) 2008 Eric Seidel <eric (at) webkit.org> 7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/) 8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved. 9 * Copyright (C) 2012 Intel Corporation. All rights reserved. 10 * 11 * This library is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Library General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Library General Public License for more details. 20 * 21 * You should have received a copy of the GNU Library General Public License 22 * along with this library; see the file COPYING.LIB. If not, write to 23 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 24 * Boston, MA 02110-1301, USA. 25 */ 26 27 #include "config.h" 28 #include "core/css/parser/CSSTokenizer.h" 29 30 #include "core/css/CSSKeyframeRule.h" 31 #include "core/css/MediaQuery.h" 32 #include "core/css/StyleRule.h" 33 #include "core/css/parser/BisonCSSParser.h" 34 #include "core/css/parser/CSSParserValues.h" 35 #include "core/html/parser/HTMLParserIdioms.h" 36 #include "core/svg/SVGParserUtilities.h" 37 38 namespace blink { 39 40 #include "core/CSSGrammar.h" 41 42 enum CharacterType { 43 // Types for the main switch. 44 45 // The first 4 types must be grouped together, as they 46 // represent the allowed chars in an identifier. 47 CharacterCaselessU, 48 CharacterIdentifierStart, 49 CharacterNumber, 50 CharacterDash, 51 52 CharacterOther, 53 CharacterNull, 54 CharacterWhiteSpace, 55 CharacterEndMediaQueryOrSupports, 56 CharacterEndNthChild, 57 CharacterQuote, 58 CharacterExclamationMark, 59 CharacterHashmark, 60 CharacterDollar, 61 CharacterAsterisk, 62 CharacterPlus, 63 CharacterDot, 64 CharacterSlash, 65 CharacterLess, 66 CharacterAt, 67 CharacterBackSlash, 68 CharacterXor, 69 CharacterVerticalBar, 70 CharacterTilde, 71 }; 72 73 // 128 ASCII codes 74 static const CharacterType typesOfASCIICharacters[128] = { 75 /* 0 - Null */ CharacterNull, 76 /* 1 - Start of Heading */ CharacterOther, 77 /* 2 - Start of Text */ CharacterOther, 78 /* 3 - End of Text */ CharacterOther, 79 /* 4 - End of Transm. */ CharacterOther, 80 /* 5 - Enquiry */ CharacterOther, 81 /* 6 - Acknowledgment */ CharacterOther, 82 /* 7 - Bell */ CharacterOther, 83 /* 8 - Back Space */ CharacterOther, 84 /* 9 - Horizontal Tab */ CharacterWhiteSpace, 85 /* 10 - Line Feed */ CharacterWhiteSpace, 86 /* 11 - Vertical Tab */ CharacterOther, 87 /* 12 - Form Feed */ CharacterWhiteSpace, 88 /* 13 - Carriage Return */ CharacterWhiteSpace, 89 /* 14 - Shift Out */ CharacterOther, 90 /* 15 - Shift In */ CharacterOther, 91 /* 16 - Data Line Escape */ CharacterOther, 92 /* 17 - Device Control 1 */ CharacterOther, 93 /* 18 - Device Control 2 */ CharacterOther, 94 /* 19 - Device Control 3 */ CharacterOther, 95 /* 20 - Device Control 4 */ CharacterOther, 96 /* 21 - Negative Ack. */ CharacterOther, 97 /* 22 - Synchronous Idle */ CharacterOther, 98 /* 23 - End of Transmit */ CharacterOther, 99 /* 24 - Cancel */ CharacterOther, 100 /* 25 - End of Medium */ CharacterOther, 101 /* 26 - Substitute */ CharacterOther, 102 /* 27 - Escape */ CharacterOther, 103 /* 28 - File Separator */ CharacterOther, 104 /* 29 - Group Separator */ CharacterOther, 105 /* 30 - Record Separator */ CharacterOther, 106 /* 31 - Unit Separator */ CharacterOther, 107 /* 32 - Space */ CharacterWhiteSpace, 108 /* 33 - ! */ CharacterExclamationMark, 109 /* 34 - " */ CharacterQuote, 110 /* 35 - # */ CharacterHashmark, 111 /* 36 - $ */ CharacterDollar, 112 /* 37 - % */ CharacterOther, 113 /* 38 - & */ CharacterOther, 114 /* 39 - ' */ CharacterQuote, 115 /* 40 - ( */ CharacterOther, 116 /* 41 - ) */ CharacterEndNthChild, 117 /* 42 - * */ CharacterAsterisk, 118 /* 43 - + */ CharacterPlus, 119 /* 44 - , */ CharacterOther, 120 /* 45 - - */ CharacterDash, 121 /* 46 - . */ CharacterDot, 122 /* 47 - / */ CharacterSlash, 123 /* 48 - 0 */ CharacterNumber, 124 /* 49 - 1 */ CharacterNumber, 125 /* 50 - 2 */ CharacterNumber, 126 /* 51 - 3 */ CharacterNumber, 127 /* 52 - 4 */ CharacterNumber, 128 /* 53 - 5 */ CharacterNumber, 129 /* 54 - 6 */ CharacterNumber, 130 /* 55 - 7 */ CharacterNumber, 131 /* 56 - 8 */ CharacterNumber, 132 /* 57 - 9 */ CharacterNumber, 133 /* 58 - : */ CharacterOther, 134 /* 59 - ; */ CharacterEndMediaQueryOrSupports, 135 /* 60 - < */ CharacterLess, 136 /* 61 - = */ CharacterOther, 137 /* 62 - > */ CharacterOther, 138 /* 63 - ? */ CharacterOther, 139 /* 64 - @ */ CharacterAt, 140 /* 65 - A */ CharacterIdentifierStart, 141 /* 66 - B */ CharacterIdentifierStart, 142 /* 67 - C */ CharacterIdentifierStart, 143 /* 68 - D */ CharacterIdentifierStart, 144 /* 69 - E */ CharacterIdentifierStart, 145 /* 70 - F */ CharacterIdentifierStart, 146 /* 71 - G */ CharacterIdentifierStart, 147 /* 72 - H */ CharacterIdentifierStart, 148 /* 73 - I */ CharacterIdentifierStart, 149 /* 74 - J */ CharacterIdentifierStart, 150 /* 75 - K */ CharacterIdentifierStart, 151 /* 76 - L */ CharacterIdentifierStart, 152 /* 77 - M */ CharacterIdentifierStart, 153 /* 78 - N */ CharacterIdentifierStart, 154 /* 79 - O */ CharacterIdentifierStart, 155 /* 80 - P */ CharacterIdentifierStart, 156 /* 81 - Q */ CharacterIdentifierStart, 157 /* 82 - R */ CharacterIdentifierStart, 158 /* 83 - S */ CharacterIdentifierStart, 159 /* 84 - T */ CharacterIdentifierStart, 160 /* 85 - U */ CharacterCaselessU, 161 /* 86 - V */ CharacterIdentifierStart, 162 /* 87 - W */ CharacterIdentifierStart, 163 /* 88 - X */ CharacterIdentifierStart, 164 /* 89 - Y */ CharacterIdentifierStart, 165 /* 90 - Z */ CharacterIdentifierStart, 166 /* 91 - [ */ CharacterOther, 167 /* 92 - \ */ CharacterBackSlash, 168 /* 93 - ] */ CharacterOther, 169 /* 94 - ^ */ CharacterXor, 170 /* 95 - _ */ CharacterIdentifierStart, 171 /* 96 - ` */ CharacterOther, 172 /* 97 - a */ CharacterIdentifierStart, 173 /* 98 - b */ CharacterIdentifierStart, 174 /* 99 - c */ CharacterIdentifierStart, 175 /* 100 - d */ CharacterIdentifierStart, 176 /* 101 - e */ CharacterIdentifierStart, 177 /* 102 - f */ CharacterIdentifierStart, 178 /* 103 - g */ CharacterIdentifierStart, 179 /* 104 - h */ CharacterIdentifierStart, 180 /* 105 - i */ CharacterIdentifierStart, 181 /* 106 - j */ CharacterIdentifierStart, 182 /* 107 - k */ CharacterIdentifierStart, 183 /* 108 - l */ CharacterIdentifierStart, 184 /* 109 - m */ CharacterIdentifierStart, 185 /* 110 - n */ CharacterIdentifierStart, 186 /* 111 - o */ CharacterIdentifierStart, 187 /* 112 - p */ CharacterIdentifierStart, 188 /* 113 - q */ CharacterIdentifierStart, 189 /* 114 - r */ CharacterIdentifierStart, 190 /* 115 - s */ CharacterIdentifierStart, 191 /* 116 - t */ CharacterIdentifierStart, 192 /* 117 - u */ CharacterCaselessU, 193 /* 118 - v */ CharacterIdentifierStart, 194 /* 119 - w */ CharacterIdentifierStart, 195 /* 120 - x */ CharacterIdentifierStart, 196 /* 121 - y */ CharacterIdentifierStart, 197 /* 122 - z */ CharacterIdentifierStart, 198 /* 123 - { */ CharacterEndMediaQueryOrSupports, 199 /* 124 - | */ CharacterVerticalBar, 200 /* 125 - } */ CharacterOther, 201 /* 126 - ~ */ CharacterTilde, 202 /* 127 - Delete */ CharacterOther, 203 }; 204 205 // Utility functions for the CSS tokenizer. 206 207 template <typename CharacterType> 208 static inline bool isCSSLetter(CharacterType character) 209 { 210 return character >= 128 || typesOfASCIICharacters[character] <= CharacterDash; 211 } 212 213 template <typename CharacterType> 214 static inline bool isCSSEscape(CharacterType character) 215 { 216 return character >= ' ' && character != 127; 217 } 218 219 template <typename CharacterType> 220 static inline bool isURILetter(CharacterType character) 221 { 222 return (character >= '*' && character != 127) || (character >= '#' && character <= '&') || character == '!'; 223 } 224 225 template <typename CharacterType> 226 static inline bool isIdentifierStartAfterDash(CharacterType* currentCharacter) 227 { 228 return isASCIIAlpha(currentCharacter[0]) || currentCharacter[0] == '_' || currentCharacter[0] >= 128 229 || (currentCharacter[0] == '\\' && isCSSEscape(currentCharacter[1])); 230 } 231 232 template <typename CharacterType> 233 static inline bool isEqualToCSSIdentifier(CharacterType* cssString, const char* constantString) 234 { 235 // Compare an character memory data with a zero terminated string. 236 do { 237 // The input must be part of an identifier if constantChar or constString 238 // contains '-'. Otherwise toASCIILowerUnchecked('\r') would be equal to '-'. 239 ASSERT((*constantString >= 'a' && *constantString <= 'z') || *constantString == '-'); 240 ASSERT(*constantString != '-' || isCSSLetter(*cssString)); 241 if (toASCIILowerUnchecked(*cssString++) != (*constantString++)) 242 return false; 243 } while (*constantString); 244 return true; 245 } 246 247 template <typename CharacterType> 248 static inline bool isEqualToCSSCaseSensitiveIdentifier(CharacterType* string, const char* constantString) 249 { 250 ASSERT(*constantString); 251 252 do { 253 if (*string++ != *constantString++) 254 return false; 255 } while (*constantString); 256 return true; 257 } 258 259 template <typename CharacterType> 260 static CharacterType* checkAndSkipEscape(CharacterType* currentCharacter) 261 { 262 // Returns with 0, if escape check is failed. Otherwise 263 // it returns with the following character. 264 ASSERT(*currentCharacter == '\\'); 265 266 ++currentCharacter; 267 if (!isCSSEscape(*currentCharacter)) 268 return 0; 269 270 if (isASCIIHexDigit(*currentCharacter)) { 271 int length = 6; 272 273 do { 274 ++currentCharacter; 275 } while (isASCIIHexDigit(*currentCharacter) && --length); 276 277 // Optional space after the escape sequence. 278 if (isHTMLSpace<CharacterType>(*currentCharacter)) 279 ++currentCharacter; 280 return currentCharacter; 281 } 282 return currentCharacter + 1; 283 } 284 285 template <typename CharacterType> 286 static inline CharacterType* skipWhiteSpace(CharacterType* currentCharacter) 287 { 288 while (isHTMLSpace<CharacterType>(*currentCharacter)) 289 ++currentCharacter; 290 return currentCharacter; 291 } 292 293 // Main CSS tokenizer functions. 294 295 template <> 296 inline LChar*& CSSTokenizer::currentCharacter<LChar>() 297 { 298 return m_currentCharacter8; 299 } 300 301 template <> 302 inline UChar*& CSSTokenizer::currentCharacter<UChar>() 303 { 304 return m_currentCharacter16; 305 } 306 307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len) 308 { 309 // Allocates and returns a CSSTokenizer owned buffer for storing 310 // UTF-16 data. Used to get a suitable life span for UTF-16 311 // strings, identifiers and URIs created by the tokenizer. 312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]); 313 314 UChar* bufferPtr = buffer.get(); 315 316 m_cssStrings16.append(buffer.release()); 317 return bufferPtr; 318 } 319 320 template <> 321 inline LChar* CSSTokenizer::dataStart<LChar>() 322 { 323 return m_dataStart8.get(); 324 } 325 326 template <> 327 inline UChar* CSSTokenizer::dataStart<UChar>() 328 { 329 return m_dataStart16.get(); 330 } 331 332 template <typename CharacterType> 333 inline CSSParserLocation CSSTokenizer::tokenLocation() 334 { 335 CSSParserLocation location; 336 location.token.init(tokenStart<CharacterType>(), currentCharacter<CharacterType>() - tokenStart<CharacterType>()); 337 location.lineNumber = m_tokenStartLineNumber; 338 location.offset = tokenStart<CharacterType>() - dataStart<CharacterType>(); 339 return location; 340 } 341 342 CSSParserLocation CSSTokenizer::currentLocation() 343 { 344 if (is8BitSource()) 345 return tokenLocation<LChar>(); 346 return tokenLocation<UChar>(); 347 } 348 349 template <typename CharacterType> 350 inline bool CSSTokenizer::isIdentifierStart() 351 { 352 // Check whether an identifier is started. 353 return isIdentifierStartAfterDash((*currentCharacter<CharacterType>() != '-') ? currentCharacter<CharacterType>() : currentCharacter<CharacterType>() + 1); 354 } 355 356 enum CheckStringValidationMode { 357 AbortIfInvalid, 358 SkipInvalid 359 }; 360 361 template <typename CharacterType> 362 static inline CharacterType* checkAndSkipString(CharacterType* currentCharacter, int quote, CheckStringValidationMode mode) 363 { 364 // If mode is AbortIfInvalid and the string check fails it returns 365 // with 0. Otherwise it returns with a pointer to the first 366 // character after the string. 367 while (true) { 368 if (UNLIKELY(*currentCharacter == quote)) { 369 // String parsing is successful. 370 return currentCharacter + 1; 371 } 372 if (UNLIKELY(!*currentCharacter)) { 373 // String parsing is successful up to end of input. 374 return currentCharacter; 375 } 376 if (mode == AbortIfInvalid && UNLIKELY(*currentCharacter <= '\r' && (*currentCharacter == '\n' || (*currentCharacter | 0x1) == '\r'))) { 377 // String parsing is failed for character '\n', '\f' or '\r'. 378 return 0; 379 } 380 381 if (LIKELY(currentCharacter[0] != '\\')) { 382 ++currentCharacter; 383 } else if (currentCharacter[1] == '\n' || currentCharacter[1] == '\f') { 384 currentCharacter += 2; 385 } else if (currentCharacter[1] == '\r') { 386 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2; 387 } else { 388 CharacterType* next = checkAndSkipEscape(currentCharacter); 389 if (!next) { 390 if (mode == AbortIfInvalid) 391 return 0; 392 next = currentCharacter + 1; 393 } 394 currentCharacter = next; 395 } 396 } 397 } 398 399 template <typename CharacterType> 400 unsigned CSSTokenizer::parseEscape(CharacterType*& src) 401 { 402 ASSERT(*src == '\\' && isCSSEscape(src[1])); 403 404 unsigned unicode = 0; 405 406 ++src; 407 if (isASCIIHexDigit(*src)) { 408 409 int length = 6; 410 411 do { 412 unicode = (unicode << 4) + toASCIIHexValue(*src++); 413 } while (--length && isASCIIHexDigit(*src)); 414 415 // Characters above 0x10ffff are not handled. 416 if (unicode > 0x10ffff) 417 unicode = 0xfffd; 418 419 // Optional space after the escape sequence. 420 if (isHTMLSpace<CharacterType>(*src)) 421 ++src; 422 423 return unicode; 424 } 425 426 return *src++; 427 } 428 429 template <> 430 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode) 431 { 432 ASSERT(unicode <= 0xff); 433 *result = unicode; 434 435 ++result; 436 } 437 438 template <> 439 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode) 440 { 441 // Replace unicode with a surrogate pairs when it is bigger than 0xffff 442 if (U16_LENGTH(unicode) == 2) { 443 *result++ = U16_LEAD(unicode); 444 *result = U16_TRAIL(unicode); 445 } else { 446 *result = unicode; 447 } 448 449 ++result; 450 } 451 452 template <typename SrcCharacterType> 453 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src) 454 { 455 // The decoded form of an identifier (after resolving escape 456 // sequences) will not contain more characters (ASCII or UTF-16 457 // codepoints) than the input. This code can therefore ignore 458 // escape sequences completely. 459 SrcCharacterType* start = src; 460 do { 461 if (LIKELY(*src != '\\')) 462 src++; 463 else 464 parseEscape<SrcCharacterType>(src); 465 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1]))); 466 467 return src - start; 468 } 469 470 template <typename SrcCharacterType, typename DestCharacterType> 471 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape) 472 { 473 hasEscape = false; 474 do { 475 if (LIKELY(*src != '\\')) { 476 *result++ = *src++; 477 } else { 478 hasEscape = true; 479 SrcCharacterType* savedEscapeStart = src; 480 unsigned unicode = parseEscape<SrcCharacterType>(src); 481 if (unicode > 0xff && sizeof(DestCharacterType) == 1) { 482 src = savedEscapeStart; 483 return false; 484 } 485 UnicodeToChars(result, unicode); 486 } 487 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1]))); 488 489 return true; 490 } 491 492 template <typename CharacterType> 493 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserString& resultString, bool& hasEscape) 494 { 495 // If a valid identifier start is found, we can safely 496 // parse the identifier until the next invalid character. 497 ASSERT(isIdentifierStart<CharacterType>()); 498 499 CharacterType* start = currentCharacter<CharacterType>(); 500 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) { 501 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue 502 ASSERT(is8BitSource()); 503 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(currentCharacter<CharacterType>())); 504 UChar* start16 = result16; 505 int i = 0; 506 for (; i < result - start; i++) 507 result16[i] = start[i]; 508 509 result16 += i; 510 511 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, hasEscape); 512 513 resultString.init(start16, result16 - start16); 514 515 return; 516 } 517 518 resultString.init(start, result - start); 519 } 520 521 template <typename SrcCharacterType> 522 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote) 523 { 524 // The decoded form of a CSS string (after resolving escape 525 // sequences) will not contain more characters (ASCII or UTF-16 526 // codepoints) than the input. This code can therefore ignore 527 // escape sequences completely and just return the length of the 528 // input string (possibly including terminating quote if any). 529 SrcCharacterType* end = checkAndSkipString(src, quote, SkipInvalid); 530 return end ? end - src : 0; 531 } 532 533 template <typename SrcCharacterType, typename DestCharacterType> 534 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote) 535 { 536 while (true) { 537 if (UNLIKELY(*src == quote)) { 538 // String parsing is done. 539 ++src; 540 return true; 541 } 542 if (UNLIKELY(!*src)) { 543 // String parsing is done, but don't advance pointer if at the end of input. 544 return true; 545 } 546 if (LIKELY(src[0] != '\\')) { 547 *result++ = *src++; 548 } else if (src[1] == '\n' || src[1] == '\f') { 549 src += 2; 550 } else if (src[1] == '\r') { 551 src += src[2] == '\n' ? 3 : 2; 552 } else { 553 SrcCharacterType* savedEscapeStart = src; 554 unsigned unicode = parseEscape<SrcCharacterType>(src); 555 if (unicode > 0xff && sizeof(DestCharacterType) == 1) { 556 src = savedEscapeStart; 557 return false; 558 } 559 UnicodeToChars(result, unicode); 560 } 561 } 562 563 return true; 564 } 565 566 template <typename CharacterType> 567 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& resultString, UChar quote) 568 { 569 CharacterType* start = currentCharacter<CharacterType>(); 570 571 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) { 572 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue 573 ASSERT(is8BitSource()); 574 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(currentCharacter<CharacterType>(), quote)); 575 UChar* start16 = result16; 576 int i = 0; 577 for (; i < result - start; i++) 578 result16[i] = start[i]; 579 580 result16 += i; 581 582 parseStringInternal(currentCharacter<CharacterType>(), result16, quote); 583 584 resultString.init(start16, result16 - start16); 585 return; 586 } 587 588 resultString.init(start, result - start); 589 } 590 591 template <typename CharacterType> 592 inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UChar& quote) 593 { 594 start = skipWhiteSpace(currentCharacter<CharacterType>()); 595 596 if (*start == '"' || *start == '\'') { 597 quote = *start++; 598 end = checkAndSkipString(start, quote, AbortIfInvalid); 599 if (!end) 600 return false; 601 } else { 602 quote = 0; 603 end = start; 604 while (isURILetter(*end)) { 605 if (LIKELY(*end != '\\')) { 606 ++end; 607 } else { 608 end = checkAndSkipEscape(end); 609 if (!end) 610 return false; 611 } 612 } 613 } 614 615 end = skipWhiteSpace(end); 616 if (*end != ')') 617 return false; 618 619 return true; 620 } 621 622 template <typename SrcCharacterType> 623 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote) 624 { 625 // The decoded form of a URI (after resolving escape sequences) 626 // will not contain more characters (ASCII or UTF-16 codepoints) 627 // than the input. This code can therefore ignore escape sequences 628 // completely. 629 SrcCharacterType* start = src; 630 if (quote) { 631 ASSERT(quote == '"' || quote == '\''); 632 return peekMaxStringLen(src, quote); 633 } 634 635 while (isURILetter(*src)) { 636 if (LIKELY(*src != '\\')) 637 src++; 638 else 639 parseEscape<SrcCharacterType>(src); 640 } 641 642 return src - start; 643 } 644 645 template <typename SrcCharacterType, typename DestCharacterType> 646 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote) 647 { 648 if (quote) { 649 ASSERT(quote == '"' || quote == '\''); 650 return parseStringInternal(src, dest, quote); 651 } 652 653 while (isURILetter(*src)) { 654 if (LIKELY(*src != '\\')) { 655 *dest++ = *src++; 656 } else { 657 unsigned unicode = parseEscape<SrcCharacterType>(src); 658 if (unicode > 0xff && sizeof(DestCharacterType) == 1) 659 return false; 660 UnicodeToChars(dest, unicode); 661 } 662 } 663 664 return true; 665 } 666 667 template <typename CharacterType> 668 inline void CSSTokenizer::parseURI(CSSParserString& string) 669 { 670 CharacterType* uriStart; 671 CharacterType* uriEnd; 672 UChar quote; 673 if (!findURI(uriStart, uriEnd, quote)) 674 return; 675 676 CharacterType* dest = currentCharacter<CharacterType>() = uriStart; 677 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote))) { 678 string.init(uriStart, dest - uriStart); 679 } else { 680 // An escape sequence was encountered that can't be stored in 8 bits. 681 // Reset the current character to the start of the URI and re-parse with 682 // a 16-bit destination. 683 ASSERT(is8BitSource()); 684 currentCharacter<CharacterType>() = uriStart; 685 UChar* result16 = allocateStringBuffer16(peekMaxURILen(currentCharacter<CharacterType>(), quote)); 686 UChar* uriStart16 = result16; 687 bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote); 688 ASSERT_UNUSED(result, result); 689 string.init(uriStart16, result16 - uriStart16); 690 } 691 692 currentCharacter<CharacterType>() = uriEnd + 1; 693 m_token = URI; 694 } 695 696 template <typename CharacterType> 697 inline bool CSSTokenizer::parseUnicodeRange() 698 { 699 CharacterType* character = currentCharacter<CharacterType>() + 1; 700 int length = 6; 701 ASSERT(*currentCharacter<CharacterType>() == '+'); 702 703 while (isASCIIHexDigit(*character) && length) { 704 ++character; 705 --length; 706 } 707 708 if (length && *character == '?') { 709 // At most 5 hex digit followed by a question mark. 710 do { 711 ++character; 712 --length; 713 } while (*character == '?' && length); 714 currentCharacter<CharacterType>() = character; 715 return true; 716 } 717 718 if (length < 6) { 719 // At least one hex digit. 720 if (character[0] == '-' && isASCIIHexDigit(character[1])) { 721 // Followed by a dash and a hex digit. 722 ++character; 723 length = 6; 724 do { 725 ++character; 726 } while (--length && isASCIIHexDigit(*character)); 727 } 728 currentCharacter<CharacterType>() = character; 729 return true; 730 } 731 return false; 732 } 733 734 template <typename CharacterType> 735 bool CSSTokenizer::parseNthChild() 736 { 737 CharacterType* character = currentCharacter<CharacterType>(); 738 739 while (isASCIIDigit(*character)) 740 ++character; 741 if (isASCIIAlphaCaselessEqual(*character, 'n')) { 742 currentCharacter<CharacterType>() = character + 1; 743 return true; 744 } 745 return false; 746 } 747 748 template <typename CharacterType> 749 bool CSSTokenizer::parseNthChildExtra() 750 { 751 CharacterType* character = skipWhiteSpace(currentCharacter<CharacterType>()); 752 if (*character != '+' && *character != '-') 753 return false; 754 755 character = skipWhiteSpace(character + 1); 756 if (!isASCIIDigit(*character)) 757 return false; 758 759 do { 760 ++character; 761 } while (isASCIIDigit(*character)); 762 763 currentCharacter<CharacterType>() = character; 764 return true; 765 } 766 767 template <typename CharacterType> 768 inline bool CSSTokenizer::detectFunctionTypeToken(int length) 769 { 770 ASSERT(length > 0); 771 CharacterType* name = tokenStart<CharacterType>(); 772 SWITCH(name, length) { 773 CASE("not") { 774 m_token = NOTFUNCTION; 775 return true; 776 } 777 CASE("url") { 778 m_token = URI; 779 return true; 780 } 781 CASE("cue") { 782 m_token = CUEFUNCTION; 783 return true; 784 } 785 CASE("calc") { 786 m_token = CALCFUNCTION; 787 return true; 788 } 789 CASE("host") { 790 m_token = HOSTFUNCTION; 791 return true; 792 } 793 CASE("host-context") { 794 m_token = HOSTCONTEXTFUNCTION; 795 return true; 796 } 797 CASE("nth-child") { 798 m_parsingMode = NthChildMode; 799 return true; 800 } 801 CASE("nth-of-type") { 802 m_parsingMode = NthChildMode; 803 return true; 804 } 805 CASE("nth-last-child") { 806 m_parsingMode = NthChildMode; 807 return true; 808 } 809 CASE("nth-last-of-type") { 810 m_parsingMode = NthChildMode; 811 return true; 812 } 813 } 814 return false; 815 } 816 817 template <typename CharacterType> 818 inline void CSSTokenizer::detectMediaQueryToken(int length) 819 { 820 ASSERT(m_parsingMode == MediaQueryMode); 821 CharacterType* name = tokenStart<CharacterType>(); 822 823 SWITCH(name, length) { 824 CASE("and") { 825 m_token = MEDIA_AND; 826 } 827 CASE("not") { 828 m_token = MEDIA_NOT; 829 } 830 CASE("only") { 831 m_token = MEDIA_ONLY; 832 } 833 CASE("or") { 834 m_token = MEDIA_OR; 835 } 836 } 837 } 838 839 template <typename CharacterType> 840 inline void CSSTokenizer::detectNumberToken(CharacterType* type, int length) 841 { 842 ASSERT(length > 0); 843 844 SWITCH(type, length) { 845 CASE("cm") { 846 m_token = CMS; 847 } 848 CASE("ch") { 849 m_token = CHS; 850 } 851 CASE("deg") { 852 m_token = DEGS; 853 } 854 CASE("dppx") { 855 // There is a discussion about the name of this unit on www-style. 856 // Keep this compile time guard in place until that is resolved. 857 // http://lists.w3.org/Archives/Public/www-style/2012May/0915.html 858 m_token = DPPX; 859 } 860 CASE("dpcm") { 861 m_token = DPCM; 862 } 863 CASE("dpi") { 864 m_token = DPI; 865 } 866 CASE("em") { 867 m_token = EMS; 868 } 869 CASE("ex") { 870 m_token = EXS; 871 } 872 CASE("fr") { 873 m_token = FR; 874 } 875 CASE("grad") { 876 m_token = GRADS; 877 } 878 CASE("hz") { 879 m_token = HERTZ; 880 } 881 CASE("in") { 882 m_token = INS; 883 } 884 CASE("khz") { 885 m_token = KHERTZ; 886 } 887 CASE("mm") { 888 m_token = MMS; 889 } 890 CASE("ms") { 891 m_token = MSECS; 892 } 893 CASE("px") { 894 m_token = PXS; 895 } 896 CASE("pt") { 897 m_token = PTS; 898 } 899 CASE("pc") { 900 m_token = PCS; 901 } 902 CASE("rad") { 903 m_token = RADS; 904 } 905 CASE("rem") { 906 m_token = REMS; 907 } 908 CASE("s") { 909 m_token = SECS; 910 } 911 CASE("turn") { 912 m_token = TURNS; 913 } 914 CASE("vw") { 915 m_token = VW; 916 } 917 CASE("vh") { 918 m_token = VH; 919 } 920 CASE("vmin") { 921 m_token = VMIN; 922 } 923 CASE("vmax") { 924 m_token = VMAX; 925 } 926 CASE("__qem") { 927 m_token = QEMS; 928 } 929 } 930 } 931 932 template <typename CharacterType> 933 inline void CSSTokenizer::detectDashToken(int length) 934 { 935 CharacterType* name = tokenStart<CharacterType>(); 936 937 // Ignore leading dash. 938 ++name; 939 --length; 940 941 SWITCH(name, length) { 942 CASE("webkit-any") { 943 m_token = ANYFUNCTION; 944 } 945 CASE("webkit-calc") { 946 m_token = CALCFUNCTION; 947 } 948 } 949 } 950 951 template <typename CharacterType> 952 inline void CSSTokenizer::detectAtToken(int length, bool hasEscape) 953 { 954 CharacterType* name = tokenStart<CharacterType>(); 955 ASSERT(name[0] == '@' && length >= 2); 956 957 // Ignore leading @. 958 ++name; 959 --length; 960 961 // charset, font-face, import, media, namespace, page, supports, 962 // -webkit-keyframes, keyframes, and -webkit-mediaquery are not affected by hasEscape. 963 SWITCH(name, length) { 964 CASE("bottom-left") { 965 if (LIKELY(!hasEscape)) 966 m_token = BOTTOMLEFT_SYM; 967 } 968 CASE("bottom-right") { 969 if (LIKELY(!hasEscape)) 970 m_token = BOTTOMRIGHT_SYM; 971 } 972 CASE("bottom-center") { 973 if (LIKELY(!hasEscape)) 974 m_token = BOTTOMCENTER_SYM; 975 } 976 CASE("bottom-left-corner") { 977 if (LIKELY(!hasEscape)) 978 m_token = BOTTOMLEFTCORNER_SYM; 979 } 980 CASE("bottom-right-corner") { 981 if (LIKELY(!hasEscape)) 982 m_token = BOTTOMRIGHTCORNER_SYM; 983 } 984 CASE("charset") { 985 if (name - 1 == dataStart<CharacterType>()) 986 m_token = CHARSET_SYM; 987 } 988 CASE("font-face") { 989 m_token = FONT_FACE_SYM; 990 } 991 CASE("import") { 992 m_parsingMode = MediaQueryMode; 993 m_token = IMPORT_SYM; 994 } 995 CASE("keyframes") { 996 if (RuntimeEnabledFeatures::cssAnimationUnprefixedEnabled()) 997 m_token = KEYFRAMES_SYM; 998 } 999 CASE("left-top") { 1000 if (LIKELY(!hasEscape)) 1001 m_token = LEFTTOP_SYM; 1002 } 1003 CASE("left-middle") { 1004 if (LIKELY(!hasEscape)) 1005 m_token = LEFTMIDDLE_SYM; 1006 } 1007 CASE("left-bottom") { 1008 if (LIKELY(!hasEscape)) 1009 m_token = LEFTBOTTOM_SYM; 1010 } 1011 CASE("media") { 1012 m_parsingMode = MediaQueryMode; 1013 m_token = MEDIA_SYM; 1014 } 1015 CASE("namespace") { 1016 m_token = NAMESPACE_SYM; 1017 } 1018 CASE("page") { 1019 m_token = PAGE_SYM; 1020 } 1021 CASE("right-top") { 1022 if (LIKELY(!hasEscape)) 1023 m_token = RIGHTTOP_SYM; 1024 } 1025 CASE("right-middle") { 1026 if (LIKELY(!hasEscape)) 1027 m_token = RIGHTMIDDLE_SYM; 1028 } 1029 CASE("right-bottom") { 1030 if (LIKELY(!hasEscape)) 1031 m_token = RIGHTBOTTOM_SYM; 1032 } 1033 CASE("supports") { 1034 m_parsingMode = SupportsMode; 1035 m_token = SUPPORTS_SYM; 1036 } 1037 CASE("top-left") { 1038 if (LIKELY(!hasEscape)) 1039 m_token = TOPLEFT_SYM; 1040 } 1041 CASE("top-right") { 1042 if (LIKELY(!hasEscape)) 1043 m_token = TOPRIGHT_SYM; 1044 } 1045 CASE("top-center") { 1046 if (LIKELY(!hasEscape)) 1047 m_token = TOPCENTER_SYM; 1048 } 1049 CASE("top-left-corner") { 1050 if (LIKELY(!hasEscape)) 1051 m_token = TOPLEFTCORNER_SYM; 1052 } 1053 CASE("top-right-corner") { 1054 if (LIKELY(!hasEscape)) 1055 m_token = TOPRIGHTCORNER_SYM; 1056 } 1057 CASE("viewport") { 1058 m_token = VIEWPORT_RULE_SYM; 1059 } 1060 CASE("-internal-rule") { 1061 if (LIKELY(!hasEscape && m_internal)) 1062 m_token = INTERNAL_RULE_SYM; 1063 } 1064 CASE("-internal-decls") { 1065 if (LIKELY(!hasEscape && m_internal)) 1066 m_token = INTERNAL_DECLS_SYM; 1067 } 1068 CASE("-internal-value") { 1069 if (LIKELY(!hasEscape && m_internal)) 1070 m_token = INTERNAL_VALUE_SYM; 1071 } 1072 CASE("-webkit-keyframes") { 1073 m_token = WEBKIT_KEYFRAMES_SYM; 1074 } 1075 CASE("-internal-selector") { 1076 if (LIKELY(!hasEscape && m_internal)) 1077 m_token = INTERNAL_SELECTOR_SYM; 1078 } 1079 CASE("-internal-keyframe-rule") { 1080 if (LIKELY(!hasEscape && m_internal)) 1081 m_token = INTERNAL_KEYFRAME_RULE_SYM; 1082 } 1083 CASE("-internal-keyframe-key-list") { 1084 if (!m_internal) 1085 return; 1086 m_token = INTERNAL_KEYFRAME_KEY_LIST_SYM; 1087 } 1088 CASE("-internal-supports-condition") { 1089 if (!m_internal) 1090 return; 1091 m_parsingMode = SupportsMode; 1092 m_token = INTERNAL_SUPPORTS_CONDITION_SYM; 1093 } 1094 } 1095 } 1096 1097 template <typename CharacterType> 1098 inline void CSSTokenizer::detectSupportsToken(int length) 1099 { 1100 ASSERT(m_parsingMode == SupportsMode); 1101 CharacterType* name = tokenStart<CharacterType>(); 1102 1103 SWITCH(name, length) { 1104 CASE("or") { 1105 m_token = SUPPORTS_OR; 1106 } 1107 CASE("and") { 1108 m_token = SUPPORTS_AND; 1109 } 1110 CASE("not") { 1111 m_token = SUPPORTS_NOT; 1112 } 1113 } 1114 } 1115 1116 template <typename SrcCharacterType> 1117 int CSSTokenizer::realLex(void* yylvalWithoutType) 1118 { 1119 YYSTYPE* yylval = static_cast<YYSTYPE*>(yylvalWithoutType); 1120 // Write pointer for the next character. 1121 SrcCharacterType* result; 1122 CSSParserString resultString; 1123 bool hasEscape; 1124 1125 // The input buffer is terminated by a \0 character, so 1126 // it is safe to read one character ahead of a known non-null. 1127 #if ENABLE(ASSERT) 1128 // In debug we check with an ASSERT that the length is > 0 for string types. 1129 yylval->string.clear(); 1130 #endif 1131 1132 restartAfterComment: 1133 result = currentCharacter<SrcCharacterType>(); 1134 setTokenStart(result); 1135 m_tokenStartLineNumber = m_lineNumber; 1136 m_token = *currentCharacter<SrcCharacterType>(); 1137 ++currentCharacter<SrcCharacterType>(); 1138 1139 switch ((m_token <= 127) ? typesOfASCIICharacters[m_token] : CharacterIdentifierStart) { 1140 case CharacterCaselessU: 1141 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '+')) { 1142 if (parseUnicodeRange<SrcCharacterType>()) { 1143 m_token = UNICODERANGE; 1144 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1145 break; 1146 } 1147 } 1148 // Fall through to CharacterIdentifierStart. 1149 1150 case CharacterIdentifierStart: 1151 --currentCharacter<SrcCharacterType>(); 1152 parseIdentifier(result, yylval->string, hasEscape); 1153 m_token = IDENT; 1154 1155 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '(')) { 1156 if (m_parsingMode == SupportsMode && !hasEscape) { 1157 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>()); 1158 if (m_token != IDENT) 1159 break; 1160 } 1161 1162 m_token = FUNCTION; 1163 if (!hasEscape) 1164 detectFunctionTypeToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>()); 1165 1166 // Skip parenthesis 1167 ++currentCharacter<SrcCharacterType>(); 1168 ++result; 1169 1170 if (m_token == URI) { 1171 m_token = FUNCTION; 1172 // Check whether it is really an URI. 1173 if (yylval->string.is8Bit()) 1174 parseURI<LChar>(yylval->string); 1175 else 1176 parseURI<UChar>(yylval->string); 1177 } 1178 } else if (UNLIKELY(m_parsingMode != NormalMode) && !hasEscape) { 1179 if (m_parsingMode == MediaQueryMode) { 1180 detectMediaQueryToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>()); 1181 } else if (m_parsingMode == SupportsMode) { 1182 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>()); 1183 } else if (m_parsingMode == NthChildMode && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[0], 'n')) { 1184 if (result - tokenStart<SrcCharacterType>() == 1) { 1185 // String "n" is IDENT but "n+1" is NTH. 1186 if (parseNthChildExtra<SrcCharacterType>()) { 1187 m_token = NTH; 1188 yylval->string.m_length = currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>(); 1189 } 1190 } else if (result - tokenStart<SrcCharacterType>() >= 2 && tokenStart<SrcCharacterType>()[1] == '-') { 1191 // String "n-" is IDENT but "n-1" is NTH. 1192 // Set currentCharacter to '-' to continue parsing. 1193 SrcCharacterType* nextCharacter = result; 1194 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 1; 1195 if (parseNthChildExtra<SrcCharacterType>()) { 1196 m_token = NTH; 1197 yylval->string.setLength(currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1198 } else { 1199 // Revert the change to currentCharacter if unsuccessful. 1200 currentCharacter<SrcCharacterType>() = nextCharacter; 1201 } 1202 } 1203 } 1204 } 1205 break; 1206 1207 case CharacterDot: 1208 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) 1209 break; 1210 // Fall through to CharacterNumber. 1211 1212 case CharacterNumber: { 1213 bool dotSeen = (m_token == '.'); 1214 1215 while (true) { 1216 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) { 1217 // Only one dot is allowed for a number, 1218 // and it must be followed by a digit. 1219 if (currentCharacter<SrcCharacterType>()[0] != '.' || dotSeen || !isASCIIDigit(currentCharacter<SrcCharacterType>()[1])) 1220 break; 1221 dotSeen = true; 1222 } 1223 ++currentCharacter<SrcCharacterType>(); 1224 } 1225 1226 if (UNLIKELY(m_parsingMode == NthChildMode) && !dotSeen && isASCIIAlphaCaselessEqual(*currentCharacter<SrcCharacterType>(), 'n')) { 1227 // "[0-9]+n" is always an NthChild. 1228 ++currentCharacter<SrcCharacterType>(); 1229 parseNthChildExtra<SrcCharacterType>(); 1230 m_token = NTH; 1231 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1232 break; 1233 } 1234 1235 // We need to take care of units like 'em' or 'ex'. 1236 SrcCharacterType* character = currentCharacter<SrcCharacterType>(); 1237 if (isASCIIAlphaCaselessEqual(*character, 'e')) { 1238 ASSERT(character - tokenStart<SrcCharacterType>() > 0); 1239 ++character; 1240 if (*character == '-' || *character == '+' || isASCIIDigit(*character)) { 1241 ++character; 1242 while (isASCIIDigit(*character)) 1243 ++character; 1244 // Use FLOATTOKEN if the string contains exponents. 1245 dotSeen = true; 1246 currentCharacter<SrcCharacterType>() = character; 1247 } 1248 } 1249 1250 yylval->number = charactersToDouble(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1251 1252 // Type of the function. 1253 if (isIdentifierStart<SrcCharacterType>()) { 1254 SrcCharacterType* type = currentCharacter<SrcCharacterType>(); 1255 result = currentCharacter<SrcCharacterType>(); 1256 1257 parseIdentifier(result, resultString, hasEscape); 1258 1259 m_token = DIMEN; 1260 if (!hasEscape) 1261 detectNumberToken(type, currentCharacter<SrcCharacterType>() - type); 1262 1263 if (m_token == DIMEN) { 1264 // The decoded number is overwritten, but this is intentional. 1265 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1266 } 1267 } else if (*currentCharacter<SrcCharacterType>() == '%') { 1268 // Although the CSS grammar says {num}% we follow 1269 // webkit at the moment which uses {num}%+. 1270 do { 1271 ++currentCharacter<SrcCharacterType>(); 1272 } while (*currentCharacter<SrcCharacterType>() == '%'); 1273 m_token = PERCENTAGE; 1274 } else { 1275 m_token = dotSeen ? FLOATTOKEN : INTEGER; 1276 } 1277 break; 1278 } 1279 1280 case CharacterDash: 1281 if (isIdentifierStartAfterDash(currentCharacter<SrcCharacterType>())) { 1282 --currentCharacter<SrcCharacterType>(); 1283 parseIdentifier(result, resultString, hasEscape); 1284 m_token = IDENT; 1285 1286 if (*currentCharacter<SrcCharacterType>() == '(') { 1287 m_token = FUNCTION; 1288 if (!hasEscape) 1289 detectDashToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>()); 1290 ++currentCharacter<SrcCharacterType>(); 1291 ++result; 1292 } else if (UNLIKELY(m_parsingMode == NthChildMode) && !hasEscape && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[1], 'n')) { 1293 if (result - tokenStart<SrcCharacterType>() == 2) { 1294 // String "-n" is IDENT but "-n+1" is NTH. 1295 if (parseNthChildExtra<SrcCharacterType>()) { 1296 m_token = NTH; 1297 result = currentCharacter<SrcCharacterType>(); 1298 } 1299 } else if (result - tokenStart<SrcCharacterType>() >= 3 && tokenStart<SrcCharacterType>()[2] == '-') { 1300 // String "-n-" is IDENT but "-n-1" is NTH. 1301 // Set currentCharacter to second '-' of '-n-' to continue parsing. 1302 SrcCharacterType* nextCharacter = result; 1303 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 2; 1304 if (parseNthChildExtra<SrcCharacterType>()) { 1305 m_token = NTH; 1306 result = currentCharacter<SrcCharacterType>(); 1307 } else { 1308 // Revert the change to currentCharacter if unsuccessful. 1309 currentCharacter<SrcCharacterType>() = nextCharacter; 1310 } 1311 } 1312 resultString.setLength(result - tokenStart<SrcCharacterType>()); 1313 } 1314 yylval->string = resultString; 1315 } else if (currentCharacter<SrcCharacterType>()[0] == '-' && currentCharacter<SrcCharacterType>()[1] == '>') { 1316 currentCharacter<SrcCharacterType>() += 2; 1317 m_token = SGML_CD; 1318 } else if (UNLIKELY(m_parsingMode == NthChildMode)) { 1319 // "-[0-9]+n" is always an NthChild. 1320 if (parseNthChild<SrcCharacterType>()) { 1321 parseNthChildExtra<SrcCharacterType>(); 1322 m_token = NTH; 1323 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1324 } 1325 } 1326 break; 1327 1328 case CharacterOther: 1329 // m_token is simply the current character. 1330 break; 1331 1332 case CharacterNull: 1333 // Do not advance pointer at the end of input. 1334 --currentCharacter<SrcCharacterType>(); 1335 break; 1336 1337 case CharacterWhiteSpace: 1338 m_token = WHITESPACE; 1339 // Might start with a '\n'. 1340 --currentCharacter<SrcCharacterType>(); 1341 do { 1342 if (*currentCharacter<SrcCharacterType>() == '\n') 1343 ++m_lineNumber; 1344 ++currentCharacter<SrcCharacterType>(); 1345 } while (*currentCharacter<SrcCharacterType>() <= ' ' && (typesOfASCIICharacters[*currentCharacter<SrcCharacterType>()] == CharacterWhiteSpace)); 1346 break; 1347 1348 case CharacterEndMediaQueryOrSupports: 1349 if (m_parsingMode == MediaQueryMode || m_parsingMode == SupportsMode) 1350 m_parsingMode = NormalMode; 1351 break; 1352 1353 case CharacterEndNthChild: 1354 if (m_parsingMode == NthChildMode) 1355 m_parsingMode = NormalMode; 1356 break; 1357 1358 case CharacterQuote: 1359 if (checkAndSkipString(currentCharacter<SrcCharacterType>(), m_token, AbortIfInvalid)) { 1360 ++result; 1361 parseString<SrcCharacterType>(result, yylval->string, m_token); 1362 m_token = STRING; 1363 } 1364 break; 1365 1366 case CharacterExclamationMark: { 1367 SrcCharacterType* start = skipWhiteSpace(currentCharacter<SrcCharacterType>()); 1368 if (isEqualToCSSIdentifier(start, "important")) { 1369 m_token = IMPORTANT_SYM; 1370 currentCharacter<SrcCharacterType>() = start + 9; 1371 } 1372 break; 1373 } 1374 1375 case CharacterHashmark: { 1376 SrcCharacterType* start = currentCharacter<SrcCharacterType>(); 1377 result = currentCharacter<SrcCharacterType>(); 1378 1379 if (isASCIIDigit(*currentCharacter<SrcCharacterType>())) { 1380 // This must be a valid hex number token. 1381 do { 1382 ++currentCharacter<SrcCharacterType>(); 1383 } while (isASCIIHexDigit(*currentCharacter<SrcCharacterType>())); 1384 m_token = HEX; 1385 yylval->string.init(start, currentCharacter<SrcCharacterType>() - start); 1386 } else if (isIdentifierStart<SrcCharacterType>()) { 1387 m_token = IDSEL; 1388 parseIdentifier(result, yylval->string, hasEscape); 1389 if (!hasEscape) { 1390 // Check whether the identifier is also a valid hex number. 1391 SrcCharacterType* current = start; 1392 m_token = HEX; 1393 do { 1394 if (!isASCIIHexDigit(*current)) { 1395 m_token = IDSEL; 1396 break; 1397 } 1398 ++current; 1399 } while (current < result); 1400 } 1401 } 1402 break; 1403 } 1404 1405 case CharacterSlash: 1406 // Ignore comments. They are not even considered as white spaces. 1407 if (*currentCharacter<SrcCharacterType>() == '*') { 1408 const CSSParserLocation startLocation = currentLocation(); 1409 if (m_parser.m_observer) { 1410 unsigned startOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>() - 1; // Start with a slash. 1411 m_parser.m_observer->startComment(startOffset - m_parsedTextPrefixLength); 1412 } 1413 ++currentCharacter<SrcCharacterType>(); 1414 while (currentCharacter<SrcCharacterType>()[0] != '*' || currentCharacter<SrcCharacterType>()[1] != '/') { 1415 if (*currentCharacter<SrcCharacterType>() == '\n') 1416 ++m_lineNumber; 1417 if (*currentCharacter<SrcCharacterType>() == '\0') { 1418 // Unterminated comments are simply ignored. 1419 currentCharacter<SrcCharacterType>() -= 2; 1420 m_parser.reportError(startLocation, UnterminatedCommentCSSError); 1421 break; 1422 } 1423 ++currentCharacter<SrcCharacterType>(); 1424 } 1425 currentCharacter<SrcCharacterType>() += 2; 1426 if (m_parser.m_observer) { 1427 unsigned endOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>(); 1428 unsigned userTextEndOffset = static_cast<unsigned>(m_length - 1 - m_parsedTextSuffixLength); 1429 m_parser.m_observer->endComment(std::min(endOffset, userTextEndOffset) - m_parsedTextPrefixLength); 1430 } 1431 goto restartAfterComment; 1432 } 1433 break; 1434 1435 case CharacterDollar: 1436 if (*currentCharacter<SrcCharacterType>() == '=') { 1437 ++currentCharacter<SrcCharacterType>(); 1438 m_token = ENDSWITH; 1439 } 1440 break; 1441 1442 case CharacterAsterisk: 1443 if (*currentCharacter<SrcCharacterType>() == '=') { 1444 ++currentCharacter<SrcCharacterType>(); 1445 m_token = CONTAINS; 1446 } 1447 break; 1448 1449 case CharacterPlus: 1450 if (UNLIKELY(m_parsingMode == NthChildMode)) { 1451 // Simplest case. "+[0-9]*n" is always NthChild. 1452 if (parseNthChild<SrcCharacterType>()) { 1453 parseNthChildExtra<SrcCharacterType>(); 1454 m_token = NTH; 1455 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>()); 1456 } 1457 } 1458 break; 1459 1460 case CharacterLess: 1461 if (currentCharacter<SrcCharacterType>()[0] == '!' && currentCharacter<SrcCharacterType>()[1] == '-' && currentCharacter<SrcCharacterType>()[2] == '-') { 1462 currentCharacter<SrcCharacterType>() += 3; 1463 m_token = SGML_CD; 1464 } 1465 break; 1466 1467 case CharacterAt: 1468 if (isIdentifierStart<SrcCharacterType>()) { 1469 m_token = ATKEYWORD; 1470 ++result; 1471 parseIdentifier(result, resultString, hasEscape); 1472 // The standard enables unicode escapes in at-rules. In this case only the resultString will contain the 1473 // correct identifier, hence we have to use it to determine its length instead of the usual pointer arithmetic. 1474 detectAtToken<SrcCharacterType>(resultString.length() + 1, hasEscape); 1475 } 1476 break; 1477 1478 case CharacterBackSlash: 1479 if (isCSSEscape(*currentCharacter<SrcCharacterType>())) { 1480 --currentCharacter<SrcCharacterType>(); 1481 parseIdentifier(result, yylval->string, hasEscape); 1482 m_token = IDENT; 1483 } 1484 break; 1485 1486 case CharacterXor: 1487 if (*currentCharacter<SrcCharacterType>() == '=') { 1488 ++currentCharacter<SrcCharacterType>(); 1489 m_token = BEGINSWITH; 1490 } 1491 break; 1492 1493 case CharacterVerticalBar: 1494 if (*currentCharacter<SrcCharacterType>() == '=') { 1495 ++currentCharacter<SrcCharacterType>(); 1496 m_token = DASHMATCH; 1497 } 1498 break; 1499 1500 case CharacterTilde: 1501 if (*currentCharacter<SrcCharacterType>() == '=') { 1502 ++currentCharacter<SrcCharacterType>(); 1503 m_token = INCLUDES; 1504 } 1505 break; 1506 1507 default: 1508 ASSERT_NOT_REACHED(); 1509 break; 1510 } 1511 1512 return m_token; 1513 } 1514 1515 template <> 1516 inline void CSSTokenizer::setTokenStart<LChar>(LChar* tokenStart) 1517 { 1518 m_tokenStart.ptr8 = tokenStart; 1519 } 1520 1521 template <> 1522 inline void CSSTokenizer::setTokenStart<UChar>(UChar* tokenStart) 1523 { 1524 m_tokenStart.ptr16 = tokenStart; 1525 } 1526 1527 void CSSTokenizer::setupTokenizer(const char* prefix, unsigned prefixLength, const String& string, const char* suffix, unsigned suffixLength) 1528 { 1529 m_parsedTextPrefixLength = prefixLength; 1530 m_parsedTextSuffixLength = suffixLength; 1531 unsigned stringLength = string.length(); 1532 unsigned length = stringLength + m_parsedTextPrefixLength + m_parsedTextSuffixLength + 1; 1533 m_length = length; 1534 1535 if (!stringLength || string.is8Bit()) { 1536 m_dataStart8 = adoptArrayPtr(new LChar[length]); 1537 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++) 1538 m_dataStart8[i] = prefix[i]; 1539 1540 if (stringLength) 1541 memcpy(m_dataStart8.get() + m_parsedTextPrefixLength, string.characters8(), stringLength * sizeof(LChar)); 1542 1543 unsigned start = m_parsedTextPrefixLength + stringLength; 1544 unsigned end = start + suffixLength; 1545 for (unsigned i = start; i < end; i++) 1546 m_dataStart8[i] = suffix[i - start]; 1547 1548 m_dataStart8[length - 1] = 0; 1549 1550 m_is8BitSource = true; 1551 m_currentCharacter8 = m_dataStart8.get(); 1552 m_currentCharacter16 = 0; 1553 setTokenStart<LChar>(m_currentCharacter8); 1554 m_lexFunc = &CSSTokenizer::realLex<LChar>; 1555 return; 1556 } 1557 1558 m_dataStart16 = adoptArrayPtr(new UChar[length]); 1559 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++) 1560 m_dataStart16[i] = prefix[i]; 1561 1562 ASSERT(stringLength); 1563 memcpy(m_dataStart16.get() + m_parsedTextPrefixLength, string.characters16(), stringLength * sizeof(UChar)); 1564 1565 unsigned start = m_parsedTextPrefixLength + stringLength; 1566 unsigned end = start + suffixLength; 1567 for (unsigned i = start; i < end; i++) 1568 m_dataStart16[i] = suffix[i - start]; 1569 1570 m_dataStart16[length - 1] = 0; 1571 1572 m_is8BitSource = false; 1573 m_currentCharacter8 = 0; 1574 m_currentCharacter16 = m_dataStart16.get(); 1575 setTokenStart<UChar>(m_currentCharacter16); 1576 m_lexFunc = &CSSTokenizer::realLex<UChar>; 1577 } 1578 1579 } // namespace blink 1580