Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2003 Lars Knoll (knoll (at) kde.org)
      3  * Copyright (C) 2005 Allan Sandfeld Jensen (kde (at) carewolf.com)
      4  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
      5  * Copyright (C) 2007 Nicholas Shanks <webkit (at) nickshanks.com>
      6  * Copyright (C) 2008 Eric Seidel <eric (at) webkit.org>
      7  * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
      8  * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
      9  * Copyright (C) 2012 Intel Corporation. All rights reserved.
     10  *
     11  * This library is free software; you can redistribute it and/or
     12  * modify it under the terms of the GNU Library General Public
     13  * License as published by the Free Software Foundation; either
     14  * version 2 of the License, or (at your option) any later version.
     15  *
     16  * This library is distributed in the hope that it will be useful,
     17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19  * Library General Public License for more details.
     20  *
     21  * You should have received a copy of the GNU Library General Public License
     22  * along with this library; see the file COPYING.LIB.  If not, write to
     23  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     24  * Boston, MA 02110-1301, USA.
     25  */
     26 
     27 #include "config.h"
     28 #include "core/css/parser/CSSTokenizer.h"
     29 
     30 #include "core/css/CSSKeyframeRule.h"
     31 #include "core/css/MediaQuery.h"
     32 #include "core/css/StyleRule.h"
     33 #include "core/css/parser/BisonCSSParser.h"
     34 #include "core/css/parser/CSSParserValues.h"
     35 #include "core/html/parser/HTMLParserIdioms.h"
     36 #include "core/svg/SVGParserUtilities.h"
     37 
     38 namespace blink {
     39 
     40 #include "core/CSSGrammar.h"
     41 
     42 enum CharacterType {
     43     // Types for the main switch.
     44 
     45     // The first 4 types must be grouped together, as they
     46     // represent the allowed chars in an identifier.
     47     CharacterCaselessU,
     48     CharacterIdentifierStart,
     49     CharacterNumber,
     50     CharacterDash,
     51 
     52     CharacterOther,
     53     CharacterNull,
     54     CharacterWhiteSpace,
     55     CharacterEndMediaQueryOrSupports,
     56     CharacterEndNthChild,
     57     CharacterQuote,
     58     CharacterExclamationMark,
     59     CharacterHashmark,
     60     CharacterDollar,
     61     CharacterAsterisk,
     62     CharacterPlus,
     63     CharacterDot,
     64     CharacterSlash,
     65     CharacterLess,
     66     CharacterAt,
     67     CharacterBackSlash,
     68     CharacterXor,
     69     CharacterVerticalBar,
     70     CharacterTilde,
     71 };
     72 
     73 // 128 ASCII codes
     74 static const CharacterType typesOfASCIICharacters[128] = {
     75 /*   0 - Null               */ CharacterNull,
     76 /*   1 - Start of Heading   */ CharacterOther,
     77 /*   2 - Start of Text      */ CharacterOther,
     78 /*   3 - End of Text        */ CharacterOther,
     79 /*   4 - End of Transm.     */ CharacterOther,
     80 /*   5 - Enquiry            */ CharacterOther,
     81 /*   6 - Acknowledgment     */ CharacterOther,
     82 /*   7 - Bell               */ CharacterOther,
     83 /*   8 - Back Space         */ CharacterOther,
     84 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
     85 /*  10 - Line Feed          */ CharacterWhiteSpace,
     86 /*  11 - Vertical Tab       */ CharacterOther,
     87 /*  12 - Form Feed          */ CharacterWhiteSpace,
     88 /*  13 - Carriage Return    */ CharacterWhiteSpace,
     89 /*  14 - Shift Out          */ CharacterOther,
     90 /*  15 - Shift In           */ CharacterOther,
     91 /*  16 - Data Line Escape   */ CharacterOther,
     92 /*  17 - Device Control 1   */ CharacterOther,
     93 /*  18 - Device Control 2   */ CharacterOther,
     94 /*  19 - Device Control 3   */ CharacterOther,
     95 /*  20 - Device Control 4   */ CharacterOther,
     96 /*  21 - Negative Ack.      */ CharacterOther,
     97 /*  22 - Synchronous Idle   */ CharacterOther,
     98 /*  23 - End of Transmit    */ CharacterOther,
     99 /*  24 - Cancel             */ CharacterOther,
    100 /*  25 - End of Medium      */ CharacterOther,
    101 /*  26 - Substitute         */ CharacterOther,
    102 /*  27 - Escape             */ CharacterOther,
    103 /*  28 - File Separator     */ CharacterOther,
    104 /*  29 - Group Separator    */ CharacterOther,
    105 /*  30 - Record Separator   */ CharacterOther,
    106 /*  31 - Unit Separator     */ CharacterOther,
    107 /*  32 - Space              */ CharacterWhiteSpace,
    108 /*  33 - !                  */ CharacterExclamationMark,
    109 /*  34 - "                  */ CharacterQuote,
    110 /*  35 - #                  */ CharacterHashmark,
    111 /*  36 - $                  */ CharacterDollar,
    112 /*  37 - %                  */ CharacterOther,
    113 /*  38 - &                  */ CharacterOther,
    114 /*  39 - '                  */ CharacterQuote,
    115 /*  40 - (                  */ CharacterOther,
    116 /*  41 - )                  */ CharacterEndNthChild,
    117 /*  42 - *                  */ CharacterAsterisk,
    118 /*  43 - +                  */ CharacterPlus,
    119 /*  44 - ,                  */ CharacterOther,
    120 /*  45 - -                  */ CharacterDash,
    121 /*  46 - .                  */ CharacterDot,
    122 /*  47 - /                  */ CharacterSlash,
    123 /*  48 - 0                  */ CharacterNumber,
    124 /*  49 - 1                  */ CharacterNumber,
    125 /*  50 - 2                  */ CharacterNumber,
    126 /*  51 - 3                  */ CharacterNumber,
    127 /*  52 - 4                  */ CharacterNumber,
    128 /*  53 - 5                  */ CharacterNumber,
    129 /*  54 - 6                  */ CharacterNumber,
    130 /*  55 - 7                  */ CharacterNumber,
    131 /*  56 - 8                  */ CharacterNumber,
    132 /*  57 - 9                  */ CharacterNumber,
    133 /*  58 - :                  */ CharacterOther,
    134 /*  59 - ;                  */ CharacterEndMediaQueryOrSupports,
    135 /*  60 - <                  */ CharacterLess,
    136 /*  61 - =                  */ CharacterOther,
    137 /*  62 - >                  */ CharacterOther,
    138 /*  63 - ?                  */ CharacterOther,
    139 /*  64 - @                  */ CharacterAt,
    140 /*  65 - A                  */ CharacterIdentifierStart,
    141 /*  66 - B                  */ CharacterIdentifierStart,
    142 /*  67 - C                  */ CharacterIdentifierStart,
    143 /*  68 - D                  */ CharacterIdentifierStart,
    144 /*  69 - E                  */ CharacterIdentifierStart,
    145 /*  70 - F                  */ CharacterIdentifierStart,
    146 /*  71 - G                  */ CharacterIdentifierStart,
    147 /*  72 - H                  */ CharacterIdentifierStart,
    148 /*  73 - I                  */ CharacterIdentifierStart,
    149 /*  74 - J                  */ CharacterIdentifierStart,
    150 /*  75 - K                  */ CharacterIdentifierStart,
    151 /*  76 - L                  */ CharacterIdentifierStart,
    152 /*  77 - M                  */ CharacterIdentifierStart,
    153 /*  78 - N                  */ CharacterIdentifierStart,
    154 /*  79 - O                  */ CharacterIdentifierStart,
    155 /*  80 - P                  */ CharacterIdentifierStart,
    156 /*  81 - Q                  */ CharacterIdentifierStart,
    157 /*  82 - R                  */ CharacterIdentifierStart,
    158 /*  83 - S                  */ CharacterIdentifierStart,
    159 /*  84 - T                  */ CharacterIdentifierStart,
    160 /*  85 - U                  */ CharacterCaselessU,
    161 /*  86 - V                  */ CharacterIdentifierStart,
    162 /*  87 - W                  */ CharacterIdentifierStart,
    163 /*  88 - X                  */ CharacterIdentifierStart,
    164 /*  89 - Y                  */ CharacterIdentifierStart,
    165 /*  90 - Z                  */ CharacterIdentifierStart,
    166 /*  91 - [                  */ CharacterOther,
    167 /*  92 - \                  */ CharacterBackSlash,
    168 /*  93 - ]                  */ CharacterOther,
    169 /*  94 - ^                  */ CharacterXor,
    170 /*  95 - _                  */ CharacterIdentifierStart,
    171 /*  96 - `                  */ CharacterOther,
    172 /*  97 - a                  */ CharacterIdentifierStart,
    173 /*  98 - b                  */ CharacterIdentifierStart,
    174 /*  99 - c                  */ CharacterIdentifierStart,
    175 /* 100 - d                  */ CharacterIdentifierStart,
    176 /* 101 - e                  */ CharacterIdentifierStart,
    177 /* 102 - f                  */ CharacterIdentifierStart,
    178 /* 103 - g                  */ CharacterIdentifierStart,
    179 /* 104 - h                  */ CharacterIdentifierStart,
    180 /* 105 - i                  */ CharacterIdentifierStart,
    181 /* 106 - j                  */ CharacterIdentifierStart,
    182 /* 107 - k                  */ CharacterIdentifierStart,
    183 /* 108 - l                  */ CharacterIdentifierStart,
    184 /* 109 - m                  */ CharacterIdentifierStart,
    185 /* 110 - n                  */ CharacterIdentifierStart,
    186 /* 111 - o                  */ CharacterIdentifierStart,
    187 /* 112 - p                  */ CharacterIdentifierStart,
    188 /* 113 - q                  */ CharacterIdentifierStart,
    189 /* 114 - r                  */ CharacterIdentifierStart,
    190 /* 115 - s                  */ CharacterIdentifierStart,
    191 /* 116 - t                  */ CharacterIdentifierStart,
    192 /* 117 - u                  */ CharacterCaselessU,
    193 /* 118 - v                  */ CharacterIdentifierStart,
    194 /* 119 - w                  */ CharacterIdentifierStart,
    195 /* 120 - x                  */ CharacterIdentifierStart,
    196 /* 121 - y                  */ CharacterIdentifierStart,
    197 /* 122 - z                  */ CharacterIdentifierStart,
    198 /* 123 - {                  */ CharacterEndMediaQueryOrSupports,
    199 /* 124 - |                  */ CharacterVerticalBar,
    200 /* 125 - }                  */ CharacterOther,
    201 /* 126 - ~                  */ CharacterTilde,
    202 /* 127 - Delete             */ CharacterOther,
    203 };
    204 
    205 // Utility functions for the CSS tokenizer.
    206 
    207 template <typename CharacterType>
    208 static inline bool isCSSLetter(CharacterType character)
    209 {
    210     return character >= 128 || typesOfASCIICharacters[character] <= CharacterDash;
    211 }
    212 
    213 template <typename CharacterType>
    214 static inline bool isCSSEscape(CharacterType character)
    215 {
    216     return character >= ' ' && character != 127;
    217 }
    218 
    219 template <typename CharacterType>
    220 static inline bool isURILetter(CharacterType character)
    221 {
    222     return (character >= '*' && character != 127) || (character >= '#' && character <= '&') || character == '!';
    223 }
    224 
    225 template <typename CharacterType>
    226 static inline bool isIdentifierStartAfterDash(CharacterType* currentCharacter)
    227 {
    228     return isASCIIAlpha(currentCharacter[0]) || currentCharacter[0] == '_' || currentCharacter[0] >= 128
    229         || (currentCharacter[0] == '\\' && isCSSEscape(currentCharacter[1]));
    230 }
    231 
    232 template <typename CharacterType>
    233 static inline bool isEqualToCSSIdentifier(CharacterType* cssString, const char* constantString)
    234 {
    235     // Compare an character memory data with a zero terminated string.
    236     do {
    237         // The input must be part of an identifier if constantChar or constString
    238         // contains '-'. Otherwise toASCIILowerUnchecked('\r') would be equal to '-'.
    239         ASSERT((*constantString >= 'a' && *constantString <= 'z') || *constantString == '-');
    240         ASSERT(*constantString != '-' || isCSSLetter(*cssString));
    241         if (toASCIILowerUnchecked(*cssString++) != (*constantString++))
    242             return false;
    243     } while (*constantString);
    244     return true;
    245 }
    246 
    247 template <typename CharacterType>
    248 static inline bool isEqualToCSSCaseSensitiveIdentifier(CharacterType* string, const char* constantString)
    249 {
    250     ASSERT(*constantString);
    251 
    252     do {
    253         if (*string++ != *constantString++)
    254             return false;
    255     } while (*constantString);
    256     return true;
    257 }
    258 
    259 template <typename CharacterType>
    260 static CharacterType* checkAndSkipEscape(CharacterType* currentCharacter)
    261 {
    262     // Returns with 0, if escape check is failed. Otherwise
    263     // it returns with the following character.
    264     ASSERT(*currentCharacter == '\\');
    265 
    266     ++currentCharacter;
    267     if (!isCSSEscape(*currentCharacter))
    268         return 0;
    269 
    270     if (isASCIIHexDigit(*currentCharacter)) {
    271         int length = 6;
    272 
    273         do {
    274             ++currentCharacter;
    275         } while (isASCIIHexDigit(*currentCharacter) && --length);
    276 
    277         // Optional space after the escape sequence.
    278         if (isHTMLSpace<CharacterType>(*currentCharacter))
    279             ++currentCharacter;
    280         return currentCharacter;
    281     }
    282     return currentCharacter + 1;
    283 }
    284 
    285 template <typename CharacterType>
    286 static inline CharacterType* skipWhiteSpace(CharacterType* currentCharacter)
    287 {
    288     while (isHTMLSpace<CharacterType>(*currentCharacter))
    289         ++currentCharacter;
    290     return currentCharacter;
    291 }
    292 
    293 // Main CSS tokenizer functions.
    294 
    295 template <>
    296 inline LChar*& CSSTokenizer::currentCharacter<LChar>()
    297 {
    298     return m_currentCharacter8;
    299 }
    300 
    301 template <>
    302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
    303 {
    304     return m_currentCharacter16;
    305 }
    306 
    307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
    308 {
    309     // Allocates and returns a CSSTokenizer owned buffer for storing
    310     // UTF-16 data. Used to get a suitable life span for UTF-16
    311     // strings, identifiers and URIs created by the tokenizer.
    312     OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
    313 
    314     UChar* bufferPtr = buffer.get();
    315 
    316     m_cssStrings16.append(buffer.release());
    317     return bufferPtr;
    318 }
    319 
    320 template <>
    321 inline LChar* CSSTokenizer::dataStart<LChar>()
    322 {
    323     return m_dataStart8.get();
    324 }
    325 
    326 template <>
    327 inline UChar* CSSTokenizer::dataStart<UChar>()
    328 {
    329     return m_dataStart16.get();
    330 }
    331 
    332 template <typename CharacterType>
    333 inline CSSParserLocation CSSTokenizer::tokenLocation()
    334 {
    335     CSSParserLocation location;
    336     location.token.init(tokenStart<CharacterType>(), currentCharacter<CharacterType>() - tokenStart<CharacterType>());
    337     location.lineNumber = m_tokenStartLineNumber;
    338     location.offset = tokenStart<CharacterType>() - dataStart<CharacterType>();
    339     return location;
    340 }
    341 
    342 CSSParserLocation CSSTokenizer::currentLocation()
    343 {
    344     if (is8BitSource())
    345         return tokenLocation<LChar>();
    346     return tokenLocation<UChar>();
    347 }
    348 
    349 template <typename CharacterType>
    350 inline bool CSSTokenizer::isIdentifierStart()
    351 {
    352     // Check whether an identifier is started.
    353     return isIdentifierStartAfterDash((*currentCharacter<CharacterType>() != '-') ? currentCharacter<CharacterType>() : currentCharacter<CharacterType>() + 1);
    354 }
    355 
    356 enum CheckStringValidationMode {
    357     AbortIfInvalid,
    358     SkipInvalid
    359 };
    360 
    361 template <typename CharacterType>
    362 static inline CharacterType* checkAndSkipString(CharacterType* currentCharacter, int quote, CheckStringValidationMode mode)
    363 {
    364     // If mode is AbortIfInvalid and the string check fails it returns
    365     // with 0. Otherwise it returns with a pointer to the first
    366     // character after the string.
    367     while (true) {
    368         if (UNLIKELY(*currentCharacter == quote)) {
    369             // String parsing is successful.
    370             return currentCharacter + 1;
    371         }
    372         if (UNLIKELY(!*currentCharacter)) {
    373             // String parsing is successful up to end of input.
    374             return currentCharacter;
    375         }
    376         if (mode == AbortIfInvalid && UNLIKELY(*currentCharacter <= '\r' && (*currentCharacter == '\n' || (*currentCharacter | 0x1) == '\r'))) {
    377             // String parsing is failed for character '\n', '\f' or '\r'.
    378             return 0;
    379         }
    380 
    381         if (LIKELY(currentCharacter[0] != '\\')) {
    382             ++currentCharacter;
    383         } else if (currentCharacter[1] == '\n' || currentCharacter[1] == '\f') {
    384             currentCharacter += 2;
    385         } else if (currentCharacter[1] == '\r') {
    386             currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;
    387         } else {
    388             CharacterType* next = checkAndSkipEscape(currentCharacter);
    389             if (!next) {
    390                 if (mode == AbortIfInvalid)
    391                     return 0;
    392                 next = currentCharacter + 1;
    393             }
    394             currentCharacter = next;
    395         }
    396     }
    397 }
    398 
    399 template <typename CharacterType>
    400 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
    401 {
    402     ASSERT(*src == '\\' && isCSSEscape(src[1]));
    403 
    404     unsigned unicode = 0;
    405 
    406     ++src;
    407     if (isASCIIHexDigit(*src)) {
    408 
    409         int length = 6;
    410 
    411         do {
    412             unicode = (unicode << 4) + toASCIIHexValue(*src++);
    413         } while (--length && isASCIIHexDigit(*src));
    414 
    415         // Characters above 0x10ffff are not handled.
    416         if (unicode > 0x10ffff)
    417             unicode = 0xfffd;
    418 
    419         // Optional space after the escape sequence.
    420         if (isHTMLSpace<CharacterType>(*src))
    421             ++src;
    422 
    423         return unicode;
    424     }
    425 
    426     return *src++;
    427 }
    428 
    429 template <>
    430 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode)
    431 {
    432     ASSERT(unicode <= 0xff);
    433     *result = unicode;
    434 
    435     ++result;
    436 }
    437 
    438 template <>
    439 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode)
    440 {
    441     // Replace unicode with a surrogate pairs when it is bigger than 0xffff
    442     if (U16_LENGTH(unicode) == 2) {
    443         *result++ = U16_LEAD(unicode);
    444         *result = U16_TRAIL(unicode);
    445     } else {
    446         *result = unicode;
    447     }
    448 
    449     ++result;
    450 }
    451 
    452 template <typename SrcCharacterType>
    453 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
    454 {
    455     // The decoded form of an identifier (after resolving escape
    456     // sequences) will not contain more characters (ASCII or UTF-16
    457     // codepoints) than the input. This code can therefore ignore
    458     // escape sequences completely.
    459     SrcCharacterType* start = src;
    460     do {
    461         if (LIKELY(*src != '\\'))
    462             src++;
    463         else
    464             parseEscape<SrcCharacterType>(src);
    465     } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
    466 
    467     return src - start;
    468 }
    469 
    470 template <typename SrcCharacterType, typename DestCharacterType>
    471 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
    472 {
    473     hasEscape = false;
    474     do {
    475         if (LIKELY(*src != '\\')) {
    476             *result++ = *src++;
    477         } else {
    478             hasEscape = true;
    479             SrcCharacterType* savedEscapeStart = src;
    480             unsigned unicode = parseEscape<SrcCharacterType>(src);
    481             if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
    482                 src = savedEscapeStart;
    483                 return false;
    484             }
    485             UnicodeToChars(result, unicode);
    486         }
    487     } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
    488 
    489     return true;
    490 }
    491 
    492 template <typename CharacterType>
    493 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserString& resultString, bool& hasEscape)
    494 {
    495     // If a valid identifier start is found, we can safely
    496     // parse the identifier until the next invalid character.
    497     ASSERT(isIdentifierStart<CharacterType>());
    498 
    499     CharacterType* start = currentCharacter<CharacterType>();
    500     if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
    501         // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
    502         ASSERT(is8BitSource());
    503         UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(currentCharacter<CharacterType>()));
    504         UChar* start16 = result16;
    505         int i = 0;
    506         for (; i < result - start; i++)
    507             result16[i] = start[i];
    508 
    509         result16 += i;
    510 
    511         parseIdentifierInternal(currentCharacter<CharacterType>(), result16, hasEscape);
    512 
    513         resultString.init(start16, result16 - start16);
    514 
    515         return;
    516     }
    517 
    518     resultString.init(start, result - start);
    519 }
    520 
    521 template <typename SrcCharacterType>
    522 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
    523 {
    524     // The decoded form of a CSS string (after resolving escape
    525     // sequences) will not contain more characters (ASCII or UTF-16
    526     // codepoints) than the input. This code can therefore ignore
    527     // escape sequences completely and just return the length of the
    528     // input string (possibly including terminating quote if any).
    529     SrcCharacterType* end = checkAndSkipString(src, quote, SkipInvalid);
    530     return end ? end - src : 0;
    531 }
    532 
    533 template <typename SrcCharacterType, typename DestCharacterType>
    534 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
    535 {
    536     while (true) {
    537         if (UNLIKELY(*src == quote)) {
    538             // String parsing is done.
    539             ++src;
    540             return true;
    541         }
    542         if (UNLIKELY(!*src)) {
    543             // String parsing is done, but don't advance pointer if at the end of input.
    544             return true;
    545         }
    546         if (LIKELY(src[0] != '\\')) {
    547             *result++ = *src++;
    548         } else if (src[1] == '\n' || src[1] == '\f') {
    549             src += 2;
    550         } else if (src[1] == '\r') {
    551             src += src[2] == '\n' ? 3 : 2;
    552         } else {
    553             SrcCharacterType* savedEscapeStart = src;
    554             unsigned unicode = parseEscape<SrcCharacterType>(src);
    555             if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
    556                 src = savedEscapeStart;
    557                 return false;
    558             }
    559             UnicodeToChars(result, unicode);
    560         }
    561     }
    562 
    563     return true;
    564 }
    565 
    566 template <typename CharacterType>
    567 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& resultString, UChar quote)
    568 {
    569     CharacterType* start = currentCharacter<CharacterType>();
    570 
    571     if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
    572         // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
    573         ASSERT(is8BitSource());
    574         UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(currentCharacter<CharacterType>(), quote));
    575         UChar* start16 = result16;
    576         int i = 0;
    577         for (; i < result - start; i++)
    578             result16[i] = start[i];
    579 
    580         result16 += i;
    581 
    582         parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
    583 
    584         resultString.init(start16, result16 - start16);
    585         return;
    586     }
    587 
    588     resultString.init(start, result - start);
    589 }
    590 
    591 template <typename CharacterType>
    592 inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UChar& quote)
    593 {
    594     start = skipWhiteSpace(currentCharacter<CharacterType>());
    595 
    596     if (*start == '"' || *start == '\'') {
    597         quote = *start++;
    598         end = checkAndSkipString(start, quote, AbortIfInvalid);
    599         if (!end)
    600             return false;
    601     } else {
    602         quote = 0;
    603         end = start;
    604         while (isURILetter(*end)) {
    605             if (LIKELY(*end != '\\')) {
    606                 ++end;
    607             } else {
    608                 end = checkAndSkipEscape(end);
    609                 if (!end)
    610                     return false;
    611             }
    612         }
    613     }
    614 
    615     end = skipWhiteSpace(end);
    616     if (*end != ')')
    617         return false;
    618 
    619     return true;
    620 }
    621 
    622 template <typename SrcCharacterType>
    623 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
    624 {
    625     // The decoded form of a URI (after resolving escape sequences)
    626     // will not contain more characters (ASCII or UTF-16 codepoints)
    627     // than the input. This code can therefore ignore escape sequences
    628     // completely.
    629     SrcCharacterType* start = src;
    630     if (quote) {
    631         ASSERT(quote == '"' || quote == '\'');
    632         return peekMaxStringLen(src, quote);
    633     }
    634 
    635     while (isURILetter(*src)) {
    636         if (LIKELY(*src != '\\'))
    637             src++;
    638         else
    639             parseEscape<SrcCharacterType>(src);
    640     }
    641 
    642     return src - start;
    643 }
    644 
    645 template <typename SrcCharacterType, typename DestCharacterType>
    646 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
    647 {
    648     if (quote) {
    649         ASSERT(quote == '"' || quote == '\'');
    650         return parseStringInternal(src, dest, quote);
    651     }
    652 
    653     while (isURILetter(*src)) {
    654         if (LIKELY(*src != '\\')) {
    655             *dest++ = *src++;
    656         } else {
    657             unsigned unicode = parseEscape<SrcCharacterType>(src);
    658             if (unicode > 0xff && sizeof(DestCharacterType) == 1)
    659                 return false;
    660             UnicodeToChars(dest, unicode);
    661         }
    662     }
    663 
    664     return true;
    665 }
    666 
    667 template <typename CharacterType>
    668 inline void CSSTokenizer::parseURI(CSSParserString& string)
    669 {
    670     CharacterType* uriStart;
    671     CharacterType* uriEnd;
    672     UChar quote;
    673     if (!findURI(uriStart, uriEnd, quote))
    674         return;
    675 
    676     CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
    677     if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote))) {
    678         string.init(uriStart, dest - uriStart);
    679     } else {
    680         // An escape sequence was encountered that can't be stored in 8 bits.
    681         // Reset the current character to the start of the URI and re-parse with
    682         // a 16-bit destination.
    683         ASSERT(is8BitSource());
    684         currentCharacter<CharacterType>() = uriStart;
    685         UChar* result16 = allocateStringBuffer16(peekMaxURILen(currentCharacter<CharacterType>(), quote));
    686         UChar* uriStart16 = result16;
    687         bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
    688         ASSERT_UNUSED(result, result);
    689         string.init(uriStart16, result16 - uriStart16);
    690     }
    691 
    692     currentCharacter<CharacterType>() = uriEnd + 1;
    693     m_token = URI;
    694 }
    695 
    696 template <typename CharacterType>
    697 inline bool CSSTokenizer::parseUnicodeRange()
    698 {
    699     CharacterType* character = currentCharacter<CharacterType>() + 1;
    700     int length = 6;
    701     ASSERT(*currentCharacter<CharacterType>() == '+');
    702 
    703     while (isASCIIHexDigit(*character) && length) {
    704         ++character;
    705         --length;
    706     }
    707 
    708     if (length && *character == '?') {
    709         // At most 5 hex digit followed by a question mark.
    710         do {
    711             ++character;
    712             --length;
    713         } while (*character == '?' && length);
    714         currentCharacter<CharacterType>() = character;
    715         return true;
    716     }
    717 
    718     if (length < 6) {
    719         // At least one hex digit.
    720         if (character[0] == '-' && isASCIIHexDigit(character[1])) {
    721             // Followed by a dash and a hex digit.
    722             ++character;
    723             length = 6;
    724             do {
    725                 ++character;
    726             } while (--length && isASCIIHexDigit(*character));
    727         }
    728         currentCharacter<CharacterType>() = character;
    729         return true;
    730     }
    731     return false;
    732 }
    733 
    734 template <typename CharacterType>
    735 bool CSSTokenizer::parseNthChild()
    736 {
    737     CharacterType* character = currentCharacter<CharacterType>();
    738 
    739     while (isASCIIDigit(*character))
    740         ++character;
    741     if (isASCIIAlphaCaselessEqual(*character, 'n')) {
    742         currentCharacter<CharacterType>() = character + 1;
    743         return true;
    744     }
    745     return false;
    746 }
    747 
    748 template <typename CharacterType>
    749 bool CSSTokenizer::parseNthChildExtra()
    750 {
    751     CharacterType* character = skipWhiteSpace(currentCharacter<CharacterType>());
    752     if (*character != '+' && *character != '-')
    753         return false;
    754 
    755     character = skipWhiteSpace(character + 1);
    756     if (!isASCIIDigit(*character))
    757         return false;
    758 
    759     do {
    760         ++character;
    761     } while (isASCIIDigit(*character));
    762 
    763     currentCharacter<CharacterType>() = character;
    764     return true;
    765 }
    766 
    767 template <typename CharacterType>
    768 inline bool CSSTokenizer::detectFunctionTypeToken(int length)
    769 {
    770     ASSERT(length > 0);
    771     CharacterType* name = tokenStart<CharacterType>();
    772     SWITCH(name, length) {
    773         CASE("not") {
    774             m_token = NOTFUNCTION;
    775             return true;
    776         }
    777         CASE("url") {
    778             m_token = URI;
    779             return true;
    780         }
    781         CASE("cue") {
    782             m_token = CUEFUNCTION;
    783             return true;
    784         }
    785         CASE("calc") {
    786             m_token = CALCFUNCTION;
    787             return true;
    788         }
    789         CASE("host") {
    790             m_token = HOSTFUNCTION;
    791             return true;
    792         }
    793         CASE("host-context") {
    794             m_token = HOSTCONTEXTFUNCTION;
    795             return true;
    796         }
    797         CASE("nth-child") {
    798             m_parsingMode = NthChildMode;
    799             return true;
    800         }
    801         CASE("nth-of-type") {
    802             m_parsingMode = NthChildMode;
    803             return true;
    804         }
    805         CASE("nth-last-child") {
    806             m_parsingMode = NthChildMode;
    807             return true;
    808         }
    809         CASE("nth-last-of-type") {
    810             m_parsingMode = NthChildMode;
    811             return true;
    812         }
    813     }
    814     return false;
    815 }
    816 
    817 template <typename CharacterType>
    818 inline void CSSTokenizer::detectMediaQueryToken(int length)
    819 {
    820     ASSERT(m_parsingMode == MediaQueryMode);
    821     CharacterType* name = tokenStart<CharacterType>();
    822 
    823     SWITCH(name, length) {
    824         CASE("and") {
    825             m_token = MEDIA_AND;
    826         }
    827         CASE("not") {
    828             m_token = MEDIA_NOT;
    829         }
    830         CASE("only") {
    831             m_token = MEDIA_ONLY;
    832         }
    833         CASE("or") {
    834             m_token = MEDIA_OR;
    835         }
    836     }
    837 }
    838 
    839 template <typename CharacterType>
    840 inline void CSSTokenizer::detectNumberToken(CharacterType* type, int length)
    841 {
    842     ASSERT(length > 0);
    843 
    844     SWITCH(type, length) {
    845         CASE("cm") {
    846             m_token = CMS;
    847         }
    848         CASE("ch") {
    849             m_token = CHS;
    850         }
    851         CASE("deg") {
    852             m_token = DEGS;
    853         }
    854         CASE("dppx") {
    855             // There is a discussion about the name of this unit on www-style.
    856             // Keep this compile time guard in place until that is resolved.
    857             // http://lists.w3.org/Archives/Public/www-style/2012May/0915.html
    858             m_token = DPPX;
    859         }
    860         CASE("dpcm") {
    861             m_token = DPCM;
    862         }
    863         CASE("dpi") {
    864             m_token = DPI;
    865         }
    866         CASE("em") {
    867             m_token = EMS;
    868         }
    869         CASE("ex") {
    870             m_token = EXS;
    871         }
    872         CASE("fr") {
    873             m_token = FR;
    874         }
    875         CASE("grad") {
    876             m_token = GRADS;
    877         }
    878         CASE("hz") {
    879             m_token = HERTZ;
    880         }
    881         CASE("in") {
    882             m_token = INS;
    883         }
    884         CASE("khz") {
    885             m_token = KHERTZ;
    886         }
    887         CASE("mm") {
    888             m_token = MMS;
    889         }
    890         CASE("ms") {
    891             m_token = MSECS;
    892         }
    893         CASE("px") {
    894             m_token = PXS;
    895         }
    896         CASE("pt") {
    897             m_token = PTS;
    898         }
    899         CASE("pc") {
    900             m_token = PCS;
    901         }
    902         CASE("rad") {
    903             m_token = RADS;
    904         }
    905         CASE("rem") {
    906             m_token = REMS;
    907         }
    908         CASE("s") {
    909             m_token = SECS;
    910         }
    911         CASE("turn") {
    912             m_token = TURNS;
    913         }
    914         CASE("vw") {
    915             m_token = VW;
    916         }
    917         CASE("vh") {
    918             m_token = VH;
    919         }
    920         CASE("vmin") {
    921             m_token = VMIN;
    922         }
    923         CASE("vmax") {
    924             m_token = VMAX;
    925         }
    926         CASE("__qem") {
    927             m_token = QEMS;
    928         }
    929     }
    930 }
    931 
    932 template <typename CharacterType>
    933 inline void CSSTokenizer::detectDashToken(int length)
    934 {
    935     CharacterType* name = tokenStart<CharacterType>();
    936 
    937     // Ignore leading dash.
    938     ++name;
    939     --length;
    940 
    941     SWITCH(name, length) {
    942         CASE("webkit-any") {
    943             m_token = ANYFUNCTION;
    944         }
    945         CASE("webkit-calc") {
    946             m_token = CALCFUNCTION;
    947         }
    948     }
    949 }
    950 
    951 template <typename CharacterType>
    952 inline void CSSTokenizer::detectAtToken(int length, bool hasEscape)
    953 {
    954     CharacterType* name = tokenStart<CharacterType>();
    955     ASSERT(name[0] == '@' && length >= 2);
    956 
    957     // Ignore leading @.
    958     ++name;
    959     --length;
    960 
    961     // charset, font-face, import, media, namespace, page, supports,
    962     // -webkit-keyframes, keyframes, and -webkit-mediaquery are not affected by hasEscape.
    963     SWITCH(name, length) {
    964         CASE("bottom-left") {
    965             if (LIKELY(!hasEscape))
    966                 m_token = BOTTOMLEFT_SYM;
    967         }
    968         CASE("bottom-right") {
    969             if (LIKELY(!hasEscape))
    970                 m_token = BOTTOMRIGHT_SYM;
    971         }
    972         CASE("bottom-center") {
    973             if (LIKELY(!hasEscape))
    974                 m_token = BOTTOMCENTER_SYM;
    975         }
    976         CASE("bottom-left-corner") {
    977             if (LIKELY(!hasEscape))
    978                 m_token = BOTTOMLEFTCORNER_SYM;
    979         }
    980         CASE("bottom-right-corner") {
    981             if (LIKELY(!hasEscape))
    982                 m_token = BOTTOMRIGHTCORNER_SYM;
    983         }
    984         CASE("charset") {
    985             if (name - 1 == dataStart<CharacterType>())
    986                 m_token = CHARSET_SYM;
    987         }
    988         CASE("font-face") {
    989             m_token = FONT_FACE_SYM;
    990         }
    991         CASE("import") {
    992             m_parsingMode = MediaQueryMode;
    993             m_token = IMPORT_SYM;
    994         }
    995         CASE("keyframes") {
    996             if (RuntimeEnabledFeatures::cssAnimationUnprefixedEnabled())
    997                 m_token = KEYFRAMES_SYM;
    998         }
    999         CASE("left-top") {
   1000             if (LIKELY(!hasEscape))
   1001                 m_token = LEFTTOP_SYM;
   1002         }
   1003         CASE("left-middle") {
   1004             if (LIKELY(!hasEscape))
   1005                 m_token = LEFTMIDDLE_SYM;
   1006         }
   1007         CASE("left-bottom") {
   1008             if (LIKELY(!hasEscape))
   1009                 m_token = LEFTBOTTOM_SYM;
   1010         }
   1011         CASE("media") {
   1012             m_parsingMode = MediaQueryMode;
   1013             m_token = MEDIA_SYM;
   1014         }
   1015         CASE("namespace") {
   1016             m_token = NAMESPACE_SYM;
   1017         }
   1018         CASE("page") {
   1019             m_token = PAGE_SYM;
   1020         }
   1021         CASE("right-top") {
   1022             if (LIKELY(!hasEscape))
   1023                 m_token = RIGHTTOP_SYM;
   1024         }
   1025         CASE("right-middle") {
   1026             if (LIKELY(!hasEscape))
   1027                 m_token = RIGHTMIDDLE_SYM;
   1028         }
   1029         CASE("right-bottom") {
   1030             if (LIKELY(!hasEscape))
   1031                 m_token = RIGHTBOTTOM_SYM;
   1032         }
   1033         CASE("supports") {
   1034             m_parsingMode = SupportsMode;
   1035             m_token = SUPPORTS_SYM;
   1036         }
   1037         CASE("top-left") {
   1038             if (LIKELY(!hasEscape))
   1039                 m_token = TOPLEFT_SYM;
   1040         }
   1041         CASE("top-right") {
   1042             if (LIKELY(!hasEscape))
   1043                 m_token = TOPRIGHT_SYM;
   1044         }
   1045         CASE("top-center") {
   1046             if (LIKELY(!hasEscape))
   1047                 m_token = TOPCENTER_SYM;
   1048         }
   1049         CASE("top-left-corner") {
   1050             if (LIKELY(!hasEscape))
   1051                 m_token = TOPLEFTCORNER_SYM;
   1052         }
   1053         CASE("top-right-corner") {
   1054             if (LIKELY(!hasEscape))
   1055                 m_token = TOPRIGHTCORNER_SYM;
   1056         }
   1057         CASE("viewport") {
   1058             m_token = VIEWPORT_RULE_SYM;
   1059         }
   1060         CASE("-internal-rule") {
   1061             if (LIKELY(!hasEscape && m_internal))
   1062                 m_token = INTERNAL_RULE_SYM;
   1063         }
   1064         CASE("-internal-decls") {
   1065             if (LIKELY(!hasEscape && m_internal))
   1066                 m_token = INTERNAL_DECLS_SYM;
   1067         }
   1068         CASE("-internal-value") {
   1069             if (LIKELY(!hasEscape && m_internal))
   1070                 m_token = INTERNAL_VALUE_SYM;
   1071         }
   1072         CASE("-webkit-keyframes") {
   1073             m_token = WEBKIT_KEYFRAMES_SYM;
   1074         }
   1075         CASE("-internal-selector") {
   1076             if (LIKELY(!hasEscape && m_internal))
   1077                 m_token = INTERNAL_SELECTOR_SYM;
   1078         }
   1079         CASE("-internal-keyframe-rule") {
   1080             if (LIKELY(!hasEscape && m_internal))
   1081                 m_token = INTERNAL_KEYFRAME_RULE_SYM;
   1082         }
   1083         CASE("-internal-keyframe-key-list") {
   1084             if (!m_internal)
   1085                 return;
   1086             m_token = INTERNAL_KEYFRAME_KEY_LIST_SYM;
   1087         }
   1088         CASE("-internal-supports-condition") {
   1089             if (!m_internal)
   1090                 return;
   1091             m_parsingMode = SupportsMode;
   1092             m_token = INTERNAL_SUPPORTS_CONDITION_SYM;
   1093         }
   1094     }
   1095 }
   1096 
   1097 template <typename CharacterType>
   1098 inline void CSSTokenizer::detectSupportsToken(int length)
   1099 {
   1100     ASSERT(m_parsingMode == SupportsMode);
   1101     CharacterType* name = tokenStart<CharacterType>();
   1102 
   1103     SWITCH(name, length) {
   1104         CASE("or") {
   1105             m_token = SUPPORTS_OR;
   1106         }
   1107         CASE("and") {
   1108             m_token = SUPPORTS_AND;
   1109         }
   1110         CASE("not") {
   1111             m_token = SUPPORTS_NOT;
   1112         }
   1113     }
   1114 }
   1115 
   1116 template <typename SrcCharacterType>
   1117 int CSSTokenizer::realLex(void* yylvalWithoutType)
   1118 {
   1119     YYSTYPE* yylval = static_cast<YYSTYPE*>(yylvalWithoutType);
   1120     // Write pointer for the next character.
   1121     SrcCharacterType* result;
   1122     CSSParserString resultString;
   1123     bool hasEscape;
   1124 
   1125     // The input buffer is terminated by a \0 character, so
   1126     // it is safe to read one character ahead of a known non-null.
   1127 #if ENABLE(ASSERT)
   1128     // In debug we check with an ASSERT that the length is > 0 for string types.
   1129     yylval->string.clear();
   1130 #endif
   1131 
   1132 restartAfterComment:
   1133     result = currentCharacter<SrcCharacterType>();
   1134     setTokenStart(result);
   1135     m_tokenStartLineNumber = m_lineNumber;
   1136     m_token = *currentCharacter<SrcCharacterType>();
   1137     ++currentCharacter<SrcCharacterType>();
   1138 
   1139     switch ((m_token <= 127) ? typesOfASCIICharacters[m_token] : CharacterIdentifierStart) {
   1140     case CharacterCaselessU:
   1141         if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '+')) {
   1142             if (parseUnicodeRange<SrcCharacterType>()) {
   1143                 m_token = UNICODERANGE;
   1144                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1145                 break;
   1146             }
   1147         }
   1148         // Fall through to CharacterIdentifierStart.
   1149 
   1150     case CharacterIdentifierStart:
   1151         --currentCharacter<SrcCharacterType>();
   1152         parseIdentifier(result, yylval->string, hasEscape);
   1153         m_token = IDENT;
   1154 
   1155         if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '(')) {
   1156             if (m_parsingMode == SupportsMode && !hasEscape) {
   1157                 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
   1158                 if (m_token != IDENT)
   1159                     break;
   1160             }
   1161 
   1162             m_token = FUNCTION;
   1163             if (!hasEscape)
   1164                 detectFunctionTypeToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
   1165 
   1166             // Skip parenthesis
   1167             ++currentCharacter<SrcCharacterType>();
   1168             ++result;
   1169 
   1170             if (m_token == URI) {
   1171                 m_token = FUNCTION;
   1172                 // Check whether it is really an URI.
   1173                 if (yylval->string.is8Bit())
   1174                     parseURI<LChar>(yylval->string);
   1175                 else
   1176                     parseURI<UChar>(yylval->string);
   1177             }
   1178         } else if (UNLIKELY(m_parsingMode != NormalMode) && !hasEscape) {
   1179             if (m_parsingMode == MediaQueryMode) {
   1180                 detectMediaQueryToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
   1181             } else if (m_parsingMode == SupportsMode) {
   1182                 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
   1183             } else if (m_parsingMode == NthChildMode && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[0], 'n')) {
   1184                 if (result - tokenStart<SrcCharacterType>() == 1) {
   1185                     // String "n" is IDENT but "n+1" is NTH.
   1186                     if (parseNthChildExtra<SrcCharacterType>()) {
   1187                         m_token = NTH;
   1188                         yylval->string.m_length = currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>();
   1189                     }
   1190                 } else if (result - tokenStart<SrcCharacterType>() >= 2 && tokenStart<SrcCharacterType>()[1] == '-') {
   1191                     // String "n-" is IDENT but "n-1" is NTH.
   1192                     // Set currentCharacter to '-' to continue parsing.
   1193                     SrcCharacterType* nextCharacter = result;
   1194                     currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 1;
   1195                     if (parseNthChildExtra<SrcCharacterType>()) {
   1196                         m_token = NTH;
   1197                         yylval->string.setLength(currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1198                     } else {
   1199                         // Revert the change to currentCharacter if unsuccessful.
   1200                         currentCharacter<SrcCharacterType>() = nextCharacter;
   1201                     }
   1202                 }
   1203             }
   1204         }
   1205         break;
   1206 
   1207     case CharacterDot:
   1208         if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0]))
   1209             break;
   1210         // Fall through to CharacterNumber.
   1211 
   1212     case CharacterNumber: {
   1213         bool dotSeen = (m_token == '.');
   1214 
   1215         while (true) {
   1216             if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) {
   1217                 // Only one dot is allowed for a number,
   1218                 // and it must be followed by a digit.
   1219                 if (currentCharacter<SrcCharacterType>()[0] != '.' || dotSeen || !isASCIIDigit(currentCharacter<SrcCharacterType>()[1]))
   1220                     break;
   1221                 dotSeen = true;
   1222             }
   1223             ++currentCharacter<SrcCharacterType>();
   1224         }
   1225 
   1226         if (UNLIKELY(m_parsingMode == NthChildMode) && !dotSeen && isASCIIAlphaCaselessEqual(*currentCharacter<SrcCharacterType>(), 'n')) {
   1227             // "[0-9]+n" is always an NthChild.
   1228             ++currentCharacter<SrcCharacterType>();
   1229             parseNthChildExtra<SrcCharacterType>();
   1230             m_token = NTH;
   1231             yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1232             break;
   1233         }
   1234 
   1235         // We need to take care of units like 'em' or 'ex'.
   1236         SrcCharacterType* character = currentCharacter<SrcCharacterType>();
   1237         if (isASCIIAlphaCaselessEqual(*character, 'e')) {
   1238             ASSERT(character - tokenStart<SrcCharacterType>() > 0);
   1239             ++character;
   1240             if (*character == '-' || *character == '+' || isASCIIDigit(*character)) {
   1241                 ++character;
   1242                 while (isASCIIDigit(*character))
   1243                     ++character;
   1244                 // Use FLOATTOKEN if the string contains exponents.
   1245                 dotSeen = true;
   1246                 currentCharacter<SrcCharacterType>() = character;
   1247             }
   1248         }
   1249 
   1250         yylval->number = charactersToDouble(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1251 
   1252         // Type of the function.
   1253         if (isIdentifierStart<SrcCharacterType>()) {
   1254             SrcCharacterType* type = currentCharacter<SrcCharacterType>();
   1255             result = currentCharacter<SrcCharacterType>();
   1256 
   1257             parseIdentifier(result, resultString, hasEscape);
   1258 
   1259             m_token = DIMEN;
   1260             if (!hasEscape)
   1261                 detectNumberToken(type, currentCharacter<SrcCharacterType>() - type);
   1262 
   1263             if (m_token == DIMEN) {
   1264                 // The decoded number is overwritten, but this is intentional.
   1265                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1266             }
   1267         } else if (*currentCharacter<SrcCharacterType>() == '%') {
   1268             // Although the CSS grammar says {num}% we follow
   1269             // webkit at the moment which uses {num}%+.
   1270             do {
   1271                 ++currentCharacter<SrcCharacterType>();
   1272             } while (*currentCharacter<SrcCharacterType>() == '%');
   1273             m_token = PERCENTAGE;
   1274         } else {
   1275             m_token = dotSeen ? FLOATTOKEN : INTEGER;
   1276         }
   1277         break;
   1278     }
   1279 
   1280     case CharacterDash:
   1281         if (isIdentifierStartAfterDash(currentCharacter<SrcCharacterType>())) {
   1282             --currentCharacter<SrcCharacterType>();
   1283             parseIdentifier(result, resultString, hasEscape);
   1284             m_token = IDENT;
   1285 
   1286             if (*currentCharacter<SrcCharacterType>() == '(') {
   1287                 m_token = FUNCTION;
   1288                 if (!hasEscape)
   1289                     detectDashToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
   1290                 ++currentCharacter<SrcCharacterType>();
   1291                 ++result;
   1292             } else if (UNLIKELY(m_parsingMode == NthChildMode) && !hasEscape && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[1], 'n')) {
   1293                 if (result - tokenStart<SrcCharacterType>() == 2) {
   1294                     // String "-n" is IDENT but "-n+1" is NTH.
   1295                     if (parseNthChildExtra<SrcCharacterType>()) {
   1296                         m_token = NTH;
   1297                         result = currentCharacter<SrcCharacterType>();
   1298                     }
   1299                 } else if (result - tokenStart<SrcCharacterType>() >= 3 && tokenStart<SrcCharacterType>()[2] == '-') {
   1300                     // String "-n-" is IDENT but "-n-1" is NTH.
   1301                     // Set currentCharacter to second '-' of '-n-' to continue parsing.
   1302                     SrcCharacterType* nextCharacter = result;
   1303                     currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 2;
   1304                     if (parseNthChildExtra<SrcCharacterType>()) {
   1305                         m_token = NTH;
   1306                         result = currentCharacter<SrcCharacterType>();
   1307                     } else {
   1308                         // Revert the change to currentCharacter if unsuccessful.
   1309                         currentCharacter<SrcCharacterType>() = nextCharacter;
   1310                     }
   1311                 }
   1312                 resultString.setLength(result - tokenStart<SrcCharacterType>());
   1313             }
   1314             yylval->string = resultString;
   1315         } else if (currentCharacter<SrcCharacterType>()[0] == '-' && currentCharacter<SrcCharacterType>()[1] == '>') {
   1316             currentCharacter<SrcCharacterType>() += 2;
   1317             m_token = SGML_CD;
   1318         } else if (UNLIKELY(m_parsingMode == NthChildMode)) {
   1319             // "-[0-9]+n" is always an NthChild.
   1320             if (parseNthChild<SrcCharacterType>()) {
   1321                 parseNthChildExtra<SrcCharacterType>();
   1322                 m_token = NTH;
   1323                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1324             }
   1325         }
   1326         break;
   1327 
   1328     case CharacterOther:
   1329         // m_token is simply the current character.
   1330         break;
   1331 
   1332     case CharacterNull:
   1333         // Do not advance pointer at the end of input.
   1334         --currentCharacter<SrcCharacterType>();
   1335         break;
   1336 
   1337     case CharacterWhiteSpace:
   1338         m_token = WHITESPACE;
   1339         // Might start with a '\n'.
   1340         --currentCharacter<SrcCharacterType>();
   1341         do {
   1342             if (*currentCharacter<SrcCharacterType>() == '\n')
   1343                 ++m_lineNumber;
   1344             ++currentCharacter<SrcCharacterType>();
   1345         } while (*currentCharacter<SrcCharacterType>() <= ' ' && (typesOfASCIICharacters[*currentCharacter<SrcCharacterType>()] == CharacterWhiteSpace));
   1346         break;
   1347 
   1348     case CharacterEndMediaQueryOrSupports:
   1349         if (m_parsingMode == MediaQueryMode || m_parsingMode == SupportsMode)
   1350             m_parsingMode = NormalMode;
   1351         break;
   1352 
   1353     case CharacterEndNthChild:
   1354         if (m_parsingMode == NthChildMode)
   1355             m_parsingMode = NormalMode;
   1356         break;
   1357 
   1358     case CharacterQuote:
   1359         if (checkAndSkipString(currentCharacter<SrcCharacterType>(), m_token, AbortIfInvalid)) {
   1360             ++result;
   1361             parseString<SrcCharacterType>(result, yylval->string, m_token);
   1362             m_token = STRING;
   1363         }
   1364         break;
   1365 
   1366     case CharacterExclamationMark: {
   1367         SrcCharacterType* start = skipWhiteSpace(currentCharacter<SrcCharacterType>());
   1368         if (isEqualToCSSIdentifier(start, "important")) {
   1369             m_token = IMPORTANT_SYM;
   1370             currentCharacter<SrcCharacterType>() = start + 9;
   1371         }
   1372         break;
   1373     }
   1374 
   1375     case CharacterHashmark: {
   1376         SrcCharacterType* start = currentCharacter<SrcCharacterType>();
   1377         result = currentCharacter<SrcCharacterType>();
   1378 
   1379         if (isASCIIDigit(*currentCharacter<SrcCharacterType>())) {
   1380             // This must be a valid hex number token.
   1381             do {
   1382                 ++currentCharacter<SrcCharacterType>();
   1383             } while (isASCIIHexDigit(*currentCharacter<SrcCharacterType>()));
   1384             m_token = HEX;
   1385             yylval->string.init(start, currentCharacter<SrcCharacterType>() - start);
   1386         } else if (isIdentifierStart<SrcCharacterType>()) {
   1387             m_token = IDSEL;
   1388             parseIdentifier(result, yylval->string, hasEscape);
   1389             if (!hasEscape) {
   1390                 // Check whether the identifier is also a valid hex number.
   1391                 SrcCharacterType* current = start;
   1392                 m_token = HEX;
   1393                 do {
   1394                     if (!isASCIIHexDigit(*current)) {
   1395                         m_token = IDSEL;
   1396                         break;
   1397                     }
   1398                     ++current;
   1399                 } while (current < result);
   1400             }
   1401         }
   1402         break;
   1403     }
   1404 
   1405     case CharacterSlash:
   1406         // Ignore comments. They are not even considered as white spaces.
   1407         if (*currentCharacter<SrcCharacterType>() == '*') {
   1408             const CSSParserLocation startLocation = currentLocation();
   1409             if (m_parser.m_observer) {
   1410                 unsigned startOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>() - 1; // Start with a slash.
   1411                 m_parser.m_observer->startComment(startOffset - m_parsedTextPrefixLength);
   1412             }
   1413             ++currentCharacter<SrcCharacterType>();
   1414             while (currentCharacter<SrcCharacterType>()[0] != '*' || currentCharacter<SrcCharacterType>()[1] != '/') {
   1415                 if (*currentCharacter<SrcCharacterType>() == '\n')
   1416                     ++m_lineNumber;
   1417                 if (*currentCharacter<SrcCharacterType>() == '\0') {
   1418                     // Unterminated comments are simply ignored.
   1419                     currentCharacter<SrcCharacterType>() -= 2;
   1420                     m_parser.reportError(startLocation, UnterminatedCommentCSSError);
   1421                     break;
   1422                 }
   1423                 ++currentCharacter<SrcCharacterType>();
   1424             }
   1425             currentCharacter<SrcCharacterType>() += 2;
   1426             if (m_parser.m_observer) {
   1427                 unsigned endOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>();
   1428                 unsigned userTextEndOffset = static_cast<unsigned>(m_length - 1 - m_parsedTextSuffixLength);
   1429                 m_parser.m_observer->endComment(std::min(endOffset, userTextEndOffset) - m_parsedTextPrefixLength);
   1430             }
   1431             goto restartAfterComment;
   1432         }
   1433         break;
   1434 
   1435     case CharacterDollar:
   1436         if (*currentCharacter<SrcCharacterType>() == '=') {
   1437             ++currentCharacter<SrcCharacterType>();
   1438             m_token = ENDSWITH;
   1439         }
   1440         break;
   1441 
   1442     case CharacterAsterisk:
   1443         if (*currentCharacter<SrcCharacterType>() == '=') {
   1444             ++currentCharacter<SrcCharacterType>();
   1445             m_token = CONTAINS;
   1446         }
   1447         break;
   1448 
   1449     case CharacterPlus:
   1450         if (UNLIKELY(m_parsingMode == NthChildMode)) {
   1451             // Simplest case. "+[0-9]*n" is always NthChild.
   1452             if (parseNthChild<SrcCharacterType>()) {
   1453                 parseNthChildExtra<SrcCharacterType>();
   1454                 m_token = NTH;
   1455                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
   1456             }
   1457         }
   1458         break;
   1459 
   1460     case CharacterLess:
   1461         if (currentCharacter<SrcCharacterType>()[0] == '!' && currentCharacter<SrcCharacterType>()[1] == '-' && currentCharacter<SrcCharacterType>()[2] == '-') {
   1462             currentCharacter<SrcCharacterType>() += 3;
   1463             m_token = SGML_CD;
   1464         }
   1465         break;
   1466 
   1467     case CharacterAt:
   1468         if (isIdentifierStart<SrcCharacterType>()) {
   1469             m_token = ATKEYWORD;
   1470             ++result;
   1471             parseIdentifier(result, resultString, hasEscape);
   1472             // The standard enables unicode escapes in at-rules. In this case only the resultString will contain the
   1473             // correct identifier, hence we have to use it to determine its length instead of the usual pointer arithmetic.
   1474             detectAtToken<SrcCharacterType>(resultString.length() + 1, hasEscape);
   1475         }
   1476         break;
   1477 
   1478     case CharacterBackSlash:
   1479         if (isCSSEscape(*currentCharacter<SrcCharacterType>())) {
   1480             --currentCharacter<SrcCharacterType>();
   1481             parseIdentifier(result, yylval->string, hasEscape);
   1482             m_token = IDENT;
   1483         }
   1484         break;
   1485 
   1486     case CharacterXor:
   1487         if (*currentCharacter<SrcCharacterType>() == '=') {
   1488             ++currentCharacter<SrcCharacterType>();
   1489             m_token = BEGINSWITH;
   1490         }
   1491         break;
   1492 
   1493     case CharacterVerticalBar:
   1494         if (*currentCharacter<SrcCharacterType>() == '=') {
   1495             ++currentCharacter<SrcCharacterType>();
   1496             m_token = DASHMATCH;
   1497         }
   1498         break;
   1499 
   1500     case CharacterTilde:
   1501         if (*currentCharacter<SrcCharacterType>() == '=') {
   1502             ++currentCharacter<SrcCharacterType>();
   1503             m_token = INCLUDES;
   1504         }
   1505         break;
   1506 
   1507     default:
   1508         ASSERT_NOT_REACHED();
   1509         break;
   1510     }
   1511 
   1512     return m_token;
   1513 }
   1514 
   1515 template <>
   1516 inline void CSSTokenizer::setTokenStart<LChar>(LChar* tokenStart)
   1517 {
   1518     m_tokenStart.ptr8 = tokenStart;
   1519 }
   1520 
   1521 template <>
   1522 inline void CSSTokenizer::setTokenStart<UChar>(UChar* tokenStart)
   1523 {
   1524     m_tokenStart.ptr16 = tokenStart;
   1525 }
   1526 
   1527 void CSSTokenizer::setupTokenizer(const char* prefix, unsigned prefixLength, const String& string, const char* suffix, unsigned suffixLength)
   1528 {
   1529     m_parsedTextPrefixLength = prefixLength;
   1530     m_parsedTextSuffixLength = suffixLength;
   1531     unsigned stringLength = string.length();
   1532     unsigned length = stringLength + m_parsedTextPrefixLength + m_parsedTextSuffixLength + 1;
   1533     m_length = length;
   1534 
   1535     if (!stringLength || string.is8Bit()) {
   1536         m_dataStart8 = adoptArrayPtr(new LChar[length]);
   1537         for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
   1538             m_dataStart8[i] = prefix[i];
   1539 
   1540         if (stringLength)
   1541             memcpy(m_dataStart8.get() + m_parsedTextPrefixLength, string.characters8(), stringLength * sizeof(LChar));
   1542 
   1543         unsigned start = m_parsedTextPrefixLength + stringLength;
   1544         unsigned end = start + suffixLength;
   1545         for (unsigned i = start; i < end; i++)
   1546             m_dataStart8[i] = suffix[i - start];
   1547 
   1548         m_dataStart8[length - 1] = 0;
   1549 
   1550         m_is8BitSource = true;
   1551         m_currentCharacter8 = m_dataStart8.get();
   1552         m_currentCharacter16 = 0;
   1553         setTokenStart<LChar>(m_currentCharacter8);
   1554         m_lexFunc = &CSSTokenizer::realLex<LChar>;
   1555         return;
   1556     }
   1557 
   1558     m_dataStart16 = adoptArrayPtr(new UChar[length]);
   1559     for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
   1560         m_dataStart16[i] = prefix[i];
   1561 
   1562     ASSERT(stringLength);
   1563     memcpy(m_dataStart16.get() + m_parsedTextPrefixLength, string.characters16(), stringLength * sizeof(UChar));
   1564 
   1565     unsigned start = m_parsedTextPrefixLength + stringLength;
   1566     unsigned end = start + suffixLength;
   1567     for (unsigned i = start; i < end; i++)
   1568         m_dataStart16[i] = suffix[i - start];
   1569 
   1570     m_dataStart16[length - 1] = 0;
   1571 
   1572     m_is8BitSource = false;
   1573     m_currentCharacter8 = 0;
   1574     m_currentCharacter16 = m_dataStart16.get();
   1575     setTokenStart<UChar>(m_currentCharacter16);
   1576     m_lexFunc = &CSSTokenizer::realLex<UChar>;
   1577 }
   1578 
   1579 } // namespace blink
   1580