Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2010 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1.  Redistributions of source code must retain the above copyright
      8  *     notice, this list of conditions and the following disclaimer.
      9  * 2.  Redistributions in binary form must reproduce the above copyright
     10  *     notice, this list of conditions and the following disclaimer in the
     11  *     documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
     17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     23  */
     24 
     25 #include "config.h"
     26 #include "core/html/parser/HTMLParserIdioms.h"
     27 
     28 #include "core/HTMLNames.h"
     29 #include <limits>
     30 #include "wtf/MathExtras.h"
     31 #include "wtf/text/AtomicString.h"
     32 #include "wtf/text/StringBuilder.h"
     33 #include "wtf/text/StringHash.h"
     34 #include "wtf/text/TextEncoding.h"
     35 
     36 namespace WebCore {
     37 
     38 using namespace HTMLNames;
     39 
     40 template <typename CharType>
     41 static String stripLeadingAndTrailingHTMLSpaces(String string, const CharType* characters, unsigned length)
     42 {
     43     unsigned numLeadingSpaces = 0;
     44     unsigned numTrailingSpaces = 0;
     45 
     46     for (; numLeadingSpaces < length; ++numLeadingSpaces) {
     47         if (isNotHTMLSpace<CharType>(characters[numLeadingSpaces]))
     48             break;
     49     }
     50 
     51     if (numLeadingSpaces == length)
     52         return string.isNull() ? string : emptyAtom.string();
     53 
     54     for (; numTrailingSpaces < length; ++numTrailingSpaces) {
     55         if (isNotHTMLSpace<CharType>(characters[length - numTrailingSpaces - 1]))
     56             break;
     57     }
     58 
     59     ASSERT(numLeadingSpaces + numTrailingSpaces < length);
     60 
     61     if (!(numLeadingSpaces | numTrailingSpaces))
     62         return string;
     63 
     64     return string.substring(numLeadingSpaces, length - (numLeadingSpaces + numTrailingSpaces));
     65 }
     66 
     67 String stripLeadingAndTrailingHTMLSpaces(const String& string)
     68 {
     69     unsigned length = string.length();
     70 
     71     if (!length)
     72         return string.isNull() ? string : emptyAtom.string();
     73 
     74     if (string.is8Bit())
     75         return stripLeadingAndTrailingHTMLSpaces<LChar>(string, string.characters8(), length);
     76 
     77     return stripLeadingAndTrailingHTMLSpaces<UChar>(string, string.characters16(), length);
     78 }
     79 
     80 String serializeForNumberType(const Decimal& number)
     81 {
     82     if (number.isZero()) {
     83         // Decimal::toString appends exponent, e.g. "0e-18"
     84         return number.isNegative() ? "-0" : "0";
     85     }
     86     return number.toString();
     87 }
     88 
     89 String serializeForNumberType(double number)
     90 {
     91     // According to HTML5, "the best representation of the number n as a floating
     92     // point number" is a string produced by applying ToString() to n.
     93     return String::numberToStringECMAScript(number);
     94 }
     95 
     96 Decimal parseToDecimalForNumberType(const String& string, const Decimal& fallbackValue)
     97 {
     98     // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers and parseToDoubleForNumberType
     99     // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
    100     const UChar firstCharacter = string[0];
    101     if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
    102         return fallbackValue;
    103 
    104     const Decimal value = Decimal::fromString(string);
    105     if (!value.isFinite())
    106         return fallbackValue;
    107 
    108     // Numbers are considered finite IEEE 754 Double-precision floating point values.
    109     const Decimal doubleMax = Decimal::fromDouble(std::numeric_limits<double>::max());
    110     if (value < -doubleMax || value > doubleMax)
    111         return fallbackValue;
    112 
    113     // We return +0 for -0 case.
    114     return value.isZero() ? Decimal(0) : value;
    115 }
    116 
    117 double parseToDoubleForNumberType(const String& string, double fallbackValue)
    118 {
    119     // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers
    120     // String::toDouble() accepts leading + and whitespace characters, which are not valid here.
    121     UChar firstCharacter = string[0];
    122     if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter))
    123         return fallbackValue;
    124 
    125     bool valid = false;
    126     double value = string.toDouble(&valid);
    127     if (!valid)
    128         return fallbackValue;
    129 
    130     // NaN and infinity are considered valid by String::toDouble, but not valid here.
    131     if (!std::isfinite(value))
    132         return fallbackValue;
    133 
    134     // Numbers are considered finite IEEE 754 Double-precision floating point values.
    135     if (-std::numeric_limits<double>::max() > value || value > std::numeric_limits<double>::max())
    136         return fallbackValue;
    137 
    138     // The following expression converts -0 to +0.
    139     return value ? value : 0;
    140 }
    141 
    142 template <typename CharacterType>
    143 static bool parseHTMLIntegerInternal(const CharacterType* position, const CharacterType* end, int& value)
    144 {
    145     // Step 3
    146     int sign = 1;
    147 
    148     // Step 4
    149     while (position < end) {
    150         if (!isHTMLSpace<CharacterType>(*position))
    151             break;
    152         ++position;
    153     }
    154 
    155     // Step 5
    156     if (position == end)
    157         return false;
    158     ASSERT(position < end);
    159 
    160     // Step 6
    161     if (*position == '-') {
    162         sign = -1;
    163         ++position;
    164     } else if (*position == '+')
    165         ++position;
    166     if (position == end)
    167         return false;
    168     ASSERT(position < end);
    169 
    170     // Step 7
    171     if (!isASCIIDigit(*position))
    172         return false;
    173 
    174     // Step 8
    175     StringBuilder digits;
    176     while (position < end) {
    177         if (!isASCIIDigit(*position))
    178             break;
    179         digits.append(*position++);
    180     }
    181 
    182     // Step 9
    183     bool ok;
    184     if (digits.is8Bit())
    185         value = sign * charactersToIntStrict(digits.characters8(), digits.length(), &ok);
    186     else
    187         value = sign * charactersToIntStrict(digits.characters16(), digits.length(), &ok);
    188     return ok;
    189 }
    190 
    191 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers
    192 bool parseHTMLInteger(const String& input, int& value)
    193 {
    194     // Step 1
    195     // Step 2
    196     unsigned length = input.length();
    197     if (!length || input.is8Bit()) {
    198         const LChar* start = input.characters8();
    199         return parseHTMLIntegerInternal(start, start + length, value);
    200     }
    201 
    202     const UChar* start = input.characters16();
    203     return parseHTMLIntegerInternal(start, start + length, value);
    204 }
    205 
    206 template <typename CharacterType>
    207 static bool parseHTMLNonNegativeIntegerInternal(const CharacterType* position, const CharacterType* end, unsigned& value)
    208 {
    209     // Step 3
    210     while (position < end) {
    211         if (!isHTMLSpace<CharacterType>(*position))
    212             break;
    213         ++position;
    214     }
    215 
    216     // Step 4
    217     if (position == end)
    218         return false;
    219     ASSERT(position < end);
    220 
    221     // Step 5
    222     if (*position == '+')
    223         ++position;
    224 
    225     // Step 6
    226     if (position == end)
    227         return false;
    228     ASSERT(position < end);
    229 
    230     // Step 7
    231     if (!isASCIIDigit(*position))
    232         return false;
    233 
    234     // Step 8
    235     StringBuilder digits;
    236     while (position < end) {
    237         if (!isASCIIDigit(*position))
    238             break;
    239         digits.append(*position++);
    240     }
    241 
    242     // Step 9
    243     bool ok;
    244     if (digits.is8Bit())
    245         value = charactersToUIntStrict(digits.characters8(), digits.length(), &ok);
    246     else
    247         value = charactersToUIntStrict(digits.characters16(), digits.length(), &ok);
    248     return ok;
    249 }
    250 
    251 
    252 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-non-negative-integers
    253 bool parseHTMLNonNegativeInteger(const String& input, unsigned& value)
    254 {
    255     // Step 1
    256     // Step 2
    257     unsigned length = input.length();
    258     if (length && input.is8Bit()) {
    259         const LChar* start = input.characters8();
    260         return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
    261     }
    262 
    263     const UChar* start = input.characters16();
    264     return parseHTMLNonNegativeIntegerInternal(start, start + length, value);
    265 }
    266 
    267 static const char charsetString[] = "charset";
    268 static const size_t charsetLength = sizeof("charset") - 1;
    269 
    270 String extractCharset(const String& value)
    271 {
    272     size_t pos = 0;
    273     unsigned length = value.length();
    274 
    275     while (pos < length) {
    276         pos = value.find(charsetString, pos, false);
    277         if (pos == kNotFound)
    278             break;
    279 
    280         pos += charsetLength;
    281 
    282         // Skip whitespace.
    283         while (pos < length && value[pos] <= ' ')
    284             ++pos;
    285 
    286         if (value[pos] != '=')
    287             continue;
    288 
    289         ++pos;
    290 
    291         while (pos < length && value[pos] <= ' ')
    292             ++pos;
    293 
    294         char quoteMark = 0;
    295         if (pos < length && (value[pos] == '"' || value[pos] == '\'')) {
    296             quoteMark = static_cast<char>(value[pos++]);
    297             ASSERT(!(quoteMark & 0x80));
    298         }
    299 
    300         if (pos == length)
    301             break;
    302 
    303         unsigned end = pos;
    304         while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';')))
    305             ++end;
    306 
    307         if (quoteMark && (end == length))
    308             break; // Close quote not found.
    309 
    310         return value.substring(pos, end - pos);
    311     }
    312 
    313     return "";
    314 }
    315 
    316 enum Mode {
    317     None,
    318     Charset,
    319     Pragma,
    320 };
    321 
    322 WTF::TextEncoding encodingFromMetaAttributes(const HTMLAttributeList& attributes)
    323 {
    324     bool gotPragma = false;
    325     Mode mode = None;
    326     String charset;
    327 
    328     for (HTMLAttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) {
    329         const String& attributeName = iter->first;
    330         const String& attributeValue = AtomicString(iter->second);
    331 
    332         if (threadSafeMatch(attributeName, http_equivAttr)) {
    333             if (equalIgnoringCase(attributeValue, "content-type"))
    334                 gotPragma = true;
    335         } else if (charset.isEmpty()) {
    336             if (threadSafeMatch(attributeName, charsetAttr)) {
    337                 charset = attributeValue;
    338                 mode = Charset;
    339             } else if (threadSafeMatch(attributeName, contentAttr)) {
    340                 charset = extractCharset(attributeValue);
    341                 if (charset.length())
    342                     mode = Pragma;
    343             }
    344         }
    345     }
    346 
    347     if (mode == Charset || (mode == Pragma && gotPragma))
    348         return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset));
    349 
    350     return WTF::TextEncoding();
    351 }
    352 
    353 static bool threadSafeEqual(const StringImpl* a, const StringImpl* b)
    354 {
    355     if (a == b)
    356         return true;
    357     if (a->hash() != b->hash())
    358         return false;
    359     return equalNonNull(a, b);
    360 }
    361 
    362 bool threadSafeMatch(const QualifiedName& a, const QualifiedName& b)
    363 {
    364     return threadSafeEqual(a.localName().impl(), b.localName().impl());
    365 }
    366 
    367 bool threadSafeMatch(const String& localName, const QualifiedName& qName)
    368 {
    369     return threadSafeEqual(localName.impl(), qName.localName().impl());
    370 }
    371 
    372 template<typename CharType>
    373 inline StringImpl* findStringIfStatic(const CharType* characters, unsigned length)
    374 {
    375     // We don't need to try hashing if we know the string is too long.
    376     if (length > StringImpl::highestStaticStringLength())
    377         return 0;
    378     // computeHashAndMaskTop8Bits is the function StringImpl::hash() uses.
    379     unsigned hash = StringHasher::computeHashAndMaskTop8Bits(characters, length);
    380     const WTF::StaticStringsTable& table = StringImpl::allStaticStrings();
    381     ASSERT(!table.isEmpty());
    382 
    383     WTF::StaticStringsTable::const_iterator it = table.find(hash);
    384     if (it == table.end())
    385         return 0;
    386     // It's possible to have hash collisions between arbitrary strings and
    387     // known identifiers (e.g. "bvvfg" collides with "script").
    388     // However ASSERTs in StringImpl::createStatic guard against there ever being collisions
    389     // between static strings.
    390     if (!equal(it->value, characters, length))
    391         return 0;
    392     return it->value;
    393 }
    394 
    395 String attemptStaticStringCreation(const LChar* characters, size_t size)
    396 {
    397     String string(findStringIfStatic(characters, size));
    398     if (string.impl())
    399         return string;
    400     return String(characters, size);
    401 }
    402 
    403 String attemptStaticStringCreation(const UChar* characters, size_t size, CharacterWidth width)
    404 {
    405     String string(findStringIfStatic(characters, size));
    406     if (string.impl())
    407         return string;
    408     if (width == Likely8Bit)
    409         string = StringImpl::create8BitIfPossible(characters, size);
    410     else if (width == Force8Bit)
    411         string = String::make8BitFrom16BitSource(characters, size);
    412     else
    413         string = String(characters, size);
    414 
    415     return string;
    416 }
    417 
    418 }
    419