1 /* 2 * Copyright (C) 2010 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 */ 24 25 #include "config.h" 26 #include "core/html/parser/HTMLParserIdioms.h" 27 28 #include "core/HTMLNames.h" 29 #include <limits> 30 #include "wtf/MathExtras.h" 31 #include "wtf/text/AtomicString.h" 32 #include "wtf/text/StringBuilder.h" 33 #include "wtf/text/StringHash.h" 34 #include "wtf/text/TextEncoding.h" 35 36 namespace blink { 37 38 using namespace HTMLNames; 39 40 template <typename CharType> 41 static String stripLeadingAndTrailingHTMLSpaces(String string, const CharType* characters, unsigned length) 42 { 43 unsigned numLeadingSpaces = 0; 44 unsigned numTrailingSpaces = 0; 45 46 for (; numLeadingSpaces < length; ++numLeadingSpaces) { 47 if (isNotHTMLSpace<CharType>(characters[numLeadingSpaces])) 48 break; 49 } 50 51 if (numLeadingSpaces == length) 52 return string.isNull() ? string : emptyAtom.string(); 53 54 for (; numTrailingSpaces < length; ++numTrailingSpaces) { 55 if (isNotHTMLSpace<CharType>(characters[length - numTrailingSpaces - 1])) 56 break; 57 } 58 59 ASSERT(numLeadingSpaces + numTrailingSpaces < length); 60 61 if (!(numLeadingSpaces | numTrailingSpaces)) 62 return string; 63 64 return string.substring(numLeadingSpaces, length - (numLeadingSpaces + numTrailingSpaces)); 65 } 66 67 String stripLeadingAndTrailingHTMLSpaces(const String& string) 68 { 69 unsigned length = string.length(); 70 71 if (!length) 72 return string.isNull() ? string : emptyAtom.string(); 73 74 if (string.is8Bit()) 75 return stripLeadingAndTrailingHTMLSpaces<LChar>(string, string.characters8(), length); 76 77 return stripLeadingAndTrailingHTMLSpaces<UChar>(string, string.characters16(), length); 78 } 79 80 String serializeForNumberType(const Decimal& number) 81 { 82 if (number.isZero()) { 83 // Decimal::toString appends exponent, e.g. "0e-18" 84 return number.isNegative() ? "-0" : "0"; 85 } 86 return number.toString(); 87 } 88 89 String serializeForNumberType(double number) 90 { 91 // According to HTML5, "the best representation of the number n as a floating 92 // point number" is a string produced by applying ToString() to n. 93 return String::numberToStringECMAScript(number); 94 } 95 96 Decimal parseToDecimalForNumberType(const String& string, const Decimal& fallbackValue) 97 { 98 // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers and parseToDoubleForNumberType 99 // String::toDouble() accepts leading + and whitespace characters, which are not valid here. 100 const UChar firstCharacter = string[0]; 101 if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter)) 102 return fallbackValue; 103 104 const Decimal value = Decimal::fromString(string); 105 if (!value.isFinite()) 106 return fallbackValue; 107 108 // Numbers are considered finite IEEE 754 Double-precision floating point values. 109 const Decimal doubleMax = Decimal::fromDouble(std::numeric_limits<double>::max()); 110 if (value < -doubleMax || value > doubleMax) 111 return fallbackValue; 112 113 // We return +0 for -0 case. 114 return value.isZero() ? Decimal(0) : value; 115 } 116 117 double parseToDoubleForNumberType(const String& string, double fallbackValue) 118 { 119 // http://www.whatwg.org/specs/web-apps/current-work/#floating-point-numbers 120 // String::toDouble() accepts leading + and whitespace characters, which are not valid here. 121 UChar firstCharacter = string[0]; 122 if (firstCharacter != '-' && firstCharacter != '.' && !isASCIIDigit(firstCharacter)) 123 return fallbackValue; 124 125 bool valid = false; 126 double value = string.toDouble(&valid); 127 if (!valid) 128 return fallbackValue; 129 130 // NaN and infinity are considered valid by String::toDouble, but not valid here. 131 if (!std::isfinite(value)) 132 return fallbackValue; 133 134 // Numbers are considered finite IEEE 754 Double-precision floating point values. 135 if (-std::numeric_limits<double>::max() > value || value > std::numeric_limits<double>::max()) 136 return fallbackValue; 137 138 // The following expression converts -0 to +0. 139 return value ? value : 0; 140 } 141 142 template <typename CharacterType> 143 static bool parseHTMLIntegerInternal(const CharacterType* position, const CharacterType* end, int& value) 144 { 145 // Step 3 146 int sign = 1; 147 148 // Step 4 149 while (position < end) { 150 if (!isHTMLSpace<CharacterType>(*position)) 151 break; 152 ++position; 153 } 154 155 // Step 5 156 if (position == end) 157 return false; 158 ASSERT(position < end); 159 160 // Step 6 161 if (*position == '-') { 162 sign = -1; 163 ++position; 164 } else if (*position == '+') 165 ++position; 166 if (position == end) 167 return false; 168 ASSERT(position < end); 169 170 // Step 7 171 if (!isASCIIDigit(*position)) 172 return false; 173 174 // Step 8 175 StringBuilder digits; 176 while (position < end) { 177 if (!isASCIIDigit(*position)) 178 break; 179 digits.append(*position++); 180 } 181 182 // Step 9 183 bool ok; 184 if (digits.is8Bit()) 185 value = sign * charactersToIntStrict(digits.characters8(), digits.length(), &ok); 186 else 187 value = sign * charactersToIntStrict(digits.characters16(), digits.length(), &ok); 188 return ok; 189 } 190 191 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-integers 192 bool parseHTMLInteger(const String& input, int& value) 193 { 194 // Step 1 195 // Step 2 196 unsigned length = input.length(); 197 if (!length || input.is8Bit()) { 198 const LChar* start = input.characters8(); 199 return parseHTMLIntegerInternal(start, start + length, value); 200 } 201 202 const UChar* start = input.characters16(); 203 return parseHTMLIntegerInternal(start, start + length, value); 204 } 205 206 template <typename CharacterType> 207 static bool parseHTMLNonNegativeIntegerInternal(const CharacterType* position, const CharacterType* end, unsigned& value) 208 { 209 // Step 3 210 while (position < end) { 211 if (!isHTMLSpace<CharacterType>(*position)) 212 break; 213 ++position; 214 } 215 216 // Step 4 217 if (position == end) 218 return false; 219 ASSERT(position < end); 220 221 // Step 5 222 if (*position == '+') 223 ++position; 224 225 // Step 6 226 if (position == end) 227 return false; 228 ASSERT(position < end); 229 230 // Step 7 231 if (!isASCIIDigit(*position)) 232 return false; 233 234 // Step 8 235 StringBuilder digits; 236 while (position < end) { 237 if (!isASCIIDigit(*position)) 238 break; 239 digits.append(*position++); 240 } 241 242 // Step 9 243 bool ok; 244 if (digits.is8Bit()) 245 value = charactersToUIntStrict(digits.characters8(), digits.length(), &ok); 246 else 247 value = charactersToUIntStrict(digits.characters16(), digits.length(), &ok); 248 return ok; 249 } 250 251 252 // http://www.whatwg.org/specs/web-apps/current-work/#rules-for-parsing-non-negative-integers 253 bool parseHTMLNonNegativeInteger(const String& input, unsigned& value) 254 { 255 // Step 1 256 // Step 2 257 unsigned length = input.length(); 258 if (length && input.is8Bit()) { 259 const LChar* start = input.characters8(); 260 return parseHTMLNonNegativeIntegerInternal(start, start + length, value); 261 } 262 263 const UChar* start = input.characters16(); 264 return parseHTMLNonNegativeIntegerInternal(start, start + length, value); 265 } 266 267 static const char charsetString[] = "charset"; 268 static const size_t charsetLength = sizeof("charset") - 1; 269 270 String extractCharset(const String& value) 271 { 272 size_t pos = 0; 273 unsigned length = value.length(); 274 275 while (pos < length) { 276 pos = value.find(charsetString, pos, false); 277 if (pos == kNotFound) 278 break; 279 280 pos += charsetLength; 281 282 // Skip whitespace. 283 while (pos < length && value[pos] <= ' ') 284 ++pos; 285 286 if (value[pos] != '=') 287 continue; 288 289 ++pos; 290 291 while (pos < length && value[pos] <= ' ') 292 ++pos; 293 294 char quoteMark = 0; 295 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { 296 quoteMark = static_cast<char>(value[pos++]); 297 ASSERT(!(quoteMark & 0x80)); 298 } 299 300 if (pos == length) 301 break; 302 303 unsigned end = pos; 304 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';'))) 305 ++end; 306 307 if (quoteMark && (end == length)) 308 break; // Close quote not found. 309 310 return value.substring(pos, end - pos); 311 } 312 313 return ""; 314 } 315 316 enum Mode { 317 None, 318 Charset, 319 Pragma, 320 }; 321 322 WTF::TextEncoding encodingFromMetaAttributes(const HTMLAttributeList& attributes) 323 { 324 bool gotPragma = false; 325 Mode mode = None; 326 String charset; 327 328 for (HTMLAttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) { 329 const String& attributeName = iter->first; 330 const String& attributeValue = AtomicString(iter->second); 331 332 if (threadSafeMatch(attributeName, http_equivAttr)) { 333 if (equalIgnoringCase(attributeValue, "content-type")) 334 gotPragma = true; 335 } else if (charset.isEmpty()) { 336 if (threadSafeMatch(attributeName, charsetAttr)) { 337 charset = attributeValue; 338 mode = Charset; 339 } else if (threadSafeMatch(attributeName, contentAttr)) { 340 charset = extractCharset(attributeValue); 341 if (charset.length()) 342 mode = Pragma; 343 } 344 } 345 } 346 347 if (mode == Charset || (mode == Pragma && gotPragma)) 348 return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); 349 350 return WTF::TextEncoding(); 351 } 352 353 static bool threadSafeEqual(const StringImpl* a, const StringImpl* b) 354 { 355 if (a == b) 356 return true; 357 if (a->hash() != b->hash()) 358 return false; 359 return equalNonNull(a, b); 360 } 361 362 bool threadSafeMatch(const QualifiedName& a, const QualifiedName& b) 363 { 364 return threadSafeEqual(a.localName().impl(), b.localName().impl()); 365 } 366 367 bool threadSafeMatch(const String& localName, const QualifiedName& qName) 368 { 369 return threadSafeEqual(localName.impl(), qName.localName().impl()); 370 } 371 372 template<typename CharType> 373 inline StringImpl* findStringIfStatic(const CharType* characters, unsigned length) 374 { 375 // We don't need to try hashing if we know the string is too long. 376 if (length > StringImpl::highestStaticStringLength()) 377 return 0; 378 // computeHashAndMaskTop8Bits is the function StringImpl::hash() uses. 379 unsigned hash = StringHasher::computeHashAndMaskTop8Bits(characters, length); 380 const WTF::StaticStringsTable& table = StringImpl::allStaticStrings(); 381 ASSERT(!table.isEmpty()); 382 383 WTF::StaticStringsTable::const_iterator it = table.find(hash); 384 if (it == table.end()) 385 return 0; 386 // It's possible to have hash collisions between arbitrary strings and 387 // known identifiers (e.g. "bvvfg" collides with "script"). 388 // However ASSERTs in StringImpl::createStatic guard against there ever being collisions 389 // between static strings. 390 if (!equal(it->value, characters, length)) 391 return 0; 392 return it->value; 393 } 394 395 String attemptStaticStringCreation(const LChar* characters, size_t size) 396 { 397 String string(findStringIfStatic(characters, size)); 398 if (string.impl()) 399 return string; 400 return String(characters, size); 401 } 402 403 String attemptStaticStringCreation(const UChar* characters, size_t size, CharacterWidth width) 404 { 405 String string(findStringIfStatic(characters, size)); 406 if (string.impl()) 407 return string; 408 if (width == Likely8Bit) 409 string = StringImpl::create8BitIfPossible(characters, size); 410 else if (width == Force8Bit) 411 string = String::make8BitFrom16BitSource(characters, size); 412 else 413 string = String(characters, size); 414 415 return string; 416 } 417 418 } 419