1 /* 2 Copyright (C) 1999 Lars Knoll (knoll (at) mpi-hd.mpg.de) 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap (at) nypop.com) 5 6 This library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Library General Public 8 License as published by the Free Software Foundation; either 9 version 2 of the License, or (at your option) any later version. 10 11 This library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Library General Public License for more details. 15 16 You should have received a copy of the GNU Library General Public License 17 along with this library; see the file COPYING.LIB. If not, write to 18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 Boston, MA 02110-1301, USA. 20 */ 21 22 23 #include "config.h" 24 #include "core/loader/TextResourceDecoder.h" 25 26 #include "HTMLNames.h" 27 #include "core/dom/DOMImplementation.h" 28 #include "core/html/parser/HTMLMetaCharsetParser.h" 29 #include "core/platform/text/TextEncodingDetector.h" 30 #include "wtf/StringExtras.h" 31 #include "wtf/text/TextCodec.h" 32 #include "wtf/text/TextEncoding.h" 33 #include "wtf/text/TextEncodingRegistry.h" 34 35 using namespace WTF; 36 37 namespace WebCore { 38 39 using namespace HTMLNames; 40 41 static inline bool bytesEqual(const char* p, char b0, char b1) 42 { 43 return p[0] == b0 && p[1] == b1; 44 } 45 46 static inline bool bytesEqual(const char* p, char b0, char b1, char b2) 47 { 48 return p[0] == b0 && p[1] == b1 && p[2] == b2; 49 } 50 51 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4) 52 { 53 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4; 54 } 55 56 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5) 57 { 58 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5; 59 } 60 61 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7) 62 { 63 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7; 64 } 65 66 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9) 67 { 68 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9; 69 } 70 71 // You might think we should put these find functions elsewhere, perhaps with the 72 // similar functions that operate on UChar, but arguably only the decoder has 73 // a reason to process strings of char rather than UChar. 74 75 static int find(const char* subject, size_t subjectLength, const char* target) 76 { 77 size_t targetLength = strlen(target); 78 if (targetLength > subjectLength) 79 return -1; 80 for (size_t i = 0; i <= subjectLength - targetLength; ++i) { 81 bool match = true; 82 for (size_t j = 0; j < targetLength; ++j) { 83 if (subject[i + j] != target[j]) { 84 match = false; 85 break; 86 } 87 } 88 if (match) 89 return i; 90 } 91 return -1; 92 } 93 94 static WTF::TextEncoding findTextEncoding(const char* encodingName, int length) 95 { 96 Vector<char, 64> buffer(length + 1); 97 memcpy(buffer.data(), encodingName, length); 98 buffer[length] = '\0'; 99 return buffer.data(); 100 } 101 102 class KanjiCode { 103 public: 104 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 }; 105 static enum Type judge(const char* str, int length); 106 static const int ESC = 0x1b; 107 static const unsigned char sjisMap[256]; 108 static int ISkanji(int code) 109 { 110 if (code >= 0x100) 111 return 0; 112 return sjisMap[code & 0xff] & 1; 113 } 114 static int ISkana(int code) 115 { 116 if (code >= 0x100) 117 return 0; 118 return sjisMap[code & 0xff] & 2; 119 } 120 }; 121 122 const unsigned char KanjiCode::sjisMap[256] = { 123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 133 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 134 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 139 }; 140 141 /* 142 * EUC-JP is 143 * [0xa1 - 0xfe][0xa1 - 0xfe] 144 * 0x8e[0xa1 - 0xfe](SS2) 145 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) 146 * 147 * Shift_Jis is 148 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] 149 * 150 * Shift_Jis Hankaku Kana is 151 * [0xa1 - 0xdf] 152 */ 153 154 /* 155 * KanjiCode::judge() is based on judge_jcode() from jvim 156 * http://hp.vector.co.jp/authors/VA003457/vim/ 157 * 158 * Special Thanks to Kenichi Tsuchida 159 */ 160 161 enum KanjiCode::Type KanjiCode::judge(const char* str, int size) 162 { 163 enum Type code; 164 int i; 165 int bfr = false; /* Kana Moji */ 166 int bfk = 0; /* EUC Kana */ 167 int sjis = 0; 168 int euc = 0; 169 170 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str); 171 172 code = ASCII; 173 174 i = 0; 175 while (i < size) { 176 if (ptr[i] == ESC && (size - i >= 3)) { 177 if (bytesEqual(str + i + 1, '$', 'B') 178 || bytesEqual(str + i + 1, '(', 'B') 179 || bytesEqual(str + i + 1, '$', '@') 180 || bytesEqual(str + i + 1, '(', 'J')) { 181 code = JIS; 182 goto breakBreak; 183 } 184 if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')', 'I')) { 185 code = JIS; 186 i += 3; 187 } else { 188 i++; 189 } 190 bfr = false; 191 bfk = 0; 192 } else { 193 if (ptr[i] < 0x20) { 194 bfr = false; 195 bfk = 0; 196 /* ?? check kudokuten ?? && ?? hiragana ?? */ 197 if ((i >= 2) && (ptr[i - 2] == 0x81) 198 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { 199 code = SJIS; 200 sjis += 100; /* kudokuten */ 201 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) 202 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) { 203 code = EUC; 204 euc += 100; /* kudokuten */ 205 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { 206 sjis += 40; /* hiragana */ 207 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { 208 euc += 40; /* hiragana */ 209 } 210 } else { 211 /* ?? check hiragana or katana ?? */ 212 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { 213 sjis++; /* hiragana */ 214 } else if ((size - i > 1) && (ptr[i] == 0x83) 215 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) { 216 sjis++; /* katakana */ 217 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { 218 euc++; /* hiragana */ 219 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { 220 euc++; /* katakana */ 221 } 222 if (bfr) { 223 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) { 224 code = SJIS; 225 goto breakBreak; 226 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { 227 code = SJIS; 228 goto breakBreak; 229 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { 230 code = EUC; 231 goto breakBreak; 232 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { 233 code = EUC; 234 goto breakBreak; 235 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) { 236 code = SJIS; 237 goto breakBreak; 238 } else if (ptr[i] <= 0x7f) { 239 code = SJIS; 240 goto breakBreak; 241 } else { 242 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) { 243 euc++; /* sjis hankaku kana kigo */ 244 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) { 245 ; /* sjis hankaku kana */ 246 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) { 247 euc++; 248 } else if (0x8e == ptr[i]) { 249 euc++; 250 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) { 251 sjis++; 252 } 253 bfr = false; 254 bfk = 0; 255 } 256 } else if (0x8e == ptr[i]) { 257 if (size - i <= 1) { 258 ; 259 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { 260 /* EUC KANA or SJIS KANJI */ 261 if (bfk == 1) { 262 euc += 100; 263 } 264 bfk++; 265 i++; 266 } else { 267 /* SJIS only */ 268 code = SJIS; 269 goto breakBreak; 270 } 271 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { 272 /* SJIS only */ 273 code = SJIS; 274 if ((size - i >= 1) 275 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) 276 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) { 277 goto breakBreak; 278 } 279 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { 280 /* EUC only */ 281 code = EUC; 282 if ((size - i >= 1) 283 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) { 284 goto breakBreak; 285 } 286 } else if (ptr[i] <= 0x7f) { 287 ; 288 } else { 289 bfr = true; 290 bfk = 0; 291 } 292 } 293 i++; 294 } 295 } 296 if (code == ASCII) { 297 if (sjis > euc) { 298 code = SJIS; 299 } else if (sjis < euc) { 300 code = EUC; 301 } 302 } 303 breakBreak: 304 return (code); 305 } 306 307 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType) 308 { 309 if (equalIgnoringCase(mimeType, "text/css")) 310 return CSS; 311 if (equalIgnoringCase(mimeType, "text/html")) 312 return HTML; 313 if (DOMImplementation::isXMLMIMEType(mimeType)) 314 return XML; 315 return PlainText; 316 } 317 318 const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const WTF::TextEncoding& specifiedDefaultEncoding) 319 { 320 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 321 // for text/xml. This matches Firefox. 322 if (contentType == XML) 323 return UTF8Encoding(); 324 if (!specifiedDefaultEncoding.isValid()) 325 return Latin1Encoding(); 326 return specifiedDefaultEncoding; 327 } 328 329 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector) 330 : m_contentType(determineContentType(mimeType)) 331 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 332 , m_source(DefaultEncoding) 333 , m_hintEncoding(0) 334 , m_checkedForBOM(false) 335 , m_checkedForCSSCharset(false) 336 , m_checkedForXMLCharset(false) 337 , m_checkedForMetaCharset(false) 338 , m_useLenientXMLDecoding(false) 339 , m_sawError(false) 340 , m_usesEncodingDetector(usesEncodingDetector) 341 { 342 } 343 344 TextResourceDecoder::~TextResourceDecoder() 345 { 346 } 347 348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, EncodingSource source) 349 { 350 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 351 if (!encoding.isValid()) 352 return; 353 354 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR), 355 // treat x-user-defined as windows-1252 (bug 18270) 356 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0) 357 m_encoding = "windows-1252"; 358 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset) 359 m_encoding = encoding.closestByteBasedEquivalent(); 360 else 361 m_encoding = encoding; 362 363 m_codec.clear(); 364 m_source = source; 365 } 366 367 // Returns the position of the encoding string. 368 static int findXMLEncoding(const char* str, int len, int& encodingLength) 369 { 370 int pos = find(str, len, "encoding"); 371 if (pos == -1) 372 return -1; 373 pos += 8; 374 375 // Skip spaces and stray control characters. 376 while (pos < len && str[pos] <= ' ') 377 ++pos; 378 379 // Skip equals sign. 380 if (pos >= len || str[pos] != '=') 381 return -1; 382 ++pos; 383 384 // Skip spaces and stray control characters. 385 while (pos < len && str[pos] <= ' ') 386 ++pos; 387 388 // Skip quotation mark. 389 if (pos >= len) 390 return - 1; 391 char quoteMark = str[pos]; 392 if (quoteMark != '"' && quoteMark != '\'') 393 return -1; 394 ++pos; 395 396 // Find the trailing quotation mark. 397 int end = pos; 398 while (end < len && str[end] != quoteMark) 399 ++end; 400 if (end >= len) 401 return -1; 402 403 encodingLength = end - pos; 404 return pos; 405 } 406 407 // true if there is more to parse 408 static inline bool skipWhitespace(const char*& pos, const char* dataEnd) 409 { 410 while (pos < dataEnd && (*pos == '\t' || *pos == ' ')) 411 ++pos; 412 return pos != dataEnd; 413 } 414 415 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len) 416 { 417 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 418 // We let it override even a user-chosen encoding. 419 ASSERT(!m_checkedForBOM); 420 421 size_t lengthOfBOM = 0; 422 423 size_t bufferLength = m_buffer.size(); 424 425 size_t buf1Len = bufferLength; 426 size_t buf2Len = len; 427 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data()); 428 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 429 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 430 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 431 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 432 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 433 434 // Check for the BOM. 435 if (c1 == 0xFF && c2 == 0xFE) { 436 if (c3 != 0 || c4 != 0) { 437 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 438 lengthOfBOM = 2; 439 } else { 440 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 441 lengthOfBOM = 4; 442 } 443 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 444 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 445 lengthOfBOM = 3; 446 } else if (c1 == 0xFE && c2 == 0xFF) { 447 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 448 lengthOfBOM = 2; 449 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { 450 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 451 lengthOfBOM = 4; 452 } 453 454 if (lengthOfBOM || bufferLength + len >= 4) 455 m_checkedForBOM = true; 456 457 return lengthOfBOM; 458 } 459 460 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer) 461 { 462 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 463 m_checkedForCSSCharset = true; 464 return true; 465 } 466 467 size_t oldSize = m_buffer.size(); 468 m_buffer.grow(oldSize + len); 469 memcpy(m_buffer.data() + oldSize, data, len); 470 471 movedDataToBuffer = true; 472 473 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13 474 return false; 475 476 const char* dataStart = m_buffer.data(); 477 const char* dataEnd = dataStart + m_buffer.size(); 478 479 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) { 480 dataStart += 10; 481 const char* pos = dataStart; 482 483 while (pos < dataEnd && *pos != '"') 484 ++pos; 485 if (pos == dataEnd) 486 return false; 487 488 int encodingNameLength = pos - dataStart; 489 490 ++pos; 491 492 if (*pos == ';') 493 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset); 494 } 495 496 m_checkedForCSSCharset = true; 497 return true; 498 } 499 500 bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer) 501 { 502 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 503 m_checkedForXMLCharset = true; 504 return true; 505 } 506 507 // This is not completely efficient, since the function might go 508 // through the HTML head several times. 509 510 size_t oldSize = m_buffer.size(); 511 m_buffer.grow(oldSize + len); 512 memcpy(m_buffer.data() + oldSize, data, len); 513 514 movedDataToBuffer = true; 515 516 const char* ptr = m_buffer.data(); 517 const char* pEnd = ptr + m_buffer.size(); 518 519 // Is there enough data available to check for XML declaration? 520 if (m_buffer.size() < 8) 521 return false; 522 523 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents. 524 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case. 525 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) { 526 const char* xmlDeclarationEnd = ptr; 527 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>') 528 ++xmlDeclarationEnd; 529 if (xmlDeclarationEnd == pEnd) 530 return false; 531 // No need for +1, because we have an extra "?" to lose at the end of XML declaration. 532 int len = 0; 533 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len); 534 if (pos != -1) 535 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader); 536 // continue looking for a charset - it may be specified in an HTTP-Equiv meta 537 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) 538 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 539 else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) 540 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 541 else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0)) 542 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 543 else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?')) 544 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 545 546 m_checkedForXMLCharset = true; 547 return true; 548 } 549 550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length) 551 { 552 if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) { 553 m_checkedForMetaCharset = true; 554 return; 555 } 556 557 if (!m_charsetParser) 558 m_charsetParser = HTMLMetaCharsetParser::create(); 559 560 if (!m_charsetParser->checkForMetaCharset(data, length)) 561 return; 562 563 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 564 m_charsetParser.clear(); 565 m_checkedForMetaCharset = true; 566 return; 567 } 568 569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len) 570 { 571 switch (KanjiCode::judge(data, len)) { 572 case KanjiCode::JIS: 573 setEncoding("ISO-2022-JP", EncodingFromContentSniffing); 574 break; 575 case KanjiCode::EUC: 576 setEncoding("EUC-JP", EncodingFromContentSniffing); 577 break; 578 case KanjiCode::SJIS: 579 setEncoding("Shift_JIS", EncodingFromContentSniffing); 580 break; 581 case KanjiCode::ASCII: 582 case KanjiCode::UTF16: 583 case KanjiCode::UTF8: 584 break; 585 } 586 } 587 588 // We use the encoding detector in two cases: 589 // 1. Encoding detector is turned ON and no other encoding source is 590 // available (that is, it's DefaultEncoding). 591 // 2. Encoding detector is turned ON and the encoding is set to 592 // the encoding of the parent frame, which is also auto-detected. 593 // Note that condition #2 is NOT satisfied unless parent-child frame 594 // relationship is compliant to the same-origin policy. If they're from 595 // different domains, |m_source| would not be set to EncodingFromParentFrame 596 // in the first place. 597 bool TextResourceDecoder::shouldAutoDetect() const 598 { 599 // Just checking m_hintEncoding suffices here because it's only set 600 // in setHintEncoding when the source is AutoDetectedEncoding. 601 return m_usesEncodingDetector 602 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 603 } 604 605 String TextResourceDecoder::decode(const char* data, size_t len) 606 { 607 size_t lengthOfBOM = 0; 608 if (!m_checkedForBOM) 609 lengthOfBOM = checkForBOM(data, len); 610 611 bool movedDataToBuffer = false; 612 613 if (m_contentType == CSS && !m_checkedForCSSCharset) 614 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 615 return emptyString(); 616 617 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLCharset) 618 if (!checkForXMLCharset(data, len, movedDataToBuffer)) 619 return emptyString(); 620 621 // FIXME: It would be more efficient to move this logic below checkForMetaCharset because 622 // checkForMetaCharset can overrule these detections. 623 if (shouldAutoDetect()) { 624 if (m_encoding.isJapanese()) 625 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages. 626 else { 627 WTF::TextEncoding detectedEncoding; 628 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) 629 setEncoding(detectedEncoding, EncodingFromContentSniffing); 630 } 631 } 632 633 ASSERT(m_encoding.isValid()); 634 635 const char* dataForDecode = data + lengthOfBOM; 636 size_t lengthForDecode = len - lengthOfBOM; 637 638 if (!m_buffer.isEmpty()) { 639 if (!movedDataToBuffer) { 640 size_t oldSize = m_buffer.size(); 641 m_buffer.grow(oldSize + len); 642 memcpy(m_buffer.data() + oldSize, data, len); 643 } 644 645 dataForDecode = m_buffer.data() + lengthOfBOM; 646 lengthForDecode = m_buffer.size() - lengthOfBOM; 647 } 648 649 if (m_contentType == HTML && !m_checkedForMetaCharset) 650 checkForMetaCharset(dataForDecode, lengthForDecode); 651 652 if (!m_codec) 653 m_codec = newTextCodec(m_encoding); 654 655 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 656 657 m_buffer.clear(); 658 return result; 659 } 660 661 String TextResourceDecoder::flush() 662 { 663 // If we can not identify the encoding even after a document is completely 664 // loaded, we need to detect the encoding if other conditions for 665 // autodetection is satisfied. 666 if (m_buffer.size() && shouldAutoDetect() 667 && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) { 668 WTF::TextEncoding detectedEncoding; 669 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding)) 670 setEncoding(detectedEncoding, EncodingFromContentSniffing); 671 } 672 673 if (!m_codec) 674 m_codec = newTextCodec(m_encoding); 675 676 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 677 m_buffer.clear(); 678 m_codec.clear(); 679 m_checkedForBOM = false; // Skip BOM again when re-decoding. 680 return result; 681 } 682 683 } 684