1 /* 2 Copyright (C) 1999 Lars Knoll (knoll (at) mpi-hd.mpg.de) 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap (at) nypop.com) 5 6 This library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Library General Public 8 License as published by the Free Software Foundation; either 9 version 2 of the License, or (at your option) any later version. 10 11 This library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Library General Public License for more details. 15 16 You should have received a copy of the GNU Library General Public License 17 along with this library; see the file COPYING.LIB. If not, write to 18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 Boston, MA 02110-1301, USA. 20 */ 21 22 23 #include "config.h" 24 #include "TextResourceDecoder.h" 25 26 #include "DOMImplementation.h" 27 #include "HTMLMetaCharsetParser.h" 28 #include "HTMLNames.h" 29 #include "TextCodec.h" 30 #include "TextEncoding.h" 31 #include "TextEncodingDetector.h" 32 #include "TextEncodingRegistry.h" 33 #include <wtf/ASCIICType.h> 34 #include <wtf/StringExtras.h> 35 36 using namespace WTF; 37 38 namespace WebCore { 39 40 using namespace HTMLNames; 41 42 // You might think we should put these find functions elsewhere, perhaps with the 43 // similar functions that operate on UChar, but arguably only the decoder has 44 // a reason to process strings of char rather than UChar. 45 46 static int find(const char* subject, size_t subjectLength, const char* target) 47 { 48 size_t targetLength = strlen(target); 49 if (targetLength > subjectLength) 50 return -1; 51 for (size_t i = 0; i <= subjectLength - targetLength; ++i) { 52 bool match = true; 53 for (size_t j = 0; j < targetLength; ++j) { 54 if (subject[i + j] != target[j]) { 55 match = false; 56 break; 57 } 58 } 59 if (match) 60 return i; 61 } 62 return -1; 63 } 64 65 static TextEncoding findTextEncoding(const char* encodingName, int length) 66 { 67 Vector<char, 64> buffer(length + 1); 68 memcpy(buffer.data(), encodingName, length); 69 buffer[length] = '\0'; 70 return buffer.data(); 71 } 72 73 class KanjiCode { 74 public: 75 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 }; 76 static enum Type judge(const char* str, int length); 77 static const int ESC = 0x1b; 78 static const unsigned char sjisMap[256]; 79 static int ISkanji(int code) 80 { 81 if (code >= 0x100) 82 return 0; 83 return sjisMap[code & 0xff] & 1; 84 } 85 static int ISkana(int code) 86 { 87 if (code >= 0x100) 88 return 0; 89 return sjisMap[code & 0xff] & 2; 90 } 91 }; 92 93 const unsigned char KanjiCode::sjisMap[256] = { 94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 104 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 105 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 110 }; 111 112 /* 113 * EUC-JP is 114 * [0xa1 - 0xfe][0xa1 - 0xfe] 115 * 0x8e[0xa1 - 0xfe](SS2) 116 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) 117 * 118 * Shift_Jis is 119 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] 120 * 121 * Shift_Jis Hankaku Kana is 122 * [0xa1 - 0xdf] 123 */ 124 125 /* 126 * KanjiCode::judge() is based on judge_jcode() from jvim 127 * http://hp.vector.co.jp/authors/VA003457/vim/ 128 * 129 * Special Thanks to Kenichi Tsuchida 130 */ 131 132 enum KanjiCode::Type KanjiCode::judge(const char* str, int size) 133 { 134 enum Type code; 135 int i; 136 int bfr = false; /* Kana Moji */ 137 int bfk = 0; /* EUC Kana */ 138 int sjis = 0; 139 int euc = 0; 140 141 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str); 142 143 code = ASCII; 144 145 i = 0; 146 while (i < size) { 147 if (ptr[i] == ESC && (size - i >= 3)) { 148 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B') 149 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) { 150 code = JIS; 151 goto breakBreak; 152 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@') 153 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) { 154 code = JIS; 155 goto breakBreak; 156 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') { 157 code = JIS; 158 i += 3; 159 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') { 160 code = JIS; 161 i += 3; 162 } else { 163 i++; 164 } 165 bfr = false; 166 bfk = 0; 167 } else { 168 if (ptr[i] < 0x20) { 169 bfr = false; 170 bfk = 0; 171 /* ?? check kudokuten ?? && ?? hiragana ?? */ 172 if ((i >= 2) && (ptr[i - 2] == 0x81) 173 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { 174 code = SJIS; 175 sjis += 100; /* kudokuten */ 176 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) 177 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) { 178 code = EUC; 179 euc += 100; /* kudokuten */ 180 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { 181 sjis += 40; /* hiragana */ 182 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { 183 euc += 40; /* hiragana */ 184 } 185 } else { 186 /* ?? check hiragana or katana ?? */ 187 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { 188 sjis++; /* hiragana */ 189 } else if ((size - i > 1) && (ptr[i] == 0x83) 190 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) { 191 sjis++; /* katakana */ 192 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { 193 euc++; /* hiragana */ 194 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { 195 euc++; /* katakana */ 196 } 197 if (bfr) { 198 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) { 199 code = SJIS; 200 goto breakBreak; 201 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { 202 code = SJIS; 203 goto breakBreak; 204 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { 205 code = EUC; 206 goto breakBreak; 207 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { 208 code = EUC; 209 goto breakBreak; 210 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) { 211 code = SJIS; 212 goto breakBreak; 213 } else if (ptr[i] <= 0x7f) { 214 code = SJIS; 215 goto breakBreak; 216 } else { 217 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) { 218 euc++; /* sjis hankaku kana kigo */ 219 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) { 220 ; /* sjis hankaku kana */ 221 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) { 222 euc++; 223 } else if (0x8e == ptr[i]) { 224 euc++; 225 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) { 226 sjis++; 227 } 228 bfr = false; 229 bfk = 0; 230 } 231 } else if (0x8e == ptr[i]) { 232 if (size - i <= 1) { 233 ; 234 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { 235 /* EUC KANA or SJIS KANJI */ 236 if (bfk == 1) { 237 euc += 100; 238 } 239 bfk++; 240 i++; 241 } else { 242 /* SJIS only */ 243 code = SJIS; 244 goto breakBreak; 245 } 246 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { 247 /* SJIS only */ 248 code = SJIS; 249 if ((size - i >= 1) 250 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) 251 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) { 252 goto breakBreak; 253 } 254 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { 255 /* EUC only */ 256 code = EUC; 257 if ((size - i >= 1) 258 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) { 259 goto breakBreak; 260 } 261 } else if (ptr[i] <= 0x7f) { 262 ; 263 } else { 264 bfr = true; 265 bfk = 0; 266 } 267 } 268 i++; 269 } 270 } 271 if (code == ASCII) { 272 if (sjis > euc) { 273 code = SJIS; 274 } else if (sjis < euc) { 275 code = EUC; 276 } 277 } 278 breakBreak: 279 return (code); 280 } 281 282 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType) 283 { 284 if (equalIgnoringCase(mimeType, "text/css")) 285 return CSS; 286 if (equalIgnoringCase(mimeType, "text/html")) 287 return HTML; 288 if (DOMImplementation::isXMLMIMEType(mimeType)) 289 return XML; 290 return PlainText; 291 } 292 293 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding) 294 { 295 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 296 // for text/xml. This matches Firefox. 297 if (contentType == XML) 298 return UTF8Encoding(); 299 if (!specifiedDefaultEncoding.isValid()) 300 return Latin1Encoding(); 301 return specifiedDefaultEncoding; 302 } 303 304 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector) 305 : m_contentType(determineContentType(mimeType)) 306 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 307 , m_source(DefaultEncoding) 308 , m_hintEncoding(0) 309 , m_checkedForBOM(false) 310 , m_checkedForCSSCharset(false) 311 , m_checkedForHeadCharset(false) 312 , m_useLenientXMLDecoding(false) 313 , m_sawError(false) 314 , m_usesEncodingDetector(usesEncodingDetector) 315 { 316 } 317 318 TextResourceDecoder::~TextResourceDecoder() 319 { 320 } 321 322 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source) 323 { 324 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 325 if (!encoding.isValid()) 326 return; 327 328 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR), 329 // treat x-user-defined as windows-1252 (bug 18270) 330 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0) 331 m_encoding = "windows-1252"; 332 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset) 333 m_encoding = encoding.closestByteBasedEquivalent(); 334 else 335 m_encoding = encoding; 336 337 m_codec.clear(); 338 m_source = source; 339 } 340 341 // Returns the position of the encoding string. 342 static int findXMLEncoding(const char* str, int len, int& encodingLength) 343 { 344 int pos = find(str, len, "encoding"); 345 if (pos == -1) 346 return -1; 347 pos += 8; 348 349 // Skip spaces and stray control characters. 350 while (pos < len && str[pos] <= ' ') 351 ++pos; 352 353 // Skip equals sign. 354 if (pos >= len || str[pos] != '=') 355 return -1; 356 ++pos; 357 358 // Skip spaces and stray control characters. 359 while (pos < len && str[pos] <= ' ') 360 ++pos; 361 362 // Skip quotation mark. 363 if (pos >= len) 364 return - 1; 365 char quoteMark = str[pos]; 366 if (quoteMark != '"' && quoteMark != '\'') 367 return -1; 368 ++pos; 369 370 // Find the trailing quotation mark. 371 int end = pos; 372 while (end < len && str[end] != quoteMark) 373 ++end; 374 if (end >= len) 375 return -1; 376 377 encodingLength = end - pos; 378 return pos; 379 } 380 381 // true if there is more to parse 382 static inline bool skipWhitespace(const char*& pos, const char* dataEnd) 383 { 384 while (pos < dataEnd && (*pos == '\t' || *pos == ' ')) 385 ++pos; 386 return pos != dataEnd; 387 } 388 389 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len) 390 { 391 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 392 // We let it override even a user-chosen encoding. 393 ASSERT(!m_checkedForBOM); 394 395 size_t lengthOfBOM = 0; 396 397 size_t bufferLength = m_buffer.size(); 398 399 size_t buf1Len = bufferLength; 400 size_t buf2Len = len; 401 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data()); 402 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 403 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 404 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 405 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 406 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 407 408 // Check for the BOM. 409 if (c1 == 0xFF && c2 == 0xFE) { 410 if (c3 != 0 || c4 != 0) { 411 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 412 lengthOfBOM = 2; 413 } else { 414 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 415 lengthOfBOM = 4; 416 } 417 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 418 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 419 lengthOfBOM = 3; 420 } else if (c1 == 0xFE && c2 == 0xFF) { 421 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 422 lengthOfBOM = 2; 423 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { 424 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 425 lengthOfBOM = 4; 426 } 427 428 if (lengthOfBOM || bufferLength + len >= 4) 429 m_checkedForBOM = true; 430 431 return lengthOfBOM; 432 } 433 434 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer) 435 { 436 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 437 m_checkedForCSSCharset = true; 438 return true; 439 } 440 441 size_t oldSize = m_buffer.size(); 442 m_buffer.grow(oldSize + len); 443 memcpy(m_buffer.data() + oldSize, data, len); 444 445 movedDataToBuffer = true; 446 447 if (m_buffer.size() > 8) { // strlen("@charset") == 8 448 const char* dataStart = m_buffer.data(); 449 const char* dataEnd = dataStart + m_buffer.size(); 450 451 if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' && 452 dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') { 453 454 dataStart += 8; 455 const char* pos = dataStart; 456 if (!skipWhitespace(pos, dataEnd)) 457 return false; 458 459 if (*pos == '"' || *pos == '\'') { 460 char quotationMark = *pos; 461 ++pos; 462 dataStart = pos; 463 464 while (pos < dataEnd && *pos != quotationMark) 465 ++pos; 466 if (pos == dataEnd) 467 return false; 468 469 int encodingNameLength = pos - dataStart; 470 471 ++pos; 472 if (!skipWhitespace(pos, dataEnd)) 473 return false; 474 475 if (*pos == ';') 476 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset); 477 } 478 } 479 m_checkedForCSSCharset = true; 480 return true; 481 } 482 return false; 483 } 484 485 // Other browsers allow comments in the head section, so we need to also. 486 // It's important not to look for tags inside the comments. 487 static inline void skipComment(const char*& ptr, const char* pEnd) 488 { 489 const char* p = ptr; 490 if (p == pEnd) 491 return; 492 // Allow <!-->; other browsers do. 493 if (*p == '>') { 494 p++; 495 } else { 496 while (p + 2 < pEnd) { 497 if (*p == '-') { 498 // This is the real end of comment, "-->". 499 if (p[1] == '-' && p[2] == '>') { 500 p += 3; 501 break; 502 } 503 // This is the incorrect end of comment that other browsers allow, "--!>". 504 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') { 505 p += 4; 506 break; 507 } 508 } 509 p++; 510 } 511 } 512 ptr = p; 513 } 514 515 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer) 516 { 517 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 518 m_checkedForHeadCharset = true; 519 return true; 520 } 521 522 // This is not completely efficient, since the function might go 523 // through the HTML head several times. 524 525 size_t oldSize = m_buffer.size(); 526 m_buffer.grow(oldSize + len); 527 memcpy(m_buffer.data() + oldSize, data, len); 528 529 movedDataToBuffer = true; 530 531 // Continue with checking for an HTML meta tag if we were already doing so. 532 if (m_charsetParser) 533 return checkForMetaCharset(data, len); 534 535 const char* ptr = m_buffer.data(); 536 const char* pEnd = ptr + m_buffer.size(); 537 538 // Is there enough data available to check for XML declaration? 539 if (m_buffer.size() < 8) 540 return false; 541 542 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents. 543 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case. 544 if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') { 545 const char* xmlDeclarationEnd = ptr; 546 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>') 547 ++xmlDeclarationEnd; 548 if (xmlDeclarationEnd == pEnd) 549 return false; 550 // No need for +1, because we have an extra "?" to lose at the end of XML declaration. 551 int len = 0; 552 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len); 553 if (pos != -1) 554 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader); 555 // continue looking for a charset - it may be specified in an HTTP-Equiv meta 556 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) { 557 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 558 return true; 559 } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') { 560 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 561 return true; 562 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) { 563 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 564 return true; 565 } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') { 566 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 567 return true; 568 } 569 570 // The HTTP-EQUIV meta has no effect on XHTML. 571 if (m_contentType == XML) 572 return true; 573 574 m_charsetParser = HTMLMetaCharsetParser::create(); 575 return checkForMetaCharset(data, len); 576 } 577 578 bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length) 579 { 580 if (!m_charsetParser->checkForMetaCharset(data, length)) 581 return false; 582 583 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 584 m_charsetParser.clear(); 585 m_checkedForHeadCharset = true; 586 return true; 587 } 588 589 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len) 590 { 591 switch (KanjiCode::judge(data, len)) { 592 case KanjiCode::JIS: 593 setEncoding("ISO-2022-JP", AutoDetectedEncoding); 594 break; 595 case KanjiCode::EUC: 596 setEncoding("EUC-JP", AutoDetectedEncoding); 597 break; 598 case KanjiCode::SJIS: 599 setEncoding("Shift_JIS", AutoDetectedEncoding); 600 break; 601 case KanjiCode::ASCII: 602 case KanjiCode::UTF16: 603 case KanjiCode::UTF8: 604 break; 605 } 606 } 607 608 // We use the encoding detector in two cases: 609 // 1. Encoding detector is turned ON and no other encoding source is 610 // available (that is, it's DefaultEncoding). 611 // 2. Encoding detector is turned ON and the encoding is set to 612 // the encoding of the parent frame, which is also auto-detected. 613 // Note that condition #2 is NOT satisfied unless parent-child frame 614 // relationship is compliant to the same-origin policy. If they're from 615 // different domains, |m_source| would not be set to EncodingFromParentFrame 616 // in the first place. 617 bool TextResourceDecoder::shouldAutoDetect() const 618 { 619 // Just checking m_hintEncoding suffices here because it's only set 620 // in setHintEncoding when the source is AutoDetectedEncoding. 621 return m_usesEncodingDetector 622 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 623 } 624 625 String TextResourceDecoder::decode(const char* data, size_t len) 626 { 627 size_t lengthOfBOM = 0; 628 if (!m_checkedForBOM) 629 lengthOfBOM = checkForBOM(data, len); 630 631 bool movedDataToBuffer = false; 632 633 if (m_contentType == CSS && !m_checkedForCSSCharset) 634 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 635 return ""; 636 637 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML 638 if (!checkForHeadCharset(data, len, movedDataToBuffer)) 639 return ""; 640 641 // FIXME: It is wrong to change the encoding downstream after we have already done some decoding. 642 if (shouldAutoDetect()) { 643 if (m_encoding.isJapanese()) 644 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages. 645 else { 646 TextEncoding detectedEncoding; 647 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) 648 setEncoding(detectedEncoding, AutoDetectedEncoding); 649 } 650 } 651 652 ASSERT(m_encoding.isValid()); 653 654 if (!m_codec) 655 m_codec = newTextCodec(m_encoding); 656 657 if (m_buffer.isEmpty()) 658 return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError); 659 660 if (!movedDataToBuffer) { 661 size_t oldSize = m_buffer.size(); 662 m_buffer.grow(oldSize + len); 663 memcpy(m_buffer.data() + oldSize, data, len); 664 } 665 666 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 667 m_buffer.clear(); 668 return result; 669 } 670 671 String TextResourceDecoder::flush() 672 { 673 // If we can not identify the encoding even after a document is completely 674 // loaded, we need to detect the encoding if other conditions for 675 // autodetection is satisfied. 676 if (m_buffer.size() && shouldAutoDetect() 677 && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) { 678 TextEncoding detectedEncoding; 679 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), 680 m_hintEncoding, &detectedEncoding)) 681 setEncoding(detectedEncoding, AutoDetectedEncoding); 682 } 683 684 if (!m_codec) 685 m_codec = newTextCodec(m_encoding); 686 687 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 688 m_buffer.clear(); 689 m_codec.clear(); 690 m_checkedForBOM = false; // Skip BOM again when re-decoding. 691 return result; 692 } 693 694 } 695