Home | History | Annotate | Download | only in loader
      1 /*
      2     Copyright (C) 1999 Lars Knoll (knoll (at) mpi-hd.mpg.de)
      3     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
      4     Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap (at) nypop.com)
      5 
      6     This library is free software; you can redistribute it and/or
      7     modify it under the terms of the GNU Library General Public
      8     License as published by the Free Software Foundation; either
      9     version 2 of the License, or (at your option) any later version.
     10 
     11     This library is distributed in the hope that it will be useful,
     12     but WITHOUT ANY WARRANTY; without even the implied warranty of
     13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14     Library General Public License for more details.
     15 
     16     You should have received a copy of the GNU Library General Public License
     17     along with this library; see the file COPYING.LIB.  If not, write to
     18     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     19     Boston, MA 02110-1301, USA.
     20 */
     21 
     22 
     23 #include "config.h"
     24 #include "core/loader/TextResourceDecoder.h"
     25 
     26 #include "HTMLNames.h"
     27 #include "core/dom/DOMImplementation.h"
     28 #include "core/html/parser/HTMLMetaCharsetParser.h"
     29 #include "core/platform/text/TextEncodingDetector.h"
     30 #include "wtf/StringExtras.h"
     31 #include "wtf/text/TextCodec.h"
     32 #include "wtf/text/TextEncoding.h"
     33 #include "wtf/text/TextEncodingRegistry.h"
     34 
     35 using namespace WTF;
     36 
     37 namespace WebCore {
     38 
     39 using namespace HTMLNames;
     40 
     41 static inline bool bytesEqual(const char* p, char b0, char b1)
     42 {
     43     return p[0] == b0 && p[1] == b1;
     44 }
     45 
     46 static inline bool bytesEqual(const char* p, char b0, char b1, char b2)
     47 {
     48     return p[0] == b0 && p[1] == b1 && p[2] == b2;
     49 }
     50 
     51 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
     52 {
     53     return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;
     54 }
     55 
     56 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)
     57 {
     58     return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;
     59 }
     60 
     61 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)
     62 {
     63     return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;
     64 }
     65 
     66 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)
     67 {
     68     return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;
     69 }
     70 
     71 // You might think we should put these find functions elsewhere, perhaps with the
     72 // similar functions that operate on UChar, but arguably only the decoder has
     73 // a reason to process strings of char rather than UChar.
     74 
     75 static int find(const char* subject, size_t subjectLength, const char* target)
     76 {
     77     size_t targetLength = strlen(target);
     78     if (targetLength > subjectLength)
     79         return -1;
     80     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
     81         bool match = true;
     82         for (size_t j = 0; j < targetLength; ++j) {
     83             if (subject[i + j] != target[j]) {
     84                 match = false;
     85                 break;
     86             }
     87         }
     88         if (match)
     89             return i;
     90     }
     91     return -1;
     92 }
     93 
     94 static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)
     95 {
     96     Vector<char, 64> buffer(length + 1);
     97     memcpy(buffer.data(), encodingName, length);
     98     buffer[length] = '\0';
     99     return buffer.data();
    100 }
    101 
    102 class KanjiCode {
    103 public:
    104     enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
    105     static enum Type judge(const char* str, int length);
    106     static const int ESC = 0x1b;
    107     static const unsigned char sjisMap[256];
    108     static int ISkanji(int code)
    109     {
    110         if (code >= 0x100)
    111             return 0;
    112         return sjisMap[code & 0xff] & 1;
    113     }
    114     static int ISkana(int code)
    115     {
    116         if (code >= 0x100)
    117             return 0;
    118         return sjisMap[code & 0xff] & 2;
    119     }
    120 };
    121 
    122 const unsigned char KanjiCode::sjisMap[256] = {
    123     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    124     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    125     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    126     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    127     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    128     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    129     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    130     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    131     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    132     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    133     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    134     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    135     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    136     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    137     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    138     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
    139 };
    140 
    141 /*
    142  * EUC-JP is
    143  *     [0xa1 - 0xfe][0xa1 - 0xfe]
    144  *     0x8e[0xa1 - 0xfe](SS2)
    145  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
    146  *
    147  * Shift_Jis is
    148  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
    149  *
    150  * Shift_Jis Hankaku Kana is
    151  *     [0xa1 - 0xdf]
    152  */
    153 
    154 /*
    155  * KanjiCode::judge() is based on judge_jcode() from jvim
    156  *     http://hp.vector.co.jp/authors/VA003457/vim/
    157  *
    158  * Special Thanks to Kenichi Tsuchida
    159  */
    160 
    161 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
    162 {
    163     enum Type code;
    164     int i;
    165     int bfr = false;            /* Kana Moji */
    166     int bfk = 0;                /* EUC Kana */
    167     int sjis = 0;
    168     int euc = 0;
    169 
    170     const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
    171 
    172     code = ASCII;
    173 
    174     i = 0;
    175     while (i < size) {
    176         if (ptr[i] == ESC && (size - i >= 3)) {
    177             if (bytesEqual(str + i + 1, '$', 'B')
    178                     || bytesEqual(str + i + 1, '(', 'B')
    179                     || bytesEqual(str + i + 1, '$', '@')
    180                     || bytesEqual(str + i + 1, '(', 'J')) {
    181                 code = JIS;
    182                 goto breakBreak;
    183             }
    184             if (bytesEqual(str + i + 1, '(', 'I') || bytesEqual(str + i + 1, ')', 'I')) {
    185                 code = JIS;
    186                 i += 3;
    187             } else {
    188                 i++;
    189             }
    190             bfr = false;
    191             bfk = 0;
    192         } else {
    193             if (ptr[i] < 0x20) {
    194                 bfr = false;
    195                 bfk = 0;
    196                 /* ?? check kudokuten ?? && ?? hiragana ?? */
    197                 if ((i >= 2) && (ptr[i - 2] == 0x81)
    198                         && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
    199                     code = SJIS;
    200                     sjis += 100;        /* kudokuten */
    201                 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
    202                         && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
    203                     code = EUC;
    204                     euc += 100;         /* kudokuten */
    205                 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
    206                     sjis += 40;         /* hiragana */
    207                 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
    208                     euc += 40;          /* hiragana */
    209                 }
    210             } else {
    211                 /* ?? check hiragana or katana ?? */
    212                 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
    213                     sjis++;     /* hiragana */
    214                 } else if ((size - i > 1) && (ptr[i] == 0x83)
    215                          && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
    216                     sjis++;     /* katakana */
    217                 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
    218                     euc++;      /* hiragana */
    219                 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
    220                     euc++;      /* katakana */
    221                 }
    222                 if (bfr) {
    223                     if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
    224                         code = SJIS;
    225                         goto breakBreak;
    226                     } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
    227                         code = SJIS;
    228                         goto breakBreak;
    229                     } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
    230                         code = EUC;
    231                         goto breakBreak;
    232                     } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
    233                         code = EUC;
    234                         goto breakBreak;
    235                     } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
    236                         code = SJIS;
    237                         goto breakBreak;
    238                     } else if (ptr[i] <= 0x7f) {
    239                         code = SJIS;
    240                         goto breakBreak;
    241                     } else {
    242                         if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
    243                             euc++;      /* sjis hankaku kana kigo */
    244                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
    245                             ;           /* sjis hankaku kana */
    246                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
    247                             euc++;
    248                         } else if (0x8e == ptr[i]) {
    249                             euc++;
    250                         } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
    251                             sjis++;
    252                         }
    253                         bfr = false;
    254                         bfk = 0;
    255                     }
    256                 } else if (0x8e == ptr[i]) {
    257                     if (size - i <= 1) {
    258                         ;
    259                     } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
    260                         /* EUC KANA or SJIS KANJI */
    261                         if (bfk == 1) {
    262                             euc += 100;
    263                         }
    264                         bfk++;
    265                         i++;
    266                     } else {
    267                         /* SJIS only */
    268                         code = SJIS;
    269                         goto breakBreak;
    270                     }
    271                 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
    272                     /* SJIS only */
    273                     code = SJIS;
    274                     if ((size - i >= 1)
    275                             && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
    276                             || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
    277                         goto breakBreak;
    278                     }
    279                 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
    280                     /* EUC only */
    281                     code = EUC;
    282                     if ((size - i >= 1)
    283                             && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
    284                         goto breakBreak;
    285                     }
    286                 } else if (ptr[i] <= 0x7f) {
    287                     ;
    288                 } else {
    289                     bfr = true;
    290                     bfk = 0;
    291                 }
    292             }
    293             i++;
    294         }
    295     }
    296     if (code == ASCII) {
    297         if (sjis > euc) {
    298             code = SJIS;
    299         } else if (sjis < euc) {
    300             code = EUC;
    301         }
    302     }
    303 breakBreak:
    304     return (code);
    305 }
    306 
    307 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
    308 {
    309     if (equalIgnoringCase(mimeType, "text/css"))
    310         return CSS;
    311     if (equalIgnoringCase(mimeType, "text/html"))
    312         return HTML;
    313     if (DOMImplementation::isXMLMIMEType(mimeType))
    314         return XML;
    315     return PlainText;
    316 }
    317 
    318 const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const WTF::TextEncoding& specifiedDefaultEncoding)
    319 {
    320     // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
    321     // for text/xml. This matches Firefox.
    322     if (contentType == XML)
    323         return UTF8Encoding();
    324     if (!specifiedDefaultEncoding.isValid())
    325         return Latin1Encoding();
    326     return specifiedDefaultEncoding;
    327 }
    328 
    329 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
    330     : m_contentType(determineContentType(mimeType))
    331     , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
    332     , m_source(DefaultEncoding)
    333     , m_hintEncoding(0)
    334     , m_checkedForBOM(false)
    335     , m_checkedForCSSCharset(false)
    336     , m_checkedForXMLCharset(false)
    337     , m_checkedForMetaCharset(false)
    338     , m_useLenientXMLDecoding(false)
    339     , m_sawError(false)
    340     , m_usesEncodingDetector(usesEncodingDetector)
    341 {
    342 }
    343 
    344 TextResourceDecoder::~TextResourceDecoder()
    345 {
    346 }
    347 
    348 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, EncodingSource source)
    349 {
    350     // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
    351     if (!encoding.isValid())
    352         return;
    353 
    354     // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
    355     // treat x-user-defined as windows-1252 (bug 18270)
    356     if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
    357         m_encoding = "windows-1252";
    358     else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
    359         m_encoding = encoding.closestByteBasedEquivalent();
    360     else
    361         m_encoding = encoding;
    362 
    363     m_codec.clear();
    364     m_source = source;
    365 }
    366 
    367 // Returns the position of the encoding string.
    368 static int findXMLEncoding(const char* str, int len, int& encodingLength)
    369 {
    370     int pos = find(str, len, "encoding");
    371     if (pos == -1)
    372         return -1;
    373     pos += 8;
    374 
    375     // Skip spaces and stray control characters.
    376     while (pos < len && str[pos] <= ' ')
    377         ++pos;
    378 
    379     // Skip equals sign.
    380     if (pos >= len || str[pos] != '=')
    381         return -1;
    382     ++pos;
    383 
    384     // Skip spaces and stray control characters.
    385     while (pos < len && str[pos] <= ' ')
    386         ++pos;
    387 
    388     // Skip quotation mark.
    389     if (pos >= len)
    390         return - 1;
    391     char quoteMark = str[pos];
    392     if (quoteMark != '"' && quoteMark != '\'')
    393         return -1;
    394     ++pos;
    395 
    396     // Find the trailing quotation mark.
    397     int end = pos;
    398     while (end < len && str[end] != quoteMark)
    399         ++end;
    400     if (end >= len)
    401         return -1;
    402 
    403     encodingLength = end - pos;
    404     return pos;
    405 }
    406 
    407 // true if there is more to parse
    408 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
    409 {
    410     while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
    411         ++pos;
    412     return pos != dataEnd;
    413 }
    414 
    415 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
    416 {
    417     // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
    418     // We let it override even a user-chosen encoding.
    419     ASSERT(!m_checkedForBOM);
    420 
    421     size_t lengthOfBOM = 0;
    422 
    423     size_t bufferLength = m_buffer.size();
    424 
    425     size_t buf1Len = bufferLength;
    426     size_t buf2Len = len;
    427     const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
    428     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
    429     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    430     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    431     unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    432     unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
    433 
    434     // Check for the BOM.
    435     if (c1 == 0xFF && c2 == 0xFE) {
    436         if (c3 != 0 || c4 != 0) {
    437             setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
    438             lengthOfBOM = 2;
    439         } else {
    440             setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
    441             lengthOfBOM = 4;
    442         }
    443     } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
    444         setEncoding(UTF8Encoding(), AutoDetectedEncoding);
    445         lengthOfBOM = 3;
    446     } else if (c1 == 0xFE && c2 == 0xFF) {
    447         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
    448         lengthOfBOM = 2;
    449     } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
    450         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
    451         lengthOfBOM = 4;
    452     }
    453 
    454     if (lengthOfBOM || bufferLength + len >= 4)
    455         m_checkedForBOM = true;
    456 
    457     return lengthOfBOM;
    458 }
    459 
    460 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
    461 {
    462     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
    463         m_checkedForCSSCharset = true;
    464         return true;
    465     }
    466 
    467     size_t oldSize = m_buffer.size();
    468     m_buffer.grow(oldSize + len);
    469     memcpy(m_buffer.data() + oldSize, data, len);
    470 
    471     movedDataToBuffer = true;
    472 
    473     if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13
    474         return false;
    475 
    476     const char* dataStart = m_buffer.data();
    477     const char* dataEnd = dataStart + m_buffer.size();
    478 
    479     if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {
    480         dataStart += 10;
    481         const char* pos = dataStart;
    482 
    483         while (pos < dataEnd && *pos != '"')
    484             ++pos;
    485         if (pos == dataEnd)
    486             return false;
    487 
    488         int encodingNameLength = pos - dataStart;
    489 
    490         ++pos;
    491 
    492         if (*pos == ';')
    493             setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
    494     }
    495 
    496     m_checkedForCSSCharset = true;
    497     return true;
    498 }
    499 
    500 bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)
    501 {
    502     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
    503         m_checkedForXMLCharset = true;
    504         return true;
    505     }
    506 
    507     // This is not completely efficient, since the function might go
    508     // through the HTML head several times.
    509 
    510     size_t oldSize = m_buffer.size();
    511     m_buffer.grow(oldSize + len);
    512     memcpy(m_buffer.data() + oldSize, data, len);
    513 
    514     movedDataToBuffer = true;
    515 
    516     const char* ptr = m_buffer.data();
    517     const char* pEnd = ptr + m_buffer.size();
    518 
    519     // Is there enough data available to check for XML declaration?
    520     if (m_buffer.size() < 8)
    521         return false;
    522 
    523     // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
    524     // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
    525     if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {
    526         const char* xmlDeclarationEnd = ptr;
    527         while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
    528             ++xmlDeclarationEnd;
    529         if (xmlDeclarationEnd == pEnd)
    530             return false;
    531         // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
    532         int len = 0;
    533         int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
    534         if (pos != -1)
    535             setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
    536         // continue looking for a charset - it may be specified in an HTTP-Equiv meta
    537     } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0))
    538         setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
    539     else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x'))
    540         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
    541     else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0))
    542         setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
    543     else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?'))
    544         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
    545 
    546     m_checkedForXMLCharset = true;
    547     return true;
    548 }
    549 
    550 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
    551 {
    552     if (m_source == UserChosenEncoding || m_source == EncodingFromHTTPHeader || m_source == AutoDetectedEncoding) {
    553         m_checkedForMetaCharset = true;
    554         return;
    555     }
    556 
    557     if (!m_charsetParser)
    558         m_charsetParser = HTMLMetaCharsetParser::create();
    559 
    560     if (!m_charsetParser->checkForMetaCharset(data, length))
    561         return;
    562 
    563     setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
    564     m_charsetParser.clear();
    565     m_checkedForMetaCharset = true;
    566     return;
    567 }
    568 
    569 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
    570 {
    571     switch (KanjiCode::judge(data, len)) {
    572         case KanjiCode::JIS:
    573             setEncoding("ISO-2022-JP", EncodingFromContentSniffing);
    574             break;
    575         case KanjiCode::EUC:
    576             setEncoding("EUC-JP", EncodingFromContentSniffing);
    577             break;
    578         case KanjiCode::SJIS:
    579             setEncoding("Shift_JIS", EncodingFromContentSniffing);
    580             break;
    581         case KanjiCode::ASCII:
    582         case KanjiCode::UTF16:
    583         case KanjiCode::UTF8:
    584             break;
    585     }
    586 }
    587 
    588 // We use the encoding detector in two cases:
    589 //   1. Encoding detector is turned ON and no other encoding source is
    590 //      available (that is, it's DefaultEncoding).
    591 //   2. Encoding detector is turned ON and the encoding is set to
    592 //      the encoding of the parent frame, which is also auto-detected.
    593 //   Note that condition #2 is NOT satisfied unless parent-child frame
    594 //   relationship is compliant to the same-origin policy. If they're from
    595 //   different domains, |m_source| would not be set to EncodingFromParentFrame
    596 //   in the first place.
    597 bool TextResourceDecoder::shouldAutoDetect() const
    598 {
    599     // Just checking m_hintEncoding suffices here because it's only set
    600     // in setHintEncoding when the source is AutoDetectedEncoding.
    601     return m_usesEncodingDetector
    602         && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
    603 }
    604 
    605 String TextResourceDecoder::decode(const char* data, size_t len)
    606 {
    607     size_t lengthOfBOM = 0;
    608     if (!m_checkedForBOM)
    609         lengthOfBOM = checkForBOM(data, len);
    610 
    611     bool movedDataToBuffer = false;
    612 
    613     if (m_contentType == CSS && !m_checkedForCSSCharset)
    614         if (!checkForCSSCharset(data, len, movedDataToBuffer))
    615             return emptyString();
    616 
    617     if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForXMLCharset)
    618         if (!checkForXMLCharset(data, len, movedDataToBuffer))
    619             return emptyString();
    620 
    621     // FIXME: It would be more efficient to move this logic below checkForMetaCharset because
    622     //        checkForMetaCharset can overrule these detections.
    623     if (shouldAutoDetect()) {
    624         if (m_encoding.isJapanese())
    625             detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
    626         else {
    627             WTF::TextEncoding detectedEncoding;
    628             if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
    629                 setEncoding(detectedEncoding, EncodingFromContentSniffing);
    630         }
    631     }
    632 
    633     ASSERT(m_encoding.isValid());
    634 
    635     const char* dataForDecode = data + lengthOfBOM;
    636     size_t lengthForDecode = len - lengthOfBOM;
    637 
    638     if (!m_buffer.isEmpty()) {
    639         if (!movedDataToBuffer) {
    640             size_t oldSize = m_buffer.size();
    641             m_buffer.grow(oldSize + len);
    642             memcpy(m_buffer.data() + oldSize, data, len);
    643         }
    644 
    645         dataForDecode = m_buffer.data() + lengthOfBOM;
    646         lengthForDecode = m_buffer.size() - lengthOfBOM;
    647     }
    648 
    649     if (m_contentType == HTML && !m_checkedForMetaCharset)
    650         checkForMetaCharset(dataForDecode, lengthForDecode);
    651 
    652     if (!m_codec)
    653         m_codec = newTextCodec(m_encoding);
    654 
    655     String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
    656 
    657     m_buffer.clear();
    658     return result;
    659 }
    660 
    661 String TextResourceDecoder::flush()
    662 {
    663    // If we can not identify the encoding even after a document is completely
    664    // loaded, we need to detect the encoding if other conditions for
    665    // autodetection is satisfied.
    666     if (m_buffer.size() && shouldAutoDetect()
    667         && ((!m_checkedForXMLCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
    668         WTF::TextEncoding detectedEncoding;
    669         if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
    670             setEncoding(detectedEncoding, EncodingFromContentSniffing);
    671     }
    672 
    673     if (!m_codec)
    674         m_codec = newTextCodec(m_encoding);
    675 
    676     String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
    677     m_buffer.clear();
    678     m_codec.clear();
    679     m_checkedForBOM = false; // Skip BOM again when re-decoding.
    680     return result;
    681 }
    682 
    683 }
    684